diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-05 20:11:24 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-05 20:11:24 +0300 |
commit | 687ee0ad4e897e29f4b41f7a20c866d74c5e0660 (patch) | |
tree | b31a2af35c24a54823674cdd126993b80daeac67 /net | |
parent | 3ddf40e8c31964b744ff10abb48c8e36a83ec6e7 (diff) | |
parent | 03a1eabc3f54469abd4f1784182851b2e29630cc (diff) | |
download | linux-687ee0ad4e897e29f4b41f7a20c866d74c5e0660.tar.xz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) BBR TCP congestion control, from Neal Cardwell, Yuchung Cheng and
co. at Google. https://lwn.net/Articles/701165/
2) Do TCP Small Queues for retransmits, from Eric Dumazet.
3) Support collect_md mode for all IPV4 and IPV6 tunnels, from Alexei
Starovoitov.
4) Allow cls_flower to classify packets in ip tunnels, from Amir Vadai.
5) Support DSA tagging in older mv88e6xxx switches, from Andrew Lunn.
6) Support GMAC protocol in iwlwifi mwm, from Ayala Beker.
7) Support ndo_poll_controller in mlx5, from Calvin Owens.
8) Move VRF processing to an output hook and allow l3mdev to be
loopback, from David Ahern.
9) Support SOCK_DESTROY for UDP sockets. Also from David Ahern.
10) Congestion control in RXRPC, from David Howells.
11) Support geneve RX offload in ixgbe, from Emil Tantilov.
12) When hitting pressure for new incoming TCP data SKBs, perform a
partial rathern than a full purge of the OFO queue (which could be
huge). From Eric Dumazet.
13) Convert XFRM state and policy lookups to RCU, from Florian Westphal.
14) Support RX network flow classification to igb, from Gangfeng Huang.
15) Hardware offloading of eBPF in nfp driver, from Jakub Kicinski.
16) New skbmod packet action, from Jamal Hadi Salim.
17) Remove some inefficiencies in snmp proc output, from Jia He.
18) Add FIB notifications to properly propagate route changes to
hardware which is doing forwarding offloading. From Jiri Pirko.
19) New dsa driver for qca8xxx chips, from John Crispin.
20) Implement RFC7559 ipv6 router solicitation backoff, from Maciej
Żenczykowski.
21) Add L3 mode to ipvlan, from Mahesh Bandewar.
22) Support 802.1ad in mlx4, from Moshe Shemesh.
23) Support hardware LRO in mediatek driver, from Nelson Chang.
24) Add TC offloading to mlx5, from Or Gerlitz.
25) Convert various drivers to ethtool ksettings interfaces, from
Philippe Reynes.
26) TX max rate limiting for cxgb4, from Rahul Lakkireddy.
27) NAPI support for ath10k, from Rajkumar Manoharan.
28) Support XDP in mlx5, from Rana Shahout and Saeed Mahameed.
29) UDP replicast support in TIPC, from Richard Alpe.
30) Per-queue statistics for qed driver, from Sudarsana Reddy Kalluru.
31) Support BQL in thunderx driver, from Sunil Goutham.
32) TSO support in alx driver, from Tobias Regnery.
33) Add stream parser engine and use it in kcm.
34) Support async DHCP replies in ipconfig module, from Uwe
Kleine-König.
35) DSA port fast aging for mv88e6xxx driver, from Vivien Didelot.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1715 commits)
mlxsw: switchx2: Fix misuse of hard_header_len
mlxsw: spectrum: Fix misuse of hard_header_len
net/faraday: Stop NCSI device on shutdown
net/ncsi: Introduce ncsi_stop_dev()
net/ncsi: Rework the channel monitoring
net/ncsi: Allow to extend NCSI request properties
net/ncsi: Rework request index allocation
net/ncsi: Don't probe on the reserved channel ID (0x1f)
net/ncsi: Introduce NCSI_RESERVED_CHANNEL
net/ncsi: Avoid unused-value build warning from ia64-linux-gcc
net: Add netdev all_adj_list refcnt propagation to fix panic
net: phy: Add Edge-rate driver for Microsemi PHYs.
vmxnet3: Wake queue from reset work
i40e: avoid NULL pointer dereference and recursive errors on early PCI error
qed: Add RoCE ll2 & GSI support
qed: Add support for memory registeration verbs
qed: Add support for QP verbs
qed: PD,PKEY and CQ verb support
qed: Add support for RoCE hw init
qede: Add qedr framework
...
Diffstat (limited to 'net')
409 files changed, 23495 insertions, 11306 deletions
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c index 86450b7e2899..941df2fa4448 100644 --- a/net/6lowpan/ndisc.c +++ b/net/6lowpan/ndisc.c @@ -101,8 +101,6 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, ieee802154_be16_to_le16(&neigh->short_addr, lladdr_short); if (!lowpan_802154_is_valid_src_short_addr(neigh->short_addr)) neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); - } else { - neigh->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC); } write_unlock_bh(&n->lock); } diff --git a/net/Kconfig b/net/Kconfig index c2cdbce629bd..7b6cd340b72b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -369,6 +369,7 @@ source "net/irda/Kconfig" source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" source "net/kcm/Kconfig" +source "net/strparser/Kconfig" config FIB_RULES bool diff --git a/net/Makefile b/net/Makefile index 9bd20bb86cc6..4cafaa2b4667 100644 --- a/net/Makefile +++ b/net/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_SUNRPC) += sunrpc/ obj-$(CONFIG_AF_RXRPC) += rxrpc/ obj-$(CONFIG_AF_KCM) += kcm/ +obj-$(CONFIG_STREAM_PARSER) += strparser/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_L2TP) += l2tp/ obj-$(CONFIG_DECNET) += decnet/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index f066781be3c8..10d2bdce686e 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1278,7 +1278,7 @@ out: return err; } -#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE) +#if IS_ENABLED(CONFIG_IPDDP) static __inline__ int is_ip_over_ddp(struct sk_buff *skb) { return skb->data[12] == 22; diff --git a/net/atm/lec.c b/net/atm/lec.c index e574a7e9db6f..5d2693826afb 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -31,7 +31,7 @@ #include <linux/atmlec.h> /* Proxy LEC knows about bridging */ -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +#if IS_ENABLED(CONFIG_BRIDGE) #include "../bridge/br_private.h" static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 }; @@ -121,7 +121,7 @@ static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; /* Device structures */ static struct net_device *dev_lec[MAX_LEC_ITF]; -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +#if IS_ENABLED(CONFIG_BRIDGE) static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) { char *buff; @@ -155,7 +155,7 @@ static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) sk->sk_data_ready(sk); } } -#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ +#endif /* IS_ENABLED(CONFIG_BRIDGE) */ /* * Open/initialize the netdevice. This is called (in the current kernel) @@ -222,7 +222,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb, pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n", (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb), (long)skb_end_pointer(skb)); -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +#if IS_ENABLED(CONFIG_BRIDGE) if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0) lec_handle_bridge(skb, dev); #endif @@ -426,7 +426,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) (unsigned short)(0xffff & mesg->content.normal.flag); break; case l_should_bridge: -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +#if IS_ENABLED(CONFIG_BRIDGE) { pr_debug("%s: bridge zeppelin asks about %pM\n", dev->name, mesg->content.proxy.mac_addr); @@ -452,7 +452,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) sk->sk_data_ready(sk); } } -#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ +#endif /* IS_ENABLED(CONFIG_BRIDGE) */ break; default: pr_info("%s: Unknown message type %d\n", dev->name, mesg->type); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 0e982222d425..3b3b1a292ec8 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1007,7 +1007,7 @@ static int mpoa_event_listener(struct notifier_block *mpoa_notifier, if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; - if (dev->name == NULL || strncmp(dev->name, "lec", 3)) + if (strncmp(dev->name, "lec", 3)) return NOTIFY_DONE; /* we are only interested in lec:s */ switch (event) { diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 833bb145ba3c..f20742cbae6d 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -73,10 +73,21 @@ config BATMAN_ADV_MCAST reduce the air overhead while improving the reliability of multicast messages. -config BATMAN_ADV_DEBUG - bool "B.A.T.M.A.N. debugging" +config BATMAN_ADV_DEBUGFS + bool "batman-adv debugfs entries" depends on BATMAN_ADV depends on DEBUG_FS + default y + help + Enable this to export routing related debug tables via debugfs. + The information for each soft-interface and used hard-interface can be + found under batman_adv/ + + If unsure, say Y. + +config BATMAN_ADV_DEBUG + bool "B.A.T.M.A.N. debugging" + depends on BATMAN_ADV_DEBUGFS help This is an option for use by developers; most people should say N here. This enables compilation of support for diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index a83fc6c58d19..f724d3c98a81 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -24,14 +24,14 @@ batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o batman-adv-y += bitarray.o batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o -batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o +batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += debugfs.o batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o batman-adv-y += fragmentation.o batman-adv-y += gateway_client.o batman-adv-y += gateway_common.o batman-adv-y += hard-interface.o batman-adv-y += hash.o -batman-adv-y += icmp_socket.o +batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += icmp_socket.o batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o batman-adv-y += main.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 81dbbf569bd4..623d04302aa2 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -20,12 +20,18 @@ #include <linux/errno.h> #include <linux/list.h> #include <linux/moduleparam.h> +#include <linux/netlink.h> #include <linux/printk.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/string.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <uapi/linux/batman_adv.h> #include "bat_algo.h" +#include "netlink.h" char batadv_routing_algo[20] = "BATMAN_IV"; static struct hlist_head batadv_algo_list; @@ -95,6 +101,7 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name) return 0; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) { struct batadv_algo_ops *bat_algo_ops; @@ -107,6 +114,7 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) return 0; } +#endif static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) { @@ -138,3 +146,65 @@ static struct kparam_string batadv_param_string_ra = { module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, 0644); + +/** + * batadv_algo_dump_entry - fill in information about one supported routing + * algorithm + * @msg: netlink message to be sent back + * @portid: Port to reply to + * @seq: Sequence number of message + * @bat_algo_ops: Algorithm to be dumped + * + * Return: Error number, or 0 on success + */ +static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_algo_ops *bat_algo_ops) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_algo_dump - fill in information about supported routing + * algorithms + * @msg: netlink message to be sent back + * @cb: Parameters to the netlink request + * + * Return: Length of reply message. + */ +int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_algo_ops *bat_algo_ops; + int skip = cb->args[0]; + int i = 0; + + hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { + if (i++ < skip) + continue; + + if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq, + bat_algo_ops)) { + i--; + break; + } + } + + cb->args[0] = i; + + return msg->len; +} diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 860d773dd8fa..3b5b69cdd12b 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -22,7 +22,9 @@ #include <linux/types.h> +struct netlink_callback; struct seq_file; +struct sk_buff; extern char batadv_routing_algo[]; extern struct list_head batadv_hardif_list; @@ -31,5 +33,6 @@ void batadv_algo_init(void); int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); int batadv_algo_select(struct batadv_priv *bat_priv, char *name); int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); +int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb); #endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 19b0abd6c640..e2d18d0b1f06 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -35,6 +35,7 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/pkt_sched.h> #include <linux/printk.h> #include <linux/random.h> @@ -48,12 +49,17 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bitarray.h" +#include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "network-coding.h" #include "originator.h" #include "packet.h" @@ -318,17 +324,18 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) if (!orig_node->bat_iv.bcast_own_sum) goto free_orig_node; + kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) - goto free_orig_node; + goto free_orig_node_hash; return orig_node; -free_orig_node: - /* free twice, as batadv_orig_node_new sets refcount to 2 */ +free_orig_node_hash: batadv_orig_node_put(orig_node); +free_orig_node: batadv_orig_node_put(orig_node); return NULL; @@ -528,36 +535,25 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) { struct net_device *soft_iface; - struct batadv_priv *bat_priv; - struct batadv_hard_iface *primary_if = NULL; if (!forw_packet->if_incoming) { pr_err("Error - can't forward packet: incoming iface not specified\n"); - goto out; + return; } soft_iface = forw_packet->if_incoming->soft_iface; - bat_priv = netdev_priv(soft_iface); if (WARN_ON(!forw_packet->if_outgoing)) - goto out; + return; if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface)) - goto out; + return; if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE) - goto out; - - primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if) - goto out; + return; /* only for one specific outgoing interface */ batadv_iv_ogm_send_to_if(forw_packet, forw_packet->if_outgoing); - -out: - if (primary_if) - batadv_hardif_put(primary_if); } /** @@ -685,19 +681,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, struct batadv_forw_packet *forw_packet_aggr; unsigned char *skb_buff; unsigned int skb_size; + atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left; - /* own packet should always be scheduled */ - if (!own_packet) { - if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "batman packet queue full\n"); - return; - } - } - - forw_packet_aggr = kmalloc(sizeof(*forw_packet_aggr), GFP_ATOMIC); + forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, + queue_left, bat_priv); if (!forw_packet_aggr) - goto out_nomem; + return; if (atomic_read(&bat_priv->aggregated_ogms) && packet_len < BATADV_MAX_AGGREGATION_BYTES) @@ -708,8 +697,11 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, skb_size += ETH_HLEN; forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); - if (!forw_packet_aggr->skb) - goto out_free_forw_packet; + if (!forw_packet_aggr->skb) { + batadv_forw_packet_free(forw_packet_aggr); + return; + } + forw_packet_aggr->skb->priority = TC_PRIO_CONTROL; skb_reserve(forw_packet_aggr->skb, ETH_HLEN); @@ -717,12 +709,7 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, forw_packet_aggr->packet_len = packet_len; memcpy(skb_buff, packet_buff, packet_len); - kref_get(&if_incoming->refcount); - kref_get(&if_outgoing->refcount); forw_packet_aggr->own = own_packet; - forw_packet_aggr->if_incoming = if_incoming; - forw_packet_aggr->if_outgoing = if_outgoing; - forw_packet_aggr->num_packets = 0; forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS; forw_packet_aggr->send_time = send_time; @@ -741,13 +728,6 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, queue_delayed_work(batadv_event_workqueue, &forw_packet_aggr->delayed_work, send_time - jiffies); - - return; -out_free_forw_packet: - kfree(forw_packet_aggr); -out_nomem: - if (!own_packet) - atomic_inc(&bat_priv->batman_queue_left); } /* aggregate a new packet into the existing ogm packet */ @@ -1830,10 +1810,6 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) batadv_iv_ogm_schedule(forw_packet->if_incoming); out: - /* don't count own packet */ - if (!forw_packet->own) - atomic_inc(&bat_priv->batman_queue_left); - batadv_forw_packet_free(forw_packet); } @@ -1879,6 +1855,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, return NET_RX_SUCCESS; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_iv_ogm_orig_print_neigh - print neighbors for the originator table * @orig_node: the orig_node for which the neighbors are printed @@ -1976,8 +1953,239 @@ next: if (batman_count == 0) seq_puts(seq, "No batman nodes in range ...\n"); } +#endif + +/** + * batadv_iv_ogm_neigh_get_tq_avg - Get the TQ average for a neighbour on a + * given outgoing interface. + * @neigh_node: Neighbour of interest + * @if_outgoing: Outgoing interface of interest + * @tq_avg: Pointer of where to store the TQ average + * + * Return: False if no average TQ available, otherwise true. + */ +static bool +batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, + struct batadv_hard_iface *if_outgoing, + u8 *tq_avg) +{ + struct batadv_neigh_ifinfo *n_ifinfo; + + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + return false; + + *tq_avg = n_ifinfo->bat_iv.tq_avg; + batadv_neigh_ifinfo_put(n_ifinfo); + + return true; +} + +/** + * batadv_iv_ogm_orig_dump_subentry - Dump an originator subentry into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @neigh_node: Single hops neighbour + * @best: Is the best originator + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + bool best) +{ + void *hdr; + u8 tq_avg; + unsigned int last_seen_msecs; + + last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); + + if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node, if_outgoing, &tq_avg)) + return 0; + + if (if_outgoing != BATADV_IF_DEFAULT && + if_outgoing != neigh_node->if_incoming) + return 0; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig_node->orig) || + nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + neigh_node->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + neigh_node->if_incoming->net_dev->ifindex) || + nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_iv_ogm_orig_dump_entry - Dump an originator entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @sub_s: Number of sub entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, int *sub_s) +{ + struct batadv_neigh_node *neigh_node_best; + struct batadv_neigh_node *neigh_node; + int sub = 0; + bool best; + u8 tq_avg_best; + + neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); + if (!neigh_node_best) + goto out; + + if (!batadv_iv_ogm_neigh_get_tq_avg(neigh_node_best, if_outgoing, + &tq_avg_best)) + goto out; + + if (tq_avg_best == 0) + goto out; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + if (sub++ < *sub_s) + continue; + + best = (neigh_node == neigh_node_best); + + if (batadv_iv_ogm_orig_dump_subentry(msg, portid, seq, + bat_priv, if_outgoing, + orig_node, neigh_node, + best)) { + batadv_neigh_node_put(neigh_node_best); + + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + out: + if (neigh_node_best) + batadv_neigh_node_put(neigh_node_best); + + *sub_s = 0; + return 0; +} /** + * batadv_iv_ogm_orig_dump_bucket - Dump an originator bucket into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @head: Bucket to be dumped + * @idx_s: Number of entries to be skipped + * @sub: Number of sub entries to be skipped + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_orig_node *orig_node; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_iv_ogm_orig_dump_entry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, + sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_iv_ogm_orig_dump - Dump the originators into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + */ +static void +batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_iv_ogm_orig_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, if_outgoing, head, + &idx, &sub)) + break; + + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; +} + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/** * batadv_iv_hardif_neigh_print - print a single hop neighbour node * @seq: neighbour table seq_file struct * @hardif_neigh: hardif neighbour information @@ -2027,37 +2235,43 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, if (batman_count == 0) seq_puts(seq, "No batman nodes in range ...\n"); } +#endif /** - * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors + * batadv_iv_ogm_neigh_diff - calculate tq difference of two neighbors * @neigh1: the first neighbor object of the comparison * @if_outgoing1: outgoing interface for the first neighbor * @neigh2: the second neighbor object of the comparison * @if_outgoing2: outgoing interface for the second neighbor + * @diff: pointer to integer receiving the calculated difference * - * Return: a value less, equal to or greater than 0 if the metric via neigh1 is - * lower, the same as or higher than the metric via neigh2 + * The content of *@diff is only valid when this function returns true. + * It is less, equal to or greater than 0 if the metric via neigh1 is lower, + * the same as or higher than the metric via neigh2 + * + * Return: true when the difference could be calculated, false otherwise */ -static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, - struct batadv_hard_iface *if_outgoing1, - struct batadv_neigh_node *neigh2, - struct batadv_hard_iface *if_outgoing2) +static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2, + int *diff) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; u8 tq1, tq2; - int diff; + bool ret = true; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!neigh1_ifinfo || !neigh2_ifinfo) { - diff = 0; + ret = false; goto out; } tq1 = neigh1_ifinfo->bat_iv.tq_avg; tq2 = neigh2_ifinfo->bat_iv.tq_avg; - diff = tq1 - tq2; + *diff = (int)tq1 - (int)tq2; out: if (neigh1_ifinfo) @@ -2065,6 +2279,162 @@ out: if (neigh2_ifinfo) batadv_neigh_ifinfo_put(neigh2_ifinfo); + return ret; +} + +/** + * batadv_iv_ogm_neigh_dump_neigh - Dump a neighbour into a netlink message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hardif_neigh: Neighbour to be dumped + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + void *hdr; + unsigned int last_seen_msecs; + + last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + hardif_neigh->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + hardif_neigh->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_iv_ogm_neigh_dump_hardif - Dump the neighbours of a hard interface + * into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @hard_iface: Hard interface to dump the neighbours for + * @idx_s: Number of entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface, + int *idx_s) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + int idx = 0; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + if (idx++ < *idx_s) + continue; + + if (batadv_iv_ogm_neigh_dump_neigh(msg, portid, seq, + hardif_neigh)) { + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + + *idx_s = 0; + return 0; +} + +/** + * batadv_iv_ogm_neigh_dump - Dump the neighbours into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @single_hardif: Limit dump to this hard interfaace + */ +static void +batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *single_hardif) +{ + struct batadv_hard_iface *hard_iface; + int i_hardif = 0; + int i_hardif_s = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + rcu_read_lock(); + if (single_hardif) { + if (i_hardif_s == 0) { + if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, + single_hardif, + &idx) == 0) + i_hardif++; + } + } else { + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, + list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (i_hardif++ < i_hardif_s) + continue; + + if (batadv_iv_ogm_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, + hard_iface, &idx)) { + i_hardif--; + break; + } + } + } + rcu_read_unlock(); + + cb->args[0] = i_hardif; + cb->args[1] = idx; +} + +/** + * batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors + * @neigh1: the first neighbor object of the comparison + * @if_outgoing1: outgoing interface for the first neighbor + * @neigh2: the second neighbor object of the comparison + * @if_outgoing2: outgoing interface for the second neighbor + * + * Return: a value less, equal to or greater than 0 if the metric via neigh1 is + * lower, the same as or higher than the metric via neigh2 + */ +static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, + struct batadv_hard_iface *if_outgoing1, + struct batadv_neigh_node *neigh2, + struct batadv_hard_iface *if_outgoing2) +{ + bool ret; + int diff; + + ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, + if_outgoing2, &diff); + if (!ret) + return 0; + return diff; } @@ -2085,36 +2455,341 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { - struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; - u8 tq1, tq2; bool ret; + int diff; - neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); - neigh2_ifinfo = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + ret = batadv_iv_ogm_neigh_diff(neigh1, if_outgoing1, neigh2, + if_outgoing2, &diff); + if (!ret) + return false; - /* we can't say that the metric is better */ - if (!neigh1_ifinfo || !neigh2_ifinfo) { - ret = false; + ret = diff > -BATADV_TQ_SIMILARITY_THRESHOLD; + return ret; +} + +static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) +{ + /* begin scheduling originator messages on that interface */ + batadv_iv_ogm_schedule(hard_iface); +} + +static struct batadv_gw_node * +batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) +{ + struct batadv_neigh_node *router; + struct batadv_neigh_ifinfo *router_ifinfo; + struct batadv_gw_node *gw_node, *curr_gw = NULL; + u64 max_gw_factor = 0; + u64 tmp_gw_factor = 0; + u8 max_tq = 0; + u8 tq_avg; + struct batadv_orig_node *orig_node; + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + orig_node = gw_node->orig_node; + router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); + if (!router) + continue; + + router_ifinfo = batadv_neigh_ifinfo_get(router, + BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto next; + + if (!kref_get_unless_zero(&gw_node->refcount)) + goto next; + + tq_avg = router_ifinfo->bat_iv.tq_avg; + + switch (atomic_read(&bat_priv->gw.sel_class)) { + case 1: /* fast connection */ + tmp_gw_factor = tq_avg * tq_avg; + tmp_gw_factor *= gw_node->bandwidth_down; + tmp_gw_factor *= 100 * 100; + tmp_gw_factor >>= 18; + + if ((tmp_gw_factor > max_gw_factor) || + ((tmp_gw_factor == max_gw_factor) && + (tq_avg > max_tq))) { + if (curr_gw) + batadv_gw_node_put(curr_gw); + curr_gw = gw_node; + kref_get(&curr_gw->refcount); + } + break; + + default: /* 2: stable connection (use best statistic) + * 3: fast-switch (use best statistic but change as + * soon as a better gateway appears) + * XX: late-switch (use best statistic but change as + * soon as a better gateway appears which has + * $routing_class more tq points) + */ + if (tq_avg > max_tq) { + if (curr_gw) + batadv_gw_node_put(curr_gw); + curr_gw = gw_node; + kref_get(&curr_gw->refcount); + } + break; + } + + if (tq_avg > max_tq) + max_tq = tq_avg; + + if (tmp_gw_factor > max_gw_factor) + max_gw_factor = tmp_gw_factor; + + batadv_gw_node_put(gw_node); + +next: + batadv_neigh_node_put(router); + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + } + rcu_read_unlock(); + + return curr_gw; +} + +static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv, + struct batadv_orig_node *curr_gw_orig, + struct batadv_orig_node *orig_node) +{ + struct batadv_neigh_ifinfo *router_orig_ifinfo = NULL; + struct batadv_neigh_ifinfo *router_gw_ifinfo = NULL; + struct batadv_neigh_node *router_gw = NULL; + struct batadv_neigh_node *router_orig = NULL; + u8 gw_tq_avg, orig_tq_avg; + bool ret = false; + + /* dynamic re-election is performed only on fast or late switch */ + if (atomic_read(&bat_priv->gw.sel_class) <= 2) + return false; + + router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); + if (!router_gw) { + ret = true; goto out; } - tq1 = neigh1_ifinfo->bat_iv.tq_avg; - tq2 = neigh2_ifinfo->bat_iv.tq_avg; - ret = (tq1 - tq2) > -BATADV_TQ_SIMILARITY_THRESHOLD; + router_gw_ifinfo = batadv_neigh_ifinfo_get(router_gw, + BATADV_IF_DEFAULT); + if (!router_gw_ifinfo) { + ret = true; + goto out; + } + + router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); + if (!router_orig) + goto out; + + router_orig_ifinfo = batadv_neigh_ifinfo_get(router_orig, + BATADV_IF_DEFAULT); + if (!router_orig_ifinfo) + goto out; + + gw_tq_avg = router_gw_ifinfo->bat_iv.tq_avg; + orig_tq_avg = router_orig_ifinfo->bat_iv.tq_avg; + + /* the TQ value has to be better */ + if (orig_tq_avg < gw_tq_avg) + goto out; + /* if the routing class is greater than 3 the value tells us how much + * greater the TQ value of the new gateway must be + */ + if ((atomic_read(&bat_priv->gw.sel_class) > 3) && + (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) + goto out; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", + gw_tq_avg, orig_tq_avg); + + ret = true; out: - if (neigh1_ifinfo) - batadv_neigh_ifinfo_put(neigh1_ifinfo); - if (neigh2_ifinfo) - batadv_neigh_ifinfo_put(neigh2_ifinfo); + if (router_gw_ifinfo) + batadv_neigh_ifinfo_put(router_gw_ifinfo); + if (router_orig_ifinfo) + batadv_neigh_ifinfo_put(router_orig_ifinfo); + if (router_gw) + batadv_neigh_node_put(router_gw); + if (router_orig) + batadv_neigh_node_put(router_orig); return ret; } -static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface) +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/* fails if orig_node has no router */ +static int batadv_iv_gw_write_buffer_text(struct batadv_priv *bat_priv, + struct seq_file *seq, + const struct batadv_gw_node *gw_node) { - /* begin scheduling originator messages on that interface */ - batadv_iv_ogm_schedule(hard_iface); + struct batadv_gw_node *curr_gw; + struct batadv_neigh_node *router; + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + int ret = -1; + + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + + seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", + (curr_gw == gw_node ? "=>" : " "), + gw_node->orig_node->orig, + router_ifinfo->bat_iv.tq_avg, router->addr, + router->if_incoming->net_dev->name, + gw_node->bandwidth_down / 10, + gw_node->bandwidth_down % 10, + gw_node->bandwidth_up / 10, + gw_node->bandwidth_up % 10); + ret = seq_has_overflowed(seq) ? -1 : 0; + + if (curr_gw) + batadv_gw_node_put(curr_gw); +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (router) + batadv_neigh_node_put(router); + return ret; +} + +static void batadv_iv_gw_print(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + struct batadv_gw_node *gw_node; + int gw_count = 0; + + seq_puts(seq, + " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n"); + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + /* fails if orig_node has no router */ + if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0) + continue; + + gw_count++; + } + rcu_read_unlock(); + + if (gw_count == 0) + seq_puts(seq, "No gateways in range ...\n"); +} +#endif + +/** + * batadv_iv_gw_dump_entry - Dump a gateway into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @gw_node: Gateway to be dumped + * + * Return: Error code, or 0 on success + */ +static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_gw_node *gw_node) +{ + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + struct batadv_neigh_node *router; + struct batadv_gw_node *curr_gw; + int ret = -EINVAL; + void *hdr; + + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + ret = -EMSGSIZE; + + if (curr_gw == gw_node) + if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + gw_node->orig_node->orig) || + nla_put_u8(msg, BATADV_ATTR_TQ, router_ifinfo->bat_iv.tq_avg) || + nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, + router->addr) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + router->if_incoming->net_dev->name) || + nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, + gw_node->bandwidth_down) || + nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, + gw_node->bandwidth_up)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (router) + batadv_neigh_node_put(router); + return ret; +} + +/** + * batadv_iv_gw_dump - Dump gateways into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + */ +static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv) +{ + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_gw_node *gw_node; + int idx_skip = cb->args[0]; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + if (idx++ < idx_skip) + continue; + + if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, gw_node)) { + idx_skip = idx - 1; + goto unlock; + } + } + + idx_skip = idx; +unlock: + rcu_read_unlock(); + + cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_iv __read_mostly = { @@ -2129,14 +2804,28 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = { .neigh = { .cmp = batadv_iv_ogm_neigh_cmp, .is_similar_or_better = batadv_iv_ogm_neigh_is_sob, +#ifdef CONFIG_BATMAN_ADV_DEBUGFS .print = batadv_iv_neigh_print, +#endif + .dump = batadv_iv_ogm_neigh_dump, }, .orig = { +#ifdef CONFIG_BATMAN_ADV_DEBUGFS .print = batadv_iv_ogm_orig_print, +#endif + .dump = batadv_iv_ogm_orig_dump, .free = batadv_iv_ogm_orig_free, .add_if = batadv_iv_ogm_orig_add_if, .del_if = batadv_iv_ogm_orig_del_if, }, + .gw = { + .get_best_gw_node = batadv_iv_gw_get_best_gw_node, + .is_eligible = batadv_iv_gw_is_eligible, +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + .print = batadv_iv_gw_print, +#endif + .dump = batadv_iv_gw_dump, + }, }; int __init batadv_iv_init(void) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 0366cbf5e444..e79f6f01182e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -21,24 +21,38 @@ #include <linux/atomic.h> #include <linux/bug.h> #include <linux/cache.h> +#include <linux/errno.h> +#include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/kref.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_v_elp.h" #include "bat_v_ogm.h" +#include "gateway_client.h" +#include "gateway_common.h" #include "hard-interface.h" #include "hash.h" +#include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" +struct sk_buff; + static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); @@ -115,6 +129,7 @@ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) batadv_v_elp_throughput_metric_update); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_v_orig_print_neigh - print neighbors for the originator table * @orig_node: the orig_node for which the neighbors are printed @@ -198,8 +213,142 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv, if (batman_count == 0) seq_puts(seq, "No batman nodes in range ...\n"); } +#endif + +/** + * batadv_v_neigh_dump_neigh - Dump a neighbour into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hardif_neigh: Neighbour to dump + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hardif_neigh_node *hardif_neigh) +{ + void *hdr; + unsigned int last_seen_msecs; + u32 throughput; + + last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); + throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); + throughput = throughput * 100; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_NEIGHBORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + hardif_neigh->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + hardif_neigh->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs) || + nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} /** + * batadv_v_neigh_dump_hardif - Dump the neighbours of a hard interface into + * a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @hard_iface: The hard interface to be dumped + * @idx_s: Entries to be skipped + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface, + int *idx_s) +{ + struct batadv_hardif_neigh_node *hardif_neigh; + int idx = 0; + + hlist_for_each_entry_rcu(hardif_neigh, + &hard_iface->neigh_list, list) { + if (idx++ < *idx_s) + continue; + + if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + + *idx_s = 0; + return 0; +} + +/** + * batadv_v_neigh_dump - Dump the neighbours of a hard interface into a + * message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @single_hardif: Limit dumping to this hard interface + */ +static void +batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *single_hardif) +{ + struct batadv_hard_iface *hard_iface; + int i_hardif = 0; + int i_hardif_s = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + rcu_read_lock(); + if (single_hardif) { + if (i_hardif_s == 0) { + if (batadv_v_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, single_hardif, + &idx) == 0) + i_hardif++; + } + } else { + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != bat_priv->soft_iface) + continue; + + if (i_hardif++ < i_hardif_s) + continue; + + if (batadv_v_neigh_dump_hardif(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, hard_iface, + &idx)) { + i_hardif--; + break; + } + } + } + rcu_read_unlock(); + + cb->args[0] = i_hardif; + cb->args[1] = idx; +} + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/** * batadv_v_orig_print - print the originator table * @bat_priv: the bat priv with all the soft interface information * @seq: debugfs table seq_file struct @@ -265,6 +414,205 @@ next: if (batman_count == 0) seq_puts(seq, "No batman nodes in range ...\n"); } +#endif + +/** + * batadv_v_orig_dump_subentry - Dump an originator subentry into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @neigh_node: Single hops neighbour + * @best: Is the best originator + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, + bool best) +{ + struct batadv_neigh_ifinfo *n_ifinfo; + unsigned int last_seen_msecs; + u32 throughput; + void *hdr; + + n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!n_ifinfo) + return 0; + + throughput = n_ifinfo->bat_v.throughput * 100; + + batadv_neigh_ifinfo_put(n_ifinfo); + + last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); + + if (if_outgoing != BATADV_IF_DEFAULT && + if_outgoing != neigh_node->if_incoming) + return 0; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_ORIGINATORS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || + nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, + neigh_node->addr) || + nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + neigh_node->if_incoming->net_dev->ifindex) || + nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, + last_seen_msecs)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_v_orig_dump_entry - Dump an originator entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @orig_node: Originator to dump + * @sub_s: Number of sub entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct batadv_orig_node *orig_node, int *sub_s) +{ + struct batadv_neigh_node *neigh_node_best; + struct batadv_neigh_node *neigh_node; + int sub = 0; + bool best; + + neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); + if (!neigh_node_best) + goto out; + + hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { + if (sub++ < *sub_s) + continue; + + best = (neigh_node == neigh_node_best); + + if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, + neigh_node, best)) { + batadv_neigh_node_put(neigh_node_best); + + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + out: + if (neigh_node_best) + batadv_neigh_node_put(neigh_node_best); + + *sub_s = 0; + return 0; +} + +/** + * batadv_v_orig_dump_bucket - Dump an originator bucket into a + * message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + * @head: Bucket to be dumped + * @idx_s: Number of entries to be skipped + * @sub: Number of sub entries to be skipped + * + * Return: Error code, or 0 on success + */ +static int +batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_orig_node *orig_node; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv, + if_outgoing, orig_node, sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_v_orig_dump - Dump the originators into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + * @if_outgoing: Limit dump to entries with this outgoing interface + */ +static void +batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *if_outgoing) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + struct hlist_head *head; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_v_orig_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + bat_priv, if_outgoing, head, &idx, + &sub)) + break; + + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; +} static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, @@ -320,6 +668,365 @@ err_ifinfo1: return ret; } +static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv, + char *buff, size_t count) +{ + u32 old_class, class; + + if (!batadv_parse_throughput(bat_priv->soft_iface, buff, + "B.A.T.M.A.N. V GW selection class", + &class)) + return -EINVAL; + + old_class = atomic_read(&bat_priv->gw.sel_class); + atomic_set(&bat_priv->gw.sel_class, class); + + if (old_class != class) + batadv_gw_reselect(bat_priv); + + return count; +} + +static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff) +{ + u32 class = atomic_read(&bat_priv->gw.sel_class); + + return sprintf(buff, "%u.%u MBit\n", class / 10, class % 10); +} + +/** + * batadv_v_gw_throughput_get - retrieve the GW-bandwidth for a given GW + * @gw_node: the GW to retrieve the metric for + * @bw: the pointer where the metric will be stored. The metric is computed as + * the minimum between the GW advertised throughput and the path throughput to + * it in the mesh + * + * Return: 0 on success, -1 on failure + */ +static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw) +{ + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + struct batadv_orig_node *orig_node; + struct batadv_neigh_node *router; + int ret = -1; + + orig_node = gw_node->orig_node; + router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + /* the GW metric is computed as the minimum between the path throughput + * to reach the GW itself and the advertised bandwidth. + * This gives us an approximation of the effective throughput that the + * client can expect via this particular GW node + */ + *bw = router_ifinfo->bat_v.throughput; + *bw = min_t(u32, *bw, gw_node->bandwidth_down); + + ret = 0; +out: + if (router) + batadv_neigh_node_put(router); + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + + return ret; +} + +/** + * batadv_v_gw_get_best_gw_node - retrieve the best GW node + * @bat_priv: the bat priv with all the soft interface information + * + * Return: the GW node having the best GW-metric, NULL if no GW is known + */ +static struct batadv_gw_node * +batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) +{ + struct batadv_gw_node *gw_node, *curr_gw = NULL; + u32 max_bw = 0, bw; + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + if (!kref_get_unless_zero(&gw_node->refcount)) + continue; + + if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) + goto next; + + if (curr_gw && (bw <= max_bw)) + goto next; + + if (curr_gw) + batadv_gw_node_put(curr_gw); + + curr_gw = gw_node; + kref_get(&curr_gw->refcount); + max_bw = bw; + +next: + batadv_gw_node_put(gw_node); + } + rcu_read_unlock(); + + return curr_gw; +} + +/** + * batadv_v_gw_is_eligible - check if a originator would be selected as GW + * @bat_priv: the bat priv with all the soft interface information + * @curr_gw_orig: originator representing the currently selected GW + * @orig_node: the originator representing the new candidate + * + * Return: true if orig_node can be selected as current GW, false otherwise + */ +static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, + struct batadv_orig_node *curr_gw_orig, + struct batadv_orig_node *orig_node) +{ + struct batadv_gw_node *curr_gw = NULL, *orig_gw = NULL; + u32 gw_throughput, orig_throughput, threshold; + bool ret = false; + + threshold = atomic_read(&bat_priv->gw.sel_class); + + curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig); + if (!curr_gw) { + ret = true; + goto out; + } + + if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) { + ret = true; + goto out; + } + + orig_gw = batadv_gw_node_get(bat_priv, orig_node); + if (!orig_node) + goto out; + + if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) + goto out; + + if (orig_throughput < gw_throughput) + goto out; + + if ((orig_throughput - gw_throughput) < threshold) + goto out; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n", + gw_throughput, orig_throughput); + + ret = true; +out: + if (curr_gw) + batadv_gw_node_put(curr_gw); + if (orig_gw) + batadv_gw_node_put(orig_gw); + + return ret; +} + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/* fails if orig_node has no router */ +static int batadv_v_gw_write_buffer_text(struct batadv_priv *bat_priv, + struct seq_file *seq, + const struct batadv_gw_node *gw_node) +{ + struct batadv_gw_node *curr_gw; + struct batadv_neigh_node *router; + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + int ret = -1; + + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + + seq_printf(seq, "%s %pM (%9u.%1u) %pM [%10s]: %u.%u/%u.%u MBit\n", + (curr_gw == gw_node ? "=>" : " "), + gw_node->orig_node->orig, + router_ifinfo->bat_v.throughput / 10, + router_ifinfo->bat_v.throughput % 10, router->addr, + router->if_incoming->net_dev->name, + gw_node->bandwidth_down / 10, + gw_node->bandwidth_down % 10, + gw_node->bandwidth_up / 10, + gw_node->bandwidth_up % 10); + ret = seq_has_overflowed(seq) ? -1 : 0; + + if (curr_gw) + batadv_gw_node_put(curr_gw); +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (router) + batadv_neigh_node_put(router); + return ret; +} + +/** + * batadv_v_gw_print - print the gateway list + * @bat_priv: the bat priv with all the soft interface information + * @seq: gateway table seq_file struct + */ +static void batadv_v_gw_print(struct batadv_priv *bat_priv, + struct seq_file *seq) +{ + struct batadv_gw_node *gw_node; + int gw_count = 0; + + seq_puts(seq, + " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n"); + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + /* fails if orig_node has no router */ + if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0) + continue; + + gw_count++; + } + rcu_read_unlock(); + + if (gw_count == 0) + seq_puts(seq, "No gateways in range ...\n"); +} +#endif + +/** + * batadv_v_gw_dump_entry - Dump a gateway into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @gw_node: Gateway to be dumped + * + * Return: Error code, or 0 on success + */ +static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_gw_node *gw_node) +{ + struct batadv_neigh_ifinfo *router_ifinfo = NULL; + struct batadv_neigh_node *router; + struct batadv_gw_node *curr_gw; + int ret = -EINVAL; + void *hdr; + + router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); + if (!router) + goto out; + + router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); + if (!router_ifinfo) + goto out; + + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + ret = -EMSGSIZE; + + if (curr_gw == gw_node) { + if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { + genlmsg_cancel(msg, hdr); + goto out; + } + } + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + gw_node->orig_node->orig)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, + router_ifinfo->bat_v.throughput)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + router->if_incoming->net_dev->name)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, + gw_node->bandwidth_down)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); + if (router) + batadv_neigh_node_put(router); + return ret; +} + +/** + * batadv_v_gw_dump - Dump gateways into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * @bat_priv: The bat priv with all the soft interface information + */ +static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *bat_priv) +{ + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_gw_node *gw_node; + int idx_skip = cb->args[0]; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { + if (idx++ < idx_skip) + continue; + + if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, gw_node)) { + idx_skip = idx - 1; + goto unlock; + } + } + + idx_skip = idx; +unlock: + rcu_read_unlock(); + + cb->args[0] = idx_skip; +} + static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", .iface = { @@ -333,10 +1040,26 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = { .hardif_init = batadv_v_hardif_neigh_init, .cmp = batadv_v_neigh_cmp, .is_similar_or_better = batadv_v_neigh_is_sob, +#ifdef CONFIG_BATMAN_ADV_DEBUGFS .print = batadv_v_neigh_print, +#endif + .dump = batadv_v_neigh_dump, }, .orig = { +#ifdef CONFIG_BATMAN_ADV_DEBUGFS .print = batadv_v_orig_print, +#endif + .dump = batadv_v_orig_dump, + }, + .gw = { + .store_sel_class = batadv_v_store_sel_class, + .show_sel_class = batadv_v_show_sel_class, + .get_best_gw_node = batadv_v_gw_get_best_gw_node, + .is_eligible = batadv_v_gw_is_eligible, +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + .print = batadv_v_gw_print, +#endif + .dump = batadv_v_gw_dump, }, }; @@ -363,7 +1086,16 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) */ int batadv_v_mesh_init(struct batadv_priv *bat_priv) { - return batadv_v_ogm_init(bat_priv); + int ret = 0; + + ret = batadv_v_ogm_init(bat_priv); + if (ret < 0) + return ret; + + /* set default throughput difference threshold to 5Mbps */ + atomic_set(&bat_priv->gw.sel_class, 50); + + return 0; } /** diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 6fbba4eb0617..1aeeadca620c 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -73,13 +73,12 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv, if (!orig_node) return NULL; + kref_get(&orig_node->refcount); hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig, batadv_choose_orig, orig_node, &orig_node->hash_entry); if (hash_added != 0) { - /* orig_node->refcounter is initialised to 2 by - * batadv_orig_node_new() - */ + /* remove refcnt for newly created orig_node and hash entry */ batadv_orig_node_put(orig_node); batadv_orig_node_put(orig_node); orig_node = NULL; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ad2ffe16d29f..e7f690b571ea 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -35,6 +35,7 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> @@ -45,12 +46,18 @@ #include <linux/string.h> #include <linux/workqueue.h> #include <net/arp.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" +#include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" @@ -519,11 +526,9 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); - - /* one for the hash, one for returning */ kref_init(&entry->refcount); - kref_get(&entry->refcount); + kref_get(&entry->refcount); hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, batadv_compare_backbone_gw, batadv_choose_backbone_gw, entry, @@ -711,12 +716,13 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, claim->lasttime = jiffies; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; - kref_init(&claim->refcount); - kref_get(&claim->refcount); + batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n", mac, BATADV_PRINT_VID(vid)); + + kref_get(&claim->refcount); hash_added = batadv_hash_add(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim, @@ -1148,7 +1154,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, /* Let the loopdetect frames on the mesh in any case. */ if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT) - return 0; + return false; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, @@ -1990,6 +1996,7 @@ out: return ret; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file * @seq: seq file to print on @@ -2050,8 +2057,172 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif + +/** + * batadv_bla_claim_dump_entry - dump one entry of the claim table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @claim: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct batadv_bla_claim *claim) +{ + u8 *primary_addr = primary_if->net_dev->dev_addr; + u16 backbone_crc; + bool is_own; + void *hdr; + int ret = -EINVAL; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + is_own = batadv_compare_eth(claim->backbone_gw->orig, + primary_addr); + + spin_lock_bh(&claim->backbone_gw->crc_lock); + backbone_crc = claim->backbone_gw->crc; + spin_unlock_bh(&claim->backbone_gw->crc_lock); + + if (is_own) + if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) || + nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) || + nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, + claim->backbone_gw->orig) || + nla_put_u16(msg, BATADV_ATTR_BLA_CRC, + backbone_crc)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + return ret; +} + +/** + * batadv_bla_claim_dump_bucket - dump one bucket of the claim table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: always 0. + */ +static int +batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_bla_claim *claim; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(claim, head, hash_entry) { + if (idx++ < *idx_skip) + continue; + if (batadv_bla_claim_dump_entry(msg, portid, seq, + primary_if, claim)) { + *idx_skip = idx - 1; + goto unlock; + } + } + + *idx_skip = idx; +unlock: + rcu_read_unlock(); + return 0; +} /** + * batadv_bla_claim_dump - dump claim table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->bla.claim_hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_bla_claim_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + primary_if, head, &idx)) + break; + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +/** * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq * file * @seq: seq file to print on @@ -2114,3 +2285,168 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif + +/** + * batadv_bla_backbone_dump_entry - dump one entry of the backbone table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @backbone_gw: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct batadv_bla_backbone_gw *backbone_gw) +{ + u8 *primary_addr = primary_if->net_dev->dev_addr; + u16 backbone_crc; + bool is_own; + int msecs; + void *hdr; + int ret = -EINVAL; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); + if (!hdr) { + ret = -ENOBUFS; + goto out; + } + + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); + + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); + + msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime); + + if (is_own) + if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, + backbone_gw->orig) || + nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) || + nla_put_u16(msg, BATADV_ATTR_BLA_CRC, + backbone_crc) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { + genlmsg_cancel(msg, hdr); + goto out; + } + + genlmsg_end(msg, hdr); + ret = 0; + +out: + return ret; +} + +/** + * batadv_bla_backbone_dump_bucket - dump one bucket of the backbone table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @primary_if: primary interface + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: always 0. + */ +static int +batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *primary_if, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_bla_backbone_gw *backbone_gw; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { + if (idx++ < *idx_skip) + continue; + if (batadv_bla_backbone_dump_entry(msg, portid, seq, + primary_if, backbone_gw)) { + *idx_skip = idx - 1; + goto unlock; + } + } + + *idx_skip = idx; +unlock: + rcu_read_unlock(); + return 0; +} + +/** + * batadv_bla_backbone_dump - dump backbone table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->bla.backbone_hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_bla_backbone_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, + primary_if, head, &idx)) + break; + bucket++; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 0f01daeb359e..1ae93e46fb98 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -23,6 +23,7 @@ #include <linux/types.h> struct net_device; +struct netlink_callback; struct seq_file; struct sk_buff; @@ -35,8 +36,10 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size); int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset); +int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, @@ -47,7 +50,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); - +int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); #define BATADV_BLA_CRC_INIT 0 #else /* ifdef CONFIG_BATMAN_ADV_BLA */ @@ -112,6 +115,18 @@ static inline void batadv_bla_free(struct batadv_priv *bat_priv) { } +static inline int batadv_bla_claim_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +static inline int batadv_bla_backbone_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + #endif /* ifdef CONFIG_BATMAN_ADV_BLA */ #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */ diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 1d68b6e63b96..b4ffba7dd583 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -31,6 +31,7 @@ #include <linux/stddef.h> #include <linux/stringify.h> #include <linux/sysfs.h> +#include <net/net_namespace.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" @@ -305,12 +306,16 @@ void batadv_debugfs_destroy(void) */ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) { + struct net *net = dev_net(hard_iface->net_dev); struct batadv_debuginfo **bat_debug; struct dentry *file; if (!batadv_debugfs) goto out; + if (net != &init_net) + return 0; + hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name, batadv_debugfs); if (!hard_iface->debug_dir) @@ -341,6 +346,11 @@ out: */ void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) { + struct net *net = dev_net(hard_iface->net_dev); + + if (net != &init_net) + return; + if (batadv_debugfs) { debugfs_remove_recursive(hard_iface->debug_dir); hard_iface->debug_dir = NULL; @@ -351,11 +361,15 @@ int batadv_debugfs_add_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_debuginfo **bat_debug; + struct net *net = dev_net(dev); struct dentry *file; if (!batadv_debugfs) goto out; + if (net != &init_net) + return 0; + bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs); if (!bat_priv->debug_dir) goto out; @@ -392,6 +406,10 @@ out: void batadv_debugfs_del_meshif(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct net *net = dev_net(dev); + + if (net != &init_net) + return; batadv_debug_log_cleanup(bat_priv); diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 1ab4e2e63afc..c68ff3dcb926 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -26,7 +26,7 @@ struct net_device; #define BATADV_DEBUGFS_SUBDIR "batman_adv" -#if IS_ENABLED(CONFIG_DEBUG_FS) +#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS) void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b1cc8bfe11ac..e257efdc5d03 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -343,8 +343,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, ether_addr_copy(dat_entry->mac_addr, mac_addr); dat_entry->last_update = jiffies; kref_init(&dat_entry->refcount); - kref_get(&dat_entry->refcount); + kref_get(&dat_entry->refcount); hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat, batadv_hash_dat, dat_entry, &dat_entry->hash_entry); @@ -795,6 +795,7 @@ void batadv_dat_free(struct batadv_priv *bat_priv) batadv_dat_hash_free(bat_priv); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_dat_cache_seq_print_text - print the local DAT hash table * @seq: seq file to print on @@ -846,6 +847,7 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif /** * batadv_arp_get_type - parse an ARP packet and gets the type diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 63a805d3f96e..de055d64debe 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -20,6 +20,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> +#include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/if_ether.h> @@ -31,6 +32,7 @@ #include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> @@ -39,13 +41,17 @@ #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/udp.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "gateway_common.h" #include "hard-interface.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "packet.h" #include "routing.h" +#include "soft-interface.h" #include "sysfs.h" #include "translation-table.h" @@ -80,12 +86,12 @@ static void batadv_gw_node_release(struct kref *ref) * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it * @gw_node: gateway node to free */ -static void batadv_gw_node_put(struct batadv_gw_node *gw_node) +void batadv_gw_node_put(struct batadv_gw_node *gw_node) { kref_put(&gw_node->refcount, batadv_gw_node_release); } -static struct batadv_gw_node * +struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; @@ -164,86 +170,6 @@ void batadv_gw_reselect(struct batadv_priv *bat_priv) atomic_set(&bat_priv->gw.reselect, 1); } -static struct batadv_gw_node * -batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) -{ - struct batadv_neigh_node *router; - struct batadv_neigh_ifinfo *router_ifinfo; - struct batadv_gw_node *gw_node, *curr_gw = NULL; - u64 max_gw_factor = 0; - u64 tmp_gw_factor = 0; - u8 max_tq = 0; - u8 tq_avg; - struct batadv_orig_node *orig_node; - - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - orig_node = gw_node->orig_node; - router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); - if (!router) - continue; - - router_ifinfo = batadv_neigh_ifinfo_get(router, - BATADV_IF_DEFAULT); - if (!router_ifinfo) - goto next; - - if (!kref_get_unless_zero(&gw_node->refcount)) - goto next; - - tq_avg = router_ifinfo->bat_iv.tq_avg; - - switch (atomic_read(&bat_priv->gw.sel_class)) { - case 1: /* fast connection */ - tmp_gw_factor = tq_avg * tq_avg; - tmp_gw_factor *= gw_node->bandwidth_down; - tmp_gw_factor *= 100 * 100; - tmp_gw_factor >>= 18; - - if ((tmp_gw_factor > max_gw_factor) || - ((tmp_gw_factor == max_gw_factor) && - (tq_avg > max_tq))) { - if (curr_gw) - batadv_gw_node_put(curr_gw); - curr_gw = gw_node; - kref_get(&curr_gw->refcount); - } - break; - - default: /* 2: stable connection (use best statistic) - * 3: fast-switch (use best statistic but change as - * soon as a better gateway appears) - * XX: late-switch (use best statistic but change as - * soon as a better gateway appears which has - * $routing_class more tq points) - */ - if (tq_avg > max_tq) { - if (curr_gw) - batadv_gw_node_put(curr_gw); - curr_gw = gw_node; - kref_get(&curr_gw->refcount); - } - break; - } - - if (tq_avg > max_tq) - max_tq = tq_avg; - - if (tmp_gw_factor > max_gw_factor) - max_gw_factor = tmp_gw_factor; - - batadv_gw_node_put(gw_node); - -next: - batadv_neigh_node_put(router); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - } - rcu_read_unlock(); - - return curr_gw; -} - /** * batadv_gw_check_client_stop - check if client mode has been switched off * @bat_priv: the bat priv with all the soft interface information @@ -287,12 +213,19 @@ void batadv_gw_election(struct batadv_priv *bat_priv) if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) goto out; + if (!bat_priv->algo_ops->gw.get_best_gw_node) + goto out; + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw) goto out; - next_gw = batadv_gw_get_best_gw_node(bat_priv); + /* if gw.reselect is set to 1 it means that a previous call to + * gw.is_eligible() said that we have a new best GW, therefore it can + * now be picked from the list and selected + */ + next_gw = bat_priv->algo_ops->gw.get_best_gw_node(bat_priv); if (curr_gw == next_gw) goto out; @@ -360,70 +293,31 @@ out: void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { - struct batadv_neigh_ifinfo *router_orig_tq = NULL; - struct batadv_neigh_ifinfo *router_gw_tq = NULL; struct batadv_orig_node *curr_gw_orig; - struct batadv_neigh_node *router_gw = NULL; - struct batadv_neigh_node *router_orig = NULL; - u8 gw_tq_avg, orig_tq_avg; + + /* abort immediately if the routing algorithm does not support gateway + * election + */ + if (!bat_priv->algo_ops->gw.is_eligible) + return; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); if (!curr_gw_orig) goto reselect; - router_gw = batadv_orig_router_get(curr_gw_orig, BATADV_IF_DEFAULT); - if (!router_gw) - goto reselect; - - router_gw_tq = batadv_neigh_ifinfo_get(router_gw, - BATADV_IF_DEFAULT); - if (!router_gw_tq) - goto reselect; - /* this node already is the gateway */ if (curr_gw_orig == orig_node) goto out; - router_orig = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); - if (!router_orig) + if (!bat_priv->algo_ops->gw.is_eligible(bat_priv, curr_gw_orig, + orig_node)) goto out; - router_orig_tq = batadv_neigh_ifinfo_get(router_orig, - BATADV_IF_DEFAULT); - if (!router_orig_tq) - goto out; - - gw_tq_avg = router_gw_tq->bat_iv.tq_avg; - orig_tq_avg = router_orig_tq->bat_iv.tq_avg; - - /* the TQ value has to be better */ - if (orig_tq_avg < gw_tq_avg) - goto out; - - /* if the routing class is greater than 3 the value tells us how much - * greater the TQ value of the new gateway must be - */ - if ((atomic_read(&bat_priv->gw.sel_class) > 3) && - (orig_tq_avg - gw_tq_avg < atomic_read(&bat_priv->gw.sel_class))) - goto out; - - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Restarting gateway selection: better gateway found (tq curr: %i, tq new: %i)\n", - gw_tq_avg, orig_tq_avg); - reselect: batadv_gw_reselect(bat_priv); out: if (curr_gw_orig) batadv_orig_node_put(curr_gw_orig); - if (router_gw) - batadv_neigh_node_put(router_gw); - if (router_orig) - batadv_neigh_node_put(router_orig); - if (router_gw_tq) - batadv_neigh_ifinfo_put(router_gw_tq); - if (router_orig_tq) - batadv_neigh_ifinfo_put(router_orig_tq); } /** @@ -445,14 +339,15 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (!gw_node) return; - kref_get(&orig_node->refcount); + kref_init(&gw_node->refcount); INIT_HLIST_NODE(&gw_node->list); + kref_get(&orig_node->refcount); gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - kref_init(&gw_node->refcount); spin_lock_bh(&bat_priv->gw.list_lock); + kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list); spin_unlock_bh(&bat_priv->gw.list_lock); @@ -463,6 +358,9 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, ntohl(gateway->bandwidth_down) % 10, ntohl(gateway->bandwidth_up) / 10, ntohl(gateway->bandwidth_up) % 10); + + /* don't return reference to new gw_node */ + batadv_gw_node_put(gw_node); } /** @@ -472,9 +370,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, * * Return: gateway node if found or NULL otherwise. */ -static struct batadv_gw_node * -batadv_gw_node_get(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node) +struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node) { struct batadv_gw_node *gw_node_tmp, *gw_node = NULL; @@ -585,81 +482,87 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->gw.list_lock); } -/* fails if orig_node has no router */ -static int batadv_write_buffer_text(struct batadv_priv *bat_priv, - struct seq_file *seq, - const struct batadv_gw_node *gw_node) +#ifdef CONFIG_BATMAN_ADV_DEBUGFS +int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) { - struct batadv_gw_node *curr_gw; - struct batadv_neigh_node *router; - struct batadv_neigh_ifinfo *router_ifinfo = NULL; - int ret = -1; + struct net_device *net_dev = (struct net_device *)seq->private; + struct batadv_priv *bat_priv = netdev_priv(net_dev); + struct batadv_hard_iface *primary_if; - router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); - if (!router) - goto out; + primary_if = batadv_seq_print_text_primary_if_get(seq); + if (!primary_if) + return 0; - router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); - if (!router_ifinfo) - goto out; + seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n", + BATADV_SOURCE_VERSION, primary_if->net_dev->name, + primary_if->net_dev->dev_addr, net_dev->name, + bat_priv->algo_ops->name); - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); + batadv_hardif_put(primary_if); - seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n", - (curr_gw == gw_node ? "=>" : " "), - gw_node->orig_node->orig, - router_ifinfo->bat_iv.tq_avg, router->addr, - router->if_incoming->net_dev->name, - gw_node->bandwidth_down / 10, - gw_node->bandwidth_down % 10, - gw_node->bandwidth_up / 10, - gw_node->bandwidth_up % 10); - ret = seq_has_overflowed(seq) ? -1 : 0; + if (!bat_priv->algo_ops->gw.print) { + seq_puts(seq, + "No printing function for this routing protocol\n"); + return 0; + } - if (curr_gw) - batadv_gw_node_put(curr_gw); -out: - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); - return ret; + bat_priv->algo_ops->gw.print(bat_priv, seq); + + return 0; } +#endif -int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) +/** + * batadv_gw_dump - Dump gateways into a message + * @msg: Netlink message to dump into + * @cb: Control block containing additional options + * + * Return: Error code, or length of message + */ +int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) { - struct net_device *net_dev = (struct net_device *)seq->private; - struct batadv_priv *bat_priv = netdev_priv(net_dev); - struct batadv_hard_iface *primary_if; - struct batadv_gw_node *gw_node; - int gw_count = 0; - - primary_if = batadv_seq_print_text_primary_if_get(seq); - if (!primary_if) + struct batadv_hard_iface *primary_if = NULL; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + int ifindex; + int ret; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; goto out; + } - seq_printf(seq, - " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth ... [B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s)]\n", - BATADV_SOURCE_VERSION, primary_if->net_dev->name, - primary_if->net_dev->dev_addr, net_dev->name); + bat_priv = netdev_priv(soft_iface); - rcu_read_lock(); - hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - /* fails if orig_node has no router */ - if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0) - continue; + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } - gw_count++; + if (!bat_priv->algo_ops->gw.dump) { + ret = -EOPNOTSUPP; + goto out; } - rcu_read_unlock(); - if (gw_count == 0) - seq_puts(seq, "No gateways in range ...\n"); + bat_priv->algo_ops->gw.dump(msg, cb, bat_priv); + + ret = msg->len; out: if (primary_if) batadv_hardif_put(primary_if); - return 0; + if (soft_iface) + dev_put(soft_iface); + + return ret; } /** diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 582dd8c413c8..859166d03561 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -23,6 +23,7 @@ #include <linux/types.h> struct batadv_tvlv_gateway_data; +struct netlink_callback; struct seq_file; struct sk_buff; @@ -39,10 +40,16 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_free(struct batadv_priv *bat_priv); +void batadv_gw_node_put(struct batadv_gw_node *gw_node); +struct batadv_gw_node * +batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); +int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, u8 *chaddr); +struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node); #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index d7bc6a87bcc9..21184810d89f 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -241,10 +241,9 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig, &gateway); - /* restart gateway selection if fast or late switching was enabled */ + /* restart gateway selection */ if ((gateway.bandwidth_down != 0) && - (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) && - (atomic_read(&bat_priv->gw.sel_class) > 2)) + (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT)) batadv_gw_check_election(bat_priv, orig); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 1f9080840566..08ce36147c4c 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -35,7 +35,8 @@ #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> -#include <linux/workqueue.h> +#include <net/net_namespace.h> +#include <net/rtnetlink.h> #include "bat_v.h" #include "bridge_loop_avoidance.h" @@ -85,25 +86,55 @@ out: } /** + * batadv_getlink_net - return link net namespace (of use fallback) + * @netdev: net_device to check + * @fallback_net: return in case get_link_net is not available for @netdev + * + * Return: result of rtnl_link_ops->get_link_net or @fallback_net + */ +static const struct net *batadv_getlink_net(const struct net_device *netdev, + const struct net *fallback_net) +{ + if (!netdev->rtnl_link_ops) + return fallback_net; + + if (!netdev->rtnl_link_ops->get_link_net) + return fallback_net; + + return netdev->rtnl_link_ops->get_link_net(netdev); +} + +/** * batadv_mutual_parents - check if two devices are each others parent - * @dev1: 1st net_device - * @dev2: 2nd net_device + * @dev1: 1st net dev + * @net1: 1st devices netns + * @dev2: 2nd net dev + * @net2: 2nd devices netns * * veth devices come in pairs and each is the parent of the other! * * Return: true if the devices are each others parent, otherwise false */ static bool batadv_mutual_parents(const struct net_device *dev1, - const struct net_device *dev2) + const struct net *net1, + const struct net_device *dev2, + const struct net *net2) { int dev1_parent_iflink = dev_get_iflink(dev1); int dev2_parent_iflink = dev_get_iflink(dev2); + const struct net *dev1_parent_net; + const struct net *dev2_parent_net; + + dev1_parent_net = batadv_getlink_net(dev1, net1); + dev2_parent_net = batadv_getlink_net(dev2, net2); if (!dev1_parent_iflink || !dev2_parent_iflink) return false; return (dev1_parent_iflink == dev2->ifindex) && - (dev2_parent_iflink == dev1->ifindex); + (dev2_parent_iflink == dev1->ifindex) && + net_eq(dev1_parent_net, net2) && + net_eq(dev2_parent_net, net1); } /** @@ -121,8 +152,9 @@ static bool batadv_mutual_parents(const struct net_device *dev1, */ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) { - struct net_device *parent_dev; struct net *net = dev_net(net_dev); + struct net_device *parent_dev; + const struct net *parent_net; bool ret; /* check if this is a batman-adv mesh interface */ @@ -134,13 +166,16 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) dev_get_iflink(net_dev) == net_dev->ifindex) return false; + parent_net = batadv_getlink_net(net_dev, net); + /* recurse over the parent device */ - parent_dev = __dev_get_by_index(net, dev_get_iflink(net_dev)); + parent_dev = __dev_get_by_index((struct net *)parent_net, + dev_get_iflink(net_dev)); /* if we got a NULL parent_dev there is something broken.. */ if (WARN(!parent_dev, "Cannot find parent device")) return false; - if (batadv_mutual_parents(net_dev, parent_dev)) + if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net)) return false; ret = batadv_is_on_batman_iface(parent_dev); @@ -625,25 +660,6 @@ out: batadv_hardif_put(primary_if); } -/** - * batadv_hardif_remove_interface_finish - cleans up the remains of a hardif - * @work: work queue item - * - * Free the parts of the hard interface which can not be removed under - * rtnl lock (to prevent deadlock situations). - */ -static void batadv_hardif_remove_interface_finish(struct work_struct *work) -{ - struct batadv_hard_iface *hard_iface; - - hard_iface = container_of(work, struct batadv_hard_iface, - cleanup_work); - - batadv_debugfs_del_hardif(hard_iface); - batadv_sysfs_del_hardif(&hard_iface->hardif_obj); - batadv_hardif_put(hard_iface); -} - static struct batadv_hard_iface * batadv_hardif_add_interface(struct net_device *net_dev) { @@ -676,10 +692,9 @@ batadv_hardif_add_interface(struct net_device *net_dev) INIT_LIST_HEAD(&hard_iface->list); INIT_HLIST_HEAD(&hard_iface->neigh_list); - INIT_WORK(&hard_iface->cleanup_work, - batadv_hardif_remove_interface_finish); spin_lock_init(&hard_iface->neigh_list_lock); + kref_init(&hard_iface->refcount); hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT; if (batadv_is_wifi_netdev(net_dev)) @@ -687,11 +702,8 @@ batadv_hardif_add_interface(struct net_device *net_dev) batadv_v_hardif_init(hard_iface); - /* extra reference for return */ - kref_init(&hard_iface->refcount); - kref_get(&hard_iface->refcount); - batadv_check_known_mac_addr(hard_iface->net_dev); + kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); return hard_iface; @@ -713,13 +725,15 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) /* first deactivate interface */ if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) batadv_hardif_disable_interface(hard_iface, - BATADV_IF_CLEANUP_AUTO); + BATADV_IF_CLEANUP_KEEP); if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) return; hard_iface->if_status = BATADV_IF_TO_BE_REMOVED; - queue_work(batadv_event_workqueue, &hard_iface->cleanup_work); + batadv_debugfs_del_hardif(hard_iface); + batadv_sysfs_del_hardif(&hard_iface->hardif_obj); + batadv_hardif_put(hard_iface); } void batadv_hardif_remove_interfaces(void) diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 618d5de06f20..e44a7da51431 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -26,9 +26,25 @@ struct batadv_icmp_header; #define BATADV_ICMP_SOCKET "socket" -void batadv_socket_init(void); int batadv_socket_setup(struct batadv_priv *bat_priv); + +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + +void batadv_socket_init(void); void batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len); +#else + +static inline void batadv_socket_init(void) +{ +} + +static inline void +batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len) +{ +} + +#endif + #endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index fe4c5e29f96b..2c017ab47557 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -82,6 +82,12 @@ static void batadv_recv_handler_init(void); static int __init batadv_init(void) { + int ret; + + ret = batadv_tt_cache_init(); + if (ret < 0) + return ret; + INIT_LIST_HEAD(&batadv_hardif_list); batadv_algo_init(); @@ -93,9 +99,8 @@ static int __init batadv_init(void) batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); - if (!batadv_event_workqueue) - return -ENOMEM; + goto err_create_wq; batadv_socket_init(); batadv_debugfs_init(); @@ -108,6 +113,11 @@ static int __init batadv_init(void) BATADV_SOURCE_VERSION, BATADV_COMPAT_VERSION); return 0; + +err_create_wq: + batadv_tt_cache_destroy(); + + return -ENOMEM; } static void __exit batadv_exit(void) @@ -123,6 +133,8 @@ static void __exit batadv_exit(void) batadv_event_workqueue = NULL; rcu_barrier(); + + batadv_tt_cache_destroy(); } int batadv_mesh_init(struct net_device *soft_iface) @@ -270,6 +282,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) return is_my_mac; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_seq_print_text_primary_if_get - called from debugfs table printing * function that requires the primary interface @@ -305,6 +318,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq) out: return primary_if; } +#endif /** * batadv_max_header_len - calculate maximum encapsulation overhead for a @@ -638,3 +652,4 @@ MODULE_AUTHOR(BATADV_DRIVER_AUTHOR); MODULE_DESCRIPTION(BATADV_DRIVER_DESC); MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE); MODULE_VERSION(BATADV_SOURCE_VERSION); +MODULE_ALIAS_RTNL_LINK("batadv"); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 06a860845434..09af21e27639 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.3" +#define BATADV_SOURCE_VERSION "2016.4" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index cc915073a753..13661f43386f 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -528,7 +528,7 @@ update: } return !(mcast_data.flags & - (BATADV_MCAST_WANT_ALL_IPV4 + BATADV_MCAST_WANT_ALL_IPV6)); + (BATADV_MCAST_WANT_ALL_IPV4 | BATADV_MCAST_WANT_ALL_IPV6)); } /** @@ -1134,6 +1134,7 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) BATADV_TVLV_HANDLER_OGM_CIFNOTFND); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_mcast_flags_print_header - print own mcast flags to debugfs table * @bat_priv: the bat priv with all the soft interface information @@ -1234,6 +1235,7 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) return 0; } +#endif /** * batadv_mcast_free - free the multicast optimizations structures diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 231f8eaf075b..64cb6acbe0a6 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -18,6 +18,8 @@ #include "netlink.h" #include "main.h" +#include <linux/atomic.h> +#include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/genetlink.h> @@ -26,24 +28,33 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/types.h> #include <net/genetlink.h> #include <net/netlink.h> +#include <net/sock.h> #include <uapi/linux/batman_adv.h> +#include "bat_algo.h" +#include "bridge_loop_avoidance.h" +#include "gateway_client.h" #include "hard-interface.h" +#include "originator.h" +#include "packet.h" #include "soft-interface.h" #include "tp_meter.h" +#include "translation-table.h" -struct sk_buff; - -static struct genl_family batadv_netlink_family = { +struct genl_family batadv_netlink_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, .name = BATADV_NL_NAME, .version = 1, .maxattr = BATADV_ATTR_MAX, + .netnsok = true, }; /* multicast groups */ @@ -51,11 +62,11 @@ enum batadv_netlink_multicast_groups { BATADV_NL_MCGRP_TPMETER, }; -static struct genl_multicast_group batadv_netlink_mcgrps[] = { +static const struct genl_multicast_group batadv_netlink_mcgrps[] = { [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER }, }; -static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { +static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, @@ -69,9 +80,44 @@ static struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, + [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, + [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, + [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, + [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, + [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TQ] = { .type = NLA_U8 }, + [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, + [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, + [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, }; /** + * batadv_netlink_get_ifindex - Extract an interface index from a message + * @nlh: Message header + * @attrtype: Attribute which holds an interface index + * + * Return: interface index, or 0. + */ +int +batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) +{ + struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); + + return attr ? nla_get_u32(attr) : 0; +} + +/** * batadv_netlink_mesh_info_put - fill in generic information about mesh * interface * @msg: netlink message to be sent back @@ -93,8 +139,16 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, - soft_iface->dev_addr)) + soft_iface->dev_addr) || + nla_put_u8(msg, BATADV_ATTR_TT_TTVN, + (u8)atomic_read(&bat_priv->tt.vn))) + goto out; + +#ifdef CONFIG_BATMAN_ADV_BLA + if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC, + ntohs(bat_priv->bla.claim_dest.group))) goto out; +#endif primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { @@ -380,6 +434,106 @@ out: return ret; } +/** + * batadv_netlink_dump_hardif_entry - Dump one hard interface into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @hard_iface: Hard interface to dump + * + * Return: error code, or 0 on success + */ +static int +batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_hard_iface *hard_iface) +{ + struct net_device *net_dev = hard_iface->net_dev; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, + BATADV_CMD_GET_HARDIFS); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, + net_dev->ifindex) || + nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, + net_dev->name) || + nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, + net_dev->dev_addr)) + goto nla_put_failure; + + if (hard_iface->if_status == BATADV_IF_ACTIVE) { + if (nla_put_flag(msg, BATADV_ATTR_ACTIVE)) + goto nla_put_failure; + } + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_netlink_dump_hardifs - Dump all hard interface into a messages + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: error code, or length of reply message on success + */ +static int +batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hard_iface *hard_iface; + int ifindex; + int portid = NETLINK_CB(cb->skb).portid; + int seq = cb->nlh->nlmsg_seq; + int skip = cb->args[0]; + int i = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface) + return -ENODEV; + + if (!batadv_softif_is_valid(soft_iface)) { + dev_put(soft_iface); + return -ENODEV; + } + + rcu_read_lock(); + + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->soft_iface != soft_iface) + continue; + + if (i++ < skip) + continue; + + if (batadv_netlink_dump_hardif_entry(msg, portid, seq, + hard_iface)) { + i--; + break; + } + } + + rcu_read_unlock(); + + dev_put(soft_iface); + + cb->args[0] = i; + + return msg->len; +} + static struct genl_ops batadv_netlink_ops[] = { { .cmd = BATADV_CMD_GET_MESH_INFO, @@ -399,6 +553,61 @@ static struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .doit = batadv_netlink_tp_meter_cancel, }, + { + .cmd = BATADV_CMD_GET_ROUTING_ALGOS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_algo_dump, + }, + { + .cmd = BATADV_CMD_GET_HARDIFS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_netlink_dump_hardifs, + }, + { + .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_tt_local_dump, + }, + { + .cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_tt_global_dump, + }, + { + .cmd = BATADV_CMD_GET_ORIGINATORS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_orig_dump, + }, + { + .cmd = BATADV_CMD_GET_NEIGHBORS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_hardif_neigh_dump, + }, + { + .cmd = BATADV_CMD_GET_GATEWAYS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_gw_dump, + }, + { + .cmd = BATADV_CMD_GET_BLA_CLAIM, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_bla_claim_dump, + }, + { + .cmd = BATADV_CMD_GET_BLA_BACKBONE, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_bla_backbone_dump, + }, + }; /** diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 945653ab58c6..52eb16281aba 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -21,12 +21,18 @@ #include "main.h" #include <linux/types.h> +#include <net/genetlink.h> + +struct nlmsghdr; void batadv_netlink_register(void); void batadv_netlink_unregister(void); +int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype); int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, u32 cookie); +extern struct genl_family batadv_netlink_family; + #endif /* _NET_BATMAN_ADV_NETLINK_H_ */ diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 293ef4ffd4e1..e3baf697a35c 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -856,14 +856,12 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, if (!nc_node) return NULL; - kref_get(&orig_neigh_node->refcount); - /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); + kref_init(&nc_node->refcount); ether_addr_copy(nc_node->addr, orig_node->orig); + kref_get(&orig_neigh_node->refcount); nc_node->orig_node = orig_neigh_node; - kref_init(&nc_node->refcount); - kref_get(&nc_node->refcount); /* Select ingoing or outgoing coding node */ if (in_coding) { @@ -879,6 +877,7 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, /* Add nc_node to orig_node */ spin_lock_bh(lock); + kref_get(&nc_node->refcount); list_add_tail_rcu(&nc_node->list, list); spin_unlock_bh(lock); @@ -979,7 +978,6 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, INIT_LIST_HEAD(&nc_path->packet_list); spin_lock_init(&nc_path->packet_list_lock); kref_init(&nc_path->refcount); - kref_get(&nc_path->refcount); nc_path->last_valid = jiffies; ether_addr_copy(nc_path->next_hop, dst); ether_addr_copy(nc_path->prev_hop, src); @@ -989,6 +987,7 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, nc_path->next_hop); /* Add nc_path to hash table */ + kref_get(&nc_path->refcount); hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, batadv_nc_hash_choose, &nc_path_key, &nc_path->hash_entry); @@ -1882,6 +1881,7 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) batadv_hash_destroy(bat_priv->nc.decoding_hash); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_nc_nodes_seq_print_text - print the nc node information * @seq: seq file to print on @@ -1981,3 +1981,4 @@ int batadv_nc_init_debugfs(struct batadv_priv *bat_priv) out: return -ENOMEM; } +#endif diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 3940b5d24421..5f3bfc41aeb1 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -28,11 +28,15 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "distributed-arp-table.h" @@ -42,8 +46,10 @@ #include "hash.h" #include "log.h" #include "multicast.h" +#include "netlink.h" #include "network-coding.h" #include "routing.h" +#include "soft-interface.h" #include "translation-table.h" /* hash class keys */ @@ -127,9 +133,9 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, goto out; kref_init(&vlan->refcount); - kref_get(&vlan->refcount); vlan->vid = vid; + kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); out: @@ -380,6 +386,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, orig_ifinfo->if_outgoing = if_outgoing; INIT_HLIST_NODE(&orig_ifinfo->list); kref_init(&orig_ifinfo->refcount); + kref_get(&orig_ifinfo->refcount); hlist_add_head_rcu(&orig_ifinfo->list, &orig_node->ifinfo_list); @@ -453,9 +460,9 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, INIT_HLIST_NODE(&neigh_ifinfo->list); kref_init(&neigh_ifinfo->refcount); - kref_get(&neigh_ifinfo->refcount); neigh_ifinfo->if_outgoing = if_outgoing; + kref_get(&neigh_ifinfo->refcount); hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list); out: @@ -647,8 +654,8 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node, /* extra reference for return */ kref_init(&neigh_node->refcount); - kref_get(&neigh_node->refcount); + kref_get(&neigh_node->refcount); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, @@ -686,6 +693,7 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list * @seq: neighbour table seq_file struct @@ -719,6 +727,84 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset) bat_priv->algo_ops->neigh.print(bat_priv, seq); return 0; } +#endif + +/** + * batadv_hardif_neigh_dump - Dump to netlink the neighbor infos for a specific + * outgoing interface + * @msg: message to dump into + * @cb: parameters for the dump + * + * Return: 0 or error value + */ +int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct net_device *hard_iface = NULL; + struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + int ret; + int ifindex, hard_ifindex; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hard_ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_HARD_IFINDEX); + if (hard_ifindex) { + hard_iface = dev_get_by_index(net, hard_ifindex); + if (hard_iface) + hardif = batadv_hardif_get_by_netdev(hard_iface); + + if (!hardif) { + ret = -ENODEV; + goto out; + } + + if (hardif->soft_iface != soft_iface) { + ret = -ENOENT; + goto out; + } + } + + if (!bat_priv->algo_ops->neigh.dump) { + ret = -EOPNOTSUPP; + goto out; + } + + bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif); + + ret = msg->len; + + out: + if (hardif) + batadv_hardif_put(hardif); + if (hard_iface) + dev_put(hard_iface); + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + return ret; +} /** * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for @@ -905,7 +991,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, /* extra reference for return */ kref_init(&orig_node->refcount); - kref_get(&orig_node->refcount); orig_node->bat_priv = bat_priv; ether_addr_copy(orig_node->orig, addr); @@ -1256,6 +1341,7 @@ void batadv_purge_orig_ref(struct batadv_priv *bat_priv) _batadv_purge_orig(bat_priv); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS int batadv_orig_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1329,6 +1415,84 @@ out: batadv_hardif_put(hard_iface); return 0; } +#endif + +/** + * batadv_orig_dump - Dump to netlink the originator infos for a specific + * outgoing interface + * @msg: message to dump into + * @cb: parameters for the dump + * + * Return: 0 or error value + */ +int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct net_device *hard_iface = NULL; + struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + int ret; + int ifindex, hard_ifindex; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hard_ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_HARD_IFINDEX); + if (hard_ifindex) { + hard_iface = dev_get_by_index(net, hard_ifindex); + if (hard_iface) + hardif = batadv_hardif_get_by_netdev(hard_iface); + + if (!hardif) { + ret = -ENODEV; + goto out; + } + + if (hardif->soft_iface != soft_iface) { + ret = -ENOENT; + goto out; + } + } + + if (!bat_priv->algo_ops->orig.dump) { + ret = -EOPNOTSUPP; + goto out; + } + + bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif); + + ret = msg->len; + + out: + if (hardif) + batadv_hardif_put(hardif); + if (hard_iface) + dev_put(hard_iface); + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + return ret; +} int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num) diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 566306bf05dc..ebc56183f358 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -31,7 +31,9 @@ #include "hash.h" +struct netlink_callback; struct seq_file; +struct sk_buff; bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); @@ -61,6 +63,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); +int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset); struct batadv_orig_ifinfo * @@ -72,6 +75,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); int batadv_orig_seq_print_text(struct seq_file *seq, void *offset); +int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset); int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, int max_if_num); diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 6b011ff64dd8..6afc0b86950e 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -129,42 +129,6 @@ enum batadv_tt_data_flags { }; /** - * enum batadv_tt_client_flags - TT client specific flags - * @BATADV_TT_CLIENT_DEL: the client has to be deleted from the table - * @BATADV_TT_CLIENT_ROAM: the client roamed to/from another node and the new - * update telling its new real location has not been received/sent yet - * @BATADV_TT_CLIENT_WIFI: this client is connected through a wifi interface. - * This information is used by the "AP Isolation" feature - * @BATADV_TT_CLIENT_ISOLA: this client is considered "isolated". This - * information is used by the Extended Isolation feature - * @BATADV_TT_CLIENT_NOPURGE: this client should never be removed from the table - * @BATADV_TT_CLIENT_NEW: this client has been added to the local table but has - * not been announced yet - * @BATADV_TT_CLIENT_PENDING: this client is marked for removal but it is kept - * in the table for one more originator interval for consistency purposes - * @BATADV_TT_CLIENT_TEMP: this global client has been detected to be part of - * the network but no nnode has already announced it - * - * Bits from 0 to 7 are called _remote flags_ because they are sent on the wire. - * Bits from 8 to 15 are called _local flags_ because they are used for local - * computations only. - * - * Bits from 4 to 7 - a subset of remote flags - are ensured to be in sync with - * the other nodes in the network. To achieve this goal these flags are included - * in the TT CRC computation. - */ -enum batadv_tt_client_flags { - BATADV_TT_CLIENT_DEL = BIT(0), - BATADV_TT_CLIENT_ROAM = BIT(1), - BATADV_TT_CLIENT_WIFI = BIT(4), - BATADV_TT_CLIENT_ISOLA = BIT(5), - BATADV_TT_CLIENT_NOPURGE = BIT(8), - BATADV_TT_CLIENT_NEW = BIT(9), - BATADV_TT_CLIENT_PENDING = BIT(10), - BATADV_TT_CLIENT_TEMP = BIT(11), -}; - -/** * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field * @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 3d199478c405..7e8dc648b95a 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -74,11 +74,23 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, if (!orig_ifinfo) return; - rcu_read_lock(); - curr_router = rcu_dereference(orig_ifinfo->router); - if (curr_router && !kref_get_unless_zero(&curr_router->refcount)) - curr_router = NULL; - rcu_read_unlock(); + spin_lock_bh(&orig_node->neigh_list_lock); + /* curr_router used earlier may not be the current orig_ifinfo->router + * anymore because it was dereferenced outside of the neigh_list_lock + * protected region. After the new best neighbor has replace the current + * best neighbor the reference counter needs to decrease. Consequently, + * the code needs to ensure the curr_router variable contains a pointer + * to the replaced best neighbor. + */ + curr_router = rcu_dereference_protected(orig_ifinfo->router, true); + + /* increase refcount of new best neighbor */ + if (neigh_node) + kref_get(&neigh_node->refcount); + + rcu_assign_pointer(orig_ifinfo->router, neigh_node); + spin_unlock_bh(&orig_node->neigh_list_lock); + batadv_orig_ifinfo_put(orig_ifinfo); /* route deleted */ if ((curr_router) && (!neigh_node)) { @@ -100,27 +112,6 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, curr_router->addr); } - if (curr_router) - batadv_neigh_node_put(curr_router); - - spin_lock_bh(&orig_node->neigh_list_lock); - /* curr_router used earlier may not be the current orig_ifinfo->router - * anymore because it was dereferenced outside of the neigh_list_lock - * protected region. After the new best neighbor has replace the current - * best neighbor the reference counter needs to decrease. Consequently, - * the code needs to ensure the curr_router variable contains a pointer - * to the replaced best neighbor. - */ - curr_router = rcu_dereference_protected(orig_ifinfo->router, true); - - /* increase refcount of new best neighbor */ - if (neigh_node) - kref_get(&neigh_node->refcount); - - rcu_assign_pointer(orig_ifinfo->router, neigh_node); - spin_unlock_bh(&orig_node->neigh_list_lock); - batadv_orig_ifinfo_put(orig_ifinfo); - /* decrease refcount of previous best neighbor */ if (curr_router) batadv_neigh_node_put(curr_router); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 6191159484df..8d4e1f578574 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -315,8 +315,7 @@ out: * * Wrap the given skb into a batman-adv unicast or unicast-4addr header * depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied - * as packet_type. Then send this frame to the given orig_node and release a - * reference to this orig_node. + * as packet_type. Then send this frame to the given orig_node. * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ @@ -370,8 +369,6 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, ret = NET_XMIT_SUCCESS; out: - if (orig_node) - batadv_orig_node_put(orig_node); if (ret == NET_XMIT_DROP) kfree_skb(skb); return ret; @@ -403,6 +400,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct ethhdr *ethhdr = (struct ethhdr *)skb->data; struct batadv_orig_node *orig_node; u8 *src, *dst; + int ret; src = ethhdr->h_source; dst = ethhdr->h_dest; @@ -414,8 +412,13 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, } orig_node = batadv_transtable_search(bat_priv, src, dst, vid); - return batadv_send_skb_unicast(bat_priv, skb, packet_type, - packet_subtype, orig_node, vid); + ret = batadv_send_skb_unicast(bat_priv, skb, packet_type, + packet_subtype, orig_node, vid); + + if (orig_node) + batadv_orig_node_put(orig_node); + + return ret; } /** @@ -433,12 +436,25 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; + int ret; orig_node = batadv_gw_get_selected_orig(bat_priv); - return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, - BATADV_P_DATA, orig_node, vid); + ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, + BATADV_P_DATA, orig_node, vid); + + if (orig_node) + batadv_orig_node_put(orig_node); + + return ret; } +/** + * batadv_forw_packet_free - free a forwarding packet + * @forw_packet: The packet to free + * + * This frees a forwarding packet and releases any resources it might + * have claimed. + */ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) { kfree_skb(forw_packet->skb); @@ -446,9 +462,73 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet) batadv_hardif_put(forw_packet->if_incoming); if (forw_packet->if_outgoing) batadv_hardif_put(forw_packet->if_outgoing); + if (forw_packet->queue_left) + atomic_inc(forw_packet->queue_left); kfree(forw_packet); } +/** + * batadv_forw_packet_alloc - allocate a forwarding packet + * @if_incoming: The (optional) if_incoming to be grabbed + * @if_outgoing: The (optional) if_outgoing to be grabbed + * @queue_left: The (optional) queue counter to decrease + * @bat_priv: The bat_priv for the mesh of this forw_packet + * + * Allocates a forwarding packet and tries to get a reference to the + * (optional) if_incoming, if_outgoing and queue_left. If queue_left + * is NULL then bat_priv is optional, too. + * + * Return: An allocated forwarding packet on success, NULL otherwise. + */ +struct batadv_forw_packet * +batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing, + atomic_t *queue_left, + struct batadv_priv *bat_priv) +{ + struct batadv_forw_packet *forw_packet; + const char *qname; + + if (queue_left && !batadv_atomic_dec_not_zero(queue_left)) { + qname = "unknown"; + + if (queue_left == &bat_priv->bcast_queue_left) + qname = "bcast"; + + if (queue_left == &bat_priv->batman_queue_left) + qname = "batman"; + + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "%s queue is full\n", qname); + + return NULL; + } + + forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC); + if (!forw_packet) + goto err; + + if (if_incoming) + kref_get(&if_incoming->refcount); + + if (if_outgoing) + kref_get(&if_outgoing->refcount); + + forw_packet->skb = NULL; + forw_packet->queue_left = queue_left; + forw_packet->if_incoming = if_incoming; + forw_packet->if_outgoing = if_outgoing; + forw_packet->num_packets = 0; + + return forw_packet; + +err: + if (queue_left) + atomic_inc(queue_left); + + return NULL; +} + static void _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet, @@ -487,24 +567,20 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, struct batadv_bcast_packet *bcast_packet; struct sk_buff *newskb; - if (!batadv_atomic_dec_not_zero(&bat_priv->bcast_queue_left)) { - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "bcast packet queue full\n"); - goto out; - } - primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) - goto out_and_inc; - - forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC); + goto err; + forw_packet = batadv_forw_packet_alloc(primary_if, NULL, + &bat_priv->bcast_queue_left, + bat_priv); + batadv_hardif_put(primary_if); if (!forw_packet) - goto out_and_inc; + goto err; newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) - goto packet_free; + goto err_packet_free; /* as we have a copy now, it is safe to decrease the TTL */ bcast_packet = (struct batadv_bcast_packet *)newskb->data; @@ -513,11 +589,6 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, skb_reset_mac_header(newskb); forw_packet->skb = newskb; - forw_packet->if_incoming = primary_if; - forw_packet->if_outgoing = NULL; - - /* how often did we send the bcast packet ? */ - forw_packet->num_packets = 0; INIT_DELAYED_WORK(&forw_packet->delayed_work, batadv_send_outstanding_bcast_packet); @@ -525,13 +596,9 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, _batadv_add_bcast_packet_to_list(bat_priv, forw_packet, delay); return NETDEV_TX_OK; -packet_free: - kfree(forw_packet); -out_and_inc: - atomic_inc(&bat_priv->bcast_queue_left); -out: - if (primary_if) - batadv_hardif_put(primary_if); +err_packet_free: + batadv_forw_packet_free(forw_packet); +err: return NETDEV_TX_BUSY; } @@ -592,7 +659,6 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) out: batadv_forw_packet_free(forw_packet); - atomic_inc(&bat_priv->bcast_queue_left); } void @@ -633,9 +699,6 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, if (pending) { hlist_del(&forw_packet->list); - if (!forw_packet->own) - atomic_inc(&bat_priv->bcast_queue_left); - batadv_forw_packet_free(forw_packet); } } @@ -663,9 +726,6 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, if (pending) { hlist_del(&forw_packet->list); - if (!forw_packet->own) - atomic_inc(&bat_priv->batman_queue_left); - batadv_forw_packet_free(forw_packet); } } diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 7cecb7563b45..999f78683d9e 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -28,6 +28,12 @@ struct sk_buff; void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet); +struct batadv_forw_packet * +batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing, + atomic_t *queue_left, + struct batadv_priv *bat_priv); + int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 7527c0652dd5..49e16b6e0ba3 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -39,6 +39,7 @@ #include <linux/random.h> #include <linux/rculist.h> #include <linux/rcupdate.h> +#include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/socket.h> @@ -46,7 +47,6 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> -#include <linux/workqueue.h> #include "bat_algo.h" #include "bridge_loop_avoidance.h" @@ -57,6 +57,7 @@ #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" +#include "originator.h" #include "packet.h" #include "send.h" #include "sysfs.h" @@ -377,6 +378,8 @@ dropped: dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: + if (mcast_single_orig) + batadv_orig_node_put(mcast_single_orig); if (primary_if) batadv_hardif_put(primary_if); return NETDEV_TX_OK; @@ -591,6 +594,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) } spin_lock_bh(&bat_priv->softif_vlan_list_lock); + kref_get(&vlan->refcount); hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); spin_unlock_bh(&bat_priv->softif_vlan_list_lock); @@ -601,6 +605,9 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) bat_priv->soft_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); + /* don't return reference to new softif_vlan */ + batadv_softif_vlan_put(vlan); + return 0; } @@ -747,34 +754,6 @@ static void batadv_set_lockdep_class(struct net_device *dev) } /** - * batadv_softif_destroy_finish - cleans up the remains of a softif - * @work: work queue item - * - * Free the parts of the soft interface which can not be removed under - * rtnl lock (to prevent deadlock situations). - */ -static void batadv_softif_destroy_finish(struct work_struct *work) -{ - struct batadv_softif_vlan *vlan; - struct batadv_priv *bat_priv; - struct net_device *soft_iface; - - bat_priv = container_of(work, struct batadv_priv, - cleanup_work); - soft_iface = bat_priv->soft_iface; - - /* destroy the "untagged" VLAN */ - vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); - if (vlan) { - batadv_softif_destroy_vlan(bat_priv, vlan); - batadv_softif_vlan_put(vlan); - } - - batadv_sysfs_del_meshif(soft_iface); - unregister_netdev(soft_iface); -} - -/** * batadv_softif_init_late - late stage initialization of soft interface * @dev: registered network device to modify * @@ -791,7 +770,6 @@ static int batadv_softif_init_late(struct net_device *dev) bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; - INIT_WORK(&bat_priv->cleanup_work, batadv_softif_destroy_finish); /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called @@ -1028,8 +1006,19 @@ struct net_device *batadv_softif_create(struct net *net, const char *name) void batadv_softif_destroy_sysfs(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_softif_vlan *vlan; - queue_work(batadv_event_workqueue, &bat_priv->cleanup_work); + ASSERT_RTNL(); + + /* destroy the "untagged" VLAN */ + vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + if (vlan) { + batadv_softif_destroy_vlan(bat_priv, vlan); + batadv_softif_vlan_put(vlan); + } + + batadv_sysfs_del_meshif(soft_iface); + unregister_netdevice(soft_iface); } /** diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index fe9ca94ddee2..02d96f224c60 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -37,6 +37,7 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/stringify.h> +#include <linux/workqueue.h> #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" @@ -428,6 +429,13 @@ static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr, struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); int bytes_written; + /* GW mode is not available if the routing algorithm in use does not + * implement the GW API + */ + if (!bat_priv->algo_ops->gw.get_best_gw_node || + !bat_priv->algo_ops->gw.is_eligible) + return -ENOENT; + switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_CLIENT: bytes_written = sprintf(buff, "%s\n", @@ -455,6 +463,13 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, char *curr_gw_mode_str; int gw_mode_tmp = -1; + /* toggling GW mode is allowed only if the routing algorithm in use + * provides the GW API + */ + if (!bat_priv->algo_ops->gw.get_best_gw_node || + !bat_priv->algo_ops->gw.is_eligible) + return -EINVAL; + if (buff[count - 1] == '\n') buff[count - 1] = '\0'; @@ -514,6 +529,50 @@ static ssize_t batadv_store_gw_mode(struct kobject *kobj, return count; } +static ssize_t batadv_show_gw_sel_class(struct kobject *kobj, + struct attribute *attr, char *buff) +{ + struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + + /* GW selection class is not available if the routing algorithm in use + * does not implement the GW API + */ + if (!bat_priv->algo_ops->gw.get_best_gw_node || + !bat_priv->algo_ops->gw.is_eligible) + return -ENOENT; + + if (bat_priv->algo_ops->gw.show_sel_class) + return bat_priv->algo_ops->gw.show_sel_class(bat_priv, buff); + + return sprintf(buff, "%i\n", atomic_read(&bat_priv->gw.sel_class)); +} + +static ssize_t batadv_store_gw_sel_class(struct kobject *kobj, + struct attribute *attr, char *buff, + size_t count) +{ + struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); + + /* setting the GW selection class is allowed only if the routing + * algorithm in use implements the GW API + */ + if (!bat_priv->algo_ops->gw.get_best_gw_node || + !bat_priv->algo_ops->gw.is_eligible) + return -EINVAL; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + if (bat_priv->algo_ops->gw.store_sel_class) + return bat_priv->algo_ops->gw.store_sel_class(bat_priv, buff, + count); + + return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE, + batadv_post_gw_reselect, attr, + &bat_priv->gw.sel_class, + bat_priv->soft_iface); +} + static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, struct attribute *attr, char *buff) { @@ -625,8 +684,8 @@ BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, S_IRUGO | S_IWUSR, 2 * BATADV_JITTER, INT_MAX, NULL); BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, S_IRUGO | S_IWUSR, 0, BATADV_TQ_MAX_VALUE, NULL); -BATADV_ATTR_SIF_UINT(gw_sel_class, gw.sel_class, S_IRUGO | S_IWUSR, 1, - BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect); +static BATADV_ATTR(gw_sel_class, S_IRUGO | S_IWUSR, batadv_show_gw_sel_class, + batadv_store_gw_sel_class); static BATADV_ATTR(gw_bandwidth, S_IRUGO | S_IWUSR, batadv_show_gw_bwidth, batadv_store_gw_bwidth); #ifdef CONFIG_BATMAN_ADV_MCAST @@ -712,6 +771,8 @@ rem_attr: for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); + kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE); + kobject_del(bat_priv->mesh_obj); kobject_put(bat_priv->mesh_obj); bat_priv->mesh_obj = NULL; out: @@ -726,6 +787,8 @@ void batadv_sysfs_del_meshif(struct net_device *dev) for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr)); + kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE); + kobject_del(bat_priv->mesh_obj); kobject_put(bat_priv->mesh_obj); bat_priv->mesh_obj = NULL; } @@ -781,6 +844,10 @@ rem_attr: for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr)); + if (vlan->kobj != bat_priv->mesh_obj) { + kobject_uevent(vlan->kobj, KOBJ_REMOVE); + kobject_del(vlan->kobj); + } kobject_put(vlan->kobj); vlan->kobj = NULL; out: @@ -800,6 +867,10 @@ void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv, for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr)); + if (vlan->kobj != bat_priv->mesh_obj) { + kobject_uevent(vlan->kobj, KOBJ_REMOVE); + kobject_del(vlan->kobj); + } kobject_put(vlan->kobj); vlan->kobj = NULL; } @@ -828,31 +899,31 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj, return length; } -static ssize_t batadv_store_mesh_iface(struct kobject *kobj, - struct attribute *attr, char *buff, - size_t count) +/** + * batadv_store_mesh_iface_finish - store new hardif mesh_iface state + * @net_dev: netdevice to add/remove to/from batman-adv soft-interface + * @ifname: name of soft-interface to modify + * + * Changes the parts of the hard+soft interface which can not be modified under + * sysfs lock (to prevent deadlock situations). + * + * Return: 0 on success, 0 < on failure + */ +static int batadv_store_mesh_iface_finish(struct net_device *net_dev, + char ifname[IFNAMSIZ]) { - struct net_device *net_dev = batadv_kobj_to_netdev(kobj); struct net *net = dev_net(net_dev); struct batadv_hard_iface *hard_iface; - int status_tmp = -1; - int ret = count; + int status_tmp; + int ret = 0; + + ASSERT_RTNL(); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface) - return count; - - if (buff[count - 1] == '\n') - buff[count - 1] = '\0'; - - if (strlen(buff) >= IFNAMSIZ) { - pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n", - buff); - batadv_hardif_put(hard_iface); - return -EINVAL; - } + return 0; - if (strncmp(buff, "none", 4) == 0) + if (strncmp(ifname, "none", 4) == 0) status_tmp = BATADV_IF_NOT_IN_USE; else status_tmp = BATADV_IF_I_WANT_YOU; @@ -861,15 +932,13 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, goto out; if ((hard_iface->soft_iface) && - (strncmp(hard_iface->soft_iface->name, buff, IFNAMSIZ) == 0)) + (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)) goto out; - rtnl_lock(); - if (status_tmp == BATADV_IF_NOT_IN_USE) { batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_AUTO); - goto unlock; + goto out; } /* if the interface already is in use */ @@ -877,15 +946,71 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_AUTO); - ret = batadv_hardif_enable_interface(hard_iface, net, buff); - -unlock: - rtnl_unlock(); + ret = batadv_hardif_enable_interface(hard_iface, net, ifname); out: batadv_hardif_put(hard_iface); return ret; } +/** + * batadv_store_mesh_iface_work - store new hardif mesh_iface state + * @work: work queue item + * + * Changes the parts of the hard+soft interface which can not be modified under + * sysfs lock (to prevent deadlock situations). + */ +static void batadv_store_mesh_iface_work(struct work_struct *work) +{ + struct batadv_store_mesh_work *store_work; + int ret; + + store_work = container_of(work, struct batadv_store_mesh_work, work); + + rtnl_lock(); + ret = batadv_store_mesh_iface_finish(store_work->net_dev, + store_work->soft_iface_name); + rtnl_unlock(); + + if (ret < 0) + pr_err("Failed to store new mesh_iface state %s for %s: %d\n", + store_work->soft_iface_name, store_work->net_dev->name, + ret); + + dev_put(store_work->net_dev); + kfree(store_work); +} + +static ssize_t batadv_store_mesh_iface(struct kobject *kobj, + struct attribute *attr, char *buff, + size_t count) +{ + struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct batadv_store_mesh_work *store_work; + + if (buff[count - 1] == '\n') + buff[count - 1] = '\0'; + + if (strlen(buff) >= IFNAMSIZ) { + pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n", + buff); + return -EINVAL; + } + + store_work = kmalloc(sizeof(*store_work), GFP_KERNEL); + if (!store_work) + return -ENOMEM; + + dev_hold(net_dev); + INIT_WORK(&store_work->work, batadv_store_mesh_iface_work); + store_work->net_dev = net_dev; + strlcpy(store_work->soft_iface_name, buff, + sizeof(store_work->soft_iface_name)); + + queue_work(batadv_event_workqueue, &store_work->work); + + return count; +} + static ssize_t batadv_show_iface_status(struct kobject *kobj, struct attribute *attr, char *buff) { @@ -1048,6 +1173,8 @@ out: void batadv_sysfs_del_hardif(struct kobject **hardif_obj) { + kobject_uevent(*hardif_obj, KOBJ_REMOVE); + kobject_del(*hardif_obj); kobject_put(*hardif_obj); *hardif_obj = NULL; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 7e6df7a4964a..7f663092f6de 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -22,12 +22,14 @@ #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> +#include <linux/cache.h> #include <linux/compiler.h> #include <linux/crc32c.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/if_ether.h> +#include <linux/init.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kernel.h> @@ -35,25 +37,39 @@ #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> +#include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> +#include <net/genetlink.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "multicast.h" +#include "netlink.h" #include "originator.h" #include "packet.h" #include "soft-interface.h" #include "tvlv.h" +static struct kmem_cache *batadv_tl_cache __read_mostly; +static struct kmem_cache *batadv_tg_cache __read_mostly; +static struct kmem_cache *batadv_tt_orig_cache __read_mostly; +static struct kmem_cache *batadv_tt_change_cache __read_mostly; +static struct kmem_cache *batadv_tt_req_cache __read_mostly; +static struct kmem_cache *batadv_tt_roam_cache __read_mostly; + /* hash class keys */ static struct lock_class_key batadv_tt_local_hash_lock_class_key; static struct lock_class_key batadv_tt_global_hash_lock_class_key; @@ -205,6 +221,20 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, } /** + * batadv_tt_local_entry_free_rcu - free the tt_local_entry + * @rcu: rcu pointer of the tt_local_entry + */ +static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_local_entry *tt_local_entry; + + tt_local_entry = container_of(rcu, struct batadv_tt_local_entry, + common.rcu); + + kmem_cache_free(batadv_tl_cache, tt_local_entry); +} + +/** * batadv_tt_local_entry_release - release tt_local_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node @@ -218,7 +248,7 @@ static void batadv_tt_local_entry_release(struct kref *ref) batadv_softif_vlan_put(tt_local_entry->vlan); - kfree_rcu(tt_local_entry, common.rcu); + call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu); } /** @@ -234,6 +264,20 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) } /** + * batadv_tt_global_entry_free_rcu - free the tt_global_entry + * @rcu: rcu pointer of the tt_global_entry + */ +static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_global_entry *tt_global_entry; + + tt_global_entry = container_of(rcu, struct batadv_tt_global_entry, + common.rcu); + + kmem_cache_free(batadv_tg_cache, tt_global_entry); +} + +/** * batadv_tt_global_entry_release - release tt_global_entry from lists and queue * for free after rcu grace period * @ref: kref pointer of the nc_node @@ -246,7 +290,8 @@ static void batadv_tt_global_entry_release(struct kref *ref) common.refcount); batadv_tt_global_del_orig_list(tt_global_entry); - kfree_rcu(tt_global_entry, common.rcu); + + call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu); } /** @@ -384,6 +429,19 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node, } /** + * batadv_tt_orig_list_entry_free_rcu - free the orig_entry + * @rcu: rcu pointer of the orig_entry + */ +static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) +{ + struct batadv_tt_orig_list_entry *orig_entry; + + orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); + + kmem_cache_free(batadv_tt_orig_cache, orig_entry); +} + +/** * batadv_tt_orig_list_entry_release - release tt orig entry from lists and * queue for free after rcu grace period * @ref: kref pointer of the tt orig entry @@ -396,7 +454,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) refcount); batadv_orig_node_put(orig_entry->orig_node); - kfree_rcu(orig_entry, rcu); + call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu); } /** @@ -426,7 +484,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, bool event_removed = false; bool del_op_requested, del_op_entry; - tt_change_node = kmalloc(sizeof(*tt_change_node), GFP_ATOMIC); + tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC); if (!tt_change_node) return; @@ -467,8 +525,8 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, continue; del: list_del(&entry->list); - kfree(entry); - kfree(tt_change_node); + kmem_cache_free(batadv_tt_change_cache, entry); + kmem_cache_free(batadv_tt_change_cache, tt_change_node); event_removed = true; goto unlock; } @@ -646,7 +704,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, goto out; } - tt_local = kmalloc(sizeof(*tt_local), GFP_ATOMIC); + tt_local = kmem_cache_alloc(batadv_tl_cache, GFP_ATOMIC); if (!tt_local) goto out; @@ -656,7 +714,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, net_ratelimited_function(batadv_info, soft_iface, "adding TT local entry %pM to non-existent VLAN %d\n", addr, BATADV_PRINT_VID(vid)); - kfree(tt_local); + kmem_cache_free(batadv_tl_cache, tt_local); tt_local = NULL; goto out; } @@ -676,7 +734,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (batadv_is_wifi_netdev(in_dev)) tt_local->common.flags |= BATADV_TT_CLIENT_WIFI; kref_init(&tt_local->common.refcount); - kref_get(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; tt_local->vlan = vlan; @@ -688,6 +745,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, is_multicast_ether_addr(addr)) tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; + kref_get(&tt_local->common.refcount); hash_added = batadv_hash_add(bat_priv->tt.local_hash, batadv_compare_tt, batadv_choose_tt, &tt_local->common, &tt_local->common.hash_entry); @@ -959,7 +1017,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) tt_diff_entries_count++; } list_del(&entry->list); - kfree(entry); + kmem_cache_free(batadv_tt_change_cache, entry); } spin_unlock_bh(&bat_priv->tt.changes_list_lock); @@ -989,6 +1047,7 @@ container_register: kfree(tt_data); } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; @@ -1056,6 +1115,165 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif + +/** + * batadv_tt_local_dump_entry - Dump one TT local entry into a message + * @msg :Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @common: tt local & tt global common data + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_tt_common_entry *common) +{ + void *hdr; + struct batadv_softif_vlan *vlan; + struct batadv_tt_local_entry *local; + unsigned int last_seen_msecs; + u32 crc; + + local = container_of(common, struct batadv_tt_local_entry, common); + last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen); + + vlan = batadv_softif_vlan_get(bat_priv, common->vid); + if (!vlan) + return 0; + + crc = vlan->tt.crc; + + batadv_softif_vlan_put(vlan); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, + BATADV_CMD_GET_TRANSTABLE_LOCAL); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || + nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || + nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + goto nla_put_failure; + + if (!(common->flags & BATADV_TT_CLIENT_NOPURGE) && + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_tt_local_dump_bucket - Dump one TT local bucket into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @head: Pointer to the list containing the local tt entries + * @idx_s: Number of entries to skip + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct hlist_head *head, int *idx_s) +{ + struct batadv_tt_common_entry *common; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(common, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv, + common)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + return 0; +} + +/** + * batadv_tt_local_dump - Dump TT local entries into a message + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: Error code, or 0 on success + */ +int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + struct batadv_hashtable *hash; + struct hlist_head *head; + int ret; + int ifindex; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int portid = NETLINK_CB(cb->skb).portid; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hash = bat_priv->tt.local_hash; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, head, &idx)) + break; + + bucket++; + } + + ret = msg->len; + + out: + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + cb->args[0] = bucket; + cb->args[1] = idx; + + return ret; +} static void batadv_tt_local_set_pending(struct batadv_priv *bat_priv, @@ -1259,7 +1477,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { list_del(&entry->list); - kfree(entry); + kmem_cache_free(batadv_tt_change_cache, entry); } atomic_set(&bat_priv->tt.local_changes, 0); @@ -1341,7 +1559,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, goto out; } - orig_entry = kzalloc(sizeof(*orig_entry), GFP_ATOMIC); + orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC); if (!orig_entry) goto out; @@ -1351,9 +1569,9 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; kref_init(&orig_entry->refcount); - kref_get(&orig_entry->refcount); spin_lock_bh(&tt_global->list_lock); + kref_get(&orig_entry->refcount); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); spin_unlock_bh(&tt_global->list_lock); @@ -1411,7 +1629,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, goto out; if (!tt_global_entry) { - tt_global_entry = kzalloc(sizeof(*tt_global_entry), GFP_ATOMIC); + tt_global_entry = kmem_cache_zalloc(batadv_tg_cache, + GFP_ATOMIC); if (!tt_global_entry) goto out; @@ -1428,13 +1647,13 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, if (flags & BATADV_TT_CLIENT_ROAM) tt_global_entry->roam_at = jiffies; kref_init(&common->refcount); - kref_get(&common->refcount); common->added_at = jiffies; INIT_HLIST_HEAD(&tt_global_entry->orig_list); atomic_set(&tt_global_entry->orig_list_count, 0); spin_lock_init(&tt_global_entry->list_lock); + kref_get(&common->refcount); hash_added = batadv_hash_add(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, common, @@ -1579,6 +1798,7 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, return best_entry; } +#ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_tt_global_print_entry - print all orig nodes who announce the address * for this global entry @@ -1702,6 +1922,219 @@ out: batadv_hardif_put(primary_if); return 0; } +#endif + +/** + * batadv_tt_global_dump_subentry - Dump all TT local entries into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @common: tt local & tt global common data + * @orig: Originator node announcing a non-mesh client + * @best: Is the best originator for the TT entry + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_tt_common_entry *common, + struct batadv_tt_orig_list_entry *orig, + bool best) +{ + void *hdr; + struct batadv_orig_node_vlan *vlan; + u8 last_ttvn; + u32 crc; + + vlan = batadv_orig_node_vlan_get(orig->orig_node, + common->vid); + if (!vlan) + return 0; + + crc = vlan->tt.crc; + + batadv_orig_node_vlan_put(vlan); + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, + BATADV_CMD_GET_TRANSTABLE_GLOBAL); + if (!hdr) + return -ENOBUFS; + + last_ttvn = atomic_read(&orig->orig_node->last_ttvn); + + if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) || + nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig->orig_node->orig) || + nla_put_u8(msg, BATADV_ATTR_TT_TTVN, orig->ttvn) || + nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || + nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || + nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + goto nla_put_failure; + + if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_tt_global_dump_entry - Dump one TT global entry into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @common: tt local & tt global common data + * @sub_s: Number of entries to skip + * + * This function assumes the caller holds rcu_read_lock(). + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct batadv_tt_common_entry *common, int *sub_s) +{ + struct batadv_tt_orig_list_entry *orig_entry, *best_entry; + struct batadv_tt_global_entry *global; + struct hlist_head *head; + int sub = 0; + bool best; + + global = container_of(common, struct batadv_tt_global_entry, common); + best_entry = batadv_transtable_best_orig(bat_priv, global); + head = &global->orig_list; + + hlist_for_each_entry_rcu(orig_entry, head, list) { + if (sub++ < *sub_s) + continue; + + best = (orig_entry == best_entry); + + if (batadv_tt_global_dump_subentry(msg, portid, seq, common, + orig_entry, best)) { + *sub_s = sub - 1; + return -EMSGSIZE; + } + } + + *sub_s = 0; + return 0; +} + +/** + * batadv_tt_global_dump_bucket - Dump one TT local bucket into a message + * @msg: Netlink message to dump into + * @portid: Port making netlink request + * @seq: Sequence number of netlink message + * @bat_priv: The bat priv with all the soft interface information + * @head: Pointer to the list containing the global tt entries + * @idx_s: Number of entries to skip + * @sub: Number of entries to skip + * + * Return: Error code, or 0 on success + */ +static int +batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, + struct hlist_head *head, int *idx_s, int *sub) +{ + struct batadv_tt_common_entry *common; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(common, head, hash_entry) { + if (idx++ < *idx_s) + continue; + + if (batadv_tt_global_dump_entry(msg, portid, seq, bat_priv, + common, sub)) { + rcu_read_unlock(); + *idx_s = idx - 1; + return -EMSGSIZE; + } + } + rcu_read_unlock(); + + *idx_s = 0; + *sub = 0; + return 0; +} + +/** + * batadv_tt_global_dump - Dump TT global entries into a message + * @msg: Netlink message to dump into + * @cb: Parameters from query + * + * Return: Error code, or length of message on success + */ +int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + struct batadv_hard_iface *primary_if = NULL; + struct batadv_hashtable *hash; + struct hlist_head *head; + int ret; + int ifindex; + int bucket = cb->args[0]; + int idx = cb->args[1]; + int sub = cb->args[2]; + int portid = NETLINK_CB(cb->skb).portid; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + hash = bat_priv->tt.global_hash; + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_tt_global_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, bat_priv, + head, &idx, &sub)) + break; + + bucket++; + } + + ret = msg->len; + + out: + if (primary_if) + batadv_hardif_put(primary_if); + if (soft_iface) + dev_put(soft_iface); + + cb->args[0] = bucket; + cb->args[1] = idx; + cb->args[2] = sub; + + return ret; +} /** * _batadv_tt_global_del_orig_entry - remove and free an orig_entry @@ -2280,7 +2713,7 @@ static void batadv_tt_req_node_release(struct kref *ref) tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount); - kfree(tt_req_node); + kmem_cache_free(batadv_tt_req_cache, tt_req_node); } /** @@ -2367,7 +2800,7 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv, goto unlock; } - tt_req_node = kmalloc(sizeof(*tt_req_node), GFP_ATOMIC); + tt_req_node = kmem_cache_alloc(batadv_tt_req_cache, GFP_ATOMIC); if (!tt_req_node) goto unlock; @@ -3104,7 +3537,7 @@ static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) list_for_each_entry_safe(node, safe, &bat_priv->tt.roam_list, list) { list_del(&node->list); - kfree(node); + kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); @@ -3121,7 +3554,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) continue; list_del(&node->list); - kfree(node); + kmem_cache_free(batadv_tt_roam_cache, node); } spin_unlock_bh(&bat_priv->tt.roam_list_lock); } @@ -3162,7 +3595,8 @@ static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) } if (!ret) { - tt_roam_node = kmalloc(sizeof(*tt_roam_node), GFP_ATOMIC); + tt_roam_node = kmem_cache_alloc(batadv_tt_roam_cache, + GFP_ATOMIC); if (!tt_roam_node) goto unlock; @@ -3865,3 +4299,85 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, return ret; } + +/** + * batadv_tt_cache_init - Initialize tt memory object cache + * + * Return: 0 on success or negative error number in case of failure. + */ +int __init batadv_tt_cache_init(void) +{ + size_t tl_size = sizeof(struct batadv_tt_local_entry); + size_t tg_size = sizeof(struct batadv_tt_global_entry); + size_t tt_orig_size = sizeof(struct batadv_tt_orig_list_entry); + size_t tt_change_size = sizeof(struct batadv_tt_change_node); + size_t tt_req_size = sizeof(struct batadv_tt_req_node); + size_t tt_roam_size = sizeof(struct batadv_tt_roam_node); + + batadv_tl_cache = kmem_cache_create("batadv_tl_cache", tl_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tl_cache) + return -ENOMEM; + + batadv_tg_cache = kmem_cache_create("batadv_tg_cache", tg_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tg_cache) + goto err_tt_tl_destroy; + + batadv_tt_orig_cache = kmem_cache_create("batadv_tt_orig_cache", + tt_orig_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tt_orig_cache) + goto err_tt_tg_destroy; + + batadv_tt_change_cache = kmem_cache_create("batadv_tt_change_cache", + tt_change_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tt_change_cache) + goto err_tt_orig_destroy; + + batadv_tt_req_cache = kmem_cache_create("batadv_tt_req_cache", + tt_req_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tt_req_cache) + goto err_tt_change_destroy; + + batadv_tt_roam_cache = kmem_cache_create("batadv_tt_roam_cache", + tt_roam_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!batadv_tt_roam_cache) + goto err_tt_req_destroy; + + return 0; + +err_tt_req_destroy: + kmem_cache_destroy(batadv_tt_req_cache); + batadv_tt_req_cache = NULL; +err_tt_change_destroy: + kmem_cache_destroy(batadv_tt_change_cache); + batadv_tt_change_cache = NULL; +err_tt_orig_destroy: + kmem_cache_destroy(batadv_tt_orig_cache); + batadv_tt_orig_cache = NULL; +err_tt_tg_destroy: + kmem_cache_destroy(batadv_tg_cache); + batadv_tg_cache = NULL; +err_tt_tl_destroy: + kmem_cache_destroy(batadv_tl_cache); + batadv_tl_cache = NULL; + + return -ENOMEM; +} + +/** + * batadv_tt_cache_destroy - Destroy tt memory object cache + */ +void batadv_tt_cache_destroy(void) +{ + kmem_cache_destroy(batadv_tl_cache); + kmem_cache_destroy(batadv_tg_cache); + kmem_cache_destroy(batadv_tt_orig_cache); + kmem_cache_destroy(batadv_tt_change_cache); + kmem_cache_destroy(batadv_tt_req_cache); + kmem_cache_destroy(batadv_tt_roam_cache); +} diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 7c7e2c006bfe..783fdba84db2 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -22,8 +22,10 @@ #include <linux/types.h> +struct netlink_callback; struct net_device; struct seq_file; +struct sk_buff; int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, @@ -33,6 +35,8 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const char *message, bool roaming); int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset); int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); +int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); +int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); @@ -59,4 +63,7 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); +int batadv_tt_cache_init(void); +void batadv_tt_cache_destroy(void); + #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 3d1cf0fb112d..77654f055f24 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -257,8 +257,13 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); batadv_tvlv_container_remove(bat_priv, tvlv_old); + + kref_get(&tvlv_new->refcount); hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); + + /* don't return reference to new tvlv_container */ + batadv_tvlv_container_put(tvlv_new); } /** @@ -542,8 +547,12 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, INIT_HLIST_NODE(&tvlv_handler->list); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + kref_get(&tvlv_handler->refcount); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); + + /* don't return reference to new tvlv_handler */ + batadv_tvlv_handler_put(tvlv_handler); } /** diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index a64522c3b45d..b3dd1a381aad 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -28,6 +28,7 @@ #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/netdevice.h> +#include <linux/netlink.h> #include <linux/sched.h> /* for linux/wait.h */ #include <linux/spinlock.h> #include <linux/types.h> @@ -132,7 +133,6 @@ struct batadv_hard_iface_bat_v { * @rcu: struct used for freeing in an RCU-safe manner * @bat_iv: per hard-interface B.A.T.M.A.N. IV data * @bat_v: per hard-interface B.A.T.M.A.N. V data - * @cleanup_work: work queue callback item for hard-interface deinit * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs * @neigh_list: list of unique single hop neighbors via this interface * @neigh_list_lock: lock protecting neigh_list @@ -152,7 +152,6 @@ struct batadv_hard_iface { #ifdef CONFIG_BATMAN_ADV_BATMAN_V struct batadv_hard_iface_bat_v bat_v; #endif - struct work_struct cleanup_work; struct dentry *debug_dir; struct hlist_head neigh_list; /* neigh_list_lock protects: neigh_list */ @@ -1015,7 +1014,6 @@ struct batadv_priv_bat_v { * @forw_bcast_list_lock: lock protecting forw_bcast_list * @tp_list_lock: spinlock protecting @tp_list * @orig_work: work queue callback item for orig node purging - * @cleanup_work: work queue callback item for soft-interface deinit * @primary_if: one of the hard-interfaces assigned to this mesh interface * becomes the primary interface * @algo_ops: routing algorithm used by this mesh interface @@ -1074,7 +1072,6 @@ struct batadv_priv { spinlock_t tp_list_lock; /* protects tp_list */ atomic_t tp_num; struct delayed_work orig_work; - struct work_struct cleanup_work; struct batadv_hard_iface __rcu *primary_if; /* rcu protected pointer */ struct batadv_algo_ops *algo_ops; struct hlist_head softif_vlan_list; @@ -1379,6 +1376,7 @@ struct batadv_skb_cb { * locally generated packet * @if_outgoing: packet where the packet should be sent to, or NULL if * unspecified + * @queue_left: The queue (counter) this packet was applied to */ struct batadv_forw_packet { struct hlist_node list; @@ -1391,11 +1389,13 @@ struct batadv_forw_packet { struct delayed_work delayed_work; struct batadv_hard_iface *if_incoming; struct batadv_hard_iface *if_outgoing; + atomic_t *queue_left; }; /** * struct batadv_algo_iface_ops - mesh algorithm callbacks (interface specific) * @activate: start routing mechanisms when hard-interface is brought up + * (optional) * @enable: init routing info when hard-interface is enabled * @disable: de-init routing info when hard-interface is disabled * @update_mac: (re-)init mac addresses of the protocol information @@ -1413,11 +1413,13 @@ struct batadv_algo_iface_ops { /** * struct batadv_algo_neigh_ops - mesh algorithm callbacks (neighbour specific) * @hardif_init: called on creation of single hop entry + * (optional) * @cmp: compare the metrics of two neighbors for their respective outgoing * interfaces * @is_similar_or_better: check if neigh1 is equally similar or better than * neigh2 for their respective outgoing interface from the metric prospective * @print: print the single hop neighbor list (optional) + * @dump: dump neighbors to a netlink socket (optional) */ struct batadv_algo_neigh_ops { void (*hardif_init)(struct batadv_hardif_neigh_node *neigh); @@ -1429,26 +1431,64 @@ struct batadv_algo_neigh_ops { struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2); +#ifdef CONFIG_BATMAN_ADV_DEBUGFS void (*print)(struct batadv_priv *priv, struct seq_file *seq); +#endif + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv, + struct batadv_hard_iface *hard_iface); }; /** * struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific) * @free: free the resources allocated by the routing algorithm for an orig_node - * object + * object (optional) * @add_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to a new hard-interface being added into the mesh + * orig_node due to a new hard-interface being added into the mesh (optional) * @del_if: ask the routing algorithm to apply the needed changes to the - * orig_node due to an hard-interface being removed from the mesh + * orig_node due to an hard-interface being removed from the mesh (optional) * @print: print the originator table (optional) + * @dump: dump originators to a netlink socket (optional) */ struct batadv_algo_orig_ops { void (*free)(struct batadv_orig_node *orig_node); int (*add_if)(struct batadv_orig_node *orig_node, int max_if_num); int (*del_if)(struct batadv_orig_node *orig_node, int max_if_num, int del_if_num); +#ifdef CONFIG_BATMAN_ADV_DEBUGFS void (*print)(struct batadv_priv *priv, struct seq_file *seq, struct batadv_hard_iface *hard_iface); +#endif + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv, + struct batadv_hard_iface *hard_iface); +}; + +/** + * struct batadv_algo_gw_ops - mesh algorithm callbacks (GW specific) + * @store_sel_class: parse and stores a new GW selection class (optional) + * @show_sel_class: prints the current GW selection class (optional) + * @get_best_gw_node: select the best GW from the list of available nodes + * (optional) + * @is_eligible: check if a newly discovered GW is a potential candidate for + * the election as best GW (optional) + * @print: print the gateway table (optional) + * @dump: dump gateways to a netlink socket (optional) + */ +struct batadv_algo_gw_ops { + ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff, + size_t count); + ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff); + struct batadv_gw_node *(*get_best_gw_node) + (struct batadv_priv *bat_priv); + bool (*is_eligible)(struct batadv_priv *bat_priv, + struct batadv_orig_node *curr_gw_orig, + struct batadv_orig_node *orig_node); +#ifdef CONFIG_BATMAN_ADV_DEBUGFS + void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq); +#endif + void (*dump)(struct sk_buff *msg, struct netlink_callback *cb, + struct batadv_priv *priv); }; /** @@ -1458,6 +1498,7 @@ struct batadv_algo_orig_ops { * @iface: callbacks related to interface handling * @neigh: callbacks related to neighbors handling * @orig: callbacks related to originators handling + * @gw: callbacks related to GW mode */ struct batadv_algo_ops { struct hlist_node list; @@ -1465,6 +1506,7 @@ struct batadv_algo_ops { struct batadv_algo_iface_ops iface; struct batadv_algo_neigh_ops neigh; struct batadv_algo_orig_ops orig; + struct batadv_algo_gw_ops gw; }; /** @@ -1564,4 +1606,17 @@ enum batadv_tvlv_handler_flags { BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2), }; +/** + * struct batadv_store_mesh_work - Work queue item to detach add/del interface + * from sysfs locks + * @net_dev: netdevice to add/remove to/from batman-adv soft-interface + * @soft_iface_name: name of soft-interface to modify + * @work: work queue item + */ +struct batadv_store_mesh_work { + struct net_device *net_dev; + char soft_iface_name[IFNAMSIZ]; + struct work_struct work; +}; + #endif /* _NET_BATMAN_ADV_TYPES_H_ */ diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 0b5f729d08d2..1aff2da9bc74 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -26,11 +26,13 @@ #include <linux/module.h> #include <linux/debugfs.h> +#include <linux/stringify.h> #include <asm/ioctls.h> #include <net/bluetooth/bluetooth.h> #include <linux/proc_fs.h> +#include "leds.h" #include "selftest.h" /* Bluetooth sockets */ @@ -712,13 +714,16 @@ static struct net_proto_family bt_sock_family_ops = { struct dentry *bt_debugfs; EXPORT_SYMBOL_GPL(bt_debugfs); +#define VERSION __stringify(BT_SUBSYS_VERSION) "." \ + __stringify(BT_SUBSYS_REVISION) + static int __init bt_init(void) { int err; sock_skb_cb_check_size(sizeof(struct bt_skb_cb)); - BT_INFO("Core ver %s", BT_SUBSYS_VERSION); + BT_INFO("Core ver %s", VERSION); err = bt_selftest(); if (err < 0) @@ -726,6 +731,8 @@ static int __init bt_init(void) bt_debugfs = debugfs_create_dir("bluetooth", NULL); + bt_leds_init(); + err = bt_sysfs_init(); if (err < 0) return err; @@ -785,6 +792,8 @@ static void __exit bt_exit(void) bt_sysfs_cleanup(); + bt_leds_cleanup(); + debugfs_remove_recursive(bt_debugfs); } @@ -792,7 +801,7 @@ subsys_initcall(bt_init); module_exit(bt_exit); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); -MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION); -MODULE_VERSION(BT_SUBSYS_VERSION); +MODULE_DESCRIPTION("Bluetooth Core ver " VERSION); +MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_BLUETOOTH); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index ddf8432fe8fb..3ac89e9ace71 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1562,6 +1562,7 @@ int hci_dev_do_close(struct hci_dev *hdev) auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF); if (!auto_off && hdev->dev_type == HCI_PRIMARY && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_MGMT)) __mgmt_power_off(hdev); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index b0e23dfc5c34..c8135680c43e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -971,14 +971,14 @@ void __hci_req_enable_advertising(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } -static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) +static u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) { - u8 ad_len = 0; size_t name_len; + int max_len; + max_len = HCI_MAX_AD_LENGTH - ad_len - 2; name_len = strlen(hdev->dev_name); - if (name_len > 0) { - size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2; + if (name_len > 0 && max_len > 0) { if (name_len > max_len) { name_len = max_len; @@ -997,22 +997,42 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) return ad_len; } +static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) +{ + return append_local_name(hdev, ptr, 0); +} + static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, u8 *ptr) { struct adv_info *adv_instance; + u32 instance_flags; + u8 scan_rsp_len = 0; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) return 0; - /* TODO: Set the appropriate entries based on advertising instance flags - * here once flags other than 0 are supported. - */ + instance_flags = adv_instance->flags; + + if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) { + ptr[0] = 3; + ptr[1] = EIR_APPEARANCE; + put_unaligned_le16(hdev->appearance, ptr + 2); + scan_rsp_len += 4; + ptr += 4; + } + memcpy(ptr, adv_instance->scan_rsp_data, adv_instance->scan_rsp_len); - return adv_instance->scan_rsp_len; + scan_rsp_len += adv_instance->scan_rsp_len; + ptr += adv_instance->scan_rsp_len; + + if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME) + scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len); + + return scan_rsp_len; } void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) @@ -1194,7 +1214,7 @@ static void adv_timeout_expire(struct work_struct *work) hci_req_init(&req, hdev); - hci_req_clear_adv_instance(hdev, &req, instance, false); + hci_req_clear_adv_instance(hdev, NULL, &req, instance, false); if (list_empty(&hdev->adv_instances)) __hci_req_disable_advertising(&req); @@ -1284,8 +1304,9 @@ static void cancel_adv_timeout(struct hci_dev *hdev) * setting. * - force == false: Only instances that have a timeout will be removed. */ -void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, - u8 instance, bool force) +void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, + struct hci_request *req, u8 instance, + bool force) { struct adv_info *adv_instance, *n, *next_instance = NULL; int err; @@ -1311,7 +1332,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, rem_inst = adv_instance->instance; err = hci_remove_adv_instance(hdev, rem_inst); if (!err) - mgmt_advertising_removed(NULL, hdev, rem_inst); + mgmt_advertising_removed(sk, hdev, rem_inst); } } else { adv_instance = hci_find_adv_instance(hdev, instance); @@ -1325,7 +1346,7 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, err = hci_remove_adv_instance(hdev, instance); if (!err) - mgmt_advertising_removed(NULL, hdev, instance); + mgmt_advertising_removed(sk, hdev, instance); } } @@ -1716,7 +1737,7 @@ void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, * function. To be safe hard-code one of the * values that's suitable for SCO. */ - rej.reason = HCI_ERROR_REMOTE_LOW_RESOURCES; + rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ, sizeof(rej), &rej); diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index b2d044bdc732..ac1e11006f38 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -73,8 +73,9 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance); int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, bool force); -void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req, - u8 instance, bool force); +void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, + struct hci_request *req, u8 instance, + bool force); void __hci_req_update_class(struct hci_request *req); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 96f04b7b9556..48f9471e7c85 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/utsname.h> +#include <linux/sched.h> #include <asm/unaligned.h> #include <net/bluetooth/bluetooth.h> @@ -38,6 +39,8 @@ static LIST_HEAD(mgmt_chan_list); static DEFINE_MUTEX(mgmt_chan_list_lock); +static DEFINE_IDA(sock_cookie_ida); + static atomic_t monitor_promisc = ATOMIC_INIT(0); /* ----- HCI socket interface ----- */ @@ -52,6 +55,8 @@ struct hci_pinfo { __u32 cmsg_mask; unsigned short channel; unsigned long flags; + __u32 cookie; + char comm[TASK_COMM_LEN]; }; void hci_sock_set_flag(struct sock *sk, int nr) @@ -74,6 +79,38 @@ unsigned short hci_sock_get_channel(struct sock *sk) return hci_pi(sk)->channel; } +u32 hci_sock_get_cookie(struct sock *sk) +{ + return hci_pi(sk)->cookie; +} + +static bool hci_sock_gen_cookie(struct sock *sk) +{ + int id = hci_pi(sk)->cookie; + + if (!id) { + id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL); + if (id < 0) + id = 0xffffffff; + + hci_pi(sk)->cookie = id; + get_task_comm(hci_pi(sk)->comm, current); + return true; + } + + return false; +} + +static void hci_sock_free_cookie(struct sock *sk) +{ + int id = hci_pi(sk)->cookie; + + if (id) { + hci_pi(sk)->cookie = 0xffffffff; + ida_simple_remove(&sock_cookie_ida, id); + } +} + static inline int hci_test_bit(int nr, const void *addr) { return *((const __u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); @@ -305,6 +342,60 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(skb_copy); } +void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, + void *data, u16 data_len, ktime_t tstamp, + int flag, struct sock *skip_sk) +{ + struct sock *sk; + __le16 index; + + if (hdev) + index = cpu_to_le16(hdev->id); + else + index = cpu_to_le16(MGMT_INDEX_NONE); + + read_lock(&hci_sk_list.lock); + + sk_for_each(sk, &hci_sk_list.head) { + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + if (hci_pi(sk)->channel != HCI_CHANNEL_CONTROL) + continue; + + /* Ignore socket without the flag set */ + if (!hci_sock_test_flag(sk, flag)) + continue; + + /* Skip the original socket */ + if (sk == skip_sk) + continue; + + skb = bt_skb_alloc(6 + data_len, GFP_ATOMIC); + if (!skb) + continue; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(event, skb_put(skb, 2)); + + if (data) + memcpy(skb_put(skb, data_len), data, data_len); + + skb->tstamp = tstamp; + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); + hdr->index = index; + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + + read_unlock(&hci_sk_list.lock); +} + static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) { struct hci_mon_hdr *hdr; @@ -384,6 +475,129 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event) return skb; } +static struct sk_buff *create_monitor_ctrl_open(struct sock *sk) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + u16 format; + u8 ver[3]; + u32 flags; + + /* No message needed when cookie is not present */ + if (!hci_pi(sk)->cookie) + return NULL; + + switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_RAW: + format = 0x0000; + ver[0] = BT_SUBSYS_VERSION; + put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); + break; + case HCI_CHANNEL_USER: + format = 0x0001; + ver[0] = BT_SUBSYS_VERSION; + put_unaligned_le16(BT_SUBSYS_REVISION, ver + 1); + break; + case HCI_CHANNEL_CONTROL: + format = 0x0002; + mgmt_fill_version_info(ver); + break; + default: + /* No message for unsupported format */ + return NULL; + } + + skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC); + if (!skb) + return NULL; + + flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(format, skb_put(skb, 2)); + memcpy(skb_put(skb, sizeof(ver)), ver, sizeof(ver)); + put_unaligned_le32(flags, skb_put(skb, 4)); + *skb_put(skb, 1) = TASK_COMM_LEN; + memcpy(skb_put(skb, TASK_COMM_LEN), hci_pi(sk)->comm, TASK_COMM_LEN); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_OPEN); + if (hci_pi(sk)->hdev) + hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); + else + hdr->index = cpu_to_le16(HCI_DEV_NONE); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + +static struct sk_buff *create_monitor_ctrl_close(struct sock *sk) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + /* No message needed when cookie is not present */ + if (!hci_pi(sk)->cookie) + return NULL; + + switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_RAW: + case HCI_CHANNEL_USER: + case HCI_CHANNEL_CONTROL: + break; + default: + /* No message for unsupported format */ + return NULL; + } + + skb = bt_skb_alloc(4, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_CLOSE); + if (hci_pi(sk)->hdev) + hdr->index = cpu_to_le16(hci_pi(sk)->hdev->id); + else + hdr->index = cpu_to_le16(HCI_DEV_NONE); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + +static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index, + u16 opcode, u16 len, + const void *buf) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + skb = bt_skb_alloc(6 + len, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4)); + put_unaligned_le16(opcode, skb_put(skb, 2)); + + if (buf) + memcpy(skb_put(skb, len), buf, len); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_COMMAND); + hdr->index = cpu_to_le16(index); + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + static void __printf(2, 3) send_monitor_note(struct sock *sk, const char *fmt, ...) { @@ -458,6 +672,26 @@ static void send_monitor_replay(struct sock *sk) read_unlock(&hci_dev_list_lock); } +static void send_monitor_control_replay(struct sock *mon_sk) +{ + struct sock *sk; + + read_lock(&hci_sk_list.lock); + + sk_for_each(sk, &hci_sk_list.head) { + struct sk_buff *skb; + + skb = create_monitor_ctrl_open(sk); + if (!skb) + continue; + + if (sock_queue_rcv_skb(mon_sk, skb)) + kfree_skb(skb); + } + + read_unlock(&hci_sk_list.lock); +} + /* Generate internal stack event */ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) { @@ -585,6 +819,7 @@ static int hci_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct hci_dev *hdev; + struct sk_buff *skb; BT_DBG("sock %p sk %p", sock, sk); @@ -593,8 +828,24 @@ static int hci_sock_release(struct socket *sock) hdev = hci_pi(sk)->hdev; - if (hci_pi(sk)->channel == HCI_CHANNEL_MONITOR) + switch (hci_pi(sk)->channel) { + case HCI_CHANNEL_MONITOR: atomic_dec(&monitor_promisc); + break; + case HCI_CHANNEL_RAW: + case HCI_CHANNEL_USER: + case HCI_CHANNEL_CONTROL: + /* Send event to monitor */ + skb = create_monitor_ctrl_close(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + + hci_sock_free_cookie(sk); + break; + } bt_sock_unlink(&hci_sk_list, sk); @@ -721,6 +972,27 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, goto done; } + /* When calling an ioctl on an unbound raw socket, then ensure + * that the monitor gets informed. Ensure that the resulting event + * is only send once by checking if the cookie exists or not. The + * socket cookie will be only ever generated once for the lifetime + * of a given socket. + */ + if (hci_sock_gen_cookie(sk)) { + struct sk_buff *skb; + + if (capable(CAP_NET_ADMIN)) + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + release_sock(sk); switch (cmd) { @@ -784,6 +1056,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, struct sockaddr_hci haddr; struct sock *sk = sock->sk; struct hci_dev *hdev = NULL; + struct sk_buff *skb; int len, err = 0; BT_DBG("sock %p sk %p", sock, sk); @@ -822,7 +1095,35 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, atomic_inc(&hdev->promisc); } + hci_pi(sk)->channel = haddr.hci_channel; + + if (!hci_sock_gen_cookie(sk)) { + /* In the case when a cookie has already been assigned, + * then there has been already an ioctl issued against + * an unbound socket and with that triggerd an open + * notification. Send a close notification first to + * allow the state transition to bounded. + */ + skb = create_monitor_ctrl_close(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + + if (capable(CAP_NET_ADMIN)) + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + hci_pi(sk)->hdev = hdev; + + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } break; case HCI_CHANNEL_USER: @@ -884,9 +1185,38 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, } } - atomic_inc(&hdev->promisc); + hci_pi(sk)->channel = haddr.hci_channel; + + if (!hci_sock_gen_cookie(sk)) { + /* In the case when a cookie has already been assigned, + * this socket will transition from a raw socket into + * an user channel socket. For a clean transition, send + * the close notification first. + */ + skb = create_monitor_ctrl_close(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + + /* The user channel is restricted to CAP_NET_ADMIN + * capabilities and with that implicitly trusted. + */ + hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); hci_pi(sk)->hdev = hdev; + + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + + atomic_inc(&hdev->promisc); break; case HCI_CHANNEL_MONITOR: @@ -900,6 +1230,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, goto done; } + hci_pi(sk)->channel = haddr.hci_channel; + /* The monitor interface is restricted to CAP_NET_RAW * capabilities and with that implicitly trusted. */ @@ -908,9 +1240,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, send_monitor_note(sk, "Linux version %s (%s)", init_utsname()->release, init_utsname()->machine); - send_monitor_note(sk, "Bluetooth subsystem version %s", - BT_SUBSYS_VERSION); + send_monitor_note(sk, "Bluetooth subsystem version %u.%u", + BT_SUBSYS_VERSION, BT_SUBSYS_REVISION); send_monitor_replay(sk); + send_monitor_control_replay(sk); atomic_inc(&monitor_promisc); break; @@ -925,6 +1258,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, err = -EPERM; goto done; } + + hci_pi(sk)->channel = haddr.hci_channel; break; default: @@ -946,6 +1281,8 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, if (capable(CAP_NET_ADMIN)) hci_sock_set_flag(sk, HCI_SOCK_TRUSTED); + hci_pi(sk)->channel = haddr.hci_channel; + /* At the moment the index and unconfigured index events * are enabled unconditionally. Setting them on each * socket when binding keeps this functionality. They @@ -956,16 +1293,40 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, * received by untrusted users. Example for such events * are changes to settings, class of device, name etc. */ - if (haddr.hci_channel == HCI_CHANNEL_CONTROL) { + if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) { + if (!hci_sock_gen_cookie(sk)) { + /* In the case when a cookie has already been + * assigned, this socket will transtion from + * a raw socket into a control socket. To + * allow for a clean transtion, send the + * close notification first. + */ + skb = create_monitor_ctrl_close(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + + /* Send event to monitor */ + skb = create_monitor_ctrl_open(sk); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + hci_sock_set_flag(sk, HCI_MGMT_INDEX_EVENTS); hci_sock_set_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS); - hci_sock_set_flag(sk, HCI_MGMT_GENERIC_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_OPTION_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_SETTING_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); } break; } - - hci_pi(sk)->channel = haddr.hci_channel; sk->sk_state = BT_BOUND; done: @@ -1133,6 +1494,19 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, goto done; } + if (chan->channel == HCI_CHANNEL_CONTROL) { + struct sk_buff *skb; + + /* Send event to monitor */ + skb = create_monitor_ctrl_command(sk, index, opcode, len, + buf + sizeof(*hdr)); + if (skb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(skb); + } + } + if (opcode >= chan->handler_count || chan->handlers[opcode].func == NULL) { BT_DBG("Unknown op %u", opcode); @@ -1440,6 +1814,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, BT_DBG("sk %p, opt %d", sk, optname); + if (level != SOL_HCI) + return -ENOPROTOOPT; + lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { @@ -1523,6 +1900,9 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, BT_DBG("sk %p, opt %d", sk, optname); + if (level != SOL_HCI) + return -ENOPROTOOPT; + if (get_user(len, optlen)) return -EFAULT; diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c index 8319c8440c89..cb670b5594eb 100644 --- a/net/bluetooth/leds.c +++ b/net/bluetooth/leds.c @@ -11,6 +11,8 @@ #include "leds.h" +DEFINE_LED_TRIGGER(bt_power_led_trigger); + struct hci_basic_led_trigger { struct led_trigger led_trigger; struct hci_dev *hdev; @@ -24,6 +26,21 @@ void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) if (hdev->power_led) led_trigger_event(hdev->power_led, enabled ? LED_FULL : LED_OFF); + + if (!enabled) { + struct hci_dev *d; + + read_lock(&hci_dev_list_lock); + + list_for_each_entry(d, &hci_dev_list, list) { + if (test_bit(HCI_UP, &d->flags)) + enabled = true; + } + + read_unlock(&hci_dev_list_lock); + } + + led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF); } static void power_activate(struct led_classdev *led_cdev) @@ -72,3 +89,13 @@ void hci_leds_init(struct hci_dev *hdev) /* initialize power_led */ hdev->power_led = led_allocate_basic(hdev, power_activate, "power"); } + +void bt_leds_init(void) +{ + led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger); +} + +void bt_leds_cleanup(void) +{ + led_trigger_unregister_simple(bt_power_led_trigger); +} diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h index a9c4d6ea01cf..08725a2fbd9b 100644 --- a/net/bluetooth/leds.h +++ b/net/bluetooth/leds.h @@ -7,10 +7,20 @@ */ #if IS_ENABLED(CONFIG_BT_LEDS) + void hci_leds_update_powered(struct hci_dev *hdev, bool enabled); void hci_leds_init(struct hci_dev *hdev); + +void bt_leds_init(void); +void bt_leds_cleanup(void); + #else + static inline void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) {} static inline void hci_leds_init(struct hci_dev *hdev) {} + +static inline void bt_leds_init(void) {} +static inline void bt_leds_cleanup(void) {} + #endif diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7639290b6de3..19b8a5e9420d 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 13 +#define MGMT_REVISION 14 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -104,6 +104,8 @@ static const u16 mgmt_commands[] = { MGMT_OP_REMOVE_ADVERTISING, MGMT_OP_GET_ADV_SIZE_INFO, MGMT_OP_START_LIMITED_DISCOVERY, + MGMT_OP_READ_EXT_INFO, + MGMT_OP_SET_APPEARANCE, }; static const u16 mgmt_events[] = { @@ -141,6 +143,7 @@ static const u16 mgmt_events[] = { MGMT_EV_LOCAL_OOB_DATA_UPDATED, MGMT_EV_ADVERTISING_ADDED, MGMT_EV_ADVERTISING_REMOVED, + MGMT_EV_EXT_INFO_CHANGED, }; static const u16 mgmt_untrusted_commands[] = { @@ -149,6 +152,7 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_UNCONF_INDEX_LIST, MGMT_OP_READ_CONFIG_INFO, MGMT_OP_READ_EXT_INDEX_LIST, + MGMT_OP_READ_EXT_INFO, }; static const u16 mgmt_untrusted_events[] = { @@ -162,6 +166,7 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_NEW_CONFIG_OPTIONS, MGMT_EV_EXT_INDEX_ADDED, MGMT_EV_EXT_INDEX_REMOVED, + MGMT_EV_EXT_INFO_CHANGED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -256,13 +261,6 @@ static int mgmt_limited_event(u16 event, struct hci_dev *hdev, void *data, flag, skip_sk); } -static int mgmt_generic_event(u16 event, struct hci_dev *hdev, void *data, - u16 len, struct sock *skip_sk) -{ - return mgmt_send_event(event, hdev, HCI_CHANNEL_CONTROL, data, len, - HCI_MGMT_GENERIC_EVENTS, skip_sk); -} - static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len, struct sock *skip_sk) { @@ -278,6 +276,14 @@ static u8 le_addr_type(u8 mgmt_addr_type) return ADDR_LE_DEV_RANDOM; } +void mgmt_fill_version_info(void *ver) +{ + struct mgmt_rp_read_version *rp = ver; + + rp->version = MGMT_VERSION; + rp->revision = cpu_to_le16(MGMT_REVISION); +} + static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -285,8 +291,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG("sock %p", sk); - rp.version = MGMT_VERSION; - rp.revision = cpu_to_le16(MGMT_REVISION); + mgmt_fill_version_info(&rp); return mgmt_cmd_complete(sk, MGMT_INDEX_NONE, MGMT_OP_READ_VERSION, 0, &rp, sizeof(rp)); @@ -572,8 +577,8 @@ static int new_options(struct hci_dev *hdev, struct sock *skip) { __le32 options = get_missing_options(hdev); - return mgmt_generic_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, - sizeof(options), skip); + return mgmt_limited_event(MGMT_EV_NEW_CONFIG_OPTIONS, hdev, &options, + sizeof(options), HCI_MGMT_OPTION_EVENTS, skip); } static int send_options_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) @@ -862,6 +867,107 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev, sizeof(rp)); } +static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, + u8 data_len) +{ + eir[eir_len++] = sizeof(type) + data_len; + eir[eir_len++] = type; + memcpy(&eir[eir_len], data, data_len); + eir_len += data_len; + + return eir_len; +} + +static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) +{ + eir[eir_len++] = sizeof(type) + sizeof(data); + eir[eir_len++] = type; + put_unaligned_le16(data, &eir[eir_len]); + eir_len += sizeof(data); + + return eir_len; +} + +static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir) +{ + u16 eir_len = 0; + size_t name_len; + + if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + eir_len = eir_append_data(eir, eir_len, EIR_CLASS_OF_DEV, + hdev->dev_class, 3); + + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE, + hdev->appearance); + + name_len = strlen(hdev->dev_name); + eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE, + hdev->dev_name, name_len); + + name_len = strlen(hdev->short_name); + eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT, + hdev->short_name, name_len); + + return eir_len; +} + +static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + char buf[512]; + struct mgmt_rp_read_ext_info *rp = (void *)buf; + u16 eir_len; + + BT_DBG("sock %p %s", sk, hdev->name); + + memset(&buf, 0, sizeof(buf)); + + hci_dev_lock(hdev); + + bacpy(&rp->bdaddr, &hdev->bdaddr); + + rp->version = hdev->hci_ver; + rp->manufacturer = cpu_to_le16(hdev->manufacturer); + + rp->supported_settings = cpu_to_le32(get_supported_settings(hdev)); + rp->current_settings = cpu_to_le32(get_current_settings(hdev)); + + + eir_len = append_eir_data_to_buf(hdev, rp->eir); + rp->eir_len = cpu_to_le16(eir_len); + + hci_dev_unlock(hdev); + + /* If this command is called at least once, then the events + * for class of device and local name changes are disabled + * and only the new extended controller information event + * is used. + */ + hci_sock_set_flag(sk, HCI_MGMT_EXT_INFO_EVENTS); + hci_sock_clear_flag(sk, HCI_MGMT_DEV_CLASS_EVENTS); + hci_sock_clear_flag(sk, HCI_MGMT_LOCAL_NAME_EVENTS); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_EXT_INFO, 0, rp, + sizeof(*rp) + eir_len); +} + +static int ext_info_changed(struct hci_dev *hdev, struct sock *skip) +{ + char buf[512]; + struct mgmt_ev_ext_info_changed *ev = (void *)buf; + u16 eir_len; + + memset(buf, 0, sizeof(buf)); + + eir_len = append_eir_data_to_buf(hdev, ev->eir); + ev->eir_len = cpu_to_le16(eir_len); + + return mgmt_limited_event(MGMT_EV_EXT_INFO_CHANGED, hdev, ev, + sizeof(*ev) + eir_len, + HCI_MGMT_EXT_INFO_EVENTS, skip); +} + static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) { __le32 settings = cpu_to_le32(get_current_settings(hdev)); @@ -922,7 +1028,7 @@ static int clean_up_hci_state(struct hci_dev *hdev) hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); } - hci_req_clear_adv_instance(hdev, NULL, 0x00, false); + hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, false); if (hci_dev_test_flag(hdev, HCI_LE_ADV)) __hci_req_disable_advertising(&req); @@ -1000,8 +1106,8 @@ static int new_settings(struct hci_dev *hdev, struct sock *skip) { __le32 ev = cpu_to_le32(get_current_settings(hdev)); - return mgmt_generic_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, - sizeof(ev), skip); + return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev, + sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip); } int mgmt_new_settings(struct hci_dev *hdev) @@ -1690,7 +1796,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) enabled = lmp_host_le_capable(hdev); if (!val) - hci_req_clear_adv_instance(hdev, NULL, 0x00, true); + hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true); if (!hdev_is_powered(hdev) || val == enabled) { bool changed = false; @@ -2435,6 +2541,8 @@ static int send_pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev, if (!cmd) return -ENOMEM; + cmd->cmd_complete = addr_cmd_complete; + err = hci_send_cmd(hdev, HCI_OP_PIN_CODE_NEG_REPLY, sizeof(cp->addr.bdaddr), &cp->addr.bdaddr); if (err < 0) @@ -2513,8 +2621,8 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, BT_DBG(""); if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY) - return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, - MGMT_STATUS_INVALID_PARAMS, NULL, 0); + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, + MGMT_STATUS_INVALID_PARAMS); hci_dev_lock(hdev); @@ -2932,6 +3040,35 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev, HCI_OP_USER_PASSKEY_NEG_REPLY, 0); } +static void adv_expire(struct hci_dev *hdev, u32 flags) +{ + struct adv_info *adv_instance; + struct hci_request req; + int err; + + adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); + if (!adv_instance) + return; + + /* stop if current instance doesn't need to be changed */ + if (!(adv_instance->flags & flags)) + return; + + cancel_adv_timeout(hdev); + + adv_instance = hci_get_next_instance(hdev, adv_instance->instance); + if (!adv_instance) + return; + + hci_req_init(&req, hdev); + err = __hci_req_schedule_adv_instance(&req, adv_instance->instance, + true); + if (err) + return; + + hci_req_run(&req, NULL); +} + static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_cp_set_local_name *cp; @@ -2947,13 +3084,17 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) cp = cmd->param; - if (status) + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, mgmt_status(status)); - else + } else { mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0, cp, sizeof(*cp)); + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + adv_expire(hdev, MGMT_ADV_FLAG_LOCAL_NAME); + } + mgmt_pending_remove(cmd); unlock: @@ -2993,8 +3134,9 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, if (err < 0) goto failed; - err = mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, - data, len, sk); + err = mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, data, + len, HCI_MGMT_LOCAL_NAME_EVENTS, sk); + ext_info_changed(hdev, sk); goto failed; } @@ -3017,7 +3159,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, /* The name is stored in the scan response data and so * no need to udpate the advertising data here. */ - if (lmp_le_capable(hdev)) + if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING)) __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance); err = hci_req_run(&req, set_name_complete); @@ -3029,6 +3171,40 @@ failed: return err; } +static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, + u16 len) +{ + struct mgmt_cp_set_appearance *cp = data; + u16 apperance; + int err; + + BT_DBG(""); + + if (!lmp_le_capable(hdev)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE, + MGMT_STATUS_NOT_SUPPORTED); + + apperance = le16_to_cpu(cp->appearance); + + hci_dev_lock(hdev); + + if (hdev->appearance != apperance) { + hdev->appearance = apperance; + + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) + adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE); + + ext_info_changed(hdev, sk); + } + + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_APPEARANCE, 0, NULL, + 0); + + hci_dev_unlock(hdev); + + return err; +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -4869,7 +5045,7 @@ static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) int err; memset(&rp, 0, sizeof(rp)); - memcpy(&rp.addr, &cmd->param, sizeof(rp.addr)); + memcpy(&rp.addr, cmd->param, sizeof(rp.addr)); if (status) goto complete; @@ -5501,17 +5677,6 @@ unlock: return err; } -static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, u8 *data, - u8 data_len) -{ - eir[eir_len++] = sizeof(type) + data_len; - eir[eir_len++] = type; - memcpy(&eir[eir_len], data, data_len); - eir_len += data_len; - - return eir_len; -} - static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -5815,6 +5980,8 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_FLAG_DISCOV; flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; flags |= MGMT_ADV_FLAG_MANAGED_FLAGS; + flags |= MGMT_ADV_FLAG_APPEARANCE; + flags |= MGMT_ADV_FLAG_LOCAL_NAME; if (hdev->adv_tx_power != HCI_TX_POWER_INVALID) flags |= MGMT_ADV_FLAG_TX_POWER; @@ -5871,28 +6038,59 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, return err; } -static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, - u8 len, bool is_adv_data) +static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data) { u8 max_len = HCI_MAX_AD_LENGTH; - int i, cur_len; - bool flags_managed = false; - bool tx_power_managed = false; if (is_adv_data) { if (adv_flags & (MGMT_ADV_FLAG_DISCOV | MGMT_ADV_FLAG_LIMITED_DISCOV | - MGMT_ADV_FLAG_MANAGED_FLAGS)) { - flags_managed = true; + MGMT_ADV_FLAG_MANAGED_FLAGS)) max_len -= 3; - } - if (adv_flags & MGMT_ADV_FLAG_TX_POWER) { - tx_power_managed = true; + if (adv_flags & MGMT_ADV_FLAG_TX_POWER) max_len -= 3; - } + } else { + /* at least 1 byte of name should fit in */ + if (adv_flags & MGMT_ADV_FLAG_LOCAL_NAME) + max_len -= 3; + + if (adv_flags & (MGMT_ADV_FLAG_APPEARANCE)) + max_len -= 4; } + return max_len; +} + +static bool flags_managed(u32 adv_flags) +{ + return adv_flags & (MGMT_ADV_FLAG_DISCOV | + MGMT_ADV_FLAG_LIMITED_DISCOV | + MGMT_ADV_FLAG_MANAGED_FLAGS); +} + +static bool tx_power_managed(u32 adv_flags) +{ + return adv_flags & MGMT_ADV_FLAG_TX_POWER; +} + +static bool name_managed(u32 adv_flags) +{ + return adv_flags & MGMT_ADV_FLAG_LOCAL_NAME; +} + +static bool appearance_managed(u32 adv_flags) +{ + return adv_flags & MGMT_ADV_FLAG_APPEARANCE; +} + +static bool tlv_data_is_valid(u32 adv_flags, u8 *data, u8 len, bool is_adv_data) +{ + int i, cur_len; + u8 max_len; + + max_len = tlv_data_max_len(adv_flags, is_adv_data); + if (len > max_len) return false; @@ -5900,10 +6098,21 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data, for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) { cur_len = data[i]; - if (flags_managed && data[i + 1] == EIR_FLAGS) + if (data[i + 1] == EIR_FLAGS && + (!is_adv_data || flags_managed(adv_flags))) + return false; + + if (data[i + 1] == EIR_TX_POWER && tx_power_managed(adv_flags)) + return false; + + if (data[i + 1] == EIR_NAME_COMPLETE && name_managed(adv_flags)) return false; - if (tx_power_managed && data[i + 1] == EIR_TX_POWER) + if (data[i + 1] == EIR_NAME_SHORT && name_managed(adv_flags)) + return false; + + if (data[i + 1] == EIR_APPEARANCE && + appearance_managed(adv_flags)) return false; /* If the current field length would exceed the total data @@ -6027,8 +6236,8 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, goto unlock; } - if (!tlv_data_is_valid(hdev, flags, cp->data, cp->adv_data_len, true) || - !tlv_data_is_valid(hdev, flags, cp->data + cp->adv_data_len, + if (!tlv_data_is_valid(flags, cp->data, cp->adv_data_len, true) || + !tlv_data_is_valid(flags, cp->data + cp->adv_data_len, cp->scan_rsp_len, false)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_INVALID_PARAMS); @@ -6175,7 +6384,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, hci_req_init(&req, hdev); - hci_req_clear_adv_instance(hdev, &req, cp->instance, true); + hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true); if (list_empty(&hdev->adv_instances)) __hci_req_disable_advertising(&req); @@ -6211,23 +6420,6 @@ unlock: return err; } -static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data) -{ - u8 max_len = HCI_MAX_AD_LENGTH; - - if (is_adv_data) { - if (adv_flags & (MGMT_ADV_FLAG_DISCOV | - MGMT_ADV_FLAG_LIMITED_DISCOV | - MGMT_ADV_FLAG_MANAGED_FLAGS)) - max_len -= 3; - - if (adv_flags & MGMT_ADV_FLAG_TX_POWER) - max_len -= 3; - } - - return max_len; -} - static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -6356,6 +6548,9 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE }, { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE }, { start_limited_discovery, MGMT_START_DISCOVERY_SIZE }, + { read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE, + HCI_MGMT_UNTRUSTED }, + { set_appearance, MGMT_SET_APPEARANCE_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) @@ -6494,9 +6689,12 @@ void __mgmt_power_off(struct hci_dev *hdev) mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status); - if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) - mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, - zero_cod, sizeof(zero_cod), NULL); + if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) { + mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, + zero_cod, sizeof(zero_cod), + HCI_MGMT_DEV_CLASS_EVENTS, NULL); + ext_info_changed(hdev, NULL); + } new_settings(hdev, match.sk); @@ -7092,9 +7290,11 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); - if (!status) - mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, - dev_class, 3, NULL); + if (!status) { + mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, + 3, HCI_MGMT_DEV_CLASS_EVENTS, NULL); + ext_info_changed(hdev, NULL); + } if (match.sk) sock_put(match.sk); @@ -7123,8 +7323,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status) return; } - mgmt_generic_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), - cmd ? cmd->sk : NULL); + mgmt_limited_event(MGMT_EV_LOCAL_NAME_CHANGED, hdev, &ev, sizeof(ev), + HCI_MGMT_LOCAL_NAME_EVENTS, cmd ? cmd->sk : NULL); + ext_info_changed(hdev, cmd ? cmd->sk : NULL); } static inline bool has_uuid(u8 *uuid, u16 uuid_count, u8 (*uuids)[16]) diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index 8c30c7eb8bef..c933bd08c1fe 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -21,12 +21,41 @@ SOFTWARE IS DISCLAIMED. */ +#include <asm/unaligned.h> + #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> +#include <net/bluetooth/hci_mon.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" +static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie, + u16 opcode, u16 len, void *buf) +{ + struct hci_mon_hdr *hdr; + struct sk_buff *skb; + + skb = bt_skb_alloc(6 + len, GFP_ATOMIC); + if (!skb) + return NULL; + + put_unaligned_le32(cookie, skb_put(skb, 4)); + put_unaligned_le16(opcode, skb_put(skb, 2)); + + if (buf) + memcpy(skb_put(skb, len), buf, len); + + __net_timestamp(skb); + + hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE); + hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); + hdr->index = index; + hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); + + return skb; +} + int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, void *data, u16 data_len, int flag, struct sock *skip_sk) { @@ -52,14 +81,18 @@ int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, __net_timestamp(skb); hci_send_to_channel(channel, skb, flag, skip_sk); - kfree_skb(skb); + if (channel == HCI_CHANNEL_CONTROL) + hci_send_monitor_ctrl_event(hdev, event, data, data_len, + skb_get_ktime(skb), flag, skip_sk); + + kfree_skb(skb); return 0; } int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) { - struct sk_buff *skb; + struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_status *ev; int err; @@ -80,17 +113,30 @@ int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) ev->status = status; ev->opcode = cpu_to_le16(cmd); + mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), + MGMT_EV_CMD_STATUS, sizeof(*ev), ev); + if (mskb) + skb->tstamp = mskb->tstamp; + else + __net_timestamp(skb); + err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); + if (mskb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(mskb); + } + return err; } int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, void *rp, size_t rp_len) { - struct sk_buff *skb; + struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_complete *ev; int err; @@ -114,10 +160,24 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, if (rp) memcpy(ev->data, rp, rp_len); + mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), + MGMT_EV_CMD_COMPLETE, + sizeof(*ev) + rp_len, ev); + if (mskb) + skb->tstamp = mskb->tstamp; + else + __net_timestamp(skb); + err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); + if (mskb) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, + HCI_SOCK_TRUSTED, NULL); + kfree_skb(mskb); + } + return err; } diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 4c1a16a96ae5..43faf2aea2ab 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -3387,7 +3387,10 @@ int smp_register(struct hci_dev *hdev) if (!lmp_sc_capable(hdev)) { debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs, hdev, &force_bredr_smp_fops); - return 0; + + /* Flag can be already set here (due to power toggle) */ + if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP)) + return 0; } if (WARN_ON(hdev->smp_bredr_data)) { diff --git a/net/bridge/Makefile b/net/bridge/Makefile index a1cda5d4718d..0aefc011b668 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -20,4 +20,6 @@ bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o +bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o + obj-$(CONFIG_NETFILTER) += netfilter/ diff --git a/net/bridge/br.c b/net/bridge/br.c index 3addc05b9a16..889e5640455f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -227,9 +227,11 @@ static int __init br_init(void) br_fdb_test_addr_hook = br_fdb_test_addr; #endif - pr_info("bridge: automatic filtering via arp/ip/ip6tables has been " - "deprecated. Update your scripts to load br_netfilter if you " +#if IS_MODULE(CONFIG_BRIDGE_NETFILTER) + pr_info("bridge: filtering via arp/ip/ip6tables is no longer available " + "by default. Update your scripts to load br_netfilter if you " "need this.\n"); +#endif return 0; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 09f26940aba5..89a687f3c0a3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -62,10 +62,10 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) goto out; if (is_broadcast_ether_addr(dest)) { - br_flood(br, skb, false, false, true); + br_flood(br, skb, BR_PKT_BROADCAST, false, true); } else if (is_multicast_ether_addr(dest)) { if (unlikely(netpoll_tx_running(dev))) { - br_flood(br, skb, false, false, true); + br_flood(br, skb, BR_PKT_MULTICAST, false, true); goto out; } if (br_multicast_rcv(br, NULL, skb, vid)) { @@ -78,11 +78,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_multicast_querier_exists(br, eth_hdr(skb))) br_multicast_flood(mdst, skb, false, true); else - br_flood(br, skb, false, false, true); + br_flood(br, skb, BR_PKT_MULTICAST, false, true); } else if ((dst = __br_fdb_get(br, dest, vid)) != NULL) { br_forward(dst->dst, skb, false, true); } else { - br_flood(br, skb, true, false, true); + br_flood(br, skb, BR_PKT_UNICAST, false, true); } out: rcu_read_unlock(); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index cd620fab41b0..6b43c8c88f19 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -710,24 +710,27 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx) + int *idx) { struct net_bridge *br = netdev_priv(dev); + int err = 0; int i; if (!(dev->priv_flags & IFF_EBRIDGE)) goto out; - if (!filter_dev) - idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); + if (!filter_dev) { + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); + if (err < 0) + goto out; + } for (i = 0; i < BR_HASH_SIZE; i++) { struct net_bridge_fdb_entry *f; hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { - int err; - if (idx < cb->args[0]) + if (*idx < cb->args[2]) goto skip; if (filter_dev && @@ -750,17 +753,15 @@ int br_fdb_dump(struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI); - if (err < 0) { - cb->args[1] = err; - break; - } + if (err < 0) + goto out; skip: - ++idx; + *idx += 1; } } out: - return idx; + return err; } /* Update (create or replace) forwarding database entry */ diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 63a83d8d7da3..7cb41aee4c82 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -29,7 +29,8 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING; + br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING && + nbp_switchdev_allowed_egress(p, skb); } int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -175,7 +176,7 @@ out: /* called under rcu_read_lock */ void br_flood(struct net_bridge *br, struct sk_buff *skb, - bool unicast, bool local_rcv, bool local_orig) + enum br_pkt_type pkt_type, bool local_rcv, bool local_orig) { u8 igmp_type = br_multicast_igmp_type(skb); struct net_bridge_port *prev = NULL; @@ -183,7 +184,10 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, list_for_each_entry_rcu(p, &br->port_list, list) { /* Do not flood unicast traffic to ports that turn it off */ - if (unicast && !(p->flags & BR_FLOOD)) + if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) + continue; + if (pkt_type == BR_PKT_MULTICAST && + !(p->flags & BR_MCAST_FLOOD)) continue; /* Do not flood to ports that enable proxy ARP */ diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f2fede05d32c..ed0dd3340084 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -362,7 +362,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; - p->flags = BR_LEARNING | BR_FLOOD; + p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); @@ -545,6 +545,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err5; + err = nbp_switchdev_mark_set(p); + if (err) + goto err6; + dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); @@ -566,7 +570,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) err = nbp_vlan_init(p); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); - goto err6; + goto err7; } spin_lock_bh(&br->lock); @@ -589,12 +593,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return 0; -err6: +err7: list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); +err6: netdev_upper_dev_unlink(dev, br->dev); - err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index abe11f085479..855b72fbe1da 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -128,11 +128,12 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - bool local_rcv = false, mcast_hit = false, unicast = true; struct net_bridge_port *p = br_port_get_rcu(skb->dev); const unsigned char *dest = eth_hdr(skb)->h_dest; + enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mdb_entry *mdst; + bool local_rcv, mcast_hit = false; struct net_bridge *br; u16 vid = 0; @@ -142,29 +143,36 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid)) goto out; + nbp_switchdev_frame_mark(p, skb); + /* insert into forwarding database after filtering to avoid spoofing */ br = p->br; if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); - if (!is_broadcast_ether_addr(dest) && is_multicast_ether_addr(dest) && - br_multicast_rcv(br, p, skb, vid)) - goto drop; + local_rcv = !!(br->dev->flags & IFF_PROMISC); + if (is_multicast_ether_addr(dest)) { + /* by definition the broadcast is also a multicast address */ + if (is_broadcast_ether_addr(dest)) { + pkt_type = BR_PKT_BROADCAST; + local_rcv = true; + } else { + pkt_type = BR_PKT_MULTICAST; + if (br_multicast_rcv(br, p, skb, vid)) + goto drop; + } + } if (p->state == BR_STATE_LEARNING) goto drop; BR_INPUT_SKB_CB(skb)->brdev = br->dev; - local_rcv = !!(br->dev->flags & IFF_PROMISC); - if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) br_do_proxy_arp(skb, br, vid, p); - if (is_broadcast_ether_addr(dest)) { - local_rcv = true; - unicast = false; - } else if (is_multicast_ether_addr(dest)) { + switch (pkt_type) { + case BR_PKT_MULTICAST: mdst = br_mdb_get(br, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(br, eth_hdr(skb))) { @@ -178,18 +186,22 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb local_rcv = true; br->dev->stats.multicast++; } - unicast = false; - } else if ((dst = __br_fdb_get(br, dest, vid)) && dst->is_local) { - /* Do not forward the packet since it's local. */ - return br_pass_frame_up(skb); + break; + case BR_PKT_UNICAST: + dst = __br_fdb_get(br, dest, vid); + default: + break; } if (dst) { + if (dst->is_local) + return br_pass_frame_up(skb); + dst->used = jiffies; br_forward(dst->dst, skb, local_rcv, false); } else { if (!mcast_hit) - br_flood(br, skb, unicast, local_rcv, false); + br_flood(br, skb, pkt_type, local_rcv, false); else br_multicast_flood(mdst, skb, local_rcv, false); } diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 77e7f69bf80d..2fe9345c1407 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -30,6 +30,7 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_arp.h> #include <linux/in_route.h> +#include <linux/rculist.h> #include <linux/inetdevice.h> #include <net/ip.h> @@ -395,11 +396,10 @@ bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, - NF_BR_PRE_ROUTING, - net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, + net, sk, skb, skb->dev, + NULL, + br_nf_pre_routing_finish); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); @@ -417,10 +417,8 @@ bridged_dnat: skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); - + br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL, + br_handle_frame_finish); return 0; } @@ -992,6 +990,43 @@ static struct notifier_block brnf_notifier __read_mostly = { .notifier_call = brnf_device_event, }; +/* recursively invokes nf_hook_slow (again), skipping already-called + * hooks (< NF_BR_PRI_BRNF). + * + * Called with rcu read lock held. + */ +int br_nf_hook_thresh(unsigned int hook, struct net *net, + struct sock *sk, struct sk_buff *skb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct net *, struct sock *, + struct sk_buff *)) +{ + struct nf_hook_entry *elem; + struct nf_hook_state state; + int ret; + + elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); + + while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF)) + elem = rcu_dereference(elem->next); + + if (!elem) + return okfn(net, sk, skb); + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1, + NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); + + ret = nf_hook_slow(skb, &state); + rcu_read_unlock(); + if (ret == 1) + ret = okfn(net, sk, skb); + + return ret; +} + #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 5e59a8457e7b..5989661c659f 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -187,10 +187,9 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, - net, sk, skb, skb->dev, NULL, - br_nf_pre_routing_finish_bridge, - 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, + net, sk, skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge); return 0; } ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr); @@ -207,9 +206,8 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc skb->dev = nf_bridge->physindev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); - NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, net, sk, skb, - skb->dev, NULL, - br_handle_frame_finish, 1); + br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, + skb->dev, NULL, br_handle_frame_finish); return 0; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index f2a29e467e78..e99037c6f7b7 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -169,10 +169,15 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u32(skb, IFLA_BRPORT_COST, p->path_cost) || nla_put_u8(skb, IFLA_BRPORT_MODE, mode) || nla_put_u8(skb, IFLA_BRPORT_GUARD, !!(p->flags & BR_BPDU_GUARD)) || - nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || - nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || + nla_put_u8(skb, IFLA_BRPORT_PROTECT, + !!(p->flags & BR_ROOT_BLOCK)) || + nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, + !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || - nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, + !!(p->flags & BR_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, + !!(p->flags & BR_MCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || @@ -630,6 +635,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); + br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); @@ -1245,14 +1251,30 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return 0; } -static size_t bridge_get_linkxstats_size(const struct net_device *dev) +static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) { - struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; + struct net_bridge *br; int numvls = 0; - vg = br_vlan_group(br); + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + br = netdev_priv(dev); + vg = br_vlan_group(br); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + p = br_port_get_rtnl(dev); + if (!p) + return 0; + br = p->br; + vg = nbp_vlan_group(p); + break; + default: + return 0; + } + if (vg) { /* we need to count all, even placeholder entries */ list_for_each_entry(v, &vg->vlan_list, vlist) @@ -1264,45 +1286,42 @@ static size_t bridge_get_linkxstats_size(const struct net_device *dev) nla_total_size(0); } -static size_t brport_get_linkxstats_size(const struct net_device *dev) -{ - return nla_total_size(sizeof(struct br_mcast_stats)) + - nla_total_size(0); -} - -static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) +static int br_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx, int attr) { - size_t retsize = 0; + struct nlattr *nla __maybe_unused; + struct net_bridge_port *p = NULL; + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge *br; + struct nlattr *nest; + int vl_idx = 0; switch (attr) { case IFLA_STATS_LINK_XSTATS: - retsize = bridge_get_linkxstats_size(dev); + br = netdev_priv(dev); + vg = br_vlan_group(br); break; case IFLA_STATS_LINK_XSTATS_SLAVE: - retsize = brport_get_linkxstats_size(dev); + p = br_port_get_rtnl(dev); + if (!p) + return 0; + br = p->br; + vg = nbp_vlan_group(p); break; + default: + return -EINVAL; } - return retsize; -} - -static int bridge_fill_linkxstats(struct sk_buff *skb, - const struct net_device *dev, - int *prividx) -{ - struct net_bridge *br = netdev_priv(dev); - struct nlattr *nla __maybe_unused; - struct net_bridge_vlan_group *vg; - struct net_bridge_vlan *v; - struct nlattr *nest; - int vl_idx = 0; - nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); if (!nest) return -EMSGSIZE; - vg = br_vlan_group(br); if (vg) { + u16 pvid; + + pvid = br_get_pvid(vg); list_for_each_entry(v, &vg->vlan_list, vlist) { struct bridge_vlan_xstats vxi; struct br_vlan_stats stats; @@ -1311,6 +1330,9 @@ static int bridge_fill_linkxstats(struct sk_buff *skb, continue; memset(&vxi, 0, sizeof(vxi)); vxi.vid = v->vid; + vxi.flags = v->flags; + if (v->vid == pvid) + vxi.flags |= BRIDGE_VLAN_INFO_PVID; br_vlan_get_stats(v, &stats); vxi.rx_bytes = stats.rx_bytes; vxi.rx_packets = stats.rx_packets; @@ -1329,7 +1351,7 @@ static int bridge_fill_linkxstats(struct sk_buff *skb, BRIDGE_XSTATS_PAD); if (!nla) goto nla_put_failure; - br_multicast_get_stats(br, NULL, nla_data(nla)); + br_multicast_get_stats(br, p, nla_data(nla)); } #endif nla_nest_end(skb, nest); @@ -1344,52 +1366,6 @@ nla_put_failure: return -EMSGSIZE; } -static int brport_fill_linkxstats(struct sk_buff *skb, - const struct net_device *dev, - int *prividx) -{ - struct net_bridge_port *p = br_port_get_rtnl(dev); - struct nlattr *nla __maybe_unused; - struct nlattr *nest; - - if (!p) - return 0; - - nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); - if (!nest) - return -EMSGSIZE; -#ifdef CONFIG_BRIDGE_IGMP_SNOOPING - nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_MCAST, - sizeof(struct br_mcast_stats), - BRIDGE_XSTATS_PAD); - if (!nla) { - nla_nest_end(skb, nest); - return -EMSGSIZE; - } - br_multicast_get_stats(p->br, p, nla_data(nla)); -#endif - nla_nest_end(skb, nest); - - return 0; -} - -static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, - int *prividx, int attr) -{ - int ret = -EINVAL; - - switch (attr) { - case IFLA_STATS_LINK_XSTATS: - ret = bridge_fill_linkxstats(skb, dev, prividx); - break; - case IFLA_STATS_LINK_XSTATS_SLAVE: - ret = brport_fill_linkxstats(skb, dev, prividx); - break; - } - - return ret; -} - static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, .get_link_af_size = br_get_link_af_size_filtered, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index aac2a6e6b008..1b63177e0ccd 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -251,6 +251,9 @@ struct net_bridge_port #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; #endif +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif }; #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) @@ -359,6 +362,11 @@ struct net_bridge struct timer_list gc_timer; struct kobject *ifobj; u32 auto_cnt; + +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif + #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; u8 vlan_enabled; @@ -381,6 +389,10 @@ struct br_input_skb_cb { #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool vlan_filtered; #endif + +#ifdef CONFIG_NET_SWITCHDEV + int offload_fwd_mark; +#endif }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) @@ -496,7 +508,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 nlh_flags); int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev, struct net_device *fdev, int idx); + struct net_device *dev, struct net_device *fdev, int *idx); int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, @@ -505,12 +517,17 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid); /* br_forward.c */ +enum br_pkt_type { + BR_PKT_UNICAST, + BR_PKT_MULTICAST, + BR_PKT_BROADCAST +}; int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb); void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, bool local_rcv, bool local_orig); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb); void br_flood(struct net_bridge *br, struct sk_buff *skb, - bool unicast, bool local_rcv, bool local_orig); + enum br_pkt_type pkt_type, bool local_rcv, bool local_orig); /* br_if.c */ void br_port_carrier_check(struct net_bridge_port *p); @@ -1034,4 +1051,29 @@ static inline int br_sysfs_addbr(struct net_device *dev) { return 0; } static inline void br_sysfs_delbr(struct net_device *dev) { return; } #endif /* CONFIG_SYSFS */ +/* br_switchdev.c */ +#ifdef CONFIG_NET_SWITCHDEV +int nbp_switchdev_mark_set(struct net_bridge_port *p); +void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb); +bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb); +#else +static inline int nbp_switchdev_mark_set(struct net_bridge_port *p) +{ + return 0; +} + +static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + +static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + return true; +} +#endif /* CONFIG_NET_SWITCHDEV */ + #endif diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 341caa0ca63a..d8ad73b38de2 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -134,17 +134,36 @@ void br_stp_disable_port(struct net_bridge_port *p) br_become_root_bridge(br); } -static void br_stp_start(struct net_bridge *br) +static int br_stp_call_user(struct net_bridge *br, char *arg) { - int r; - char *argv[] = { BR_STP_PROG, br->dev->name, "start", NULL }; + char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL }; char *envp[] = { NULL }; + int rc; + + /* call userspace STP and report program errors */ + rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); + if (rc > 0) { + if (rc & 0xff) + br_debug(br, BR_STP_PROG " received signal %d\n", + rc & 0x7f); + else + br_debug(br, BR_STP_PROG " exited with code %d\n", + (rc >> 8) & 0xff); + } + + return rc; +} + +static void br_stp_start(struct net_bridge *br) +{ struct net_bridge_port *p; + int err = -ENOENT; if (net_eq(dev_net(br->dev), &init_net)) - r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); - else - r = -ENOENT; + err = br_stp_call_user(br, "start"); + + if (err && err != -ENOENT) + br_err(br, "failed to start userspace STP (%d)\n", err); spin_lock_bh(&br->lock); @@ -153,9 +172,10 @@ static void br_stp_start(struct net_bridge *br) else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY) __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY); - if (r == 0) { + if (!err) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); + /* Stop hello and hold timers */ del_timer(&br->hello_timer); list_for_each_entry(p, &br->port_list, list) @@ -173,14 +193,13 @@ static void br_stp_start(struct net_bridge *br) static void br_stp_stop(struct net_bridge *br) { - int r; - char *argv[] = { BR_STP_PROG, br->dev->name, "stop", NULL }; - char *envp[] = { NULL }; struct net_bridge_port *p; + int err; if (br->stp_enabled == BR_USER_STP) { - r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); - br_info(br, "userspace STP stopped, return code %d\n", r); + err = br_stp_call_user(br, "stop"); + if (err) + br_err(br, "failed to stop userspace STP (%d)\n", err); /* To start timers on any ports left in blocking */ mod_timer(&br->hello_timer, jiffies + br->hello_time); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c new file mode 100644 index 000000000000..f4097b900de1 --- /dev/null +++ b/net/bridge/br_switchdev.c @@ -0,0 +1,57 @@ +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <net/switchdev.h> + +#include "br_private.h" + +static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev) +{ + struct net_bridge_port *p; + + /* dev is yet to be added to the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (switchdev_port_same_parent_id(dev, p->dev)) + return p->offload_fwd_mark; + } + + return ++br->offload_fwd_mark; +} + +int nbp_switchdev_mark_set(struct net_bridge_port *p) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; + int err; + + ASSERT_RTNL(); + + err = switchdev_port_attr_get(p->dev, &attr); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev); + + return 0; +} + +void nbp_switchdev_frame_mark(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark)) + BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark; +} + +bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + return !skb->offload_fwd_mark || + BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark; +} diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 1e04d4d44273..e657258e1f2c 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -171,6 +171,7 @@ BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); +BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 152300d164ac..9a11086ba6ff 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -91,7 +91,7 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, if (loginfo->type == NF_LOG_TYPE_LOG) bitmask = loginfo->u.log.logflags; else - bitmask = NF_LOG_MASK; + bitmask = NF_LOG_DEFAULT_MASK; if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c index 203964997a51..2e7c4f974340 100644 --- a/net/bridge/netfilter/ebt_redirect.c +++ b/net/bridge/netfilter/ebt_redirect.c @@ -24,7 +24,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par) return EBT_DROP; if (par->hooknum != NF_BR_BROUTING) - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ ether_addr_copy(eth_hdr(skb)->h_dest, br_port_get_rcu(par->in)->br->dev->dev_addr); else diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 0833c251aef7..f5c11bbe27db 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -146,7 +146,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, return 1; if (NF_INVF(e, EBT_IOUT, ebt_dev_check(e->out, out))) return 1; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ if (in && (p = br_port_get_rcu(in)) != NULL && NF_INVF(e, EBT_ILOGICALIN, ebt_dev_check(e->logical_in, p->br->dev))) diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c index 5d9953a90929..1663df598545 100644 --- a/net/bridge/netfilter/nf_log_bridge.c +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -50,8 +50,7 @@ static struct nf_logger nf_bridge_logger __read_mostly = { static int __net_init nf_log_bridge_net_init(struct net *net) { - nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); - return 0; + return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); } static void __net_exit nf_log_bridge_net_exit(struct net *net) diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index a78c4e2826e5..97afdc0744e6 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -13,79 +13,11 @@ #include <linux/module.h> #include <linux/netfilter_bridge.h> #include <net/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables_bridge.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> -int nft_bridge_iphdr_validate(struct sk_buff *skb) -{ - struct iphdr *iph; - u32 len; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return 0; - - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - return 0; - - len = ntohs(iph->tot_len); - if (skb->len < len) - return 0; - else if (len < (iph->ihl*4)) - return 0; - - if (!pskb_may_pull(skb, iph->ihl*4)) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(nft_bridge_iphdr_validate); - -int nft_bridge_ip6hdr_validate(struct sk_buff *skb) -{ - struct ipv6hdr *hdr; - u32 pkt_len; - - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - return 0; - - hdr = ipv6_hdr(skb); - if (hdr->version != 6) - return 0; - - pkt_len = ntohs(hdr->payload_len); - if (pkt_len + sizeof(struct ipv6hdr) > skb->len) - return 0; - - return 1; -} -EXPORT_SYMBOL_GPL(nft_bridge_ip6hdr_validate); - -static inline void nft_bridge_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - if (nft_bridge_iphdr_validate(skb)) - nft_set_pktinfo_ipv4(pkt, skb, state); - else - nft_set_pktinfo(pkt, skb, state); -} - -static inline void nft_bridge_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (nft_bridge_ip6hdr_validate(skb) && - nft_set_pktinfo_ipv6(pkt, skb, state) == 0) - return; -#endif - nft_set_pktinfo(pkt, skb, state); -} - static unsigned int nft_do_chain_bridge(void *priv, struct sk_buff *skb, @@ -95,13 +27,13 @@ nft_do_chain_bridge(void *priv, switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): - nft_bridge_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo_ipv4_validate(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_bridge_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo_ipv6_validate(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); break; } @@ -207,12 +139,20 @@ static int __init nf_tables_bridge_init(void) int ret; nf_register_afinfo(&nf_br_afinfo); - nft_register_chain_type(&filter_bridge); + ret = nft_register_chain_type(&filter_bridge); + if (ret < 0) + goto err1; + ret = register_pernet_subsys(&nf_tables_bridge_net_ops); - if (ret < 0) { - nft_unregister_chain_type(&filter_bridge); - nf_unregister_afinfo(&nf_br_afinfo); - } + if (ret < 0) + goto err2; + + return ret; + +err2: + nft_unregister_chain_type(&filter_bridge); +err1: + nf_unregister_afinfo(&nf_br_afinfo); return ret; } diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 0b77ffbc27d6..4b3df6b0e3b9 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -14,7 +14,6 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> -#include <net/netfilter/nf_tables_bridge.h> #include <net/netfilter/ipv4/nf_reject.h> #include <net/netfilter/ipv6/nf_reject.h> #include <linux/ip.h> @@ -37,6 +36,30 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb, skb_pull(nskb, ETH_HLEN); } +static int nft_bridge_iphdr_validate(struct sk_buff *skb) +{ + struct iphdr *iph; + u32 len; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return 0; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + return 0; + + len = ntohs(iph->tot_len); + if (skb->len < len) + return 0; + else if (len < (iph->ihl*4)) + return 0; + + if (!pskb_may_pull(skb, iph->ihl*4)) + return 0; + + return 1; +} + /* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT) * or the bridge port (NF_BRIDGE PREROUTING). */ @@ -143,6 +166,25 @@ static void nft_reject_br_send_v4_unreach(struct net *net, br_forward(br_port_get_rcu(dev), nskb, false, true); } +static int nft_bridge_ip6hdr_validate(struct sk_buff *skb) +{ + struct ipv6hdr *hdr; + u32 pkt_len; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + + hdr = ipv6_hdr(skb); + if (hdr->version != 6) + return 0; + + pkt_len = ntohs(hdr->payload_len); + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + return 0; + + return 1; +} + static void nft_reject_br_send_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index ea6312057a71..f1fe26f66458 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3355,16 +3355,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); -#ifdef CONFIG_NET_SWITCHDEV - /* Don't forward if offload device already forwarded */ - if (skb->offload_fwd_mark && - skb->offload_fwd_mark == dev->offload_fwd_mark) { - consume_skb(skb); - rc = NET_XMIT_SUCCESS; - goto out; - } -#endif - txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); @@ -3914,8 +3904,7 @@ static void net_tx_action(struct softirq_action *h) } } -#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ - (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) +#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) /* This hook is defined here for ATM LANE */ int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr) __read_mostly; @@ -4066,12 +4055,17 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, { #ifdef CONFIG_NETFILTER_INGRESS if (nf_hook_ingress_active(skb)) { + int ingress_retval; + if (*pt_prev) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - return nf_hook_ingress(skb); + rcu_read_lock(); + ingress_retval = nf_hook_ingress(skb); + rcu_read_unlock(); + return ingress_retval; } #endif /* CONFIG_NETFILTER_INGRESS */ return 0; @@ -4308,32 +4302,53 @@ int netif_receive_skb(struct sk_buff *skb) } EXPORT_SYMBOL(netif_receive_skb); -/* Network device is going away, flush any packets still pending - * Called with irqs disabled. - */ -static void flush_backlog(void *arg) +DEFINE_PER_CPU(struct work_struct, flush_works); + +/* Network device is going away, flush any packets still pending */ +static void flush_backlog(struct work_struct *work) { - struct net_device *dev = arg; - struct softnet_data *sd = this_cpu_ptr(&softnet_data); struct sk_buff *skb, *tmp; + struct softnet_data *sd; + + local_bh_disable(); + sd = this_cpu_ptr(&softnet_data); + local_irq_disable(); rps_lock(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev == dev) { + if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); kfree_skb(skb); input_queue_head_incr(sd); } } rps_unlock(sd); + local_irq_enable(); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev == dev) { + if (skb->dev->reg_state == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); kfree_skb(skb); input_queue_head_incr(sd); } } + local_bh_enable(); +} + +static void flush_all_backlogs(void) +{ + unsigned int cpu; + + get_online_cpus(); + + for_each_online_cpu(cpu) + queue_work_on(cpu, system_highpri_wq, + per_cpu_ptr(&flush_works, cpu)); + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(&flush_works, cpu)); + + put_online_cpus(); } static int napi_gro_complete(struct sk_buff *skb) @@ -4821,8 +4836,9 @@ static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) static int process_backlog(struct napi_struct *napi, int quota) { - int work = 0; struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); + bool again = true; + int work = 0; /* Check if we have pending ipi, its better to send them now, * not waiting net_rx_action() end. @@ -4833,23 +4849,20 @@ static int process_backlog(struct napi_struct *napi, int quota) } napi->weight = weight_p; - local_irq_disable(); - while (1) { + while (again) { struct sk_buff *skb; while ((skb = __skb_dequeue(&sd->process_queue))) { rcu_read_lock(); - local_irq_enable(); __netif_receive_skb(skb); rcu_read_unlock(); - local_irq_disable(); input_queue_head_incr(sd); - if (++work >= quota) { - local_irq_enable(); + if (++work >= quota) return work; - } + } + local_irq_disable(); rps_lock(sd); if (skb_queue_empty(&sd->input_pkt_queue)) { /* @@ -4861,16 +4874,14 @@ static int process_backlog(struct napi_struct *napi, int quota) * and we dont need an smp_mb() memory barrier. */ napi->state = 0; - rps_unlock(sd); - - break; + again = false; + } else { + skb_queue_splice_tail_init(&sd->input_pkt_queue, + &sd->process_queue); } - - skb_queue_splice_tail_init(&sd->input_pkt_queue, - &sd->process_queue); rps_unlock(sd); + local_irq_enable(); } - local_irq_enable(); return work; } @@ -5578,6 +5589,7 @@ static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, + u16 ref_nr, struct list_head *dev_list, void *private, bool master) { @@ -5587,7 +5599,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj = __netdev_find_adj(adj_dev, dev_list); if (adj) { - adj->ref_nr++; + adj->ref_nr += ref_nr; return 0; } @@ -5597,7 +5609,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->dev = adj_dev; adj->master = master; - adj->ref_nr = 1; + adj->ref_nr = ref_nr; adj->private = private; dev_hold(adj_dev); @@ -5636,6 +5648,7 @@ free_adj: static void __netdev_adjacent_dev_remove(struct net_device *dev, struct net_device *adj_dev, + u16 ref_nr, struct list_head *dev_list) { struct netdev_adjacent *adj; @@ -5648,10 +5661,10 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, BUG(); } - if (adj->ref_nr > 1) { - pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, - adj->ref_nr-1); - adj->ref_nr--; + if (adj->ref_nr > ref_nr) { + pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name, + ref_nr, adj->ref_nr-ref_nr); + adj->ref_nr -= ref_nr; return; } @@ -5670,21 +5683,22 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev, static int __netdev_adjacent_dev_link_lists(struct net_device *dev, struct net_device *upper_dev, + u16 ref_nr, struct list_head *up_list, struct list_head *down_list, void *private, bool master) { int ret; - ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, - master); + ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list, + private, master); if (ret) return ret; - ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, - false); + ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list, + private, false); if (ret) { - __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); return ret; } @@ -5692,9 +5706,10 @@ static int __netdev_adjacent_dev_link_lists(struct net_device *dev, } static int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + u16 ref_nr) { - return __netdev_adjacent_dev_link_lists(dev, upper_dev, + return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr, &dev->all_adj_list.upper, &upper_dev->all_adj_list.lower, NULL, false); @@ -5702,17 +5717,19 @@ static int __netdev_adjacent_dev_link(struct net_device *dev, static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, struct net_device *upper_dev, + u16 ref_nr, struct list_head *up_list, struct list_head *down_list) { - __netdev_adjacent_dev_remove(dev, upper_dev, up_list); - __netdev_adjacent_dev_remove(upper_dev, dev, down_list); + __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); } static void __netdev_adjacent_dev_unlink(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + u16 ref_nr) { - __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr, &dev->all_adj_list.upper, &upper_dev->all_adj_list.lower); } @@ -5721,17 +5738,17 @@ static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, struct net_device *upper_dev, void *private, bool master) { - int ret = __netdev_adjacent_dev_link(dev, upper_dev); + int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1); if (ret) return ret; - ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, + ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower, private, master); if (ret) { - __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink(dev, upper_dev, 1); return ret; } @@ -5741,8 +5758,8 @@ static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, struct net_device *upper_dev) { - __netdev_adjacent_dev_unlink(dev, upper_dev); - __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + __netdev_adjacent_dev_unlink(dev, upper_dev, 1); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, &dev->adj_list.upper, &upper_dev->adj_list.lower); } @@ -5795,7 +5812,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { pr_debug("Interlinking %s with %s, non-neighbour\n", i->dev->name, j->dev->name); - ret = __netdev_adjacent_dev_link(i->dev, j->dev); + ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr); if (ret) goto rollback_mesh; } @@ -5805,7 +5822,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { pr_debug("linking %s's upper device %s with %s\n", upper_dev->name, i->dev->name, dev->name); - ret = __netdev_adjacent_dev_link(dev, i->dev); + ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr); if (ret) goto rollback_upper_mesh; } @@ -5814,7 +5831,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, list_for_each_entry(i, &dev->all_adj_list.lower, list) { pr_debug("linking %s's lower device %s with %s\n", dev->name, i->dev->name, upper_dev->name); - ret = __netdev_adjacent_dev_link(i->dev, upper_dev); + ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr); if (ret) goto rollback_lower_mesh; } @@ -5832,7 +5849,7 @@ rollback_lower_mesh: list_for_each_entry(i, &dev->all_adj_list.lower, list) { if (i == to_i) break; - __netdev_adjacent_dev_unlink(i->dev, upper_dev); + __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); } i = NULL; @@ -5842,7 +5859,7 @@ rollback_upper_mesh: list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { if (i == to_i) break; - __netdev_adjacent_dev_unlink(dev, i->dev); + __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); } i = j = NULL; @@ -5854,7 +5871,7 @@ rollback_mesh: list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { if (i == to_i && j == to_j) break; - __netdev_adjacent_dev_unlink(i->dev, j->dev); + __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); } if (i == to_i) break; @@ -5934,16 +5951,16 @@ void netdev_upper_dev_unlink(struct net_device *dev, */ list_for_each_entry(i, &dev->all_adj_list.lower, list) list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(i->dev, j->dev); + __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); /* remove also the devices itself from lower/upper device * list */ list_for_each_entry(i, &dev->all_adj_list.lower, list) - __netdev_adjacent_dev_unlink(i->dev, upper_dev); + __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) - __netdev_adjacent_dev_unlink(dev, i->dev); + __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, &changeupper_info.info); @@ -6723,8 +6740,8 @@ static void rollback_registered_many(struct list_head *head) unlist_netdevice(dev); dev->reg_state = NETREG_UNREGISTERING; - on_each_cpu(flush_backlog, dev, 1); } + flush_all_backlogs(); synchronize_net(); @@ -7641,6 +7658,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->all_adj_list.lower); INIT_LIST_HEAD(&dev->ptype_all); INIT_LIST_HEAD(&dev->ptype_specific); +#ifdef CONFIG_NET_SCHED + hash_init(dev->qdisc_hash); +#endif dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -8286,8 +8306,11 @@ static int __init net_dev_init(void) */ for_each_possible_cpu(i) { + struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); + INIT_WORK(flush, flush_backlog); + skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); INIT_LIST_HEAD(&sd->poll_list); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index d6b3b579560d..72cfb0c61125 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -105,7 +105,7 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) return skb; } -static struct genl_multicast_group dropmon_mcgrps[] = { +static const struct genl_multicast_group dropmon_mcgrps[] = { { .name = "events", }, }; diff --git a/net/core/filter.c b/net/core/filter.c index cb06aceb512a..00351cdf7d0c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -94,14 +94,13 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) { - return skb_get_poff((struct sk_buff *)(unsigned long) ctx); + return skb_get_poff(skb); } -static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -120,9 +119,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { - struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; struct nlattr *nla; if (skb_is_nonlinear(skb)) @@ -145,7 +143,7 @@ static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return 0; } -static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +BPF_CALL_0(__get_raw_cpu_id) { return raw_smp_processor_id(); } @@ -233,9 +231,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); - BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ @@ -1350,17 +1347,26 @@ struct bpf_scratchpad { static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static inline int __bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + return skb_ensure_writable(skb, write_len); +} + static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err; + int err = __bpf_try_make_writable(skb, write_len); - err = skb_ensure_writable(skb, write_len); bpf_compute_data_end(skb); - return err; } +static int bpf_try_make_head_writable(struct sk_buff *skb) +{ + return bpf_try_make_writable(skb, skb_headlen(skb)); +} + static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) @@ -1373,12 +1379,9 @@ static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } -static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) +BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, + const void *, from, u32, len, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; - void *from = (void *) (long) r3; - unsigned int len = (unsigned int) r4; void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) @@ -1413,12 +1416,9 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, + void *, to, u32, len) { - const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; - unsigned int offset = (unsigned int) r2; - void *to = (void *)(unsigned long) r3; - unsigned int len = (unsigned int) r4; void *ptr; if (unlikely(offset > 0xffff)) @@ -1446,10 +1446,31 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_STACK_SIZE, }; -static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto bpf_skb_pull_data_proto = { + .func = bpf_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - unsigned int offset = (unsigned int) r2; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) @@ -1491,12 +1512,11 @@ static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, + u64, from, u64, to, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; - unsigned int offset = (unsigned int) r2; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | @@ -1544,12 +1564,11 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .arg5_type = ARG_ANYTHING, }; -static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) +BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, + __be32 *, to, u32, to_size, __wsum, seed) { struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); - u64 diff_size = from_size + to_size; - __be32 *from = (__be32 *) (long) r1; - __be32 *to = (__be32 *) (long) r3; + u32 diff_size = from_size + to_size; int i, j = 0; /* This is quite flexible, some examples: @@ -1575,6 +1594,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, + .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_STACK, .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, @@ -1583,6 +1603,26 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +{ + /* The interface is to be used in combination with bpf_csum_diff() + * for direct packet writes. csum rotation for alignment as well + * as emulating csum_sub() can be done from the eBPF program. + */ + if (skb->ip_summed == CHECKSUM_COMPLETE) + return (skb->csum = csum_add(skb->csum, csum)); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_csum_update_proto = { + .func = bpf_csum_update, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -1607,10 +1647,11 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) return ret; } -static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; struct net_device *dev; + struct sk_buff *clone; + int ret; if (unlikely(flags & ~(BPF_F_INGRESS))) return -EINVAL; @@ -1619,14 +1660,25 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) if (unlikely(!dev)) return -EINVAL; - skb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb)) + clone = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!clone)) return -ENOMEM; - bpf_push_mac_rcsum(skb); + /* For direct write, we need to keep the invariant that the skbs + * we're dealing with need to be uncloned. Should uncloning fail + * here, we need to free the just generated clone to unclone once + * again. + */ + ret = bpf_try_make_head_writable(skb); + if (unlikely(ret)) { + kfree_skb(clone); + return -ENOMEM; + } + + bpf_push_mac_rcsum(clone); return flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -1645,7 +1697,7 @@ struct redirect_info { static DEFINE_PER_CPU(struct redirect_info, redirect_info); -static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); @@ -1684,9 +1736,9 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { - return task_get_classid((struct sk_buff *) (unsigned long) r1); + return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { @@ -1696,9 +1748,9 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { - return dst_tclassid((struct sk_buff *) (unsigned long) r1); + return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { @@ -1708,14 +1760,14 @@ static const struct bpf_func_proto bpf_get_route_realm_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ - return skb_get_hash((struct sk_buff *) (unsigned long) r1); + return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { @@ -1725,10 +1777,25 @@ static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) +{ + /* After all direct packet write, this can be used once for + * triggering a lazy recalc on next skb_get_hash() invocation. + */ + skb_clear_hash(skb); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_invalid_proto = { + .func = bpf_set_hash_invalid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, + u16, vlan_tci) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 vlan_proto = (__force __be16) r2; int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && @@ -1753,9 +1820,8 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { }; EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); -static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; int ret; bpf_push_mac_rcsum(skb); @@ -1930,10 +1996,9 @@ static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) return -ENOTSUPP; } -static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, + u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - __be16 proto = (__force __be16) r2; int ret; if (unlikely(flags)) @@ -1970,14 +2035,11 @@ static const struct bpf_func_proto bpf_skb_change_proto_proto = { .arg3_type = ARG_ANYTHING, }; -static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u32 pkt_type = r2; - /* We only allow a restricted subset to be changed for now. */ - if (unlikely(skb->pkt_type > PACKET_OTHERHOST || - pkt_type > PACKET_OTHERHOST)) + if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || + !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; @@ -1992,19 +2054,100 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 __bpf_skb_min_len(const struct sk_buff *skb) +{ + u32 min_len = skb_network_offset(skb); + + if (skb_transport_header_was_set(skb)) + min_len = skb_transport_offset(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) + min_len = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + return min_len; +} + +static u32 __bpf_skb_max_len(const struct sk_buff *skb) +{ + return skb->dev->mtu + skb->dev->hard_header_len; +} + +static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + unsigned int old_len = skb->len; + int ret; + + ret = __skb_grow_rcsum(skb, new_len); + if (!ret) + memset(skb->data + old_len, 0, new_len - old_len); + return ret; +} + +static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) +{ + return __skb_trim_rcsum(skb, new_len); +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + u32 max_len = __bpf_skb_max_len(skb); + u32 min_len = __bpf_skb_min_len(skb); + int ret; + + if (unlikely(flags || new_len > max_len || new_len < min_len)) + return -EINVAL; + if (skb->encapsulation) + return -ENOTSUPP; + + /* The basic idea of this helper is that it's performing the + * needed work to either grow or trim an skb, and eBPF program + * rewrites the rest via helpers like bpf_skb_store_bytes(), + * bpf_lX_csum_replace() and others rather than passing a raw + * buffer here. This one is a slow path helper and intended + * for replies with control messages. + * + * Like in bpf_skb_change_proto(), we want to keep this rather + * minimal and without protocol specifics so that we are able + * to separate concerns as in bpf_skb_store_bytes() should only + * be the one responsible for writing buffers. + * + * It's really expected to be a slow path operation here for + * control message replies, so we're implicitly linearizing, + * uncloning and drop offloads from the skb by this. + */ + ret = __bpf_try_make_writable(skb, skb->len); + if (!ret) { + if (new_len > skb->len) + ret = bpf_skb_grow_rcsum(skb, new_len); + else if (new_len < skb->len) + ret = bpf_skb_trim_rcsum(skb, new_len); + if (!ret && skb_is_gso(skb)) + skb_gso_reset(skb); + } + + bpf_compute_data_end(skb); + return ret; +} + +static const struct bpf_func_proto bpf_skb_change_tail_proto = { + .func = bpf_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + bool bpf_helper_changes_skb_data(void *func) { - if (func == bpf_skb_vlan_push) - return true; - if (func == bpf_skb_vlan_pop) - return true; - if (func == bpf_skb_store_bytes) - return true; - if (func == bpf_skb_change_proto) - return true; - if (func == bpf_l3_csum_replace) - return true; - if (func == bpf_l4_csum_replace) + if (func == bpf_skb_vlan_push || + func == bpf_skb_vlan_pop || + func == bpf_skb_store_bytes || + func == bpf_skb_change_proto || + func == bpf_skb_change_tail || + func == bpf_skb_pull_data || + func == bpf_l3_csum_replace || + func == bpf_l4_csum_replace) return true; return false; @@ -2023,13 +2166,10 @@ static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, return 0; } -static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4, - u64 meta_size) +BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) { - struct sk_buff *skb = (struct sk_buff *)(long) r1; - struct bpf_map *map = (struct bpf_map *)(long) r2; u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; - void *meta = (void *)(long) r4; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; @@ -2056,10 +2196,9 @@ static unsigned short bpf_tunnel_key_af(u64 flags) return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } -static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, + u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; @@ -2124,10 +2263,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u8 *to = (u8 *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; @@ -2162,10 +2299,9 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { static struct metadata_dst __percpu *md_dst; -static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, + const struct bpf_tunnel_key *, from, u32, size, u64, flags) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; @@ -2183,7 +2319,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); - from = (struct bpf_tunnel_key *)compat; + from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; @@ -2233,10 +2369,9 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, + const u8 *, from, u32, size) { - struct sk_buff *skb = (struct sk_buff *) (long) r1; - u8 *from = (u8 *) (long) r2; struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); @@ -2282,28 +2417,24 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) } } -#ifdef CONFIG_SOCK_CGROUP_DATA -static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, + u32, idx) { - struct sk_buff *skb = (struct sk_buff *)(long)r1; - struct bpf_map *map = (struct bpf_map *)(long)r2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; - u32 i = (u32)r3; - sk = skb->sk; + sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; - - if (unlikely(i >= array->map.max_entries)) + if (unlikely(idx >= array->map.max_entries)) return -E2BIG; - cgrp = READ_ONCE(array->ptrs[i]); + cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; - return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp); + return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { @@ -2314,7 +2445,38 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; -#endif + +static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, + unsigned long off, unsigned long len) +{ + memcpy(dst_buff, src_buff + off, len); + return 0; +} + +BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, + u64, flags, void *, meta, u64, meta_size) +{ + u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; + + if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) + return -EINVAL; + if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) + return -EFAULT; + + return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, + bpf_xdp_copy); +} + +static const struct bpf_func_proto bpf_xdp_event_output_proto = { + .func = bpf_xdp_event_output, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_STACK, + .arg5_type = ARG_CONST_STACK_SIZE, +}; static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) @@ -2350,8 +2512,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; + case BPF_FUNC_csum_update: + return &bpf_csum_update_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -2368,6 +2534,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: @@ -2382,14 +2550,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; + case BPF_FUNC_set_hash_invalid: + return &bpf_set_hash_invalid_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; -#ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; -#endif default: return sk_filter_func_proto(func_id); } @@ -2398,7 +2566,14 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id) { - return sk_filter_func_proto(func_id); + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_xdp_event_output_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; + default: + return sk_filter_func_proto(func_id); + } } static bool __is_valid_access(int off, int size, enum bpf_access_type type) @@ -2438,6 +2613,45 @@ static bool sk_filter_is_valid_access(int off, int size, return __is_valid_access(off, size, type); } +static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return TC_ACT_SHOT; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); + *insn++ = BPF_EXIT_INSN(); + + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) @@ -2475,7 +2689,7 @@ static bool __is_valid_xdp_access(int off, int size, return false; if (off % size != 0) return false; - if (size != 4) + if (size != sizeof(__u32)) return false; return true; @@ -2506,10 +2720,10 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, - struct bpf_insn *insn_buf, - struct bpf_prog *prog) +static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; @@ -2556,7 +2770,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, case offsetof(struct __sk_buff, ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), dst_reg, src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); @@ -2597,7 +2811,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, dst_reg, src_reg, insn); case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); prog->cb_access = 1; @@ -2621,7 +2835,7 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, break; case offsetof(struct __sk_buff, data): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), dst_reg, src_reg, offsetof(struct sk_buff, data)); break; @@ -2630,8 +2844,8 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, ctx_off -= offsetof(struct __sk_buff, data_end); ctx_off += offsetof(struct sk_buff, cb); ctx_off += offsetof(struct bpf_skb_data_end, data_end); - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), - dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, + ctx_off); break; case offsetof(struct __sk_buff, tc_index): @@ -2657,6 +2871,31 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, + int src_reg, int ctx_off, + struct bpf_insn *insn_buf, + struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, ifindex): + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), + dst_reg, src_reg, + offsetof(struct sk_buff, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + offsetof(struct net_device, ifindex)); + break; + default: + return sk_filter_convert_ctx_access(type, dst_reg, src_reg, + ctx_off, insn_buf, prog); + } + + return insn - insn_buf; +} + static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn_buf, @@ -2666,12 +2905,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, switch (ctx_off) { case offsetof(struct xdp_md, data): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), dst_reg, src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_end): - *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)), + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), dst_reg, src_reg, offsetof(struct xdp_buff, data_end)); break; @@ -2683,13 +2922,14 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .convert_ctx_access = sk_filter_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .convert_ctx_access = tc_cls_act_convert_ctx_access, + .gen_prologue = tc_cls_act_prologue, }; static const struct bpf_verifier_ops xdp_ops = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 52742a02814f..1a7b80f73376 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -6,6 +6,8 @@ #include <linux/if_vlan.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/gre.h> +#include <net/pptp.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> @@ -116,13 +118,16 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_tags *key_tags; + struct flow_dissector_key_vlan *key_vlan; struct flow_dissector_key_keyid *key_keyid; + bool skip_vlan = false; u8 ip_proto = 0; bool ret = false; if (!data) { data = skb->data; - proto = skb->protocol; + proto = skb_vlan_tag_present(skb) ? + skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); } @@ -241,23 +246,45 @@ ipv6: case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan; - struct vlan_hdr _vlan; - vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); - if (!vlan) - goto out_bad; + if (skb_vlan_tag_present(skb)) + proto = skb->protocol; + + if (!skb_vlan_tag_present(skb) || + proto == cpu_to_be16(ETH_P_8021Q) || + proto == cpu_to_be16(ETH_P_8021AD)) { + struct vlan_hdr _vlan; + vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), + data, hlen, &_vlan); + if (!vlan) + goto out_bad; + proto = vlan->h_vlan_encapsulated_proto; + nhoff += sizeof(*vlan); + if (skip_vlan) + goto again; + } + + skip_vlan = true; if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID)) { - key_tags = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID, + FLOW_DISSECTOR_KEY_VLAN)) { + key_vlan = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_VLAN, target_container); - key_tags->vlan_id = skb_vlan_tag_get_id(skb); + if (skb_vlan_tag_present(skb)) { + key_vlan->vlan_id = skb_vlan_tag_get_id(skb); + key_vlan->vlan_priority = + (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT); + } else { + key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & + VLAN_VID_MASK; + key_vlan->vlan_priority = + (ntohs(vlan->h_vlan_TCI) & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + } } - proto = vlan->h_vlan_encapsulated_proto; - nhoff += sizeof(*vlan); goto again; } case htons(ETH_P_PPP_SES): { @@ -338,32 +365,42 @@ mpls: ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: { - struct gre_hdr { - __be16 flags; - __be16 proto; - } *hdr, _hdr; + struct gre_base_hdr *hdr, _hdr; + u16 gre_ver; + int offset = 0; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) goto out_bad; - /* - * Only look inside GRE if version zero and no - * routing - */ - if (hdr->flags & (GRE_VERSION | GRE_ROUTING)) + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) break; - proto = hdr->proto; - nhoff += 4; + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + break; + + proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + break; + } + + offset += sizeof(struct gre_base_hdr); + if (hdr->flags & GRE_CSUM) - nhoff += 4; + offset += sizeof(((struct gre_full_hdr *)0)->csum) + + sizeof(((struct gre_full_hdr *)0)->reserved1); + if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; - keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid), + keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid), data, hlen, &_keyid); - if (!keyid) goto out_bad; @@ -372,32 +409,65 @@ ip_proto_again: key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); - key_keyid->keyid = *keyid; + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } - nhoff += 4; + offset += sizeof(((struct gre_full_hdr *)0)->key); } + if (hdr->flags & GRE_SEQ) - nhoff += 4; - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) + offset += sizeof(((struct pptp_gre_header *)0)->seq); + + if (gre_ver == 0) { + if (proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, nhoff + offset, + sizeof(_eth), + data, hlen, &_eth); + if (!eth) + goto out_bad; + proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + hlen = (nhoff + offset); + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof(((struct pptp_gre_header *)0)->ack); + + ppp_hdr = skb_header_pointer(skb, nhoff + offset, + sizeof(_ppp_hdr), _ppp_hdr); + if (!ppp_hdr) goto out_bad; - proto = eth->h_proto; - nhoff += sizeof(*eth); - - /* Cap headers that we access via pointers at the - * end of the Ethernet header as our maximum alignment - * at that point is only 2 bytes. - */ - if (NET_IP_ALIGN) - hlen = nhoff; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; } + nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) goto out_good; @@ -874,8 +944,8 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = { .offset = offsetof(struct flow_keys, ports), }, { - .key_id = FLOW_DISSECTOR_KEY_VLANID, - .offset = offsetof(struct flow_keys, tags), + .key_id = FLOW_DISSECTOR_KEY_VLAN, + .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 669ecc9f884e..e5f84c26ba1a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -251,6 +251,41 @@ drop: } EXPORT_SYMBOL(lwtunnel_output); +int lwtunnel_xmit(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; + int ret = -EINVAL; + + if (!dst) + goto drop; + + lwtstate = dst->lwtstate; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->xmit)) + ret = ops->xmit(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_xmit); + int lwtunnel_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index cf26e04c4046..2ae929f9bd06 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1148,7 +1148,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } else goto out; } else { - if (lladdr == neigh->ha && new == NUD_STALE) + if (lladdr == neigh->ha && new == NUD_STALE && + !(flags & NEIGH_UPDATE_F_ADMIN)) new = old; } } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2c2eb1b629b1..42bdda0e616b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -37,6 +37,8 @@ struct net init_net = { }; EXPORT_SYMBOL(init_net); +static bool init_net_initialized; + #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; @@ -213,31 +215,29 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id); */ int peernet2id_alloc(struct net *net, struct net *peer) { - unsigned long flags; bool alloc; int id; - spin_lock_irqsave(&net->nsid_lock, flags); + spin_lock_bh(&net->nsid_lock); alloc = atomic_read(&peer->count) == 0 ? false : true; id = __peernet2id_alloc(net, peer, &alloc); - spin_unlock_irqrestore(&net->nsid_lock, flags); + spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) rtnl_net_notifyid(net, RTM_NEWNSID, id); return id; } -EXPORT_SYMBOL(peernet2id_alloc); /* This function returns, if assigned, the id of a peer netns. */ int peernet2id(struct net *net, struct net *peer) { - unsigned long flags; int id; - spin_lock_irqsave(&net->nsid_lock, flags); + spin_lock_bh(&net->nsid_lock); id = __peernet2id(net, peer); - spin_unlock_irqrestore(&net->nsid_lock, flags); + spin_unlock_bh(&net->nsid_lock); return id; } +EXPORT_SYMBOL(peernet2id); /* This function returns true is the peer netns has an id assigned into the * current netns. @@ -249,18 +249,17 @@ bool peernet_has_id(struct net *net, struct net *peer) struct net *get_net_ns_by_id(struct net *net, int id) { - unsigned long flags; struct net *peer; if (id < 0) return NULL; rcu_read_lock(); - spin_lock_irqsave(&net->nsid_lock, flags); + spin_lock_bh(&net->nsid_lock); peer = idr_find(&net->netns_ids, id); if (peer) get_net(peer); - spin_unlock_irqrestore(&net->nsid_lock, flags); + spin_unlock_bh(&net->nsid_lock); rcu_read_unlock(); return peer; @@ -404,17 +403,17 @@ static void cleanup_net(struct work_struct *work) for_each_net(tmp) { int id; - spin_lock_irq(&tmp->nsid_lock); + spin_lock_bh(&tmp->nsid_lock); id = __peernet2id(tmp, net); if (id >= 0) idr_remove(&tmp->netns_ids, id); - spin_unlock_irq(&tmp->nsid_lock); + spin_unlock_bh(&tmp->nsid_lock); if (id >= 0) rtnl_net_notifyid(tmp, RTM_DELNSID, id); } - spin_lock_irq(&net->nsid_lock); + spin_lock_bh(&net->nsid_lock); idr_destroy(&net->netns_ids); - spin_unlock_irq(&net->nsid_lock); + spin_unlock_bh(&net->nsid_lock); } rtnl_unlock(); @@ -531,7 +530,7 @@ static struct pernet_operations __net_initdata net_ns_ops = { .exit = net_ns_net_exit, }; -static struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { +static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_NONE] = { .type = NLA_UNSPEC }, [NETNSA_NSID] = { .type = NLA_S32 }, [NETNSA_PID] = { .type = NLA_U32 }, @@ -542,7 +541,6 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; - unsigned long flags; struct net *peer; int nsid, err; @@ -563,15 +561,15 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (IS_ERR(peer)) return PTR_ERR(peer); - spin_lock_irqsave(&net->nsid_lock, flags); + spin_lock_bh(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { - spin_unlock_irqrestore(&net->nsid_lock, flags); + spin_unlock_bh(&net->nsid_lock); err = -EEXIST; goto out; } err = alloc_netid(net, peer, nsid); - spin_unlock_irqrestore(&net->nsid_lock, flags); + spin_unlock_bh(&net->nsid_lock); if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; @@ -693,11 +691,10 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb) .idx = 0, .s_idx = cb->args[0], }; - unsigned long flags; - spin_lock_irqsave(&net->nsid_lock, flags); + spin_lock_bh(&net->nsid_lock); idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb); - spin_unlock_irqrestore(&net->nsid_lock, flags); + spin_unlock_bh(&net->nsid_lock); cb->args[0] = net_cb.idx; return skb->len; @@ -750,6 +747,8 @@ static int __init net_ns_init(void) if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); + init_net_initialized = true; + rtnl_lock(); list_add_tail_rcu(&init_net.list, &net_namespace_list); rtnl_unlock(); @@ -811,15 +810,24 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + if (!init_net_initialized) { + list_add_tail(&ops->list, list); + return 0; + } + return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { - LIST_HEAD(net_exit_list); - list_add(&init_net.exit_list, &net_exit_list); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + if (!init_net_initialized) { + list_del(&ops->list); + } else { + LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); + ops_exit_list(ops, &net_exit_list); + ops_free_list(ops, &net_exit_list); + } } #endif /* CONFIG_NET_NS */ diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bbd118b19aef..5219a9e2127a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2286,7 +2286,7 @@ out: static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { - pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev); + pkt_dev->pkt_overhead = 0; pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2777,13 +2777,13 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, - struct pktgen_dev *pkt_dev, - unsigned int extralen) + struct pktgen_dev *pkt_dev) { + unsigned int extralen = LL_RESERVED_SPACE(dev); struct sk_buff *skb = NULL; - unsigned int size = pkt_dev->cur_pkt_size + 64 + extralen + - pkt_dev->pkt_overhead; + unsigned int size; + size = pkt_dev->cur_pkt_size + 64 + extralen + pkt_dev->pkt_overhead; if (pkt_dev->flags & F_NODE) { int node = pkt_dev->node >= 0 ? pkt_dev->node : numa_node_id(); @@ -2796,8 +2796,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } + /* the caller pre-fetches from skb->data and reserves for the mac hdr */ if (likely(skb)) - skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb_reserve(skb, extralen - 16); return skb; } @@ -2830,16 +2831,14 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - datalen = (odev->hard_header_len + 16) & ~0xf; - - skb = pktgen_alloc_skb(odev, pkt_dev, datalen); + skb = pktgen_alloc_skb(odev, pkt_dev); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; } prefetchw(skb->data); - skb_reserve(skb, datalen); + skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ eth = (__u8 *) skb_push(skb, 14); @@ -2959,7 +2958,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mod_cur_headers(pkt_dev); queue_map = pkt_dev->cur_queue_map; - skb = pktgen_alloc_skb(odev, pkt_dev, 16); + skb = pktgen_alloc_skb(odev, pkt_dev); if (!skb) { sprintf(pkt_dev->result, "No memory"); return NULL; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 189cc78c77eb..b06d2f46b83e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -704,6 +704,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + if (!user_features) + continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; @@ -841,7 +843,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, size += nla_total_size(num_vfs * sizeof(struct nlattr)); size += num_vfs * (nla_total_size(sizeof(struct ifla_vf_mac)) + - nla_total_size(sizeof(struct ifla_vf_vlan)) + + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct nlattr)) + + nla_total_size(MAX_VLAN_LIST_LEN * + sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + @@ -1109,14 +1114,15 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct nlattr *vfinfo) { struct ifla_vf_rss_query_en vf_rss_query_en; + struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; + struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; - struct nlattr *vf, *vfstats; struct ifla_vf_mac vf_mac; struct ifla_vf_info ivi; @@ -1133,11 +1139,14 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; + /* VLAN Protocol by default is 802.1Q */ + ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; vf_mac.vf = vf_vlan.vf = + vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = @@ -1148,6 +1157,9 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; + vf_vlan_info.vlan = ivi.vlan; + vf_vlan_info.qos = ivi.qos; + vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; @@ -1156,10 +1168,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start(skb, IFLA_VF_INFO); - if (!vf) { - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vf) + goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), @@ -1175,17 +1185,23 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) - return -EMSGSIZE; + goto nla_put_vf_failure; + vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST); + if (!vfvlanlist) + goto nla_put_vf_failure; + if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), + &vf_vlan_info)) { + nla_nest_cancel(skb, vfvlanlist); + goto nla_put_vf_failure; + } + nla_nest_end(skb, vfvlanlist); memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start(skb, IFLA_VF_STATS); - if (!vfstats) { - nla_nest_cancel(skb, vf); - nla_nest_cancel(skb, vfinfo); - return -EMSGSIZE; - } + if (!vfstats) + goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, @@ -1197,11 +1213,19 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) - return -EMSGSIZE; + vf_stats.multicast, IFLA_VF_STATS_PAD)) { + nla_nest_cancel(skb, vfstats); + goto nla_put_vf_failure; + } nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); return 0; + +nla_put_vf_failure: + nla_nest_cancel(skb, vf); +nla_put_vfinfo_failure: + nla_nest_cancel(skb, vfinfo); + return -EMSGSIZE; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) @@ -1446,6 +1470,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, + [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, @@ -1702,7 +1727,37 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, - ivv->qos); + ivv->qos, + htons(ETH_P_8021Q)); + if (err < 0) + return err; + } + + if (tb[IFLA_VF_VLAN_LIST]) { + struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; + struct nlattr *attr; + int rem, len = 0; + + err = -EOPNOTSUPP; + if (!ops->ndo_set_vf_vlan) + return err; + + nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { + if (nla_type(attr) != IFLA_VF_VLAN_INFO || + nla_len(attr) < NLA_HDRLEN) { + return -EINVAL; + } + if (len >= MAX_VLAN_LIST_LEN) + return -EOPNOTSUPP; + ivvl[len] = nla_data(attr); + + len++; + } + if (len == 0) + return -EINVAL; + + err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, + ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } @@ -3066,7 +3121,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[0]) + if (*idx < cb->args[2]) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, @@ -3093,19 +3148,18 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx) + int *idx) { int err; netif_addr_lock_bh(dev); - err = nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->uc); + err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; - nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); + nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); - cb->args[1] = err; - return idx; + return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); @@ -3118,9 +3172,13 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) const struct net_device_ops *cops = NULL; struct ifinfomsg *ifm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); + struct hlist_head *head; int brport_idx = 0; int br_idx = 0; - int idx = 0; + int h, s_h; + int idx = 0, s_idx; + int err = 0; + int fidx = 0; if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy) == 0) { @@ -3138,49 +3196,71 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - cb->args[1] = 0; - for_each_netdev(net, dev) { - if (brport_idx && (dev->ifindex != brport_idx)) - continue; + s_h = cb->args[0]; + s_idx = cb->args[1]; - if (!br_idx) { /* user did not specify a specific bridge */ - if (dev->priv_flags & IFF_BRIDGE_PORT) { - br_dev = netdev_master_upper_dev_get(dev); - cops = br_dev->netdev_ops; - } + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, head, index_hlist) { - } else { - if (dev != br_dev && - !(dev->priv_flags & IFF_BRIDGE_PORT)) + if (brport_idx && (dev->ifindex != brport_idx)) continue; - if (br_dev != netdev_master_upper_dev_get(dev) && - !(dev->priv_flags & IFF_EBRIDGE)) - continue; + if (!br_idx) { /* user did not specify a specific bridge */ + if (dev->priv_flags & IFF_BRIDGE_PORT) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; + } + } else { + if (dev != br_dev && + !(dev->priv_flags & IFF_BRIDGE_PORT)) + continue; - cops = ops; - } + if (br_dev != netdev_master_upper_dev_get(dev) && + !(dev->priv_flags & IFF_EBRIDGE)) + continue; + cops = ops; + } - if (dev->priv_flags & IFF_BRIDGE_PORT) { - if (cops && cops->ndo_fdb_dump) - idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, - idx); - } - if (cb->args[1] == -EMSGSIZE) - break; + if (idx < s_idx) + goto cont; - if (dev->netdev_ops->ndo_fdb_dump) - idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, - idx); - else - idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); - if (cb->args[1] == -EMSGSIZE) - break; + if (dev->priv_flags & IFF_BRIDGE_PORT) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, + br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + goto out; + } + } + + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, + dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, + &fidx); + if (err == -EMSGSIZE) + goto out; + + cops = NULL; - cops = NULL; + /* reset fdb offset to 0 for rest of the interfaces */ + cb->args[2] = 0; + fidx = 0; +cont: + idx++; + } } - cb->args[0] = idx; +out: + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = fidx; + return skb->len; } @@ -3550,6 +3630,91 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) (!idxattr || idxattr == attrid); } +#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) +static int rtnl_get_offload_stats_attr_size(int attr_id) +{ + switch (attr_id) { + case IFLA_OFFLOAD_XSTATS_CPU_HIT: + return sizeof(struct rtnl_link_stats64); + } + + return 0; +} + +static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, + int *prividx) +{ + struct nlattr *attr = NULL; + int attr_id, size; + void *attr_data; + int err; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return -ENODATA; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (attr_id < *prividx) + continue; + + size = rtnl_get_offload_stats_attr_size(attr_id); + if (!size) + continue; + + if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + continue; + + attr = nla_reserve_64bit(skb, attr_id, size, + IFLA_OFFLOAD_XSTATS_UNSPEC); + if (!attr) + goto nla_put_failure; + + attr_data = nla_data(attr); + memset(attr_data, 0, size); + err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, + attr_data); + if (err) + goto get_offload_stats_failure; + } + + if (!attr) + return -ENODATA; + + *prividx = 0; + return 0; + +nla_put_failure: + err = -EMSGSIZE; +get_offload_stats_failure: + *prividx = attr_id; + return err; +} + +static int rtnl_get_offload_stats_size(const struct net_device *dev) +{ + int nla_size = 0; + int attr_id; + int size; + + if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && + dev->netdev_ops->ndo_get_offload_stats)) + return 0; + + for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; + attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { + if (!dev->netdev_ops->ndo_has_offload_stats(attr_id)) + continue; + size = rtnl_get_offload_stats_attr_size(attr_id); + nla_size += nla_total_size_64bit(size); + } + + if (nla_size != 0) + nla_size += nla_total_size(0); + + return nla_size; +} + static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, unsigned int filter_mask, @@ -3559,6 +3724,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; + int err; ASSERT_RTNL(); @@ -3587,8 +3753,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS); @@ -3612,8 +3776,6 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { - int err; - *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start(skb, IFLA_STATS_LINK_XSTATS_SLAVE); @@ -3628,6 +3790,24 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, + *idxattr)) { + *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; + attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); + if (!attr) + goto nla_put_failure; + + err = rtnl_get_offload_stats(skb, dev, prividx); + if (err == -ENODATA) + nla_nest_cancel(skb, attr); + else + nla_nest_end(skb, attr); + + if (err && err != -ENODATA) + goto nla_put_failure; + *idxattr = 0; + } + nlmsg_end(skb, nlh); return 0; @@ -3642,10 +3822,6 @@ nla_put_failure: return -EMSGSIZE; } -static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = { - [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) }, -}; - static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { @@ -3685,6 +3861,9 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, } } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) + size += rtnl_get_offload_stats_size(dev); + return size; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3864b4b68fa1..cbd19d250947 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2445,6 +2445,25 @@ void skb_queue_purge(struct sk_buff_head *list) EXPORT_SYMBOL(skb_queue_purge); /** + * skb_rbtree_purge - empty a skb rbtree + * @root: root of the rbtree to empty + * + * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from + * the list and one reference dropped. This function does not take + * any lock. Synchronization should be handled by the caller (e.g., TCP + * out-of-order queue is protected by the socket lock). + */ +void skb_rbtree_purge(struct rb_root *root) +{ + struct sk_buff *skb, *next; + + rbtree_postorder_for_each_entry_safe(skb, next, root, rbnode) + kfree_skb(skb); + + *root = RB_ROOT; +} + +/** * skb_queue_head - queue a buffer at the list head * @list: list to use * @newsk: buffer to queue @@ -3078,11 +3097,31 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); - /* GSO partial only requires that we trim off any excess that - * doesn't fit into an MSS sized block, so take care of that - * now. - */ - if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { + if (sg && csum && (mss != GSO_BY_FRAGS)) { + if (!(features & NETIF_F_GSO_PARTIAL)) { + struct sk_buff *iter; + + if (!list_skb || + !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) + goto normal; + + /* Split the buffer at the frag_list pointer. + * This is based on the assumption that all + * buffers in the chain excluding the last + * containing the same amount of data. + */ + skb_walk_frags(head_skb, iter) { + if (skb_headlen(iter)) + goto normal; + + len -= iter->len; + } + } + + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + */ partial_segs = len / mss; if (partial_segs > 1) mss *= partial_segs; @@ -3090,6 +3129,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, partial_segs = 0; } +normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3281,21 +3321,29 @@ perform_csum_check: */ segs->prev = tail; - /* Update GSO info on first skb in partial sequence. */ if (partial_segs) { + struct sk_buff *iter; int type = skb_shinfo(head_skb)->gso_type; + unsigned short gso_size = skb_shinfo(head_skb)->gso_size; /* Update type to add partial and then remove dodgy if set */ - type |= SKB_GSO_PARTIAL; + type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; type &= ~SKB_GSO_DODGY; /* Update GSO info and prepare to start updating headers on * our way back down the stack of protocols. */ - skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; - skb_shinfo(segs)->gso_segs = partial_segs; - skb_shinfo(segs)->gso_type = type; - SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; + for (iter = segs; iter; iter = iter->next) { + skb_shinfo(iter)->gso_size = gso_size; + skb_shinfo(iter)->gso_segs = partial_segs; + skb_shinfo(iter)->gso_type = type; + SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset; + } + + if (tail->len - doffset <= gso_size) + skb_shinfo(tail)->gso_size = 0; + else if (tail != segs) + skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); } /* Following permits correct backpressure, for protocols @@ -4474,17 +4522,24 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len) } EXPORT_SYMBOL(skb_ensure_writable); -/* remove VLAN header from packet and update csum accordingly. */ -static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) +/* remove VLAN header from packet and update csum accordingly. + * expects a non skb_vlan_tag_present skb with a vlan tag payload + */ +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr; - unsigned int offset = skb->data - skb_mac_header(skb); + int offset = skb->data - skb_mac_header(skb); int err; - __skb_push(skb, offset); + if (WARN_ONCE(offset, + "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + err = skb_ensure_writable(skb, VLAN_ETH_HLEN); if (unlikely(err)) - goto pull; + return err; skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); @@ -4501,12 +4556,14 @@ static int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_len(skb); -pull: - __skb_pull(skb, offset); return err; } +EXPORT_SYMBOL(__skb_vlan_pop); +/* Pop a vlan tag either from hwaccel or from payload. + * Expects skb->data at mac header. + */ int skb_vlan_pop(struct sk_buff *skb) { u16 vlan_tci; @@ -4516,9 +4573,7 @@ int skb_vlan_pop(struct sk_buff *skb) if (likely(skb_vlan_tag_present(skb))) { skb->vlan_tci = 0; } else { - if (unlikely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (unlikely(!eth_type_vlan(skb->protocol))) return 0; err = __skb_vlan_pop(skb, &vlan_tci); @@ -4526,9 +4581,7 @@ int skb_vlan_pop(struct sk_buff *skb) return err; } /* move next vlan tag to hw accel tag */ - if (likely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) + if (likely(!eth_type_vlan(skb->protocol))) return 0; vlan_proto = skb->protocol; @@ -4541,29 +4594,30 @@ int skb_vlan_pop(struct sk_buff *skb) } EXPORT_SYMBOL(skb_vlan_pop); +/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). + * Expects skb->data at mac header. + */ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { if (skb_vlan_tag_present(skb)) { - unsigned int offset = skb->data - skb_mac_header(skb); + int offset = skb->data - skb_mac_header(skb); int err; - /* __vlan_insert_tag expect skb->data pointing to mac header. - * So change skb->data before calling it and change back to - * original position later - */ - __skb_push(skb, offset); + if (WARN_ONCE(offset, + "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n", + offset)) { + return -EINVAL; + } + err = __vlan_insert_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); - if (err) { - __skb_pull(skb, offset); + if (err) return err; - } skb->protocol = skb->vlan_proto; skb->mac_len += VLAN_HLEN; skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); - __skb_pull(skb, offset); } __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); return 0; diff --git a/net/core/sock.c b/net/core/sock.c index fd7b41edf1ce..038e660ef844 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1315,24 +1315,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #endif } -void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) -{ - unsigned long nulls1, nulls2; - - nulls1 = offsetof(struct sock, __sk_common.skc_node.next); - nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); - if (nulls1 > nulls2) - swap(nulls1, nulls2); - - if (nulls1 != 0) - memset((char *)sk, 0, nulls1); - memset((char *)sk + nulls1 + sizeof(void *), 0, - nulls2 - nulls1 - sizeof(void *)); - memset((char *)sk + nulls2 + sizeof(void *), 0, - size - nulls2 - sizeof(void *)); -} -EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); - static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family) { @@ -1344,12 +1326,8 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); if (!sk) return sk; - if (priority & __GFP_ZERO) { - if (prot->clear_sk) - prot->clear_sk(sk, prot->obj_size); - else - sk_prot_clear_nulls(sk, prot->obj_size); - } + if (priority & __GFP_ZERO) + sk_prot_clear_nulls(sk, prot->obj_size); } else sk = kmalloc(prot->obj_size, priority); diff --git a/net/core/stream.c b/net/core/stream.c index 159516a11b7e..1086c8b280a8 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -43,7 +43,6 @@ void sk_stream_write_space(struct sock *sk) rcu_read_unlock(); } } -EXPORT_SYMBOL(sk_stream_write_space); /** * sk_stream_wait_connect - Wait for a socket to get into the connected state diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index ff7736f7ff42..96e47c539bee 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -38,4 +38,7 @@ config NET_DSA_TAG_EDSA config NET_DSA_TAG_TRAILER bool +config NET_DSA_TAG_QCA + bool + endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 8af4ded70f1c..a3380ed0e0be 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -7,3 +7,4 @@ dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o +dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 7e68bc6bc853..a6902c1e2f28 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -54,6 +54,9 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #ifdef CONFIG_NET_DSA_TAG_BRCM [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops, #endif +#ifdef CONFIG_NET_DSA_TAG_QCA + [DSA_TAG_PROTO_QCA] = &qca_netdev_ops, +#endif [DSA_TAG_PROTO_NONE] = &none_ops, }; @@ -61,27 +64,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { static DEFINE_MUTEX(dsa_switch_drivers_mutex); static LIST_HEAD(dsa_switch_drivers); -void register_switch_driver(struct dsa_switch_driver *drv) +void register_switch_driver(struct dsa_switch_ops *ops) { mutex_lock(&dsa_switch_drivers_mutex); - list_add_tail(&drv->list, &dsa_switch_drivers); + list_add_tail(&ops->list, &dsa_switch_drivers); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(register_switch_driver); -void unregister_switch_driver(struct dsa_switch_driver *drv) +void unregister_switch_driver(struct dsa_switch_ops *ops) { mutex_lock(&dsa_switch_drivers_mutex); - list_del_init(&drv->list); + list_del_init(&ops->list); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(unregister_switch_driver); -static struct dsa_switch_driver * +static struct dsa_switch_ops * dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, const char **_name, void **priv) { - struct dsa_switch_driver *ret; + struct dsa_switch_ops *ret; struct list_head *list; const char *name; @@ -90,13 +93,13 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, mutex_lock(&dsa_switch_drivers_mutex); list_for_each(list, &dsa_switch_drivers) { - struct dsa_switch_driver *drv; + struct dsa_switch_ops *ops; - drv = list_entry(list, struct dsa_switch_driver, list); + ops = list_entry(list, struct dsa_switch_ops, list); - name = drv->probe(parent, host_dev, sw_addr, priv); + name = ops->probe(parent, host_dev, sw_addr, priv); if (name != NULL) { - ret = drv; + ret = ops; break; } } @@ -117,7 +120,7 @@ static ssize_t temp1_input_show(struct device *dev, struct dsa_switch *ds = dev_get_drvdata(dev); int temp, ret; - ret = ds->drv->get_temp(ds, &temp); + ret = ds->ops->get_temp(ds, &temp); if (ret < 0) return ret; @@ -131,7 +134,7 @@ static ssize_t temp1_max_show(struct device *dev, struct dsa_switch *ds = dev_get_drvdata(dev); int temp, ret; - ret = ds->drv->get_temp_limit(ds, &temp); + ret = ds->ops->get_temp_limit(ds, &temp); if (ret < 0) return ret; @@ -149,7 +152,7 @@ static ssize_t temp1_max_store(struct device *dev, if (ret < 0) return ret; - ret = ds->drv->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); + ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); if (ret < 0) return ret; @@ -164,7 +167,7 @@ static ssize_t temp1_max_alarm_show(struct device *dev, bool alarm; int ret; - ret = ds->drv->get_temp_alarm(ds, &alarm); + ret = ds->ops->get_temp_alarm(ds, &alarm); if (ret < 0) return ret; @@ -184,15 +187,15 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, { struct device *dev = container_of(kobj, struct device, kobj); struct dsa_switch *ds = dev_get_drvdata(dev); - struct dsa_switch_driver *drv = ds->drv; + struct dsa_switch_ops *ops = ds->ops; umode_t mode = attr->mode; if (index == 1) { - if (!drv->get_temp_limit) + if (!ops->get_temp_limit) mode = 0; - else if (!drv->set_temp_limit) + else if (!ops->set_temp_limit) mode &= ~S_IWUSR; - } else if (index == 2 && !drv->get_temp_alarm) { + } else if (index == 2 && !ops->get_temp_alarm) { mode = 0; } return mode; @@ -228,8 +231,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, genphy_config_init(phydev); genphy_read_status(phydev); - if (ds->drv->adjust_link) - ds->drv->adjust_link(ds, port, phydev); + if (ds->ops->adjust_link) + ds->ops->adjust_link(ds, port, phydev); } return 0; @@ -303,7 +306,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { - struct dsa_switch_driver *drv = ds->drv; + struct dsa_switch_ops *ops = ds->ops; struct dsa_switch_tree *dst = ds->dst; struct dsa_chip_data *cd = ds->cd; bool valid_name_found = false; @@ -354,7 +357,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * switch. */ if (dst->cpu_switch == index) { - dst->tag_ops = dsa_resolve_tag_protocol(drv->tag_protocol); + enum dsa_tag_protocol tag_protocol; + + tag_protocol = ops->get_tag_protocol(ds); + dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { ret = PTR_ERR(dst->tag_ops); goto out; @@ -368,15 +374,17 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) /* * Do basic register setup. */ - ret = drv->setup(ds); + ret = ops->setup(ds); if (ret < 0) goto out; - ret = drv->set_addr(ds, dst->master_netdev->dev_addr); - if (ret < 0) - goto out; + if (ops->set_addr) { + ret = ops->set_addr(ds, dst->master_netdev->dev_addr); + if (ret < 0) + goto out; + } - if (!ds->slave_mii_bus && drv->phy_read) { + if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(parent); if (!ds->slave_mii_bus) { ret = -ENOMEM; @@ -423,7 +431,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * register with hardware monitoring subsystem. * Treat registration error as non-fatal and ignore it. */ - if (drv->get_temp) { + if (ops->get_temp) { const char *netname = netdev_name(dst->master_netdev); char hname[IFNAMSIZ + 1]; int i, j; @@ -454,7 +462,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, struct device *parent, struct device *host_dev) { struct dsa_chip_data *cd = dst->pd->chip + index; - struct dsa_switch_driver *drv; + struct dsa_switch_ops *ops; struct dsa_switch *ds; int ret; const char *name; @@ -463,8 +471,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Probe for switch model. */ - drv = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); - if (drv == NULL) { + ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); + if (!ops) { netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", index); return ERR_PTR(-EINVAL); @@ -483,7 +491,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->dst = dst; ds->index = index; ds->cd = cd; - ds->drv = drv; + ds->ops = ops; ds->priv = priv; ds->dev = parent; @@ -538,12 +546,12 @@ static void dsa_switch_destroy(struct dsa_switch *ds) ds->dsa_port_mask |= ~(1 << port); } - if (ds->slave_mii_bus && ds->drv->phy_read) + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); } #ifdef CONFIG_PM_SLEEP -static int dsa_switch_suspend(struct dsa_switch *ds) +int dsa_switch_suspend(struct dsa_switch *ds) { int i, ret = 0; @@ -557,18 +565,19 @@ static int dsa_switch_suspend(struct dsa_switch *ds) return ret; } - if (ds->drv->suspend) - ret = ds->drv->suspend(ds); + if (ds->ops->suspend) + ret = ds->ops->suspend(ds); return ret; } +EXPORT_SYMBOL_GPL(dsa_switch_suspend); -static int dsa_switch_resume(struct dsa_switch *ds) +int dsa_switch_resume(struct dsa_switch *ds) { int i, ret = 0; - if (ds->drv->resume) - ret = ds->drv->resume(ds); + if (ds->ops->resume) + ret = ds->ops->resume(ds); if (ret) return ret; @@ -585,6 +594,7 @@ static int dsa_switch_resume(struct dsa_switch *ds) return 0; } +EXPORT_SYMBOL_GPL(dsa_switch_resume); #endif /* platform driver init and cleanup *****************************************/ @@ -1086,7 +1096,6 @@ static int dsa_resume(struct device *d) static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); static const struct of_device_id dsa_of_match_table[] = { - { .compatible = "brcm,bcm7445-switch-v4.0" }, { .compatible = "marvell,dsa", }, {} }; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f30bad9678f0..f8a7d9aab437 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -294,25 +294,23 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) int err; /* Initialize ds->phys_mii_mask before registering the slave MDIO bus - * driver and before drv->setup() has run, since the switch drivers and + * driver and before ops->setup() has run, since the switch drivers and * the slave MDIO bus driver rely on these values for probing PHY * devices or not */ ds->phys_mii_mask = ds->enabled_port_mask; - err = ds->drv->setup(ds); + err = ds->ops->setup(ds); if (err < 0) return err; - err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); - if (err < 0) - return err; - - err = ds->drv->set_addr(ds, dst->master_netdev->dev_addr); - if (err < 0) - return err; + if (ds->ops->set_addr) { + err = ds->ops->set_addr(ds, dst->master_netdev->dev_addr); + if (err < 0) + return err; + } - if (!ds->slave_mii_bus && ds->drv->phy_read) { + if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) return -ENOMEM; @@ -374,7 +372,7 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) dsa_user_port_unapply(port, index, ds); } - if (ds->slave_mii_bus && ds->drv->phy_read) + if (ds->slave_mii_bus && ds->ops->phy_read) mdiobus_unregister(ds->slave_mii_bus); } @@ -443,6 +441,7 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { + enum dsa_tag_protocol tag_protocol; struct net_device *ethernet_dev; struct device_node *ethernet; @@ -465,7 +464,8 @@ static int dsa_cpu_parse(struct device_node *port, u32 index, dst->cpu_port = index; } - dst->tag_ops = dsa_resolve_tag_protocol(ds->drv->tag_protocol); + tag_protocol = ds->ops->get_tag_protocol(ds); + dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); return PTR_ERR(dst->tag_ops); @@ -541,7 +541,7 @@ static int dsa_parse_ports_dn(struct device_node *ports, struct dsa_switch *ds) ds->ports[reg].dn = port; - /* Initialize enabled_port_mask now for drv->setup() + /* Initialize enabled_port_mask now for ops->setup() * to have access to a correct value, just like what * net/dsa/dsa.c::dsa_switch_setup_one does. */ diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 00077a9c97f4..6cfd7388834e 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -81,5 +81,7 @@ extern const struct dsa_device_ops trailer_netdev_ops; /* tag_brcm.c */ extern const struct dsa_device_ops brcm_netdev_ops; +/* tag_qca.c */ +extern const struct dsa_device_ops qca_netdev_ops; #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index fc9196745225..6b1282c006b1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -28,7 +28,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) - return ds->drv->phy_read(ds, addr, reg); + return ds->ops->phy_read(ds, addr, reg); return 0xffff; } @@ -38,7 +38,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) - return ds->drv->phy_write(ds, addr, reg, val); + return ds->ops->phy_write(ds, addr, reg, val); return 0; } @@ -69,6 +69,30 @@ static inline bool dsa_port_is_bridged(struct dsa_slave_priv *p) return !!p->bridge_dev; } +static void dsa_port_set_stp_state(struct dsa_switch *ds, int port, u8 state) +{ + struct dsa_port *dp = &ds->ports[port]; + + if (ds->ops->port_stp_state_set) + ds->ops->port_stp_state_set(ds, port, state); + + if (ds->ops->port_fast_age) { + /* Fast age FDB entries or flush appropriate forwarding database + * for the given port, if we are moving it from Learning or + * Forwarding state, to Disabled or Blocking or Listening state. + */ + + if ((dp->stp_state == BR_STATE_LEARNING || + dp->stp_state == BR_STATE_FORWARDING) && + (state == BR_STATE_DISABLED || + state == BR_STATE_BLOCKING || + state == BR_STATE_LISTENING)) + ds->ops->port_fast_age(ds, port); + } + + dp->stp_state = state; +} + static int dsa_slave_open(struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -98,14 +122,13 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - if (ds->drv->port_enable) { - err = ds->drv->port_enable(ds, p->port, p->phy); + if (ds->ops->port_enable) { + err = ds->ops->port_enable(ds, p->port, p->phy); if (err) goto clear_promisc; } - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, stp_state); + dsa_port_set_stp_state(ds, p->port, stp_state); if (p->phy) phy_start(p->phy); @@ -144,11 +167,10 @@ static int dsa_slave_close(struct net_device *dev) if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) dev_uc_del(master, dev->dev_addr); - if (ds->drv->port_disable) - ds->drv->port_disable(ds, p->port, p->phy); + if (ds->ops->port_disable) + ds->ops->port_disable(ds, p->port, p->phy); - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, BR_STATE_DISABLED); + dsa_port_set_stp_state(ds, p->port, BR_STATE_DISABLED); return 0; } @@ -209,13 +231,13 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) { - if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) + if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add) return -EOPNOTSUPP; - return ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); + return ds->ops->port_vlan_prepare(ds, p->port, vlan, trans); } - ds->drv->port_vlan_add(ds, p->port, vlan, trans); + ds->ops->port_vlan_add(ds, p->port, vlan, trans); return 0; } @@ -226,10 +248,10 @@ static int dsa_slave_port_vlan_del(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (!ds->drv->port_vlan_del) + if (!ds->ops->port_vlan_del) return -EOPNOTSUPP; - return ds->drv->port_vlan_del(ds, p->port, vlan); + return ds->ops->port_vlan_del(ds, p->port, vlan); } static int dsa_slave_port_vlan_dump(struct net_device *dev, @@ -239,8 +261,8 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->port_vlan_dump) - return ds->drv->port_vlan_dump(ds, p->port, vlan, cb); + if (ds->ops->port_vlan_dump) + return ds->ops->port_vlan_dump(ds, p->port, vlan, cb); return -EOPNOTSUPP; } @@ -253,13 +275,13 @@ static int dsa_slave_port_fdb_add(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) { - if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) + if (!ds->ops->port_fdb_prepare || !ds->ops->port_fdb_add) return -EOPNOTSUPP; - return ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); + return ds->ops->port_fdb_prepare(ds, p->port, fdb, trans); } - ds->drv->port_fdb_add(ds, p->port, fdb, trans); + ds->ops->port_fdb_add(ds, p->port, fdb, trans); return 0; } @@ -271,8 +293,8 @@ static int dsa_slave_port_fdb_del(struct net_device *dev, struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->port_fdb_del) - ret = ds->drv->port_fdb_del(ds, p->port, fdb); + if (ds->ops->port_fdb_del) + ret = ds->ops->port_fdb_del(ds, p->port, fdb); return ret; } @@ -284,8 +306,52 @@ static int dsa_slave_port_fdb_dump(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->port_fdb_dump) - return ds->drv->port_fdb_dump(ds, p->port, fdb, cb); + if (ds->ops->port_fdb_dump) + return ds->ops->port_fdb_dump(ds, p->port, fdb, cb); + + return -EOPNOTSUPP; +} + +static int dsa_slave_port_mdb_add(struct net_device *dev, + const struct switchdev_obj_port_mdb *mdb, + struct switchdev_trans *trans) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add) + return -EOPNOTSUPP; + + return ds->ops->port_mdb_prepare(ds, p->port, mdb, trans); + } + + ds->ops->port_mdb_add(ds, p->port, mdb, trans); + + return 0; +} + +static int dsa_slave_port_mdb_del(struct net_device *dev, + const struct switchdev_obj_port_mdb *mdb) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->ops->port_mdb_del) + return ds->ops->port_mdb_del(ds, p->port, mdb); + + return -EOPNOTSUPP; +} + +static int dsa_slave_port_mdb_dump(struct net_device *dev, + struct switchdev_obj_port_mdb *mdb, + switchdev_obj_dump_cb_t *cb) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (ds->ops->port_mdb_dump) + return ds->ops->port_mdb_dump(ds, p->port, mdb, cb); return -EOPNOTSUPP; } @@ -308,9 +374,9 @@ static int dsa_slave_stp_state_set(struct net_device *dev, struct dsa_switch *ds = p->parent; if (switchdev_trans_ph_prepare(trans)) - return ds->drv->port_stp_state_set ? 0 : -EOPNOTSUPP; + return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP; - ds->drv->port_stp_state_set(ds, p->port, attr->u.stp_state); + dsa_port_set_stp_state(ds, p->port, attr->u.stp_state); return 0; } @@ -326,8 +392,8 @@ static int dsa_slave_vlan_filtering(struct net_device *dev, if (switchdev_trans_ph_prepare(trans)) return 0; - if (ds->drv->port_vlan_filtering) - return ds->drv->port_vlan_filtering(ds, p->port, + if (ds->ops->port_vlan_filtering) + return ds->ops->port_vlan_filtering(ds, p->port, attr->u.vlan_filtering); return 0; @@ -365,8 +431,8 @@ static int dsa_slave_ageing_time(struct net_device *dev, ds->ports[p->port].ageing_time = ageing_time; ageing_time = dsa_fastest_ageing_time(ds, ageing_time); - if (ds->drv->set_ageing_time) - return ds->drv->set_ageing_time(ds, ageing_time); + if (ds->ops->set_ageing_time) + return ds->ops->set_ageing_time(ds, ageing_time); return 0; } @@ -412,6 +478,10 @@ static int dsa_slave_port_obj_add(struct net_device *dev, SWITCHDEV_OBJ_PORT_FDB(obj), trans); break; + case SWITCHDEV_OBJ_ID_PORT_MDB: + err = dsa_slave_port_mdb_add(dev, SWITCHDEV_OBJ_PORT_MDB(obj), + trans); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_add(dev, SWITCHDEV_OBJ_PORT_VLAN(obj), @@ -435,6 +505,9 @@ static int dsa_slave_port_obj_del(struct net_device *dev, err = dsa_slave_port_fdb_del(dev, SWITCHDEV_OBJ_PORT_FDB(obj)); break; + case SWITCHDEV_OBJ_ID_PORT_MDB: + err = dsa_slave_port_mdb_del(dev, SWITCHDEV_OBJ_PORT_MDB(obj)); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_del(dev, SWITCHDEV_OBJ_PORT_VLAN(obj)); @@ -459,6 +532,10 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, SWITCHDEV_OBJ_PORT_FDB(obj), cb); break; + case SWITCHDEV_OBJ_ID_PORT_MDB: + err = dsa_slave_port_mdb_dump(dev, SWITCHDEV_OBJ_PORT_MDB(obj), + cb); + break; case SWITCHDEV_OBJ_ID_PORT_VLAN: err = dsa_slave_port_vlan_dump(dev, SWITCHDEV_OBJ_PORT_VLAN(obj), @@ -481,8 +558,8 @@ static int dsa_slave_bridge_port_join(struct net_device *dev, p->bridge_dev = br; - if (ds->drv->port_bridge_join) - ret = ds->drv->port_bridge_join(ds, p->port, br); + if (ds->ops->port_bridge_join) + ret = ds->ops->port_bridge_join(ds, p->port, br); return ret == -EOPNOTSUPP ? 0 : ret; } @@ -493,16 +570,15 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev) struct dsa_switch *ds = p->parent; - if (ds->drv->port_bridge_leave) - ds->drv->port_bridge_leave(ds, p->port); + if (ds->ops->port_bridge_leave) + ds->ops->port_bridge_leave(ds, p->port); p->bridge_dev = NULL; /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - if (ds->drv->port_stp_state_set) - ds->drv->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING); + dsa_port_set_stp_state(ds, p->port, BR_STATE_FORWARDING); } static int dsa_slave_port_attr_get(struct net_device *dev, @@ -605,8 +681,8 @@ static int dsa_slave_get_regs_len(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_regs_len) - return ds->drv->get_regs_len(ds, p->port); + if (ds->ops->get_regs_len) + return ds->ops->get_regs_len(ds, p->port); return -EOPNOTSUPP; } @@ -617,8 +693,8 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_regs) - ds->drv->get_regs(ds, p->port, regs, _p); + if (ds->ops->get_regs) + ds->ops->get_regs(ds, p->port, regs, _p); } static int dsa_slave_nway_reset(struct net_device *dev) @@ -651,8 +727,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; - if (ds->drv->get_eeprom_len) - return ds->drv->get_eeprom_len(ds); + if (ds->ops->get_eeprom_len) + return ds->ops->get_eeprom_len(ds); return 0; } @@ -663,8 +739,8 @@ static int dsa_slave_get_eeprom(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_eeprom) - return ds->drv->get_eeprom(ds, eeprom, data); + if (ds->ops->get_eeprom) + return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } @@ -675,8 +751,8 @@ static int dsa_slave_set_eeprom(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->set_eeprom) - return ds->drv->set_eeprom(ds, eeprom, data); + if (ds->ops->set_eeprom) + return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } @@ -694,8 +770,8 @@ static void dsa_slave_get_strings(struct net_device *dev, strncpy(data + len, "tx_bytes", len); strncpy(data + 2 * len, "rx_packets", len); strncpy(data + 3 * len, "rx_bytes", len); - if (ds->drv->get_strings != NULL) - ds->drv->get_strings(ds, p->port, data + 4 * len); + if (ds->ops->get_strings) + ds->ops->get_strings(ds, p->port, data + 4 * len); } } @@ -714,8 +790,8 @@ static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data); } - if (ds->drv->get_ethtool_stats) - ds->drv->get_ethtool_stats(ds, cpu_port, data + count); + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, cpu_port, data + count); } static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) @@ -727,8 +803,8 @@ static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) if (dst->master_ethtool_ops.get_sset_count) count += dst->master_ethtool_ops.get_sset_count(dev, sset); - if (sset == ETH_SS_STATS && ds->drv->get_sset_count) - count += ds->drv->get_sset_count(ds); + if (sset == ETH_SS_STATS && ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); return count; } @@ -755,14 +831,14 @@ static void dsa_cpu_port_get_strings(struct net_device *dev, dst->master_ethtool_ops.get_strings(dev, stringset, data); } - if (stringset == ETH_SS_STATS && ds->drv->get_strings) { + if (stringset == ETH_SS_STATS && ds->ops->get_strings) { ndata = data + mcount * len; /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle * the output after to prepend our CPU port prefix we * constructed earlier */ - ds->drv->get_strings(ds, cpu_port, ndata); - count = ds->drv->get_sset_count(ds); + ds->ops->get_strings(ds, cpu_port, ndata); + count = ds->ops->get_sset_count(ds); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), ndata + i * len, len - sizeof(pfx)); @@ -782,8 +858,8 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, data[1] = dev->stats.tx_bytes; data[2] = dev->stats.rx_packets; data[3] = dev->stats.rx_bytes; - if (ds->drv->get_ethtool_stats != NULL) - ds->drv->get_ethtool_stats(ds, p->port, data + 4); + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, p->port, data + 4); } static int dsa_slave_get_sset_count(struct net_device *dev, int sset) @@ -795,8 +871,8 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) int count; count = 4; - if (ds->drv->get_sset_count != NULL) - count += ds->drv->get_sset_count(ds); + if (ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds); return count; } @@ -809,8 +885,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->drv->get_wol) - ds->drv->get_wol(ds, p->port, w); + if (ds->ops->get_wol) + ds->ops->get_wol(ds, p->port, w); } static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) @@ -819,8 +895,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->set_wol) - ret = ds->drv->set_wol(ds, p->port, w); + if (ds->ops->set_wol) + ret = ds->ops->set_wol(ds, p->port, w); return ret; } @@ -831,10 +907,10 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) struct dsa_switch *ds = p->parent; int ret; - if (!ds->drv->set_eee) + if (!ds->ops->set_eee) return -EOPNOTSUPP; - ret = ds->drv->set_eee(ds, p->port, p->phy, e); + ret = ds->ops->set_eee(ds, p->port, p->phy, e); if (ret) return ret; @@ -850,10 +926,10 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) struct dsa_switch *ds = p->parent; int ret; - if (!ds->drv->get_eee) + if (!ds->ops->get_eee) return -EOPNOTSUPP; - ret = ds->drv->get_eee(ds, p->port, e); + ret = ds->ops->get_eee(ds, p->port, e); if (ret) return ret; @@ -988,8 +1064,8 @@ static void dsa_slave_adjust_link(struct net_device *dev) p->old_pause = p->phy->pause; } - if (ds->drv->adjust_link && status_changed) - ds->drv->adjust_link(ds, p->port, p->phy); + if (ds->ops->adjust_link && status_changed) + ds->ops->adjust_link(ds, p->port, p->phy); if (status_changed) phy_print_status(p->phy); @@ -1004,8 +1080,8 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, if (dev) { p = netdev_priv(dev); ds = p->parent; - if (ds->drv->fixed_link_update) - ds->drv->fixed_link_update(ds, p->port, status); + if (ds->ops->fixed_link_update) + ds->ops->fixed_link_update(ds, p->port, status); } return 0; @@ -1062,8 +1138,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, phy_dn = port_dn; } - if (ds->drv->get_phy_flags) - phy_flags = ds->drv->get_phy_flags(ds, p->port); + if (ds->ops->get_phy_flags) + phy_flags = ds->ops->get_phy_flags(ds, p->port); if (phy_dn) { int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c new file mode 100644 index 000000000000..0c90cacee7aa --- /dev/null +++ b/net/dsa/tag_qca.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/etherdevice.h> +#include "dsa_priv.h" + +#define QCA_HDR_LEN 2 +#define QCA_HDR_VERSION 0x2 + +#define QCA_HDR_RECV_VERSION_MASK GENMASK(15, 14) +#define QCA_HDR_RECV_VERSION_S 14 +#define QCA_HDR_RECV_PRIORITY_MASK GENMASK(13, 11) +#define QCA_HDR_RECV_PRIORITY_S 11 +#define QCA_HDR_RECV_TYPE_MASK GENMASK(10, 6) +#define QCA_HDR_RECV_TYPE_S 6 +#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3) +#define QCA_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0) + +#define QCA_HDR_XMIT_VERSION_MASK GENMASK(15, 14) +#define QCA_HDR_XMIT_VERSION_S 14 +#define QCA_HDR_XMIT_PRIORITY_MASK GENMASK(13, 11) +#define QCA_HDR_XMIT_PRIORITY_S 11 +#define QCA_HDR_XMIT_CONTROL_MASK GENMASK(10, 8) +#define QCA_HDR_XMIT_CONTROL_S 8 +#define QCA_HDR_XMIT_FROM_CPU BIT(7) +#define QCA_HDR_XMIT_DP_BIT_MASK GENMASK(6, 0) + +static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u16 *phdr, hdr; + + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; + + if (skb_cow_head(skb, 0) < 0) + goto out_free; + + skb_push(skb, QCA_HDR_LEN); + + memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN); + phdr = (u16 *)(skb->data + 2 * ETH_ALEN); + + /* Set the version field, and set destination port information */ + hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | + QCA_HDR_XMIT_FROM_CPU | + BIT(p->port); + + *phdr = htons(hdr); + + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + +static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + u8 ver; + int port; + __be16 *phdr, hdr; + + if (unlikely(!dst)) + goto out_drop; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + + if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) + goto out_drop; + + /* The QCA header is added by the switch between src addr and Ethertype + * At this point, skb->data points to ethertype so header should be + * right before + */ + phdr = (__be16 *)(skb->data - 2); + hdr = ntohs(*phdr); + + /* Make sure the version is correct */ + ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S; + if (unlikely(ver != QCA_HDR_VERSION)) + goto out_drop; + + /* Remove QCA tag and recalculate checksum */ + skb_pull_rcsum(skb, QCA_HDR_LEN); + memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN, + ETH_HLEN - QCA_HDR_LEN); + + /* This protocol doesn't support cascading multiple switches so it's + * safe to assume the switch is first in the tree + */ + ds = dst->ds[0]; + if (!ds) + goto out_drop; + + /* Get source port information */ + port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); + if (!ds->ports[port].netdev) + goto out_drop; + + /* Update skb & forward the frame accordingly */ + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->dev = ds->ports[port].netdev; + skb->protocol = eth_type_trans(skb, skb->dev); + + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; + + netif_receive_skb(skb); + + return 0; + +out_drop: + kfree_skb(skb); +out: + return 0; +} + +const struct dsa_device_ops qca_netdev_ops = { + .xmit = qca_tag_xmit, + .rcv = qca_tag_rcv, +}; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 50d6a9b49f6c..300b06888fdf 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -640,6 +640,21 @@ config TCP_CONG_CDG D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg +config TCP_CONG_BBR + tristate "BBR TCP" + default n + ---help--- + + BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to + maximize network utilization and minimize queues. It builds an explicit + model of the the bottleneck delivery rate and path round-trip + propagation delay. It tolerates packet loss and delay unrelated to + congestion. It can operate over LAN, WAN, cellular, wifi, or cable + modem links. It can coexist with flows that use loss-based congestion + control, and can operate with shallow buffers, deep buffers, + bufferbloat, policers, or AQM schemes that do not provide a delay + signal. It requires the fq ("Fair Queue") pacing packet scheduler. + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -674,6 +689,9 @@ choice config DEFAULT_CDG bool "CDG" if TCP_CONG_CDG=y + config DEFAULT_BBR + bool "BBR" if TCP_CONG_BBR=y + config DEFAULT_RENO bool "Reno" endchoice diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 24629b6f57cc..bc6a6c8b9bcd 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - tcp_recovery.o \ + tcp_rate.o tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ @@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o +obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 55513e654d79..1effc986739e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -211,24 +211,19 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { - /* Check special setups for testing purpose to enable TFO w/o - * requiring TCP_FASTOPEN sockopt. + /* Enable TFO w/o requiring TCP_FASTOPEN socket option. * Note that only TCP sockets (SOCK_STREAM) will reach here. - * Also fastopenq may already been allocated because this - * socket was in TCP_LISTEN state previously but was - * shutdown() (rather than close()). + * Also fastopen backlog may already been set via the option + * because the socket was in TCP_LISTEN state previously but + * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && + if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0) - fastopen_queue_tune(sk, backlog); - else if ((sysctl_tcp_fastopen & - TFO_SERVER_WO_SOCKOPT2) != 0) - fastopen_queue_tune(sk, - ((uint)sysctl_tcp_fastopen) >> 16); - + fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(true); } + err = inet_csk_listen_start(sk, backlog); if (err) goto out; @@ -921,6 +916,8 @@ const struct proto_ops inet_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, + .read_sock = tcp_read_sock, + .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, @@ -1195,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool udpfrag = false, fixedid = false, encap; + bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; @@ -1248,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (IS_ERR_OR_NULL(segs)) goto out; + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); @@ -1262,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; } - tot_len = skb_shinfo(skb)->gso_size + - SKB_GSO_CB(skb)->data_offset + - skb->head - (unsigned char *)iph; + + if (gso_partial) + tot_len = skb_shinfo(skb)->gso_size + + SKB_GSO_CB(skb)->data_offset + + skb->head - (unsigned char *)iph; + else + tot_len = skb->len - nhoff; } else { if (!fixedid) iph->id = htons(id++); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1b25daf8c7f1..c3b80478226e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -93,9 +93,6 @@ struct fib_table *fib_new_table(struct net *net, u32 id) return NULL; switch (id) { - case RT_TABLE_LOCAL: - rcu_assign_pointer(net->ipv4.fib_local, tb); - break; case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, tb); break; @@ -137,9 +134,6 @@ static void fib_replace_table(struct net *net, struct fib_table *old, { #ifdef CONFIG_IP_MULTIPLE_TABLES switch (new->tb_id) { - case RT_TABLE_LOCAL: - rcu_assign_pointer(net->ipv4.fib_local, new); - break; case RT_TABLE_MAIN: rcu_assign_pointer(net->ipv4.fib_main, new); break; @@ -188,26 +182,13 @@ static void fib_flush(struct net *net) struct fib_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) - flushed += fib_table_flush(tb); + flushed += fib_table_flush(net, tb); } if (flushed) rt_cache_flush(net); } -void fib_flush_external(struct net *net) -{ - struct fib_table *tb; - struct hlist_head *head; - unsigned int h; - - for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) - fib_table_flush_external(tb); - } -} - /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. @@ -596,13 +577,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) - err = fib_table_delete(tb, &cfg); + err = fib_table_delete(net, tb, &cfg); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) - err = fib_table_insert(tb, &cfg); + err = fib_table_insert(net, tb, &cfg); else err = -ENOBUFS; } @@ -725,7 +706,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = fib_table_delete(tb, &cfg); + err = fib_table_delete(net, tb, &cfg); errout: return err; } @@ -747,7 +728,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = fib_table_insert(tb, &cfg); + err = fib_table_insert(net, tb, &cfg); errout: return err; } @@ -834,9 +815,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) - fib_table_insert(tb, &cfg); + fib_table_insert(net, tb, &cfg); else - fib_table_delete(tb, &cfg); + fib_table_delete(net, tb, &cfg); } void fib_add_ifaddr(struct in_ifaddr *ifa) @@ -1250,7 +1231,6 @@ static void ip_fib_net_exit(struct net *net) rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES - RCU_INIT_POINTER(net->ipv4.fib_local, NULL); RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif @@ -1261,7 +1241,7 @@ static void ip_fib_net_exit(struct net *net) hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); - fib_table_flush(tb); + fib_table_flush(net, tb); fib_free_table(tb); } } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 6e9ea69e5f75..2e50062f642d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, }; int err; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi4_to_flowi(flp)); + err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); #ifdef CONFIG_IP_ROUTE_CLASSID if (arg.rule) @@ -161,6 +164,14 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } +static int call_fib_rule_notifiers(struct net *net, + enum fib_event_type event_type) +{ + struct fib_notifier_info info; + + return call_fib_notifiers(net, event_type, &info); +} + static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { FRA_GENERIC_POLICY, [FRA_FLOW] = { .type = NLA_U32 }, @@ -217,7 +228,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; - fib_flush_external(rule->fr_net); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD); err = 0; errout: @@ -239,7 +250,7 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; - fib_flush_external(rule->fr_net); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL); errout: return err; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e9f56225e53f..388d3e21629b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1580,7 +1580,8 @@ static bool fib_good_nh(const struct fib_nh *nh) rcu_read_lock_bh(); - n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw); + n = __ipv4_neigh_lookup_noref(nh->nh_dev, + (__force u32)nh->nh_gw); if (n) state = n->nud_state; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e2ffc2a5c7db..31cef3602585 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -73,6 +73,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/vmalloc.h> +#include <linux/notifier.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -80,10 +81,47 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> -#include <net/switchdev.h> #include <trace/events/fib.h> #include "fib_lookup.h" +static BLOCKING_NOTIFIER_HEAD(fib_chain); + +int register_fib_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&fib_chain, nb); +} +EXPORT_SYMBOL(register_fib_notifier); + +int unregister_fib_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&fib_chain, nb); +} +EXPORT_SYMBOL(unregister_fib_notifier); + +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->net = net; + return blocking_notifier_call_chain(&fib_chain, event_type, info); +} + +static int call_fib_entry_notifiers(struct net *net, + enum fib_event_type event_type, u32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id, u32 nlflags) +{ + struct fib_entry_notifier_info info = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .tb_id = tb_id, + .nlflags = nlflags, + }; + return call_fib_notifiers(net, event_type, &info.info); +} + #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) @@ -1076,12 +1114,13 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, } /* Caller must hold RTNL. */ -int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) +int fib_table_insert(struct net *net, struct fib_table *tb, + struct fib_config *cfg) { struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; - unsigned int nlflags = 0; + u16 nlflags = NLM_F_EXCL; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; @@ -1126,6 +1165,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (cfg->fc_nlflags & NLM_F_EXCL) goto out; + nlflags &= ~NLM_F_EXCL; + /* We have 2 goals: * 1. Find exact match for type, scope, fib_info to avoid * duplicate routes @@ -1151,6 +1192,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) struct fib_info *fi_drop; u8 state; + nlflags |= NLM_F_REPLACE; fa = fa_first; if (fa_match) { if (fa == fa_match) @@ -1172,17 +1214,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - err = switchdev_fib_ipv4_add(key, plen, fi, - new_fa->fa_tos, - cfg->fc_type, - cfg->fc_nlflags, - tb->tb_id); - if (err) { - switchdev_fib_ipv4_abort(fi); - kmem_cache_free(fn_alias_kmem, new_fa); - goto out; - } - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); alias_free_mem_rcu(fa); @@ -1190,8 +1221,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); + + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, + key, plen, fi, + new_fa->fa_tos, cfg->fc_type, + tb->tb_id, cfg->fc_nlflags); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, - tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); + tb->tb_id, &cfg->fc_nlinfo, nlflags); goto succeeded; } @@ -1203,7 +1239,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) goto out; if (cfg->fc_nlflags & NLM_F_APPEND) - nlflags = NLM_F_APPEND; + nlflags |= NLM_F_APPEND; else fa = fa_first; } @@ -1211,6 +1247,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) if (!(cfg->fc_nlflags & NLM_F_CREATE)) goto out; + nlflags |= NLM_F_CREATE; err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) @@ -1224,30 +1261,22 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - /* (Optionally) offload fib entry to switch hardware. */ - err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type, - cfg->fc_nlflags, tb->tb_id); - if (err) { - switchdev_fib_ipv4_abort(fi); - goto out_free_new_fa; - } - /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_sw_fib_del; + goto out_free_new_fa; if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos, + cfg->fc_type, tb->tb_id, cfg->fc_nlflags); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; -out_sw_fib_del: - switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1486,7 +1515,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, } /* Caller must hold RTNL. */ -int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) +int fib_table_delete(struct net *net, struct fib_table *tb, + struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; @@ -1539,9 +1569,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, - cfg->fc_type, tb->tb_id); - + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, + fa_to_delete->fa_info, tos, cfg->fc_type, + tb->tb_id, 0); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1730,82 +1760,8 @@ out: return NULL; } -/* Caller must hold RTNL */ -void fib_table_flush_external(struct fib_table *tb) -{ - struct trie *t = (struct trie *)tb->tb_data; - struct key_vector *pn = t->kv; - unsigned long cindex = 1; - struct hlist_node *tmp; - struct fib_alias *fa; - - /* walk trie in reverse order */ - for (;;) { - unsigned char slen = 0; - struct key_vector *n; - - if (!(cindex--)) { - t_key pkey = pn->key; - - /* cannot resize the trie vector */ - if (IS_TRIE(pn)) - break; - - /* resize completed node */ - pn = resize(t, pn); - cindex = get_index(pkey, pn); - - continue; - } - - /* grab the next available node */ - n = get_child(pn, cindex); - if (!n) - continue; - - if (IS_TNODE(n)) { - /* record pn and cindex for leaf walking */ - pn = n; - cindex = 1ul << n->bits; - - continue; - } - - hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { - struct fib_info *fi = fa->fa_info; - - /* if alias was cloned to local then we just - * need to remove the local copy from main - */ - if (tb->tb_id != fa->tb_id) { - hlist_del_rcu(&fa->fa_list); - alias_free_mem_rcu(fa); - continue; - } - - /* record local slen */ - slen = fa->fa_slen; - - if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD)) - continue; - - switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); - } - - /* update leaf slen */ - n->slen = slen; - - if (hlist_empty(&n->leaf)) { - put_child_root(pn, n->key, NULL); - node_free(n); - } - } -} - /* Caller must hold RTNL. */ -int fib_table_flush(struct fib_table *tb) +int fib_table_flush(struct net *net, struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; @@ -1854,9 +1810,11 @@ int fib_table_flush(struct fib_table *tb) continue; } - switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, + n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, fa->fa_type, + tb->tb_id, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 321d57f825ce..cf50f7e2b012 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -631,7 +631,7 @@ static struct genl_family fou_nl_family = { .netnsok = true, }; -static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { +static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_PORT] = { .type = NLA_U16, }, [FOU_ATTR_AF] = { .type = NLA_U8, }, [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index ecd1e09dbbf1..96e0efecefa6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, ufo; + bool need_csum, ufo, gso_partial; if (!skb->encapsulation) goto out; @@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, goto out; } + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + outer_hlen = skb_tnl_header_len(skb); gre_offset = outer_hlen - tnl_hlen; skb = segs; @@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (skb_is_gso(skb)) { + if (gso_partial) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9b4ca87f70ba..606cc3e85d2b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -472,6 +472,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, continue; } + /* Based on RFC3376 5.1. Should not send source-list change + * records when there is a filter mode change. + */ + if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) || + (!gdeleted && pmc->crcount)) && + (type == IGMPV3_ALLOW_NEW_SOURCES || + type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) + goto decrease_sf_crcount; + /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; @@ -499,6 +508,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, scount++; stotal++; if ((type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { +decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 38c2c47fe0e8..e4d16fc5bbb3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -45,6 +45,7 @@ struct inet_diag_entry { u16 family; u16 userlocks; u32 ifindex; + u32 mark; }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -98,6 +99,7 @@ static size_t inet_sk_attr_size(void) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(sizeof(struct inet_diag_msg)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) @@ -108,7 +110,8 @@ static size_t inet_sk_attr_size(void) int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, - struct user_namespace *user_ns) + struct user_namespace *user_ns, + bool net_admin) { const struct inet_sock *inet = inet_sk(sk); @@ -135,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, } #endif + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) + goto errout; + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); @@ -148,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; @@ -174,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 0; r->idiag_retrans = 0; - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) goto errout; if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { @@ -273,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { - return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, - user_ns, portid, seq, nlmsg_flags, unlh); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns, + portid, seq, nlmsg_flags, unlh, net_admin); } static int inet_twsk_diag_fill(struct sock *sk, @@ -318,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk, static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { + struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -333,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; - r->idiag_retrans = inet_reqsk(sk)->num_retrans; + r->idiag_retrans = reqsk->num_retrans; BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != offsetof(struct sock, sk_cookie)); @@ -345,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = 0; r->idiag_inode = 0; + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) + return -EMSGSIZE; + nlmsg_end(skb, nlh); return 0; } @@ -353,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, const struct inet_diag_req_v2 *r, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) return inet_twsk_diag_fill(sk, skb, portid, seq, @@ -361,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, - nlmsg_flags, unlh); + nlmsg_flags, unlh, net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, @@ -434,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, err = sk_diag_fill(sk, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); @@ -580,6 +594,14 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } + case INET_DIAG_BC_MARK_COND: { + struct inet_diag_markcond *cond; + + cond = (struct inet_diag_markcond *)(op + 1); + if ((entry->mark & cond->mask) != cond->mark) + yes = 0; + break; + } } if (yes) { @@ -624,6 +646,12 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.dport = ntohs(inet->inet_dport); entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; + if (sk_fullsock(sk)) + entry.mark = sk->sk_mark; + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else + entry.mark = 0; return inet_diag_bc_run(bc, &entry); } @@ -706,10 +734,25 @@ static bool valid_port_comparison(const struct inet_diag_bc_op *op, return true; } -static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) +static bool valid_markcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + *min_len += sizeof(struct inet_diag_markcond); + return len >= *min_len; +} + +static int inet_diag_bc_audit(const struct nlattr *attr, + const struct sk_buff *skb) { - const void *bc = bytecode; - int len = bytecode_len; + bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); + const void *bytecode, *bc; + int bytecode_len, len; + + if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + return -EINVAL; + + bytecode = bc = nla_data(attr); + len = bytecode_len = nla_len(attr); while (len > 0) { int min_len = sizeof(struct inet_diag_bc_op); @@ -732,6 +775,12 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; + case INET_DIAG_BC_MARK_COND: + if (!net_admin) + return -EPERM; + if (!valid_markcond(bc, len, &min_len)) + return -EINVAL; + break; case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: @@ -760,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - const struct nlattr *bc) + const struct nlattr *bc, + bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -768,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk, return inet_csk_diag_fill(sk, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, + net_admin); } static void twsk_build_assert(void) @@ -804,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct net *net = sock_net(skb->sk); int i, num, s_i, s_num; u32 idiag_states = r->idiag_states; + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; @@ -844,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, cb->args[3] > 0) goto next_listen; - if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (inet_csk_diag_dump(sk, skb, cb, r, + bc, net_admin) < 0) { spin_unlock_bh(&ilb->lock); goto done; } @@ -912,7 +965,7 @@ skip_listen_ht: sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh); + cb->nlh, net_admin); if (res < 0) { spin_unlock_bh(lock); goto done; @@ -1020,13 +1073,13 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlh->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(nlh, hdrlen)) { struct nlattr *attr; + int err; attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - if (!attr || - nla_len(attr) < sizeof(struct inet_diag_bc_op) || - inet_diag_bc_audit(nla_data(attr), nla_len(attr))) - return -EINVAL; + err = inet_diag_bc_audit(attr, skb); + if (err) + return err; } { struct netlink_dump_control c = { @@ -1051,13 +1104,13 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) h->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(h, hdrlen)) { struct nlattr *attr; + int err; attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - if (!attr || - nla_len(attr) < sizeof(struct inet_diag_bc_op) || - inet_diag_bc_audit(nla_data(attr), nla_len(attr))) - return -EINVAL; + err = inet_diag_bc_audit(attr, skb); + if (err) + return err; } { struct netlink_dump_control c = { diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 113cc43df789..576f705d8180 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info) ipgre_err(skb, info, &tpi); } -static __be64 key_to_tunnel_id(__be32 key) -{ -#ifdef __BIG_ENDIAN - return (__force __be64)((__force u32)key); -#else - return (__force __be64)((__force u64)key << 32); -#endif -} - -/* Returns the least-significant 32 bits of a __be64. */ -static __be32 tunnel_id_to_key(__be64 x) -{ -#ifdef __BIG_ENDIAN - return (__force __be32)x; -#else - return (__force __be32)((__force u64)x >> 32); -#endif -} - static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) { @@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, __be64 tun_id; flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); - tun_id = key_to_tunnel_id(tpi->key); + tun_id = key32_to_tunnel_id(tpi->key); tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) return PACKET_REJECT; @@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); gre_build_header(skb, tunnel_hlen, flags, proto, - tunnel_id_to_key(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index dde37fb340bf..05d105832bdb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -73,6 +73,7 @@ #include <net/icmp.h> #include <net/checksum.h> #include <net/inetpeer.h> +#include <net/lwtunnel.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> @@ -98,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) iph->tot_len = htons(skb->len); ip_send_check(iph); + + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_out(sk, skb); + if (unlikely(!skb)) + return 0; + return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); @@ -197,6 +206,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s skb = skb2; } + if (lwtunnel_xmit_redirect(dst->lwtstate)) { + int res = lwtunnel_xmit(skb); + + if (res < 0 || res == LWTUNNEL_XMIT_DONE) + return res; + } + rcu_read_lock_bh(); nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); @@ -482,7 +498,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) to->tc_index = from->tc_index; #endif nf_copy(to, from); -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) +#if IS_ENABLED(CONFIG_IP_VS) to->ipvs_property = from->ipvs_property; #endif skb_copy_secmark(to, from); @@ -1566,8 +1582,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) - oif = skb->skb_iif; + oif = oif ? : skb->skb_iif; flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 71a52f4d4cff..af4919792b6a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -284,9 +284,12 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, ipc->ttl = val; break; case IP_TOS: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + if (cmsg->cmsg_len == CMSG_LEN(sizeof(int))) + val = *(int *)CMSG_DATA(cmsg); + else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8))) + val = *(u8 *)CMSG_DATA(cmsg); + else return -EINVAL; - val = *(int *)CMSG_DATA(cmsg); if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 95649ebd2874..5719d6ba0824 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -55,6 +55,7 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/udp.h> +#include <net/dst_metadata.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } +void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + u32 headroom = sizeof(struct iphdr); + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + const struct iphdr *inner_iph; + struct rtable *rt; + struct flowi4 fl4; + __be16 df = 0; + u8 tos, ttl; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto tx_error; + key = &tun_info->key; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); + tos = key->tos; + if (tos == 1) { + if (skb->protocol == htons(ETH_P_IP)) + tos = inner_iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); + } + init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, + RT_TOS(tos), tunnel->parms.link); + if (tunnel->encap.type != TUNNEL_ENCAP_NONE) + goto tx_error; + rt = ip_route_output_key(tunnel->net, &fl4); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; + } + if (rt->dst.dev == dev) { + ip_rt_put(rt); + dev->stats.collisions++; + goto tx_error; + } + tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); + ttl = key->ttl; + if (ttl == 0) { + if (skb->protocol == htons(ETH_P_IP)) + ttl = inner_iph->ttl; + else if (skb->protocol == htons(ETH_P_IPV6)) + ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; + else + ttl = ip4_dst_hoplimit(&rt->dst); + } + if (key->tun_flags & TUNNEL_DONT_FRAGMENT) + df = htons(IP_DF); + else if (skb->protocol == htons(ETH_P_IP)) + df = inner_iph->frag_off & htons(IP_DF); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + if (headroom > dev->needed_headroom) + dev->needed_headroom = headroom; + + if (skb_cow_head(skb, dev->needed_headroom)) { + ip_rt_put(rt); + goto tx_dropped; + } + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, + key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); + return; +tx_error: + dev->stats.tx_errors++; + goto kfree; +tx_dropped: + dev->stats.tx_dropped++; +kfree: + kfree_skb(skb); +} +EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); + void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 0f227db0e9ac..777bc1883870 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -69,7 +69,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_scrub_packet(skb, xnet); - skb_clear_hash(skb); + skb_clear_hash_if_not_l4(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 1d71c40eaaf3..071a785c65eb 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -85,7 +85,6 @@ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ #define CONF_SEND_RETRIES 6 /* Send six requests per open */ -#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */ #define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ #define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ #define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ @@ -188,7 +187,7 @@ struct ic_device { }; static struct ic_device *ic_first_dev __initdata; /* List of open device */ -static struct net_device *ic_dev __initdata; /* Selected device */ +static struct ic_device *ic_dev __initdata; /* Selected device */ static bool __init ic_is_init_dev(struct net_device *dev) { @@ -307,7 +306,7 @@ static void __init ic_close_devs(void) while ((d = next)) { next = d->next; dev = d->dev; - if (dev != ic_dev && !netdev_uses_dsa(dev)) { + if ((!ic_dev || dev != ic_dev->dev) && !netdev_uses_dsa(dev)) { pr_debug("IP-Config: Downing %s\n", dev->name); dev_change_flags(dev, d->flags); } @@ -372,7 +371,7 @@ static int __init ic_setup_if(void) int err; memset(&ir, 0, sizeof(ir)); - strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); + strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->dev->name); set_sockaddr(sin, ic_myaddr, 0); if ((err = ic_devinet_ioctl(SIOCSIFADDR, &ir)) < 0) { pr_err("IP-Config: Unable to set interface address (%d)\n", @@ -396,7 +395,7 @@ static int __init ic_setup_if(void) * out, we'll try to muddle along. */ if (ic_dev_mtu != 0) { - strcpy(ir.ifr_name, ic_dev->name); + strcpy(ir.ifr_name, ic_dev->dev->name); ir.ifr_mtu = ic_dev_mtu; if ((err = ic_dev_ioctl(SIOCSIFMTU, &ir)) < 0) pr_err("IP-Config: Unable to set interface mtu to %d (%d)\n", @@ -568,7 +567,7 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt goto drop_unlock; /* We have a winner! */ - ic_dev = dev; + ic_dev = d; if (ic_myaddr == NONE) ic_myaddr = tip; ic_servaddr = sip; @@ -655,8 +654,6 @@ static struct packet_type bootp_packet_type __initdata = { .func = ic_bootp_recv, }; -static __be32 ic_dev_xid; /* Device under configuration */ - /* * Initialize DHCP/BOOTP extension fields in the request. */ @@ -666,14 +663,14 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; #ifdef IPCONFIG_DHCP static void __init -ic_dhcp_init_options(u8 *options) +ic_dhcp_init_options(u8 *options, struct ic_device *d) { u8 mt = ((ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST); u8 *e = options; int len; - pr_debug("DHCP: Sending message type %d\n", mt); + pr_debug("DHCP: Sending message type %d (%s)\n", mt, d->dev->name); memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ e += 4; @@ -857,7 +854,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d /* add DHCP options or BOOTP extensions */ #ifdef IPCONFIG_DHCP if (ic_proto_enabled & IC_USE_DHCP) - ic_dhcp_init_options(b->exten); + ic_dhcp_init_options(b->exten, d); else #endif ic_bootp_init_ext(b->exten); @@ -1033,14 +1030,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str /* Is it a reply to our BOOTP request? */ if (b->op != BOOTP_REPLY || b->xid != d->xid) { - net_err_ratelimited("DHCP/BOOTP: Reply not for us, op[%x] xid[%x]\n", - b->op, b->xid); - goto drop_unlock; - } - - /* Is it a reply for the device we are configuring? */ - if (b->xid != ic_dev_xid) { - net_err_ratelimited("DHCP/BOOTP: Ignoring delayed packet\n"); + net_err_ratelimited("DHCP/BOOTP: Reply not for us on %s, op[%x] xid[%x]\n", + d->dev->name, b->op, b->xid); goto drop_unlock; } @@ -1075,7 +1066,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str } } - pr_debug("DHCP: Got message type %d\n", mt); + pr_debug("DHCP: Got message type %d (%s)\n", mt, d->dev->name); switch (mt) { case DHCPOFFER: @@ -1130,7 +1121,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str } /* We have a winner! */ - ic_dev = dev; + ic_dev = d; ic_myaddr = b->your_ip; ic_servaddr = b->server_ip; ic_addrservaddr = b->iph.saddr; @@ -1225,9 +1216,6 @@ static int __init ic_dynamic(void) timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM); for (;;) { #ifdef IPCONFIG_BOOTP - /* Track the device we are configuring */ - ic_dev_xid = d->xid; - if (do_bootp && (d->able & IC_BOOTP)) ic_bootp_send_if(d, jiffies - start_jiffies); #endif @@ -1236,15 +1224,19 @@ static int __init ic_dynamic(void) ic_rarp_send_if(d); #endif - jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout); - while (time_before(jiffies, jiff) && !ic_got_reply) - schedule_timeout_uninterruptible(1); + if (!d->next) { + jiff = jiffies + timeout; + while (time_before(jiffies, jiff) && !ic_got_reply) + schedule_timeout_uninterruptible(1); + } #ifdef IPCONFIG_DHCP /* DHCP isn't done until we get a DHCPACK. */ if ((ic_got_reply & IC_BOOTP) && (ic_proto_enabled & IC_USE_DHCP) && ic_dhcp_msgtype != DHCPACK) { ic_got_reply = 0; + /* continue on device that got the reply */ + d = ic_dev; pr_cont(","); continue; } @@ -1487,7 +1479,7 @@ static int __init ip_auto_config(void) #endif /* IPCONFIG_DYNAMIC */ } else { /* Device selected manually or only one device -> use it */ - ic_dev = ic_first_dev->dev; + ic_dev = ic_first_dev; } addr = root_nfs_parse_addr(root_server_path); @@ -1501,14 +1493,6 @@ static int __init ip_auto_config(void) return -1; /* - * Close all network devices except the device we've - * autoconfigured and set up routes. - */ - ic_close_devs(); - if (ic_setup_if() < 0 || ic_setup_routes() < 0) - return -1; - - /* * Record which protocol was actually used. */ #ifdef IPCONFIG_DYNAMIC @@ -1522,7 +1506,7 @@ static int __init ip_auto_config(void) pr_info("IP-Config: Complete:\n"); pr_info(" device=%s, hwaddr=%*phC, ipaddr=%pI4, mask=%pI4, gw=%pI4\n", - ic_dev->name, ic_dev->addr_len, ic_dev->dev_addr, + ic_dev->dev->name, ic_dev->dev->addr_len, ic_dev->dev->dev_addr, &ic_myaddr, &ic_netmask, &ic_gateway); pr_info(" host=%s, domain=%s, nis-domain=%s\n", utsname()->nodename, ic_domain, utsname()->domainname); @@ -1542,7 +1526,18 @@ static int __init ip_auto_config(void) pr_cont("\n"); #endif /* !SILENT */ - return 0; + /* + * Close all network devices except the device we've + * autoconfigured and set up routes. + */ + if (ic_setup_if() < 0 || ic_setup_routes() < 0) + err = -1; + else + err = 0; + + ic_close_devs(); + + return err; } late_initcall(ip_auto_config); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4ae3f8e6c6cc..c9392589c415 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -115,6 +115,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/dst_metadata.h> static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); @@ -193,6 +194,7 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, ipip_net_id); + struct metadata_dst *tun_dst = NULL; struct ip_tunnel *tunnel; const struct iphdr *iph; @@ -216,7 +218,12 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto) tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; - return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); + if (tunnel->collect_md) { + tun_dst = ip_tun_rx_dst(skb, 0, 0, 0); + if (!tun_dst) + return 0; + } + return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); } return -1; @@ -270,7 +277,10 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, ipproto); - ip_tunnel_xmit(skb, dev, tiph, ipproto); + if (tunnel->collect_md) + ip_md_tunnel_xmit(skb, dev, ipproto); + else + ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: @@ -380,13 +390,14 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) } static void ipip_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm *parms, bool *collect_md) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPIP; parms->iph.ihl = 5; + *collect_md = false; if (!data) return; @@ -414,6 +425,9 @@ static void ipip_netlink_parms(struct nlattr *data[], if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); + + if (data[IFLA_IPTUN_COLLECT_METADATA]) + *collect_md = true; } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -453,18 +467,18 @@ static bool ipip_netlink_encap_parms(struct nlattr *data[], static int ipip_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; if (ipip_netlink_encap_parms(data, &ipencap)) { - struct ip_tunnel *t = netdev_priv(dev); int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipip_netlink_parms(data, &p); + ipip_netlink_parms(data, &p, &t->collect_md); return ip_tunnel_newlink(dev, tb, &p); } @@ -473,6 +487,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + bool collect_md; if (ipip_netlink_encap_parms(data, &ipencap)) { struct ip_tunnel *t = netdev_priv(dev); @@ -482,7 +497,9 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], return err; } - ipip_netlink_parms(data, &p); + ipip_netlink_parms(data, &p, &collect_md); + if (collect_md) + return -EINVAL; if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) @@ -516,6 +533,8 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_IPTUN_COLLECT_METADATA */ + nla_total_size(0) + 0; } @@ -544,6 +563,9 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) tunnel->encap.flags)) goto nla_put_failure; + if (tunnel->collect_md) + if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) + goto nla_put_failure; return 0; nla_put_failure: @@ -562,6 +584,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c187c60e3e0c..d613309e3e5d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -25,17 +25,6 @@ config NF_CONNTRACK_IPV4 To compile it as a module, choose M here. If unsure, say N. -config NF_CONNTRACK_PROC_COMPAT - bool "proc/sysctl compatibility with old connection tracking" - depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4 - default y - help - This option enables /proc and sysctl compatibility with the old - layer 3 dependent connection tracking. This is needed to keep - old programs that have not been adapted to the new names working. - - If unsure, say Y. - if NF_TABLES config NF_TABLES_IPV4 diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 87b073da14c9..853328f8fd05 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -4,11 +4,6 @@ # objects for l3 independent conntrack nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o -ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y) -ifeq ($(CONFIG_PROC_FS),y) -nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o -endif -endif # connection tracking obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f993545a3373..7c00ce90adb8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -156,7 +156,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = 4, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index ae1a71a97132..713c09a74b90 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -110,7 +110,7 @@ static unsigned int ipv4_helper(void *priv, if (!help) return NF_ACCEPT; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (!helper) return NF_ACCEPT; @@ -202,47 +202,6 @@ static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = { }, }; -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) -static int log_invalid_proto_min = 0; -static int log_invalid_proto_max = 255; - -static struct ctl_table ip_ct_sysctl_table[] = { - { - .procname = "ip_conntrack_max", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_count", - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_buckets", - .maxlen = sizeof(unsigned int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_checksum", - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_log_invalid", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &log_invalid_proto_min, - .extra2 = &log_invalid_proto_max, - }, - { } -}; -#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */ - /* Fast function for those who don't want to parse /proc (and I don't blame them). */ /* Reversing the socket's dst/src point of view gives us the reply @@ -350,20 +309,6 @@ static struct nf_sockopt_ops so_getorigdst = { static int ipv4_init_net(struct net *net) { -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - struct nf_ip_net *in = &net->ct.nf_ct_proto; - in->ctl_table = kmemdup(ip_ct_sysctl_table, - sizeof(ip_ct_sysctl_table), - GFP_KERNEL); - if (!in->ctl_table) - return -ENOMEM; - - in->ctl_table[0].data = &nf_conntrack_max; - in->ctl_table[1].data = &net->ct.count; - in->ctl_table[2].data = &nf_conntrack_htable_size; - in->ctl_table[3].data = &net->ct.sysctl_checksum; - in->ctl_table[4].data = &net->ct.sysctl_log_invalid; -#endif return 0; } @@ -380,9 +325,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = { .nlattr_to_tuple = ipv4_nlattr_to_tuple, .nla_policy = ipv4_nla_policy, #endif -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - .ctl_table_path = "net/ipv4/netfilter", -#endif .init_net = ipv4_init_net, .me = THIS_MODULE, }; @@ -492,16 +434,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) goto cleanup_icmpv4; } -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - ret = nf_conntrack_ipv4_compat_init(); - if (ret < 0) - goto cleanup_proto; -#endif return ret; -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - cleanup_proto: - nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); -#endif cleanup_icmpv4: nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); cleanup_udp4: @@ -520,9 +453,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) static void __exit nf_conntrack_l3proto_ipv4_fini(void) { synchronize_net(); -#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - nf_conntrack_ipv4_compat_fini(); -#endif nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4); nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp); nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c deleted file mode 100644 index 63923710f325..000000000000 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ /dev/null @@ -1,492 +0,0 @@ -/* ip_conntrack proc compat - based on ip_conntrack_standalone.c - * - * (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> - * (C) 2006-2010 Patrick McHardy <kaber@trash.net> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#include <linux/types.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/percpu.h> -#include <linux/security.h> -#include <net/net_namespace.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_l3proto.h> -#include <net/netfilter/nf_conntrack_l4proto.h> -#include <net/netfilter/nf_conntrack_expect.h> -#include <net/netfilter/nf_conntrack_acct.h> -#include <linux/rculist_nulls.h> -#include <linux/export.h> - -struct ct_iter_state { - struct seq_net_private p; - struct hlist_nulls_head *hash; - unsigned int htable_size; - unsigned int bucket; -}; - -static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) -{ - struct ct_iter_state *st = seq->private; - struct hlist_nulls_node *n; - - for (st->bucket = 0; - st->bucket < st->htable_size; - st->bucket++) { - n = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - if (!is_a_nulls(n)) - return n; - } - return NULL; -} - -static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, - struct hlist_nulls_node *head) -{ - struct ct_iter_state *st = seq->private; - - head = rcu_dereference(hlist_nulls_next_rcu(head)); - while (is_a_nulls(head)) { - if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= st->htable_size) - return NULL; - } - head = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - } - return head; -} - -static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_nulls_node *head = ct_get_first(seq); - - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *ct_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - struct ct_iter_state *st = seq->private; - - rcu_read_lock(); - - nf_conntrack_get_ht(&st->hash, &st->htable_size); - return ct_get_idx(seq, *pos); -} - -static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - (*pos)++; - return ct_get_next(s, v); -} - -static void ct_seq_stop(struct seq_file *s, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -#ifdef CONFIG_NF_CONNTRACK_SECMARK -static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) -{ - int ret; - u32 len; - char *secctx; - - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) - return; - - seq_printf(s, "secctx=%s ", secctx); - - security_release_secctx(secctx, len); -} -#else -static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) -{ -} -#endif - -static bool ct_seq_should_skip(const struct nf_conn *ct, - const struct net *net, - const struct nf_conntrack_tuple_hash *hash) -{ - /* we only want to print DIR_ORIGINAL */ - if (NF_CT_DIRECTION(hash)) - return true; - - if (nf_ct_l3num(ct) != AF_INET) - return true; - - if (!net_eq(nf_ct_net(ct), net)) - return true; - - return false; -} - -static int ct_seq_show(struct seq_file *s, void *v) -{ - struct nf_conntrack_tuple_hash *hash = v; - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash); - const struct nf_conntrack_l3proto *l3proto; - const struct nf_conntrack_l4proto *l4proto; - int ret = 0; - - NF_CT_ASSERT(ct); - if (ct_seq_should_skip(ct, seq_file_net(s), hash)) - return 0; - - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) - return 0; - - /* check if we raced w. object reuse */ - if (!nf_ct_is_confirmed(ct) || - ct_seq_should_skip(ct, seq_file_net(s), hash)) - goto release; - - l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); - NF_CT_ASSERT(l3proto); - l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - NF_CT_ASSERT(l4proto); - - ret = -ENOSPC; - seq_printf(s, "%-8s %u %ld ", - l4proto->name, nf_ct_protonum(ct), - timer_pending(&ct->timeout) - ? (long)(ct->timeout.expires - jiffies)/HZ : 0); - - if (l4proto->print_conntrack) - l4proto->print_conntrack(s, ct); - - if (seq_has_overflowed(s)) - goto release; - - print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - l3proto, l4proto); - - if (seq_has_overflowed(s)) - goto release; - - if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL)) - goto release; - - if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) - seq_printf(s, "[UNREPLIED] "); - - print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, - l3proto, l4proto); - - if (seq_has_overflowed(s)) - goto release; - - if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) - goto release; - - if (test_bit(IPS_ASSURED_BIT, &ct->status)) - seq_printf(s, "[ASSURED] "); - -#ifdef CONFIG_NF_CONNTRACK_MARK - seq_printf(s, "mark=%u ", ct->mark); -#endif - - ct_show_secctx(s, ct); - - seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); - - if (seq_has_overflowed(s)) - goto release; - - ret = 0; -release: - nf_ct_put(ct); - return ret; -} - -static const struct seq_operations ct_seq_ops = { - .start = ct_seq_start, - .next = ct_seq_next, - .stop = ct_seq_stop, - .show = ct_seq_show -}; - -static int ct_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ct_seq_ops, - sizeof(struct ct_iter_state)); -} - -static const struct file_operations ct_file_ops = { - .owner = THIS_MODULE, - .open = ct_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -/* expects */ -struct ct_expect_iter_state { - struct seq_net_private p; - unsigned int bucket; -}; - -static struct hlist_node *ct_expect_get_first(struct seq_file *seq) -{ - struct ct_expect_iter_state *st = seq->private; - struct hlist_node *n; - - for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference( - hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); - if (n) - return n; - } - return NULL; -} - -static struct hlist_node *ct_expect_get_next(struct seq_file *seq, - struct hlist_node *head) -{ - struct ct_expect_iter_state *st = seq->private; - - head = rcu_dereference(hlist_next_rcu(head)); - while (head == NULL) { - if (++st->bucket >= nf_ct_expect_hsize) - return NULL; - head = rcu_dereference( - hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); - } - return head; -} - -static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_node *head = ct_expect_get_first(seq); - - if (head) - while (pos && (head = ct_expect_get_next(seq, head))) - pos--; - return pos ? NULL : head; -} - -static void *exp_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - rcu_read_lock(); - return ct_expect_get_idx(seq, *pos); -} - -static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - (*pos)++; - return ct_expect_get_next(seq, v); -} - -static void exp_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static int exp_seq_show(struct seq_file *s, void *v) -{ - struct nf_conntrack_expect *exp; - const struct hlist_node *n = v; - - exp = hlist_entry(n, struct nf_conntrack_expect, hnode); - - if (!net_eq(nf_ct_net(exp->master), seq_file_net(s))) - return 0; - - if (exp->tuple.src.l3num != AF_INET) - return 0; - - if (exp->timeout.function) - seq_printf(s, "%ld ", timer_pending(&exp->timeout) - ? (long)(exp->timeout.expires - jiffies)/HZ : 0); - else - seq_printf(s, "- "); - - seq_printf(s, "proto=%u ", exp->tuple.dst.protonum); - - print_tuple(s, &exp->tuple, - __nf_ct_l3proto_find(exp->tuple.src.l3num), - __nf_ct_l4proto_find(exp->tuple.src.l3num, - exp->tuple.dst.protonum)); - seq_putc(s, '\n'); - - return 0; -} - -static const struct seq_operations exp_seq_ops = { - .start = exp_seq_start, - .next = exp_seq_next, - .stop = exp_seq_stop, - .show = exp_seq_show -}; - -static int exp_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &exp_seq_ops, - sizeof(struct ct_expect_iter_state)); -} - -static const struct file_operations ip_exp_file_ops = { - .owner = THIS_MODULE, - .open = exp_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - struct net *net = seq_file_net(seq); - int cpu; - - if (*pos == 0) - return SEQ_START_TOKEN; - - for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { - if (!cpu_possible(cpu)) - continue; - *pos = cpu+1; - return per_cpu_ptr(net->ct.stat, cpu); - } - - return NULL; -} - -static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct net *net = seq_file_net(seq); - int cpu; - - for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { - if (!cpu_possible(cpu)) - continue; - *pos = cpu+1; - return per_cpu_ptr(net->ct.stat, cpu); - } - - return NULL; -} - -static void ct_cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static int ct_cpu_seq_show(struct seq_file *seq, void *v) -{ - struct net *net = seq_file_net(seq); - unsigned int nr_conntracks = atomic_read(&net->ct.count); - const struct ip_conntrack_stat *st = v; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); - return 0; - } - - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " - "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - nr_conntracks, - st->searched, - st->found, - st->new, - st->invalid, - st->ignore, - st->delete, - st->delete_list, - st->insert, - st->insert_failed, - st->drop, - st->early_drop, - st->error, - - st->expect_new, - st->expect_create, - st->expect_delete, - st->search_restart - ); - return 0; -} - -static const struct seq_operations ct_cpu_seq_ops = { - .start = ct_cpu_seq_start, - .next = ct_cpu_seq_next, - .stop = ct_cpu_seq_stop, - .show = ct_cpu_seq_show, -}; - -static int ct_cpu_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &ct_cpu_seq_ops, - sizeof(struct seq_net_private)); -} - -static const struct file_operations ct_cpu_seq_fops = { - .owner = THIS_MODULE, - .open = ct_cpu_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -static int __net_init ip_conntrack_net_init(struct net *net) -{ - struct proc_dir_entry *proc, *proc_exp, *proc_stat; - - proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops); - if (!proc) - goto err1; - - proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net, - &ip_exp_file_ops); - if (!proc_exp) - goto err2; - - proc_stat = proc_create("ip_conntrack", S_IRUGO, - net->proc_net_stat, &ct_cpu_seq_fops); - if (!proc_stat) - goto err3; - return 0; - -err3: - remove_proc_entry("ip_conntrack_expect", net->proc_net); -err2: - remove_proc_entry("ip_conntrack", net->proc_net); -err1: - return -ENOMEM; -} - -static void __net_exit ip_conntrack_net_exit(struct net *net) -{ - remove_proc_entry("ip_conntrack", net->proc_net_stat); - remove_proc_entry("ip_conntrack_expect", net->proc_net); - remove_proc_entry("ip_conntrack", net->proc_net); -} - -static struct pernet_operations ip_conntrack_net_ops = { - .init = ip_conntrack_net_init, - .exit = ip_conntrack_net_exit, -}; - -int __init nf_conntrack_ipv4_compat_init(void) -{ - return register_pernet_subsys(&ip_conntrack_net_ops); -} - -void __exit nf_conntrack_ipv4_compat_fini(void) -{ - unregister_pernet_subsys(&ip_conntrack_net_ops); -} diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index c567e1b5d799..d075b3cf2400 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -149,7 +149,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, return -NF_ACCEPT; } - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's @@ -327,17 +327,6 @@ static struct ctl_table icmp_sysctl_table[] = { }, { } }; -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table icmp_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_icmp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -355,40 +344,14 @@ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct nf_icmp_net *in) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, - sizeof(icmp_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &in->timeout; -#endif -#endif - return 0; -} - static int icmp_init_net(struct net *net, u_int16_t proto) { - int ret; struct nf_icmp_net *in = icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; - ret = icmp_kmemdup_compat_sysctl_table(pn, in); - if (ret < 0) - return ret; - - ret = icmp_kmemdup_sysctl_table(pn, in); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - - return ret; + return icmp_kmemdup_sysctl_table(pn, in); } static struct nf_proto_net *icmp_get_net_proto(struct net *net) diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index ceb187308120..cf986e1c7bbd 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -74,21 +74,19 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, nf_conntrack_get(skb->nfct); #endif /* - * If we are in PREROUTING/INPUT, the checksum must be recalculated - * since the length could have changed as a result of defragmentation. - * - * We also decrease the TTL to mitigate potential loops between two - * hosts. + * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential + * loops between two hosts. * * Set %IP_DF so that the original source is notified of a potentially * decreased MTU on the clone route. IPv6 does this too. + * + * IP header checksum will be recalculated at ip_local_out. */ iph = ip_hdr(skb); iph->frag_off |= htons(IP_DF); if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_IN) --iph->ttl; - ip_send_check(iph); if (nf_dup_ipv4_route(net, skb, gw, oif)) { __this_cpu_write(nf_skb_duplicated, true); diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index e7ad950cf9ef..b24795e2ee6d 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; @@ -62,7 +62,7 @@ static void dump_arp_packet(struct nf_log_buf *m, /* If it's for Ethernet and the lengths are OK, then log the ARP * payload. */ - if (ah->ar_hrd != htons(1) || + if (ah->ar_hrd != htons(ARPHRD_ETHER) || ah->ar_hln != ETH_ALEN || ah->ar_pln != sizeof(__be32)) return; @@ -111,8 +111,7 @@ static struct nf_logger nf_arp_logger __read_mostly = { static int __net_init nf_log_arp_net_init(struct net *net) { - nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); - return 0; + return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); } static void __net_exit nf_log_arp_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 076aadda0473..856648966f4c 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -29,7 +29,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; @@ -46,7 +46,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else - logflags = NF_LOG_MASK; + logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { @@ -76,7 +76,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, if (ntohs(ih->frag_off) & IP_OFFSET) nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - if ((logflags & XT_LOG_IPOPT) && + if ((logflags & NF_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { const unsigned char *op; unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; @@ -250,7 +250,7 @@ static void dump_ipv4_packet(struct nf_log_buf *m, } /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && !iphoff) + if ((logflags & NF_LOG_UID) && !iphoff) nf_log_dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ @@ -282,7 +282,7 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; - if (!(logflags & XT_LOG_MACDECODE)) + if (!(logflags & NF_LOG_MACDECODE)) goto fallback; switch (dev->type) { @@ -347,8 +347,7 @@ static struct nf_logger nf_ip_logger __read_mostly = { static int __net_init nf_log_ipv4_net_init(struct net *net) { - nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); - return 0; + return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); } static void __net_exit nf_log_ipv4_net_exit(struct net *net) diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index 9414923f1e15..edf05002d674 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -88,8 +88,8 @@ gre_manip_pkt(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { - const struct gre_hdr *greh; - struct gre_hdr_pptp *pgreh; + const struct gre_base_hdr *greh; + struct pptp_gre_header *pgreh; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ @@ -97,18 +97,19 @@ gre_manip_pkt(struct sk_buff *skb, return false; greh = (void *)skb->data + hdroff; - pgreh = (struct gre_hdr_pptp *)greh; + pgreh = (struct pptp_gre_header *)greh; /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != NF_NAT_MANIP_DST) return true; - switch (greh->version) { - case GRE_VERSION_1701: + + switch (greh->flags & GRE_VERSION) { + case GRE_VERSION_0: /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; - case GRE_VERSION_PPTP: + case GRE_VERSION_1: pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index cd84d4295a20..805c8ddfe860 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -21,7 +21,7 @@ nft_do_chain_arp(void *priv, { struct nft_pktinfo pkt; - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); return nft_do_chain(&pkt, priv); } @@ -80,7 +80,10 @@ static int __init nf_tables_arp_init(void) { int ret; - nft_register_chain_type(&filter_arp); + ret = nft_register_chain_type(&filter_arp); + if (ret < 0) + return ret; + ret = register_pernet_subsys(&nf_tables_arp_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_arp); diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index e44ba3b12fbb..2840a29b2e04 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -103,7 +103,10 @@ static int __init nf_tables_ipv4_init(void) { int ret; - nft_register_chain_type(&filter_ipv4); + ret = nft_register_chain_type(&filter_ipv4); + if (ret < 0) + return ret; + ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_ipv4); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 9f665b63a927..7143ca1a6af9 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -46,6 +46,8 @@ #include <net/sock.h> #include <net/raw.h> +#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX) + /* * Report socket allocation statistics [mea@utu.fi] */ @@ -257,6 +259,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS), SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND), SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED), + SNMP_MIB_ITEM("TCPMD5Failure", LINUX_MIB_TCPMD5FAILURE), SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), @@ -355,22 +358,22 @@ static void icmp_put(struct seq_file *seq) atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs; seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); - for (i = 0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); seq_puts(seq, " OutMsgs OutErrors"); - for (i = 0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS)); - for (i = 0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); - for (i = 0; icmpmibmap[i].name != NULL; i++) + for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); } @@ -378,14 +381,16 @@ static void icmp_put(struct seq_file *seq) /* * Called from the PROCfs module. This outputs /proc/net/snmp. */ -static int snmp_seq_show(struct seq_file *seq, void *v) +static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { - int i; struct net *net = seq->private; + u64 buff64[IPSTATS_MIB_MAX]; + int i; - seq_puts(seq, "Ip: Forwarding DefaultTTL"); + memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); - for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_puts(seq, "Ip: Forwarding DefaultTTL"); + for (i = 0; snmp4_ipstats_list[i].name; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", @@ -393,57 +398,77 @@ static int snmp_seq_show(struct seq_file *seq, void *v) net->ipv4.sysctl_ip_default_ttl); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); - for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) - seq_printf(seq, " %llu", - snmp_fold_field64(net->mib.ip_statistics, - snmp4_ipstats_list[i].entry, - offsetof(struct ipstats_mib, syncp))); + snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, + net->mib.ip_statistics, + offsetof(struct ipstats_mib, syncp)); + for (i = 0; snmp4_ipstats_list[i].name; i++) + seq_printf(seq, " %llu", buff64[i]); - icmp_put(seq); /* RFC 2011 compatibility */ - icmpmsg_put(seq); + return 0; +} + +static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) +{ + unsigned long buff[TCPUDP_MIB_MAX]; + struct net *net = seq->private; + int i; + + memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); - for (i = 0; snmp4_tcp_list[i].name != NULL; i++) + for (i = 0; snmp4_tcp_list[i].name; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); - for (i = 0; snmp4_tcp_list[i].name != NULL; i++) { + snmp_get_cpu_field_batch(buff, snmp4_tcp_list, + net->mib.tcp_statistics); + for (i = 0; snmp4_tcp_list[i].name; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) - seq_printf(seq, " %ld", - snmp_fold_field(net->mib.tcp_statistics, - snmp4_tcp_list[i].entry)); + seq_printf(seq, " %ld", buff[i]); else - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.tcp_statistics, - snmp4_tcp_list[i].entry)); + seq_printf(seq, " %lu", buff[i]); } + memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + + snmp_get_cpu_field_batch(buff, snmp4_udp_list, + net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) + for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); - seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.udp_statistics, - snmp4_udp_list[i].entry)); + for (i = 0; snmp4_udp_list[i].name; i++) + seq_printf(seq, " %lu", buff[i]); + + memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) + snmp_get_cpu_field_batch(buff, snmp4_udp_list, + net->mib.udplite_statistics); + for (i = 0; snmp4_udp_list[i].name; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); - seq_puts(seq, "\nUdpLite:"); - for (i = 0; snmp4_udp_list[i].name != NULL; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.udplite_statistics, - snmp4_udp_list[i].entry)); + for (i = 0; snmp4_udp_list[i].name; i++) + seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); return 0; } +static int snmp_seq_show(struct seq_file *seq, void *v) +{ + snmp_seq_show_ipstats(seq, v); + + icmp_put(seq); /* RFC 2011 compatibility */ + icmpmsg_put(seq); + + snmp_seq_show_tcp_udp(seq, v); + + return 0; +} + static int snmp_seq_open(struct inode *inode, struct file *file) { return single_open_net(inode, file, snmp_seq_show); @@ -468,21 +493,21 @@ static int netstat_seq_show(struct seq_file *seq, void *v) struct net *net = seq->private; seq_puts(seq, "TcpExt:"); - for (i = 0; snmp4_net_list[i].name != NULL; i++) + for (i = 0; snmp4_net_list[i].name; i++) seq_printf(seq, " %s", snmp4_net_list[i].name); seq_puts(seq, "\nTcpExt:"); - for (i = 0; snmp4_net_list[i].name != NULL; i++) + for (i = 0; snmp4_net_list[i].name; i++) seq_printf(seq, " %lu", snmp_fold_field(net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); - for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + for (i = 0; snmp4_ipextstats_list[i].name; i++) seq_printf(seq, " %s", snmp4_ipextstats_list[i].name); seq_puts(seq, "\nIpExt:"); - for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + for (i = 0; snmp4_ipextstats_list[i].name; i++) seq_printf(seq, " %llu", snmp_fold_field64(net->mib.ip_statistics, snmp4_ipextstats_list[i].entry, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 438f50c1a676..90a85c955872 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -606,12 +606,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); - if (!saddr && ipc.oif) { - err = l3mdev_get_saddr(net, ipc.oif, &fl4); - if (err < 0) - goto done; - } - if (!inet->hdrincl) { rfv.msg = msg; rfv.hlen = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 62c3ed0b7556..f2be689a6c85 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1252,7 +1252,9 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = 576; } - return min_t(unsigned int, mtu, IP_MAX_MTU); + mtu = min_t(unsigned int, mtu, IP_MAX_MTU); + + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) @@ -1835,7 +1837,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev); + fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; @@ -2022,7 +2024,9 @@ static struct rtable *__mkroute_output(const struct fib_result *res, return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) - if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + if (ipv4_is_loopback(fl4->saddr) && + !(dev_out->flags & IFF_LOOPBACK) && + !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) @@ -2152,7 +2156,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, unsigned int flags = 0; struct fib_result res; struct rtable *rth; - int master_idx; int orig_oif; int err = -ENETUNREACH; @@ -2162,9 +2165,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, orig_oif = fl4->flowi4_oif; - master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif); - if (master_idx) - fl4->flowi4_oif = master_idx; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? @@ -2248,10 +2248,6 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } - - rth = l3mdev_get_rtable(dev_out, fl4); - if (rth) - goto out; } if (!fl4->daddr) { @@ -2269,8 +2265,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, if (err) { res.fi = NULL; res.table = NULL; - if (fl4->flowi4_oif && - !netif_index_is_l3_master(net, fl4->flowi4_oif)) { + if (fl4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2306,7 +2301,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, else fl4->saddr = fl4->daddr; } - dev_out = net->loopback_dev; + + /* L3 master device is the loopback for that domain */ + dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; @@ -2582,9 +2579,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; - if (netif_index_is_l3_master(net, fl4.flowi4_oif)) - fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF; - if (iif) { struct net_device *dev; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ffbb218de520..f253e5019d22 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -380,14 +380,14 @@ void tcp_init_sock(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - __skb_queue_head_init(&tp->out_of_order_queue); + tp->out_of_order_queue = RB_ROOT; tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); INIT_LIST_HEAD(&tp->tsq_node); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - tp->rtt_min[0].rtt = ~0U; + minmax_reset(&tp->rtt_min, tcp_time_stamp, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control @@ -396,6 +396,9 @@ void tcp_init_sock(struct sock *sk) */ tp->snd_cwnd = TCP_INIT_CWND; + /* There's a bubble in the pipe until at least the first ACK. */ + tp->app_limited = ~0U; + /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ @@ -1014,23 +1017,40 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, flags); lock_sock(sk); + + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + res = do_tcp_sendpages(sk, page, offset, size, flags); release_sock(sk); return res; } EXPORT_SYMBOL(tcp_sendpage); -static inline int select_size(const struct sock *sk, bool sg) +/* Do not bother using a page frag for very small frames. + * But use this heuristic only for the first skb in write queue. + * + * Having no payload in skb->head allows better SACK shifting + * in tcp_shift_skb_data(), reducing sack/rack overhead, because + * write queue has less skbs. + * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB. + * This also speeds up tso_fragment(), since it wont fallback + * to tcp_fragment(). + */ +static int linear_payload_sz(bool first_skb) +{ + if (first_skb) + return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); + return 0; +} + +static int select_size(const struct sock *sk, bool sg, bool first_skb) { const struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sg) { if (sk_can_gso(sk)) { - /* Small frames wont use a full page: - * Payload will immediately follow tcp header. - */ - tmp = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER); + tmp = linear_payload_sz(first_skb); } else { int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); @@ -1101,6 +1121,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + tcp_rate_check_app_limited(sk); /* is sending application-limited? */ + /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. @@ -1161,6 +1183,8 @@ restart: } if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { + bool first_skb; + new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. @@ -1172,10 +1196,11 @@ new_segment: process_backlog = false; goto restart; } + first_skb = skb_queue_empty(&sk->sk_write_queue); skb = sk_stream_alloc_skb(sk, - select_size(sk, sg), + select_size(sk, sg, first_skb), sk->sk_allocation, - skb_queue_empty(&sk->sk_write_queue)); + first_skb); if (!skb) goto wait_for_memory; @@ -1570,6 +1595,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); +int tcp_peek_len(struct socket *sock) +{ + return tcp_inq(sock->sk); +} +EXPORT_SYMBOL(tcp_peek_len); + /* * This routine copies from a sock struct into the user buffer. * @@ -2237,7 +2268,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; @@ -2681,7 +2712,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); - u32 now = tcp_time_stamp; + u32 now = tcp_time_stamp, intv; unsigned int start; int notsent_bytes; u64 rate64; @@ -2771,6 +2802,15 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_out = tp->data_segs_out; + + info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; + rate = READ_ONCE(tp->rate_delivered); + intv = READ_ONCE(tp->rate_interval_us); + if (rate && intv) { + rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; + do_div(rate64, intv); + put_unaligned(rate64, &info->tcpi_delivery_rate); + } } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3092,23 +3132,6 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void) } EXPORT_SYMBOL(tcp_get_md5sig_pool); -int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, - const struct tcphdr *th) -{ - struct scatterlist sg; - struct tcphdr hdr; - - /* We are not allowed to change tcphdr, make a local copy */ - memcpy(&hdr, th, sizeof(hdr)); - hdr.check = 0; - - /* options aren't included in the hash */ - sg_init_one(&sg, &hdr, sizeof(hdr)); - ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr)); - return crypto_ahash_update(hp->md5_req); -} -EXPORT_SYMBOL(tcp_md5_hash_header); - int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, const struct sk_buff *skb, unsigned int header_len) { @@ -3255,11 +3278,12 @@ static void __init tcp_init_mem(void) void __init tcp_init(void) { - unsigned long limit; int max_rshare, max_wshare, cnt; + unsigned long limit; unsigned int i; - sock_skb_cb_check_size(sizeof(struct tcp_skb_cb)); + BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > + FIELD_SIZEOF(struct sk_buff, cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c new file mode 100644 index 000000000000..0ea66c2c9344 --- /dev/null +++ b/net/ipv4/tcp_bbr.c @@ -0,0 +1,896 @@ +/* Bottleneck Bandwidth and RTT (BBR) congestion control + * + * BBR congestion control computes the sending rate based on the delivery + * rate (throughput) estimated from ACKs. In a nutshell: + * + * On each ACK, update our model of the network path: + * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) + * min_rtt = windowed_min(rtt, 10 seconds) + * pacing_rate = pacing_gain * bottleneck_bandwidth + * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) + * + * The core algorithm does not react directly to packet losses or delays, + * although BBR may adjust the size of next send per ACK when loss is + * observed, or adjust the sending rate if it estimates there is a + * traffic policer, in order to keep the drop rate reasonable. + * + * BBR is described in detail in: + * "BBR: Congestion-Based Congestion Control", + * Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh, + * Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016. + * + * There is a public e-mail list for discussing BBR development and testing: + * https://groups.google.com/forum/#!forum/bbr-dev + * + * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled, + * since pacing is integral to the BBR design and implementation. + * BBR without pacing would not function properly, and may incur unnecessary + * high packet loss rates. + */ +#include <linux/module.h> +#include <net/tcp.h> +#include <linux/inet_diag.h> +#include <linux/inet.h> +#include <linux/random.h> +#include <linux/win_minmax.h> + +/* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. + * Since the minimum window is >=4 packets, the lower bound isn't + * an issue. The upper bound isn't an issue with existing technologies. + */ +#define BW_SCALE 24 +#define BW_UNIT (1 << BW_SCALE) + +#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */ +#define BBR_UNIT (1 << BBR_SCALE) + +/* BBR has the following modes for deciding how fast to send: */ +enum bbr_mode { + BBR_STARTUP, /* ramp up sending rate rapidly to fill pipe */ + BBR_DRAIN, /* drain any queue created during startup */ + BBR_PROBE_BW, /* discover, share bw: pace around estimated bw */ + BBR_PROBE_RTT, /* cut cwnd to min to probe min_rtt */ +}; + +/* BBR congestion control block */ +struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ + struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ + u32 rtt_cnt; /* count of packet-timed rounds elapsed */ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + struct skb_mstamp cycle_mstamp; /* time of this cycle phase start */ + u32 mode:3, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ + packet_conservation:1, /* use packet conservation? */ + restore_cwnd:1, /* decided to revert cwnd to old value */ + round_start:1, /* start of packet-timed tx->ack round? */ + tso_segs_goal:7, /* segments we want in each skb we send */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ + unused:5, + lt_is_sampling:1, /* taking long-term ("LT") samples now? */ + lt_rtt_cnt:7, /* round trips in long-term interval */ + lt_use_bw:1; /* use lt_bw as our bw estimate? */ + u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ + u32 lt_last_delivered; /* LT intvl start: tp->delivered */ + u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ + u32 lt_last_lost; /* LT intvl start: tp->lost */ + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_cnt:3, /* number of rounds without large bw gains */ + cycle_idx:3, /* current index in pacing_gain cycle array */ + unused_b:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ +}; + +#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ + +/* Window length of bw filter (in rounds): */ +static const int bbr_bw_rtts = CYCLE_LEN + 2; +/* Window length of min_rtt filter (in sec): */ +static const u32 bbr_min_rtt_win_sec = 10; +/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ +static const u32 bbr_probe_rtt_mode_ms = 200; +/* Skip TSO below the following bandwidth (bits/sec): */ +static const int bbr_min_tso_rate = 1200000; + +/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ +static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +/* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */ +static const int bbr_cwnd_gain = BBR_UNIT * 2; +/* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ +static const int bbr_pacing_gain[] = { + BBR_UNIT * 5 / 4, /* probe for more available bw */ + BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ + BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ + BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ +}; +/* Randomize the starting gain cycling phase over N phases: */ +static const u32 bbr_cycle_rand = 7; + +/* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet + * needs at least 4 packets in flight: + */ +static const u32 bbr_cwnd_min_target = 4; + +/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ +/* If bw has increased significantly (1.25x), there may be more bw available: */ +static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; +/* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ +static const u32 bbr_full_bw_cnt = 3; + +/* "long-term" ("LT") bandwidth estimator parameters... */ +/* The minimum number of rounds in an LT bw sampling interval: */ +static const u32 bbr_lt_intvl_min_rtts = 4; +/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +static const u32 bbr_lt_loss_thresh = 50; +/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +static const u32 bbr_lt_bw_diff = 4000 / 8; +/* If we estimate we're policed, use lt_bw for this many round trips: */ +static const u32 bbr_lt_bw_max_rtts = 48; + +/* Do we estimate that STARTUP filled the pipe? */ +static bool bbr_full_bw_reached(const struct sock *sk) +{ + const struct bbr *bbr = inet_csk_ca(sk); + + return bbr->full_bw_cnt >= bbr_full_bw_cnt; +} + +/* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ +static u32 bbr_max_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return minmax_get(&bbr->bw); +} + +/* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ +static u32 bbr_bw(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); +} + +/* Return rate in bytes per second, optionally with a gain. + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) +{ + rate *= tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache); + rate *= gain; + rate >>= BBR_SCALE; + rate *= USEC_PER_SEC; + return rate >> BW_SCALE; +} + +/* Pace using current bw estimate and a gain factor. In order to help drive the + * network toward lower queues while maintaining high utilization and low + * latency, the average pacing rate aims to be slightly (~1%) lower than the + * estimated bandwidth. This is an important aspect of the design. In this + * implementation this slightly lower pacing rate is achieved implicitly by not + * including link-layer headers in the packet size used for the pacing rate. + */ +static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) + sk->sk_pacing_rate = rate; +} + +/* Return count of segments we want in the skbs we send, or 0 for default. */ +static u32 bbr_tso_segs_goal(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return bbr->tso_segs_goal; +} + +static void bbr_set_tso_segs_goal(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 min_segs; + + min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; + bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), + 0x7FU); +} + +/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +static void bbr_save_cwnd(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) + bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ + else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ + bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); +} + +static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (event == CA_EVENT_TX_START && tp->app_limited) { + bbr->idle_restart = 1; + /* Avoid pointless buffer overflows: pace at est. bw if we don't + * need more speed (we're restarting from idle and app-limited). + */ + if (bbr->mode == BBR_PROBE_BW) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + } +} + +/* Find target cwnd. Right-size the cwnd based on min RTT and the + * estimated bottleneck bandwidth: + * + * cwnd = bw * min_rtt * gain = BDP * gain + * + * The key factor, gain, controls the amount of queue. While a small gain + * builds a smaller queue, it becomes more vulnerable to noise in RTT + * measurements (e.g., delayed ACKs or other ACK compression effects). This + * noise may cause BBR to under-estimate the rate. + * + * To achieve full performance in high-speed paths, we budget enough cwnd to + * fit full-sized skbs in-flight on both end hosts to fully utilize the path: + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine + * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because + * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ +static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 cwnd; + u64 w; + + /* If we've never had a valid RTT sample, cap cwnd at the initial + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which + * case we need to slow-start up toward something safe: TCP_INIT_CWND. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ + return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ + + w = (u64)bw * bbr->min_rtt_us; + + /* Apply a gain to the given value, then remove the BW_SCALE shift. */ + cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; + + /* Allow enough full-sized skbs in flight to utilize end systems. */ + cwnd += 3 * bbr->tso_segs_goal; + + /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ + cwnd = (cwnd + 1) & ~1U; + + return cwnd; +} + +/* An optimization in BBR to reduce losses: On the first round of recovery, we + * follow the packet conservation principle: send P packets per P packets acked. + * After that, we slow-start and send at most 2*P packets per P packets acked. + * After recovery finishes, or upon undo, we restore the cwnd we had when + * recovery started (capped by the target cwnd based on estimated BDP). + * + * TODO(ycheng/ncardwell): implement a rate-based approach. + */ +static bool bbr_set_cwnd_to_recover_or_restore( + struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; + u32 cwnd = tp->snd_cwnd; + + /* An ACK for P pkts should release at most 2*P packets. We do this + * in two steps. First, here we deduct the number of lost packets. + * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. + */ + if (rs->losses > 0) + cwnd = max_t(s32, cwnd - rs->losses, 1); + + if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { + /* Starting 1st round of Recovery, so do packet conservation. */ + bbr->packet_conservation = 1; + bbr->next_rtt_delivered = tp->delivered; /* start round now */ + /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ + cwnd = tcp_packets_in_flight(tp) + acked; + } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { + /* Exiting loss recovery; restore cwnd saved before recovery. */ + bbr->restore_cwnd = 1; + bbr->packet_conservation = 0; + } + bbr->prev_ca_state = state; + + if (bbr->restore_cwnd) { + /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ + cwnd = max(cwnd, bbr->prior_cwnd); + bbr->restore_cwnd = 0; + } + + if (bbr->packet_conservation) { + *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); + return true; /* yes, using packet conservation */ + } + *new_cwnd = cwnd; + return false; +} + +/* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ +static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + u32 acked, u32 bw, int gain) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 cwnd = 0, target_cwnd = 0; + + if (!acked) + return; + + if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) + goto done; + + /* If we're below target cwnd, slow start cwnd toward target cwnd. */ + target_cwnd = bbr_target_cwnd(sk, bw, gain); + if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ + cwnd = min(cwnd + acked, target_cwnd); + else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) + cwnd = cwnd + acked; + cwnd = max(cwnd, bbr_cwnd_min_target); + +done: + tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ + tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); +} + +/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +static bool bbr_is_next_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool is_full_length = + skb_mstamp_us_delta(&tp->delivered_mstamp, &bbr->cycle_mstamp) > + bbr->min_rtt_us; + u32 inflight, bw; + + /* The pacing_gain of 1.0 paces at the estimated bw to try to fully + * use the pipe without increasing the queue. + */ + if (bbr->pacing_gain == BBR_UNIT) + return is_full_length; /* just use wall clock time */ + + inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ + bw = bbr_max_bw(sk); + + /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at + * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is + * small (e.g. on a LAN). We do not persist if packets are lost, since + * a path with small buffers may not hold that much. + */ + if (bbr->pacing_gain > BBR_UNIT) + return is_full_length && + (rs->losses || /* perhaps pacing_gain*BDP won't fit */ + inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); + + /* A pacing_gain < 1.0 tries to drain extra queue we added if bw + * probing didn't find more bw. If inflight falls to match BDP then we + * estimate queue is drained; persisting would underutilize the pipe. + */ + return is_full_length || + inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); +} + +static void bbr_advance_cycle_phase(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); + bbr->cycle_mstamp = tp->delivered_mstamp; + bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; +} + +/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +static void bbr_update_cycle_phase(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw && + bbr_is_next_cycle_phase(sk, rs)) + bbr_advance_cycle_phase(sk); +} + +static void bbr_reset_startup_mode(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->mode = BBR_STARTUP; + bbr->pacing_gain = bbr_high_gain; + bbr->cwnd_gain = bbr_high_gain; +} + +static void bbr_reset_probe_bw_mode(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->mode = BBR_PROBE_BW; + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = bbr_cwnd_gain; + bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); + bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +} + +static void bbr_reset_mode(struct sock *sk) +{ + if (!bbr_full_bw_reached(sk)) + bbr_reset_startup_mode(sk); + else + bbr_reset_probe_bw_mode(sk); +} + +/* Start a new long-term sampling interval. */ +static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->lt_last_stamp = tp->delivered_mstamp.stamp_jiffies; + bbr->lt_last_delivered = tp->delivered; + bbr->lt_last_lost = tp->lost; + bbr->lt_rtt_cnt = 0; +} + +/* Completely reset long-term bandwidth sampling. */ +static void bbr_reset_lt_bw_sampling(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->lt_bw = 0; + bbr->lt_use_bw = 0; + bbr->lt_is_sampling = false; + bbr_reset_lt_bw_sampling_interval(sk); +} + +/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 diff; + + if (bbr->lt_bw) { /* do we have bw from a previous interval? */ + /* Is new bw close to the lt_bw from the previous interval? */ + diff = abs(bw - bbr->lt_bw); + if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || + (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= + bbr_lt_bw_diff)) { + /* All criteria are met; estimate we're policed. */ + bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ + bbr->lt_use_bw = 1; + bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ + bbr->lt_rtt_cnt = 0; + return; + } + } + bbr->lt_bw = bw; + bbr_reset_lt_bw_sampling_interval(sk); +} + +/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of + * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and + * explicitly models their policed rate, to reduce unnecessary losses. We + * estimate that we're policed if we see 2 consecutive sampling intervals with + * consistent throughput and high packet loss. If we think we're being policed, + * set lt_bw to the "long-term" average delivery rate from those 2 intervals. + */ +static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 lost, delivered; + u64 bw; + s32 t; + + if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ + if (bbr->mode == BBR_PROBE_BW && bbr->round_start && + ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { + bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ + bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ + } + return; + } + + /* Wait for the first loss before sampling, to let the policer exhaust + * its tokens and estimate the steady-state rate allowed by the policer. + * Starting samples earlier includes bursts that over-estimate the bw. + */ + if (!bbr->lt_is_sampling) { + if (!rs->losses) + return; + bbr_reset_lt_bw_sampling_interval(sk); + bbr->lt_is_sampling = true; + } + + /* To avoid underestimates, reset sampling if we run out of data. */ + if (rs->is_app_limited) { + bbr_reset_lt_bw_sampling(sk); + return; + } + + if (bbr->round_start) + bbr->lt_rtt_cnt++; /* count round trips in this interval */ + if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) + return; /* sampling interval needs to be longer */ + if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { + bbr_reset_lt_bw_sampling(sk); /* interval is too long */ + return; + } + + /* End sampling interval when a packet is lost, so we estimate the + * policer tokens were exhausted. Stopping the sampling before the + * tokens are exhausted under-estimates the policed rate. + */ + if (!rs->losses) + return; + + /* Calculate packets lost and delivered in sampling interval. */ + lost = tp->lost - bbr->lt_last_lost; + delivered = tp->delivered - bbr->lt_last_delivered; + /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ + if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) + return; + + /* Find average delivery rate in this sampling interval. */ + t = (s32)(tp->delivered_mstamp.stamp_jiffies - bbr->lt_last_stamp); + if (t < 1) + return; /* interval is less than one jiffy, so wait */ + t = jiffies_to_usecs(t); + /* Interval long enough for jiffies_to_usecs() to return a bogus 0? */ + if (t < 1) { + bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ + return; + } + bw = (u64)delivered * BW_UNIT; + do_div(bw, t); + bbr_lt_bw_interval_done(sk, bw); +} + +/* Estimate the bandwidth based on how fast packets are delivered */ +static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->round_start = 0; + if (rs->delivered < 0 || rs->interval_us <= 0) + return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ + if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { + bbr->next_rtt_delivered = tp->delivered; + bbr->rtt_cnt++; + bbr->round_start = 1; + bbr->packet_conservation = 0; + } + + bbr_lt_bw_sampling(sk, rs); + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. + */ + bw = (u64)rs->delivered * BW_UNIT; + do_div(bw, rs->interval_us); + + /* If this sample is application-limited, it is likely to have a very + * low delivered count that represents application behavior rather than + * the available network rate. Such a sample could drag down estimated + * bw, causing needless slow-down. Thus, to continue to send at the + * last measured network rate, we filter out app-limited samples unless + * they describe the path bw at least as well as our bw model. + * + * So the goal during app-limited phase is to proceed with the best + * network rate no matter how long. We automatically leave this + * phase when app writes faster than the network can deliver :) + */ + if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { + /* Incorporate new sample into our max bw filter. */ + minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); + } +} + +/* Estimate when the pipe is full, using the change in delivery rate: BBR + * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by + * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited + * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the + * higher rwin, 3: we get higher delivery rate samples. Or transient + * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar + * design goal, but uses delay and inter-ACK spacing instead of bandwidth. + */ +static void bbr_check_full_bw_reached(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw_thresh; + + if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) + return; + + bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; + if (bbr_max_bw(sk) >= bw_thresh) { + bbr->full_bw = bbr_max_bw(sk); + bbr->full_bw_cnt = 0; + return; + } + ++bbr->full_bw_cnt; +} + +/* If pipe is probably full, drain the queue and then enter steady-state. */ +static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { + bbr->mode = BBR_DRAIN; /* drain queue we created */ + bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ + bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ + } /* fall through to check if in-flight is already small: */ + if (bbr->mode == BBR_DRAIN && + tcp_packets_in_flight(tcp_sk(sk)) <= + bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) + bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +} + +/* The goal of PROBE_RTT mode is to have BBR flows cooperatively and + * periodically drain the bottleneck queue, to converge to measure the true + * min_rtt (unloaded propagation delay). This allows the flows to keep queues + * small (reducing queuing delay and packet loss) and achieve fairness among + * BBR flows. + * + * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, + * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. + * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed + * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and + * re-enter the previous mode. BBR uses 200ms to approximately bound the + * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). + * + * Note that flows need only pay 2% if they are busy sending over the last 10 + * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have + * natural silences or low-rate periods within 10 seconds where the rate is low + * enough for long enough to drain its queue in the bottleneck. We pick up + * these min RTT measurements opportunistically with our min_rtt filter. :-) + */ +static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool filter_expired; + + /* Track min RTT seen in the min_rtt_win_sec filter window: */ + filter_expired = after(tcp_time_stamp, + bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); + if (rs->rtt_us >= 0 && + (rs->rtt_us <= bbr->min_rtt_us || filter_expired)) { + bbr->min_rtt_us = rs->rtt_us; + bbr->min_rtt_stamp = tcp_time_stamp; + } + + if (bbr_probe_rtt_mode_ms > 0 && filter_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; + } + + if (bbr->mode == BBR_PROBE_RTT) { + /* Ignore low rate samples during this mode. */ + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && + tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { + bbr->probe_rtt_done_stamp = tcp_time_stamp + + msecs_to_jiffies(bbr_probe_rtt_mode_ms); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { + if (bbr->round_start) + bbr->probe_rtt_round_done = 1; + if (bbr->probe_rtt_round_done && + after(tcp_time_stamp, bbr->probe_rtt_done_stamp)) { + bbr->min_rtt_stamp = tcp_time_stamp; + bbr->restore_cwnd = 1; /* snap to prior_cwnd */ + bbr_reset_mode(sk); + } + } + } + bbr->idle_restart = 0; +} + +static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) +{ + bbr_update_bw(sk, rs); + bbr_update_cycle_phase(sk, rs); + bbr_check_full_bw_reached(sk, rs); + bbr_check_drain(sk, rs); + bbr_update_min_rtt(sk, rs); +} + +static void bbr_main(struct sock *sk, const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw; + + bbr_update_model(sk, rs); + + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); + bbr_set_tso_segs_goal(sk); + bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); +} + +static void bbr_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + bbr->prior_cwnd = 0; + bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ + bbr->rtt_cnt = 0; + bbr->next_rtt_delivered = 0; + bbr->prev_ca_state = TCP_CA_Open; + bbr->packet_conservation = 0; + + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; + bbr->min_rtt_us = tcp_min_rtt(tp); + bbr->min_rtt_stamp = tcp_time_stamp; + + minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ + + /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); + sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ + bbr_set_pacing_rate(sk, bw, bbr_high_gain); + + bbr->restore_cwnd = 0; + bbr->round_start = 0; + bbr->idle_restart = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; + bbr->cycle_mstamp.v64 = 0; + bbr->cycle_idx = 0; + bbr_reset_lt_bw_sampling(sk); + bbr_reset_startup_mode(sk); +} + +static u32 bbr_sndbuf_expand(struct sock *sk) +{ + /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ + return 3; +} + +/* In theory BBR does not need to undo the cwnd since it does not + * always reduce cwnd on losses (see bbr_main()). Keep it for now. + */ +static u32 bbr_undo_cwnd(struct sock *sk) +{ + return tcp_sk(sk)->snd_cwnd; +} + +/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ +static u32 bbr_ssthresh(struct sock *sk) +{ + bbr_save_cwnd(sk); + return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ +} + +static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, + union tcp_cc_info *info) +{ + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw = bbr_bw(sk); + + bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; + memset(&info->bbr, 0, sizeof(info->bbr)); + info->bbr.bbr_bw_lo = (u32)bw; + info->bbr.bbr_bw_hi = (u32)(bw >> 32); + info->bbr.bbr_min_rtt = bbr->min_rtt_us; + info->bbr.bbr_pacing_gain = bbr->pacing_gain; + info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; + *attr = INET_DIAG_BBRINFO; + return sizeof(info->bbr); + } + return 0; +} + +static void bbr_set_state(struct sock *sk, u8 new_state) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { + struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; + bbr->full_bw = 0; + bbr->round_start = 1; /* treat RTO like end of a round */ + bbr_lt_bw_sampling(sk, &rs); + } +} + +static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { + .flags = TCP_CONG_NON_RESTRICTED, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, + .tso_segs_goal = bbr_tso_segs_goal, + .get_info = bbr_get_info, + .set_state = bbr_set_state, +}; + +static int __init bbr_register(void) +{ + BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_bbr_cong_ops); +} + +static void __exit bbr_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_bbr_cong_ops); +} + +module_init(bbr_register); +module_exit(bbr_unregister); + +MODULE_AUTHOR("Van Jacobson <vanj@google.com>"); +MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>"); +MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>"); +MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 03725b294286..35b280361cb2 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -56,7 +56,7 @@ MODULE_PARM_DESC(use_shadow, "use shadow window heuristic"); module_param(use_tolerance, bool, 0644); MODULE_PARM_DESC(use_tolerance, "use loss tolerance heuristic"); -struct minmax { +struct cdg_minmax { union { struct { s32 min; @@ -74,10 +74,10 @@ enum cdg_state { }; struct cdg { - struct minmax rtt; - struct minmax rtt_prev; - struct minmax *gradients; - struct minmax gsum; + struct cdg_minmax rtt; + struct cdg_minmax rtt_prev; + struct cdg_minmax *gradients; + struct cdg_minmax gsum; bool gfilled; u8 tail; u8 state; @@ -353,7 +353,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - struct minmax *gradients; + struct cdg_minmax *gradients; switch (ev) { case CA_EVENT_CWND_RESTART: diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 882caa4e72bc..1294af4e0127 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -69,7 +69,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid) { + if (!ca->ssthresh || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a756b8749a26..a27b9c0e27c0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -289,6 +289,7 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr static void tcp_sndbuf_expand(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; int sndmem, per_mss; u32 nr_segs; @@ -309,7 +310,8 @@ static void tcp_sndbuf_expand(struct sock *sk) * Cubic needs 1.7 factor, rounded to 2 to include * extra cushion (application might react slowly to POLLOUT) */ - sndmem = 2 * nr_segs * per_mss; + sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; + sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); @@ -899,12 +901,29 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } +/* Sum the number of packets on the wire we have marked as lost. + * There are two cases we care about here: + * a) Packet hasn't been marked lost (nor retransmitted), + * and this is the first loss. + * b) Packet has been marked both lost and retransmitted, + * and this means we think it was lost again. + */ +static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) +{ + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if (!(sacked & TCPCB_LOST) || + ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) + tp->lost += tcp_skb_pcount(skb); +} + static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) { if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tcp_verify_retransmit_hint(tp, skb); tp->lost_out += tcp_skb_pcount(skb); + tcp_sum_lost(tp, skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; } } @@ -913,6 +932,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) { tcp_verify_retransmit_hint(tp, skb); + tcp_sum_lost(tp, skb); if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; @@ -1094,6 +1114,7 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; + struct rate_sample *rate; int flag; }; @@ -1261,6 +1282,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, &skb->skb_mstamp); + tcp_rate_skb_delivered(sk, skb, state->rate); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1311,6 +1333,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); + if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp.v64)) + TCP_SKB_CB(prev)->tx.delivered_mstamp.v64 = 0; + tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); @@ -1540,6 +1565,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, dup_sack, tcp_skb_pcount(skb), &skb->skb_mstamp); + tcp_rate_skb_delivered(sk, skb, state->rate); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1622,8 +1648,10 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); - if (found_dup_sack) + if (found_dup_sack) { state->flag |= FLAG_DSACKING_ACK; + tp->delivered++; /* A spurious retransmission is delivered */ + } /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1890,6 +1918,7 @@ void tcp_enter_loss(struct sock *sk) struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ + bool mark_lost; /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1923,8 +1952,12 @@ void tcp_enter_loss(struct sock *sk) if (skb == tcp_send_head(sk)) break; + mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || + is_reneg); + if (mark_lost) + tcp_sum_lost(tp, skb); TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { + if (mark_lost) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -2502,6 +2535,9 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + if (inet_csk(sk)->icsk_ca_ops->cong_control) + return; + /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { @@ -2878,67 +2914,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, *rexmit = REXMIT_LOST; } -/* Kathleen Nichols' algorithm for tracking the minimum value of - * a data stream over some fixed time interval. (E.g., the minimum - * RTT over the past five minutes.) It uses constant space and constant - * time per update yet almost always delivers the same minimum as an - * implementation that has to keep all the data in the window. - * - * The algorithm keeps track of the best, 2nd best & 3rd best min - * values, maintaining an invariant that the measurement time of the - * n'th best >= n-1'th best. It also makes sure that the three values - * are widely separated in the time window since that bounds the worse - * case error when that data is monotonically increasing over the window. - * - * Upon getting a new min, we can forget everything earlier because it - * has no value - the new min is <= everything else in the window by - * definition and it's the most recent. So we restart fresh on every new min - * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd - * best. - */ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us) { - const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ; - struct rtt_meas *m = tcp_sk(sk)->rtt_min; - struct rtt_meas rttm = { - .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1), - .ts = now, - }; - u32 elapsed; - - /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */ - if (unlikely(rttm.rtt <= m[0].rtt)) - m[0] = m[1] = m[2] = rttm; - else if (rttm.rtt <= m[1].rtt) - m[1] = m[2] = rttm; - else if (rttm.rtt <= m[2].rtt) - m[2] = rttm; - - elapsed = now - m[0].ts; - if (unlikely(elapsed > wlen)) { - /* Passed entire window without a new min so make 2nd choice - * the new min & 3rd choice the new 2nd. So forth and so on. - */ - m[0] = m[1]; - m[1] = m[2]; - m[2] = rttm; - if (now - m[0].ts > wlen) { - m[0] = m[1]; - m[1] = rttm; - if (now - m[0].ts > wlen) - m[0] = rttm; - } - } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) { - /* Passed a quarter of the window without a new min so - * take 2nd choice from the 2nd quarter of the window. - */ - m[2] = m[1] = rttm; - } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) { - /* Passed half the window without a new min so take the 3rd - * choice from the last half of the window. - */ - m[2] = rttm; - } + struct tcp_sock *tp = tcp_sk(sk); + u32 wlen = sysctl_tcp_min_rtt_wlen * HZ; + + minmax_running_min(&tp->rtt_min, wlen, tcp_time_stamp, + rtt_us ? : jiffies_to_usecs(1)); } static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, @@ -3101,10 +3083,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una, int *acked, - struct tcp_sacktag_state *sack) + struct tcp_sacktag_state *sack, + struct skb_mstamp *now) { const struct inet_connection_sock *icsk = inet_csk(sk); - struct skb_mstamp first_ackt, last_ackt, now; + struct skb_mstamp first_ackt, last_ackt; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; @@ -3136,7 +3119,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, acked_pcount = tcp_tso_acked(sk, skb); if (!acked_pcount) break; - fully_acked = false; } else { /* Speedup tcp_unlink_write_queue() and next loop */ @@ -3172,6 +3154,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->packets_out -= acked_pcount; pkts_acked += acked_pcount; + tcp_rate_skb_delivered(sk, skb, sack->rate); /* Initial outgoing SYN's get put onto the write_queue * just like anything else we transmit. It is not @@ -3204,16 +3187,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - skb_mstamp_get(&now); if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) { - seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); - ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + seq_rtt_us = skb_mstamp_us_delta(now, &first_ackt); + ca_rtt_us = skb_mstamp_us_delta(now, &last_ackt); } if (sack->first_sackt.v64) { - sack_rtt_us = skb_mstamp_us_delta(&now, &sack->first_sackt); - ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt); + sack_rtt_us = skb_mstamp_us_delta(now, &sack->first_sackt); + ca_rtt_us = skb_mstamp_us_delta(now, &sack->last_sackt); } - + sack->rate->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet, or -1 */ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, ca_rtt_us); @@ -3241,7 +3223,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); } else if (skb && rtt_update && sack_rtt_us >= 0 && - sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { + sack_rtt_us > skb_mstamp_us_delta(now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3332,8 +3314,15 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) * information. All transmission or retransmission are delayed afterwards. */ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, - int flag) + int flag, const struct rate_sample *rs) { + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->cong_control) { + icsk->icsk_ca_ops->cong_control(sk, rs); + return; + } + if (tcp_in_cwnd_reduction(sk)) { /* Reduce cwnd if state mandates */ tcp_cwnd_reduction(sk, acked_sacked, flag); @@ -3578,17 +3567,21 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_sacktag_state sack_state; + struct rate_sample rs = { .prior_delivered = 0 }; u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; u32 prior_fackets; int prior_packets = tp->packets_out; - u32 prior_delivered = tp->delivered; + u32 delivered = tp->delivered; + u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + struct skb_mstamp now; sack_state.first_sackt.v64 = 0; + sack_state.rate = &rs; /* We very likely will need to access write queue head. */ prefetchw(sk->sk_write_queue.next); @@ -3611,6 +3604,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; + skb_mstamp_get(&now); + if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -3621,6 +3616,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) } prior_fackets = tp->fackets_out; + rs.prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3676,7 +3672,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state); + &sack_state, &now); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -3693,7 +3689,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); + delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ + lost = tp->lost - lost; /* freshly marked lost */ + tcp_rate_gen(sk, delivered, lost, &now, &rs); + tcp_cong_control(sk, ack, delivered, flag, &rs); tcp_xmit_recovery(sk, rexmit); return 1; @@ -4107,7 +4106,7 @@ void tcp_fin(struct sock *sk) /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_mem_reclaim(sk); @@ -4267,7 +4266,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) int this_sack; /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ - if (skb_queue_empty(&tp->out_of_order_queue)) { + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tp->rx_opt.num_sacks = 0; return; } @@ -4343,10 +4342,13 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; + bool fin, fragstolen, eaten; struct sk_buff *skb, *tail; - bool fragstolen, eaten; + struct rb_node *p; - while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + p = rb_first(&tp->out_of_order_queue); + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4356,9 +4358,10 @@ static void tcp_ofo_queue(struct sock *sk) dsack_high = TCP_SKB_CB(skb)->end_seq; tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + p = rb_next(p); + rb_erase(&skb->rbnode, &tp->out_of_order_queue); - __skb_unlink(skb, &tp->out_of_order_queue); - if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { SOCK_DEBUG(sk, "ofo packet was already received\n"); tcp_drop(sk, skb); continue; @@ -4370,12 +4373,19 @@ static void tcp_ofo_queue(struct sock *sk) tail = skb_peek_tail(&sk->sk_receive_queue); eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); + fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) __skb_queue_tail(&sk->sk_receive_queue, skb); - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - tcp_fin(sk); - if (eaten) + else kfree_skb_partial(skb, fragstolen); + + if (unlikely(fin)) { + tcp_fin(sk); + /* tcp_fin() purges tp->out_of_order_queue, + * so we must end this loop right now. + */ + break; + } } } @@ -4391,12 +4401,9 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, skb, size)) { + while (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk)) return -1; - - if (!sk_rmem_schedule(sk, skb, size)) - return -1; } } return 0; @@ -4405,8 +4412,10 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + struct rb_node **p, *q, *parent; struct sk_buff *skb1; u32 seq, end_seq; + bool fragstolen; tcp_ecn_check_ce(tp, skb); @@ -4421,88 +4430,92 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) inet_csk_schedule_ack(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); + seq = TCP_SKB_CB(skb)->seq; + end_seq = TCP_SKB_CB(skb)->end_seq; SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", - tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + tp->rcv_nxt, seq, end_seq); - skb1 = skb_peek_tail(&tp->out_of_order_queue); - if (!skb1) { + p = &tp->out_of_order_queue.rb_node; + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; - tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; - tp->selective_acks[0].end_seq = - TCP_SKB_CB(skb)->end_seq; + tp->selective_acks[0].start_seq = seq; + tp->selective_acks[0].end_seq = end_seq; } - __skb_queue_head(&tp->out_of_order_queue, skb); + rb_link_node(&skb->rbnode, NULL, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); + tp->ooo_last_skb = skb; goto end; } - seq = TCP_SKB_CB(skb)->seq; - end_seq = TCP_SKB_CB(skb)->end_seq; - - if (seq == TCP_SKB_CB(skb1)->end_seq) { - bool fragstolen; - - if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); - } else { - tcp_grow_window(sk, skb); - kfree_skb_partial(skb, fragstolen); - skb = NULL; - } - - if (!tp->rx_opt.num_sacks || - tp->selective_acks[0].end_seq != seq) - goto add_sack; - - /* Common case: data arrive in order after hole. */ - tp->selective_acks[0].end_seq = end_seq; - goto end; - } - - /* Find place to insert this segment. */ - while (1) { - if (!after(TCP_SKB_CB(skb1)->seq, seq)) - break; - if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) { - skb1 = NULL; - break; - } - skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1); - } - - /* Do skb overlap to previous one? */ - if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { - if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - /* All the bits are present. Drop. */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb); - skb = NULL; - tcp_dsack_set(sk, seq, end_seq); - goto add_sack; + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ + if (tcp_try_coalesce(sk, tp->ooo_last_skb, skb, &fragstolen)) { +coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); + skb = NULL; + goto add_sack; + } + /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ + if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) { + parent = &tp->ooo_last_skb->rbnode; + p = &parent->rb_right; + goto insert; + } + + /* Find place to insert this segment. Handle overlaps on the way. */ + parent = NULL; + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(seq, TCP_SKB_CB(skb1)->seq)) { + p = &parent->rb_left; + continue; } - if (after(seq, TCP_SKB_CB(skb1)->seq)) { - /* Partial overlap. */ - tcp_dsack_set(sk, seq, - TCP_SKB_CB(skb1)->end_seq); - } else { - if (skb_queue_is_first(&tp->out_of_order_queue, - skb1)) - skb1 = NULL; - else - skb1 = skb_queue_prev( - &tp->out_of_order_queue, - skb1); + if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + /* skb's seq == skb1's seq and skb covers skb1. + * Replace skb1 with skb. + */ + rb_replace_node(&skb1->rbnode, &skb->rbnode, + &tp->out_of_order_queue); + tcp_dsack_extend(sk, + TCP_SKB_CB(skb1)->seq, + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); + __kfree_skb(skb1); + goto merge_right; + } + } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { + goto coalesce_done; } + p = &parent->rb_right; } - if (!skb1) - __skb_queue_head(&tp->out_of_order_queue, skb); - else - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); +insert: + /* Insert segment into RB tree. */ + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); - /* And clean segments covered by new one as whole. */ - while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) { - skb1 = skb_queue_next(&tp->out_of_order_queue, skb); +merge_right: + /* Remove other segments covered by skb. */ + while ((q = rb_next(&skb->rbnode)) != NULL) { + skb1 = rb_entry(q, struct sk_buff, rbnode); if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; @@ -4511,12 +4524,15 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq); break; } - __skb_unlink(skb1, &tp->out_of_order_queue); + rb_erase(&skb1->rbnode, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop(sk, skb1); } + /* If there is no skb after us, we are the last_skb ! */ + if (!q) + tp->ooo_last_skb = skb; add_sack: if (tcp_is_sack(tp)) @@ -4653,13 +4669,13 @@ queue_and_out: if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); - if (!skb_queue_empty(&tp->out_of_order_queue)) { + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_ofo_queue(sk); /* RFC2581. 4.2. SHOULD send immediate ACK, when * gap in queue is filled. */ - if (skb_queue_empty(&tp->out_of_order_queue)) + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) inet_csk(sk)->icsk_ack.pingpong = 0; } @@ -4713,48 +4729,76 @@ drop: tcp_data_queue_ofo(sk, skb); } +static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) +{ + if (list) + return !skb_queue_is_last(list, skb) ? skb->next : NULL; + + return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); +} + static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *list) + struct sk_buff_head *list, + struct rb_root *root) { - struct sk_buff *next = NULL; + struct sk_buff *next = tcp_skb_next(skb, list); - if (!skb_queue_is_last(list, skb)) - next = skb_queue_next(list, skb); + if (list) + __skb_unlink(skb, list); + else + rb_erase(&skb->rbnode, root); - __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ +static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sk_buff *skb1; + + while (*p) { + parent = *p; + skb1 = rb_entry(parent, struct sk_buff, rbnode); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, root); +} + /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * - * If tail is NULL, this means until the end of the list. + * If tail is NULL, this means until the end of the queue. * * Segments with FIN/SYN are not collapsed (only because this * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff_head *list, - struct sk_buff *head, struct sk_buff *tail, - u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, + struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) { - struct sk_buff *skb, *n; + struct sk_buff *skb = head, *n; + struct sk_buff_head tmp; bool end_of_skbs; /* First, check that queue is collapsible and find - * the point where collapsing can be useful. */ - skb = head; + * the point where collapsing can be useful. + */ restart: - end_of_skbs = true; - skb_queue_walk_from_safe(list, skb, n) { - if (skb == tail) - break; + for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { + n = tcp_skb_next(skb, list); + /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb) break; goto restart; @@ -4772,13 +4816,10 @@ restart: break; } - if (!skb_queue_is_last(list, skb)) { - struct sk_buff *next = skb_queue_next(list, skb); - if (next != tail && - TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { - end_of_skbs = false; - break; - } + if (n && n != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { + end_of_skbs = false; + break; } /* Decided to skip this, advance start seq. */ @@ -4788,17 +4829,22 @@ restart: (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; + __skb_queue_head_init(&tmp); + while (before(start, end)) { int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) - return; + break; memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_queue_before(list, skb, nskb); + if (list) + __skb_queue_before(list, skb, nskb); + else + __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -4816,14 +4862,17 @@ restart: start += size; } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { - skb = tcp_collapse_one(sk, skb, list); + skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) - return; + goto end; } } } +end: + skb_queue_walk_safe(&tmp, skb, n) + tcp_rbtree_insert(root, skb); } /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs @@ -4832,70 +4881,86 @@ restart: static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); - struct sk_buff *head; + struct sk_buff *skb, *head; + struct rb_node *p; u32 start, end; - if (!skb) + p = rb_first(&tp->out_of_order_queue); + skb = rb_entry_safe(p, struct sk_buff, rbnode); +new_range: + if (!skb) { + p = rb_last(&tp->out_of_order_queue); + /* Note: This is possible p is NULL here. We do not + * use rb_entry_safe(), as ooo_last_skb is valid only + * if rbtree is not empty. + */ + tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); return; - + } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; - head = skb; - - for (;;) { - struct sk_buff *next = NULL; - if (!skb_queue_is_last(&tp->out_of_order_queue, skb)) - next = skb_queue_next(&tp->out_of_order_queue, skb); - skb = next; + for (head = skb;;) { + skb = tcp_skb_next(skb, NULL); - /* Segment is terminated when we see gap or when - * we are at the end of all the queue. */ + /* Range is terminated when we see a gap or when + * we are at the queue end. + */ if (!skb || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, &tp->out_of_order_queue, + tcp_collapse(sk, NULL, &tp->out_of_order_queue, head, skb, start, end); - head = skb; - if (!skb) - break; - /* Start new segment */ + goto new_range; + } + + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) end = TCP_SKB_CB(skb)->end_seq; - } else { - if (before(TCP_SKB_CB(skb)->seq, start)) - start = TCP_SKB_CB(skb)->seq; - if (after(TCP_SKB_CB(skb)->end_seq, end)) - end = TCP_SKB_CB(skb)->end_seq; - } } } /* - * Purge the out-of-order queue. - * Return true if queue was pruned. + * Clean the out-of-order queue to make room. + * We drop high sequences packets to : + * 1) Let a chance for holes to be filled. + * 2) not add too big latencies if thousands of packets sit there. + * (But if application shrinks SO_RCVBUF, we could still end up + * freeing whole queue here) + * + * Return true if queue has shrunk. */ static bool tcp_prune_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - bool res = false; + struct rb_node *node, *prev; - if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); - __skb_queue_purge(&tp->out_of_order_queue); + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) + return false; - /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if (tp->rx_opt.sack_ok) - tcp_sack_reset(&tp->rx_opt); + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); + tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); sk_mem_reclaim(sk); - res = true; - } - return res; + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !tcp_under_memory_pressure(sk)) + break; + node = prev; + } while (node); + tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + return true; } /* Reduce allocated memory if we can, trying to get @@ -4920,7 +4985,7 @@ static int tcp_prune_queue(struct sock *sk) tcp_collapse_ofo_queue(sk); if (!skb_queue_empty(&sk->sk_receive_queue)) - tcp_collapse(sk, &sk->sk_receive_queue, + tcp_collapse(sk, &sk->sk_receive_queue, NULL, skb_peek(&sk->sk_receive_queue), NULL, tp->copied_seq, tp->rcv_nxt); @@ -5025,7 +5090,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ - (ofo_possible && skb_peek(&tp->out_of_order_queue))) { + (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { /* Then ack it now */ tcp_send_ack(sk); } else { @@ -5926,7 +5991,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } else tcp_init_metrics(sk); - tcp_update_pacing_rate(sk); + if (!inet_csk(sk)->icsk_ca_ops->cong_control) + tcp_update_pacing_rate(sk); /* Prevent spurious tcp_cwnd_restart() on first data packet */ tp->lsndtime = tcp_time_stamp; @@ -6259,6 +6325,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); + inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent; /* Note: tcp_v6_init_req() might override ir_iif for link locals */ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7158d4f8dae4..7ac37c314312 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1175,6 +1175,7 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), @@ -1195,7 +1196,6 @@ static void tcp_v4_init_req(struct request_sock *req, sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->no_srccheck = inet_sk(sk_listener)->transparent; ireq->opt = tcp_v4_save_options(skb); } @@ -1537,6 +1537,34 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_prequeue); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) +{ + u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; + + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + + /* In case all data was pulled from skb frags (in __pskb_pull_tail()), + * we can fix skb->truesize to its real value to avoid future drops. + * This is valid because skb is not yet charged to the socket. + * It has been noticed pure SACK packets were sometimes dropped + * (if cooked by drivers without copybreak feature). + */ + if (!skb->data_len) + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + + if (unlikely(sk_add_backlog(sk, skb, limit))) { + bh_unlock_sock(sk); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); + return true; + } + return false; +} +EXPORT_SYMBOL(tcp_add_backlog); + /* * From tcp_input.c */ @@ -1608,6 +1636,7 @@ process: sk = req->rsk_listener; if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } @@ -1666,10 +1695,7 @@ process: if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); - } else if (unlikely(sk_add_backlog(sk, skb, - sk->sk_rcvbuf + sk->sk_sndbuf))) { - bh_unlock_sock(sk); - __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); + } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } bh_unlock_sock(sk); @@ -1818,7 +1844,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_write_queue_purge(sk); /* Cleans up our, hopefully empty, out_of_order_queue. */ - __skb_queue_purge(&tp->out_of_order_queue); + skb_rbtree_purge(&tp->out_of_order_queue); #ifdef CONFIG_TCP_MD5SIG /* Clean up the MD5 key list, if any */ diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b617826e2477..bf1f3b2b29d1 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -751,7 +751,7 @@ static struct genl_family tcp_metrics_nl_family = { .netnsok = true, }; -static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { +static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, .len = sizeof(struct in6_addr), }, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4b95ec4ed2c8..6234ebaa7db1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -464,7 +464,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->srtt_us = 0; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); - newtp->rtt_min[0].rtt = ~0U; + minmax_reset(&newtp->rtt_min, tcp_time_stamp, ~0U); newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; @@ -487,8 +487,10 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_cwnd = TCP_INIT_CWND; newtp->snd_cwnd_cnt = 0; + /* There's a bubble in the pipe until at least the first ACK. */ + newtp->app_limited = ~0U; + tcp_init_xmit_timers(newsk); - __skb_queue_head_init(&newtp->out_of_order_queue); newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1; newtp->rx_opt.saw_tstamp = 0; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 5c5964962d0c..bc68da38ea86 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -90,12 +90,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, goto out; } - /* GSO partial only requires splitting the frame into an MSS - * multiple and possibly a remainder. So update the mss now. - */ - if (features & NETIF_F_GSO_PARTIAL) - mss = skb->len - (skb->len % mss); - copy_destructor = gso_skb->destructor == tcp_wfree; ooo_okay = gso_skb->ooo_okay; /* All segments but the first should have ooo_okay cleared */ @@ -108,6 +102,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, /* Only first segment might have ooo_okay set */ segs->ooo_okay = ooo_okay; + /* GSO partial and frag_list segmentation only requires splitting + * the frame into an MSS multiple and possibly a remainder, both + * cases return a GSO skb. So update the mss now. + */ + if (skb_is_gso(segs)) + mss *= skb_shinfo(segs)->gso_segs; + delta = htonl(oldlen + (thlen + mss)); skb = segs; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d48d5571e62a..896e9dfbdb5c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -734,9 +734,16 @@ static void tcp_tsq_handler(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | - TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) - tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) { + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->lost_out > tp->retrans_out && + tp->snd_cwnd > tcp_packets_in_flight(tp)) + tcp_xmit_retransmit_queue(sk); + + tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle, 0, GFP_ATOMIC); + } } /* * One tasklet per cpu tries to send more skbs. @@ -918,6 +925,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_mstamp_get(&skb->skb_mstamp); TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; + tcp_rate_skb_sent(sk, skb); if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -1213,6 +1221,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, tcp_set_skb_tso_segs(skb, mss_now); tcp_set_skb_tso_segs(buff, mss_now); + /* Update delivered info for the new segment */ + TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx; + /* If this packet has been sent out already, we must * adjust the various packet counters. */ @@ -1358,6 +1369,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss) } return mtu; } +EXPORT_SYMBOL(tcp_mss_to_mtu); /* MTU probing init per socket */ void tcp_mtup_init(struct sock *sk) @@ -1545,7 +1557,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, /* Return how many segs we'd like on a TSO packet, * to send one TSO packet per ms */ -static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) +u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + int min_tso_segs) { u32 bytes, segs; @@ -1557,10 +1570,23 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) * This preserves ACK clocking and is consistent * with tcp_tso_should_defer() heuristic. */ - segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); + segs = max_t(u32, bytes / mss_now, min_tso_segs); return min_t(u32, segs, sk->sk_gso_max_segs); } +EXPORT_SYMBOL(tcp_tso_autosize); + +/* Return the number of segments we want in the skb we are transmitting. + * See if congestion control module wants to decide; otherwise, autosize. + */ +static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; + + return tso_segs ? : + tcp_tso_autosize(sk, mss_now, sysctl_tcp_min_tso_segs); +} /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, @@ -2022,6 +2048,39 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } +/* TCP Small Queues : + * Control number of packets in qdisc/devices to two packets / or ~1 ms. + * (These limits are doubled for retransmits) + * This allows for : + * - better RTT estimation and ACK scheduling + * - faster recovery + * - high rates + * Alas, some drivers / subsystems require a fair amount + * of queued bytes to ensure line rate. + * One example is wifi aggregation (802.11 AMPDU) + */ +static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, + unsigned int factor) +{ + unsigned int limit; + + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); + limit <<= factor; + + if (atomic_read(&sk->sk_wmem_alloc) > limit) { + set_bit(TSQ_THROTTLED, &tcp_sk(sk)->tsq_flags); + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + */ + smp_mb__after_atomic(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) + return true; + } + return false; +} + /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. @@ -2059,7 +2118,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } - max_segs = tcp_tso_autosize(sk, mss_now); + max_segs = tcp_tso_segs(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -2108,29 +2167,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; - /* TCP Small Queues : - * Control number of packets in qdisc/devices to two packets / or ~1 ms. - * This allows for : - * - better RTT estimation and ACK scheduling - * - faster recovery - * - high rates - * Alas, some drivers / subsystems require a fair amount - * of queued bytes to ensure line rate. - * One example is wifi aggregation (802.11 AMPDU) - */ - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); - limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); - - if (atomic_read(&sk->sk_wmem_alloc) > limit) { - set_bit(TSQ_THROTTLED, &tp->tsq_flags); - /* It is possible TX completion already happened - * before we set TSQ_THROTTLED, so we must - * test again the condition. - */ - smp_mb__after_atomic(); - if (atomic_read(&sk->sk_wmem_alloc) > limit) - break; - } + if (tcp_small_queue_check(sk, skb, 0)) + break; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; @@ -2777,9 +2815,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) last_lost = tp->snd_una; } - max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk)); + max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); tcp_for_write_queue_from(skb, sk) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; + __u8 sacked; int segs; if (skb == tcp_send_head(sk)) @@ -2791,6 +2829,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) segs = tp->snd_cwnd - tcp_packets_in_flight(tp); if (segs <= 0) return; + sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets */ @@ -2830,6 +2869,9 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; + if (tcp_small_queue_check(sk, skb, 1)) + return; + if (tcp_retransmit_skb(sk, skb, segs)) return; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c new file mode 100644 index 000000000000..9be1581a5a08 --- /dev/null +++ b/net/ipv4/tcp_rate.c @@ -0,0 +1,186 @@ +#include <net/tcp.h> + +/* The bandwidth estimator estimates the rate at which the network + * can currently deliver outbound data packets for this flow. At a high + * level, it operates by taking a delivery rate sample for each ACK. + * + * A rate sample records the rate at which the network delivered packets + * for this flow, calculated over the time interval between the transmission + * of a data packet and the acknowledgment of that packet. + * + * Specifically, over the interval between each transmit and corresponding ACK, + * the estimator generates a delivery rate sample. Typically it uses the rate + * at which packets were acknowledged. However, the approach of using only the + * acknowledgment rate faces a challenge under the prevalent ACK decimation or + * compression: packets can temporarily appear to be delivered much quicker + * than the bottleneck rate. Since it is physically impossible to do that in a + * sustained fashion, when the estimator notices that the ACK rate is faster + * than the transmit rate, it uses the latter: + * + * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) + * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) + * bw = min(send_rate, ack_rate) + * + * Notice the estimator essentially estimates the goodput, not always the + * network bottleneck link rate when the sending or receiving is limited by + * other factors like applications or receiver window limits. The estimator + * deliberately avoids using the inter-packet spacing approach because that + * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. + */ + +/* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* In general we need to start delivery rate samples from the + * time we received the most recent ACK, to ensure we include + * the full time the network needs to deliver all in-flight + * packets. If there are no packets in flight yet, then we + * know that any ACKs after now indicate that the network was + * able to deliver those packets completely in the sampling + * interval between now and the next ACK. + * + * Note that we use packets_out instead of tcp_packets_in_flight(tp) + * because the latter is a guess based on RTO and loss-marking + * heuristics. We don't want spurious RTOs or loss markings to cause + * a spuriously small time interval, causing a spuriously high + * bandwidth estimate. + */ + if (!tp->packets_out) { + tp->first_tx_mstamp = skb->skb_mstamp; + tp->delivered_mstamp = skb->skb_mstamp; + } + + TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; +} + +/* When an skb is sacked or acked, we fill in the rate sample with the (prior) + * delivery information when the skb was last transmitted. + * + * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is + * called multiple times. We favor the information from the most recently + * sent skb, i.e., the skb with the highest prior_delivered count. + */ +void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + + if (!scb->tx.delivered_mstamp.v64) + return; + + if (!rs->prior_delivered || + after(scb->tx.delivered, rs->prior_delivered)) { + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; + + /* Find the duration of the "send phase" of this window: */ + rs->interval_us = skb_mstamp_us_delta( + &skb->skb_mstamp, + &scb->tx.first_tx_mstamp); + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = skb->skb_mstamp; + } + /* Mark off the skb delivered once it's sacked to avoid being + * used again when it's cumulatively acked. For acked packets + * we don't need to reset since it'll be freed soon. + */ + if (scb->sacked & TCPCB_SACKED_ACKED) + scb->tx.delivered_mstamp.v64 = 0; +} + +/* Update the connection delivery information and generate a rate sample. */ +void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + struct skb_mstamp *now, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 snd_us, ack_us; + + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + + /* TODO: there are multiple places throughout tcp_ack() to get + * current time. Refactor the code using a new "tcp_acktag_state" + * to carry current time, flags, stats like "tcp_sacktag_state". + */ + if (delivered) + tp->delivered_mstamp = *now; + + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available. */ + if (!rs->prior_mstamp.v64) { + rs->delivered = -1; + rs->interval_us = -1; + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; + + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ + ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Normally we expect interval_us >= min-rtt. + * Note that rate may still be over-estimated when a spuriously + * retransmistted skb was first (s)acked because "interval_us" + * is under-estimated (up to an RTT). However continuously + * measuring the delivery rate during loss recovery is crucial + * for connections suffer heavy or prolonged losses. + */ + if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { + if (!rs->is_retrans) + pr_debug("tcp rate: %ld %d %u %u %u\n", + rs->interval_us, rs->delivered, + inet_csk(sk)->icsk_ca_state, + tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + rs->interval_us = -1; + return; + } + + /* Record the last non-app-limited or the highest app-limited bw */ + if (!rs->is_app_limited || + ((u64)rs->delivered * tp->rate_interval_us >= + (u64)tp->rate_delivered * rs->interval_us)) { + tp->rate_delivered = rs->delivered; + tp->rate_interval_us = rs->interval_us; + tp->rate_app_limited = rs->is_app_limited; + } +} + +/* If a gap is detected between sends, mark the socket application-limited. */ +void tcp_rate_check_app_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (/* We have less than one packet to send. */ + tp->write_seq - tp->snd_nxt < tp->mss_cache && + /* Nothing in sending host's qdisc queues or NIC tx queue. */ + sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && + /* We are not limited by CWND. */ + tcp_packets_in_flight(tp) < tp->snd_cwnd && + /* All lost packets have been retransmitted. */ + tp->lost_out <= tp->retrans_out) + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f712b411f6ed..3ea1cf804748 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -192,6 +192,8 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_data && icsk->icsk_retransmits == 1) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + } else if (!tp->syn_data && !tp->syn_fastopen) { + sk_rethink_txhash(sk); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; @@ -213,6 +215,8 @@ static int tcp_write_timeout(struct sock *sk) tcp_mtu_probing(icsk, sk); dst_negative_advice(sk); + } else { + sk_rethink_txhash(sk); } retry_until = net->ipv4.sysctl_tcp_retries2; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5fdcb8d108d4..7d96dc2d3d08 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,6 +114,7 @@ #include <net/busy_poll.h> #include "udp_impl.h" #include <net/sock_reuseport.h> +#include <net/addrconf.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -1020,12 +1021,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flow_flags, faddr, saddr, dport, inet->inet_sport); - if (!saddr && ipc.oif) { - err = l3mdev_get_saddr(net, ipc.oif, fl4); - if (err < 0) - goto out; - } - security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { @@ -2192,6 +2187,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) } EXPORT_SYMBOL(udp_poll); +int udp_abort(struct sock *sk, int err) +{ + lock_sock(sk); + + sk->sk_err = err; + sk->sk_error_report(sk); + udp_disconnect(sk, 0); + + release_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(udp_abort); + struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, @@ -2221,7 +2230,7 @@ struct proto udp_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, + .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 3d5ccf4b1412..9a89c10a55f0 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -20,7 +20,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc) + struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -28,7 +28,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, return inet_sk_diag_fill(sk, NULL, skb, req, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, @@ -76,7 +76,8 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, err = inet_sk_diag_fill(sk, NULL, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); @@ -97,6 +98,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); int num, s_num, slot, s_slot; @@ -132,7 +134,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc) < 0) { + if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } @@ -165,12 +167,88 @@ static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_wqueue = sk_wmem_alloc_get(sk); } +#ifdef CONFIG_INET_DIAG_DESTROY +static int __udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req, + struct udp_table *tbl) +{ + struct net *net = sock_net(in_skb->sk); + struct sock *sk; + int err; + + rcu_read_lock(); + + if (req->sdiag_family == AF_INET) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[0], req->id.idiag_dport, + req->id.idiag_src[0], req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); +#if IS_ENABLED(CONFIG_IPV6) + else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = __udp4_lib_lookup(net, + req->id.idiag_dst[3], req->id.idiag_dport, + req->id.idiag_src[3], req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); + + else + sk = __udp6_lib_lookup(net, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if, tbl, NULL); + } +#endif + else { + rcu_read_unlock(); + return -EINVAL; + } + + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + + rcu_read_unlock(); + + if (!sk) + return -ENOENT; + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_put(sk); + return -ENOENT; + } + + err = sock_diag_destroy(sk, ECONNABORTED); + + sock_put(sk); + + return err; +} + +static int udp_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udp_table); +} + +static int udplite_diag_destroy(struct sk_buff *in_skb, + const struct inet_diag_req_v2 *req) +{ + return __udp_diag_destroy(in_skb, req, &udplite_table); +} + +#endif + static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udp_diag_destroy, +#endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -192,6 +270,9 @@ static const struct inet_diag_handler udplite_diag_handler = { .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, +#ifdef CONFIG_INET_DIAG_DESTROY + .destroy = udplite_diag_destroy, +#endif }; static int __init udp_diag_init(void) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 81f253b6ff36..f9333c963607 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - bool remcsum, need_csum, offload_csum, ufo; + bool remcsum, need_csum, offload_csum, ufo, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; @@ -88,6 +88,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, goto out; } + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + outer_hlen = skb_tnl_header_len(skb); udp_offset = outer_hlen - tnl_hlen; skb = segs; @@ -117,7 +119,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (skb_is_gso(skb)) { + if (gso_partial) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 2eea073e27ef..af817158d830 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -60,7 +60,6 @@ struct proto udplite_prot = { .compat_setsockopt = compat_udp_setsockopt, .compat_getsockopt = compat_udp_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, }; EXPORT_SYMBOL(udplite_prot); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 41f5b504a782..6a7ff6957535 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -112,7 +112,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) int oif = 0; if (skb_dst(skb)) - oif = l3mdev_fib_oif(skb_dst(skb)->dev); + oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2f1f5d439788..cbd9343751a2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -112,6 +112,27 @@ static inline u32 cstamp_delta(unsigned long cstamp) return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } +static inline s32 rfc3315_s14_backoff_init(s32 irt) +{ + /* multiply 'initial retransmission time' by 0.9 .. 1.1 */ + u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt; + do_div(tmp, 1000000); + return (s32)tmp; +} + +static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt) +{ + /* multiply 'retransmission timeout' by 1.9 .. 2.1 */ + u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt; + do_div(tmp, 1000000); + if ((s32)tmp > mrt) { + /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */ + tmp = (900000 + prandom_u32() % 200001) * (u64)mrt; + do_div(tmp, 1000000); + } + return (s32)tmp; +} + #ifdef CONFIG_SYSCTL static int addrconf_sysctl_register(struct inet6_dev *idev); static void addrconf_sysctl_unregister(struct inet6_dev *idev); @@ -187,6 +208,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, @@ -232,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .dad_transmits = 1, .rtr_solicits = MAX_RTR_SOLICITATIONS, .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL, .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, @@ -3687,7 +3710,7 @@ static void addrconf_rs_timer(unsigned long data) if (idev->if_flags & IF_RA_RCVD) goto out; - if (idev->rs_probes++ < idev->cnf.rtr_solicits) { + if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) { write_unlock(&idev->lock); if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) ndisc_send_rs(dev, &lladdr, @@ -3696,11 +3719,13 @@ static void addrconf_rs_timer(unsigned long data) goto put; write_lock(&idev->lock); + idev->rs_interval = rfc3315_s14_backoff_update( + idev->rs_interval, idev->cnf.rtr_solicit_max_interval); /* The wait after the last probe can be shorter */ addrconf_mod_rs_timer(idev, (idev->rs_probes == idev->cnf.rtr_solicits) ? idev->cnf.rtr_solicit_delay : - idev->cnf.rtr_solicit_interval); + idev->rs_interval); } else { /* * Note: we do not support deprecated "all on-link" @@ -3949,7 +3974,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp); send_rs = send_mld && ipv6_accept_ra(ifp->idev) && - ifp->idev->cnf.rtr_solicits > 0 && + ifp->idev->cnf.rtr_solicits != 0 && (dev->flags&IFF_LOOPBACK) == 0; read_unlock_bh(&ifp->idev->lock); @@ -3971,10 +3996,11 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) write_lock_bh(&ifp->idev->lock); spin_lock(&ifp->lock); + ifp->idev->rs_interval = rfc3315_s14_backoff_init( + ifp->idev->cnf.rtr_solicit_interval); ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; - addrconf_mod_rs_timer(ifp->idev, - ifp->idev->cnf.rtr_solicit_interval); + addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval); spin_unlock(&ifp->lock); write_unlock_bh(&ifp->idev->lock); } @@ -4891,6 +4917,8 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; array[DEVCONF_RTR_SOLICIT_INTERVAL] = jiffies_to_msecs(cnf->rtr_solicit_interval); + array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] = + jiffies_to_msecs(cnf->rtr_solicit_max_interval); array[DEVCONF_RTR_SOLICIT_DELAY] = jiffies_to_msecs(cnf->rtr_solicit_delay); array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; @@ -4961,18 +4989,18 @@ static inline size_t inet6_if_nlmsg_size(void) } static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, - int items, int bytes) + int bytes) { int i; - int pad = bytes - sizeof(u64) * items; + int pad = bytes - sizeof(u64) * ICMP6_MIB_MAX; BUG_ON(pad < 0); /* Use put_unaligned() because stats may not be aligned for u64. */ - put_unaligned(items, &stats[0]); - for (i = 1; i < items; i++) + put_unaligned(ICMP6_MIB_MAX, &stats[0]); + for (i = 1; i < ICMP6_MIB_MAX; i++) put_unaligned(atomic_long_read(&mib[i]), &stats[i]); - memset(&stats[items], 0, pad); + memset(&stats[ICMP6_MIB_MAX], 0, pad); } static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, @@ -5005,7 +5033,7 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: - __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); + __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, bytes); break; } } @@ -5099,7 +5127,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) return -EINVAL; if (!ipv6_accept_ra(idev)) return -EINVAL; - if (idev->cnf.rtr_solicits <= 0) + if (idev->cnf.rtr_solicits == 0) return -EINVAL; write_lock_bh(&idev->lock); @@ -5128,8 +5156,10 @@ update_lft: if (update_rs) { idev->if_flags |= IF_RS_SENT; + idev->rs_interval = rfc3315_s14_backoff_init( + idev->cnf.rtr_solicit_interval); idev->rs_probes = 1; - addrconf_mod_rs_timer(idev, idev->cnf.rtr_solicit_interval); + addrconf_mod_rs_timer(idev, idev->rs_interval); } /* Well, that's kinda nasty ... */ @@ -5467,20 +5497,6 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, } static -int addrconf_sysctl_hop_limit(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table lctl; - int min_hl = 1, max_hl = 255; - - lctl = *ctl; - lctl.extra1 = &min_hl; - lctl.extra2 = &max_hl; - - return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos); -} - -static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -5713,6 +5729,9 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, return ret; } +static const int one = 1; +static const int two_five_five = 255; + static const struct ctl_table addrconf_sysctl[] = { { .procname = "forwarding", @@ -5726,7 +5745,9 @@ static const struct ctl_table addrconf_sysctl[] = { .data = &ipv6_devconf.hop_limit, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = addrconf_sysctl_hop_limit, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&one, + .extra2 = (void *)&two_five_five, }, { .procname = "mtu", @@ -5778,6 +5799,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec_jiffies, }, { + .procname = "router_solicitation_max_interval", + .data = &ipv6_devconf.rtr_solicit_max_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { .procname = "router_solicitation_delay", .data = &ipv6_devconf.rtr_solicit_delay, .maxlen = sizeof(int), @@ -6044,8 +6072,14 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, for (i = 0; table[i].data; i++) { table[i].data += (char *)p - (char *)&ipv6_devconf; - table[i].extra1 = idev; /* embedded; no ref */ - table[i].extra2 = net; + /* If one of these is already set, then it is not safe to + * overwrite either of them: this makes proc_dointvec_minmax + * usable. + */ + if (!table[i].extra1 && !table[i].extra2) { + table[i].extra1 = idev; /* embedded; no ref */ + table[i].extra2 = net; + } } snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b454055ba625..46ad699937fd 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -545,6 +545,8 @@ const struct proto_ops inet6_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, .splice_read = tcp_splice_read, + .read_sock = tcp_read_sock, + .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 5857c1fc8b67..eea23b57c6a5 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -38,6 +38,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, .flags = FIB_LOOKUP_NOREF, }; + /* update flow if oif or iif point to device enslaved to l3mdev */ + l3mdev_update_flow(net, flowi6_to_flowi(fl6)); + fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index ec9efbcdad35..aba0998ddbfb 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -172,6 +172,5 @@ static void __exit ila_fini(void) module_init(ila_init); module_exit(ila_fini); -MODULE_ALIAS_RTNL_LWT(ILA); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index c8314c6b6154..e50c27a93e17 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -51,7 +51,7 @@ drop: return -EINVAL; } -static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { +static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, }; diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index e6eca5fdf4c9..e604013dd814 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -128,7 +128,7 @@ static struct genl_family ila_nl_family = { .parallel_ops = true, }; -static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { +static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 771be1fa4176..ef5485204522 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -743,6 +743,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + u16 nlflags = NLM_F_EXCL; int err; ins = &fn->leaf; @@ -759,6 +760,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; + + nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; @@ -856,6 +859,7 @@ next_iter: pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: + nlflags |= NLM_F_CREATE; err = fib6_commit_metrics(&rt->dst, mxc); if (err) return err; @@ -864,7 +868,7 @@ add: *ins = rt; rt->rt6i_node = fn; atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); + inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index edc3daab354e..d7d6d3ae0b3b 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -61,12 +61,12 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); -#define HASH_SIZE_SHIFT 5 -#define HASH_SIZE (1 << HASH_SIZE_SHIFT) +#define IP6_GRE_HASH_SIZE_SHIFT 5 +#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT) static int ip6gre_net_id __read_mostly; struct ip6gre_net { - struct ip6_tnl __rcu *tunnels[4][HASH_SIZE]; + struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; struct net_device *fb_tunnel_dev; }; @@ -96,12 +96,12 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu); will match fallback tunnel. */ -#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(HASH_SIZE - 1)) +#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1)) static u32 HASH_ADDR(const struct in6_addr *addr) { u32 hash = ipv6_addr_hash(addr); - return hash_32(hash, HASH_SIZE_SHIFT); + return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT); } #define tunnels_r_l tunnels[3] @@ -1086,7 +1086,7 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for (prio = 0; prio < 4; prio++) { int h; - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; t = rtnl_dereference(ign->tunnels[prio][h]); @@ -1238,7 +1238,7 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]); if (data[IFLA_GRE_FLOWINFO]) - parms->flowinfo = nla_get_u32(data[IFLA_GRE_FLOWINFO]); + parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]); if (data[IFLA_GRE_FLAGS]) parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 22e90e56b5a9..e7bfd55899a3 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int offset = 0; bool encap, udpfrag; int nhoff; + bool gso_partial; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); @@ -101,9 +102,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (IS_ERR(segs)) goto out; + gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); + for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (skb_is_gso(skb)) + if (gso_partial) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1dfc402d9ad1..6001e781164e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -56,6 +56,7 @@ #include <net/checksum.h> #include <linux/mroute6.h> #include <net/l3mdev.h> +#include <net/lwtunnel.h> static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -104,6 +105,13 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } } + if (lwtunnel_xmit_redirect(dst->lwtstate)) { + int res = lwtunnel_xmit(skb); + + if (res < 0 || res == LWTUNNEL_XMIT_DONE) + return res; + } + rcu_read_lock_bh(); nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); @@ -228,6 +236,14 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUT, skb->len); + + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out((struct sock *)sk, skb); + if (unlikely(!skb)) + return 0; + /* hooks should never assume socket lock is held. * we promote our socket to non const */ @@ -910,13 +926,6 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, int err; int flags = 0; - if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif && - (!*dst || !(*dst)->error)) { - err = l3mdev_get_saddr6(net, sk, fl6); - if (err) - goto out_err; - } - /* The correct way to handle this would be to do * ip6_route_get_saddr, and then ip6_route_output; however, * the route-specific preferred source forces the @@ -1008,7 +1017,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, out_err_release: dst_release(*dst); *dst = NULL; -out_err: + if (err == -ENETUNREACH) IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); return err; @@ -1054,8 +1063,6 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; - if (!fl6->flowi6_oif) - fl6->flowi6_oif = l3mdev_fib_oif(dst->dev); return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 888543debe4e..6a66adba0c22 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -57,6 +57,7 @@ #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/dst_metadata.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); @@ -64,8 +65,8 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); -#define HASH_SIZE_SHIFT 5 -#define HASH_SIZE (1 << HASH_SIZE_SHIFT) +#define IP6_TUNNEL_HASH_SIZE_SHIFT 5 +#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); @@ -75,7 +76,7 @@ static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); - return hash_32(hash, HASH_SIZE_SHIFT); + return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); @@ -87,9 +88,10 @@ struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ - struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; + struct ip6_tnl __rcu *collect_md_tun; }; static struct net_device_stats *ip6_get_stats(struct net_device *dev) @@ -166,6 +168,10 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_ return t; } + t = rcu_dereference(ip6n->collect_md_tun); + if (t) + return t; + t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; @@ -209,6 +215,8 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); + if (t->parms.collect_md) + rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } @@ -224,6 +232,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; + if (t->parms.collect_md) + rcu_assign_pointer(ip6n->collect_md_tun, NULL); + for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { @@ -829,6 +840,9 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); + if (tun_dst) + skb_dst_set(skb, (struct dst_entry *)tun_dst); + gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -865,6 +879,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); @@ -881,7 +896,12 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; - ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate, + if (t->parms.collect_md) { + tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0); + if (!tun_dst) + return 0; + } + ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } @@ -1012,8 +1032,16 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, int mtu; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; + u8 hop_limit; int err = -1; + if (t->parms.collect_md) { + hop_limit = skb_tunnel_info(skb)->key.ttl; + goto route_lookup; + } else { + hop_limit = t->parms.hop_limit; + } + /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { struct in6_addr *addr6; @@ -1043,6 +1071,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, goto tx_err_link_failure; if (!dst) { +route_lookup: dst = ip6_route_output(net, NULL, fl6); if (dst->error) @@ -1053,6 +1082,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, dst = NULL; goto tx_err_link_failure; } + if (t->parms.collect_md && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl6->daddr, 0, &fl6->saddr)) + goto tx_err_link_failure; ndst = dst; } @@ -1071,7 +1104,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, } if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (skb_dst(skb)) + if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { *pmtu = mtu; @@ -1111,8 +1144,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, skb = new_skb; } - if (!fl6->flowi6_mark && ndst) - dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); + if (t->parms.collect_md) { + if (t->encap.type != TUNNEL_ENCAP_NONE) + goto tx_err_dst_release; + } else { + if (!fl6->flowi6_mark && ndst) + dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); + } skb_dst_set(skb, dst); if (encap_limit >= 0) { @@ -1137,7 +1175,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); - ipv6h->hop_limit = t->parms.hop_limit; + ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; @@ -1170,19 +1208,34 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (tproto != IPPROTO_IPIP && tproto != 0) return -1; - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; + dsfield = ipv4_get_dsfield(iph); - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; - dsfield = ipv4_get_dsfield(iph); + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) + return -1; + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + } else { + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + encap_limit = t->parms.encap_limit; - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) - & IPV6_TCLASS_MASK; - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPIP; + + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT) + & IPV6_TCLASS_MASK; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + } if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; @@ -1220,29 +1273,47 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) ip6_tnl_addr_conflict(t, ipv6h)) return -1; - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); + dsfield = ipv6_get_dsfield(ipv6h); + + if (t->parms.collect_md) { + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; + key = &tun_info->key; + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; + fl6.daddr = key->u.ipv6.dst; + fl6.flowlabel = key->label; + } else { + offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + + tel = (void *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + encap_limit = t->parms.encap_limit; } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; + memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); + fl6.flowi6_proto = IPPROTO_IPV6; - dsfield = ipv6_get_dsfield(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + fl6.flowlabel |= (*(__be32 *)ipv6h & IPV6_TCLASS_MASK); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) + fl6.flowi6_mark = skb->mark; + } if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; @@ -1741,6 +1812,10 @@ static int ip6_tnl_dev_init(struct net_device *dev) if (err) return err; ip6_tnl_link_config(t); + if (t->parms.collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } return 0; } @@ -1811,6 +1886,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); + + if (data[IFLA_IPTUN_COLLECT_METADATA]) + parms->collect_md = true; } static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], @@ -1850,6 +1928,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct net *net = dev_net(dev); + struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *nt, *t; struct ip_tunnel_encap ipencap; @@ -1864,9 +1943,14 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, ip6_tnl_netlink_parms(data, &nt->parms); - t = ip6_tnl_locate(net, &nt->parms, 0); - if (!IS_ERR(t)) - return -EEXIST; + if (nt->parms.collect_md) { + if (rtnl_dereference(ip6n->collect_md_tun)) + return -EEXIST; + } else { + t = ip6_tnl_locate(net, &nt->parms, 0); + if (!IS_ERR(t)) + return -EEXIST; + } return ip6_tnl_create2(dev); } @@ -1890,6 +1974,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], return err; } ip6_tnl_netlink_parms(data, &p); + if (p.collect_md) + return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { @@ -1937,6 +2023,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_IPTUN_COLLECT_METADATA */ + nla_total_size(0) + 0; } @@ -1955,16 +2043,15 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) goto nla_put_failure; - if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, - tunnel->encap.type) || - nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, - tunnel->encap.sport) || - nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, - tunnel->encap.dport) || - nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, - tunnel->encap.flags)) + if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || + nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || + nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; + if (parm->collect_md) + if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) + goto nla_put_failure; return 0; nla_put_failure: @@ -1992,6 +2079,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { @@ -2033,7 +2121,7 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, &list); - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 5bd3afdcc771..8a02ca8a11af 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -50,14 +50,14 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> -#define HASH_SIZE_SHIFT 5 -#define HASH_SIZE (1 << HASH_SIZE_SHIFT) +#define IP6_VTI_HASH_SIZE_SHIFT 5 +#define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT) static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); - return hash_32(hash, HASH_SIZE_SHIFT); + return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT); } static int vti6_dev_init(struct net_device *dev); @@ -69,7 +69,7 @@ struct vti6_net { /* the vti6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ - struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; }; @@ -1051,7 +1051,7 @@ static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n) struct ip6_tnl *t; LIST_HEAD(list); - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, &list); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d64ee7e83664..75c1fc54f188 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1739,6 +1739,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, continue; } + /* Based on RFC3810 6.1. Should not send source-list change + * records when there is a filter mode change. + */ + if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) || + (!gdeleted && pmc->mca_crcount)) && + (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) + goto decrease_sf_crcount; + /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; @@ -1766,6 +1775,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, scount++; stotal++; if ((type == MLD2_ALLOW_NEW_SOURCES || type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { +decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index fe65cdc28a45..d8e671457d10 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -67,7 +67,6 @@ #include <net/flow.h> #include <net/ip6_checksum.h> #include <net/inet_common.h> -#include <net/l3mdev.h> #include <linux/proc_fs.h> #include <linux/netfilter.h> @@ -457,11 +456,9 @@ static void ndisc_send_skb(struct sk_buff *skb, if (!dst) { struct flowi6 fl6; - int oif = l3mdev_fib_oif(skb->dev); + int oif = skb->dev->ifindex; icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); - if (oif != skb->dev->ifindex) - fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { kfree_skb(skb); @@ -1538,7 +1535,6 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) int rd_len; u8 ha_buf[MAX_ADDR_LEN], *ha = NULL, ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; - int oif = l3mdev_fib_oif(dev); bool ret; if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { @@ -1555,10 +1551,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) } icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT, - &saddr_buf, &ipv6_hdr(skb)->saddr, oif); - - if (oif != skb->dev->ifindex) - fl6.flowi6_flags |= FLOWI_FLAG_L3MDEV_SRC; + &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 552fac2f390a..55aacea24396 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -190,7 +190,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = LOGLEVEL_WARNING, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 1aa5848764a7..963ee3848675 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -115,7 +115,7 @@ static unsigned int ipv6_helper(void *priv, help = nfct_help(ct); if (!help) return NF_ACCEPT; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (!helper) return NF_ACCEPT; diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 660bc10c7a9c..f5a61bc3ec2b 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -165,7 +165,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, return -NF_ACCEPT; } - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ inproto = __nf_ct_l4proto_find(PF_INET6, origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 8dd869642f45..57d86066a13b 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -30,7 +30,7 @@ static struct nf_loginfo default_loginfo = { .u = { .log = { .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; @@ -52,7 +52,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; else - logflags = NF_LOG_MASK; + logflags = NF_LOG_DEFAULT_MASK; ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (ih == NULL) { @@ -84,7 +84,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, } /* Max length: 48 "OPT (...) " */ - if (logflags & XT_LOG_IPOPT) + if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, "OPT ( "); switch (currenthdr) { @@ -121,7 +121,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: if (fragment) { - if (logflags & XT_LOG_IPOPT) + if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, ")"); return; } @@ -129,7 +129,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, break; /* Max Length */ case IPPROTO_AH: - if (logflags & XT_LOG_IPOPT) { + if (logflags & NF_LOG_IPOPT) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; @@ -161,7 +161,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, hdrlen = (hp->hdrlen+2)<<2; break; case IPPROTO_ESP: - if (logflags & XT_LOG_IPOPT) { + if (logflags & NF_LOG_IPOPT) { struct ip_esp_hdr _esph; const struct ip_esp_hdr *eh; @@ -194,7 +194,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); return; } - if (logflags & XT_LOG_IPOPT) + if (logflags & NF_LOG_IPOPT) nf_log_buf_add(m, ") "); currenthdr = hp->nexthdr; @@ -277,7 +277,7 @@ static void dump_ipv6_packet(struct nf_log_buf *m, } /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && recurse) + if ((logflags & NF_LOG_UID) && recurse) nf_log_dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ @@ -295,7 +295,7 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m, if (info->type == NF_LOG_TYPE_LOG) logflags = info->u.log.logflags; - if (!(logflags & XT_LOG_MACDECODE)) + if (!(logflags & NF_LOG_MACDECODE)) goto fallback; switch (dev->type) { @@ -379,8 +379,7 @@ static struct nf_logger nf_ip6_logger __read_mostly = { static int __net_init nf_log_ipv6_net_init(struct net *net) { - nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); - return 0; + return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); } static void __net_exit nf_log_ipv6_net_exit(struct net *net) diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index 30b22f4dff55..d6e4ba5de916 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -22,9 +22,7 @@ static unsigned int nft_do_chain_ipv6(void *priv, { struct nft_pktinfo pkt; - /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) - return NF_DROP; + nft_set_pktinfo_ipv6(&pkt, skb, state); return nft_do_chain(&pkt, priv); } @@ -102,7 +100,10 @@ static int __init nf_tables_ipv6_init(void) { int ret; - nft_register_chain_type(&filter_ipv6); + ret = nft_register_chain_type(&filter_ipv6); + if (ret < 0) + return ret; + ret = register_pernet_subsys(&nf_tables_ipv6_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_ipv6); diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 2535223ba956..f2727475895e 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -33,9 +33,7 @@ static unsigned int nf_route_table_hook(void *priv, u32 mark, flowlabel; int err; - /* malformed packet, drop it */ - if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0) - return NF_DROP; + nft_set_pktinfo_ipv6(&pkt, skb, state); /* save source/dest address, mark, hoplimit, flowlabel, priority */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 462f2a76b5c2..7cca8ac66fe9 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -148,6 +148,13 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out(sk, skb); + if (unlikely(!skb)) + return 0; + return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 679253d0af84..cc8e3ae9ca73 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -30,6 +30,11 @@ #include <net/transp_v6.h> #include <net/ipv6.h> +#define MAX4(a, b, c, d) \ + max_t(u32, max_t(u32, a, b), max_t(u32, c, d)) +#define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \ + IPSTATS_MIB_MAX, ICMP_MIB_MAX) + static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; @@ -191,25 +196,34 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, const struct snmp_mib *itemlist) { + unsigned long buff[SNMP_MIB_MAX]; int i; - unsigned long val; - for (i = 0; itemlist[i].name; i++) { - val = pcpumib ? - snmp_fold_field(pcpumib, itemlist[i].entry) : - atomic_long_read(smib + itemlist[i].entry); - seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, val); + if (pcpumib) { + memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + + snmp_get_cpu_field_batch(buff, itemlist, pcpumib); + for (i = 0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", + itemlist[i].name, buff[i]); + } else { + for (i = 0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, + atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, const struct snmp_mib *itemlist, size_t syncpoff) { + u64 buff64[SNMP_MIB_MAX]; int i; + memset(buff64, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + + snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); for (i = 0; itemlist[i].name; i++) - seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, - snmp_fold_field64(mib, itemlist[i].entry, syncpoff)); + seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } static int snmp6_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 590dd1f7746f..54404f08efcc 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -653,6 +653,13 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, if (err) goto error_fault; + /* if egress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_out(sk, skb); + if (unlikely(!skb)) + return 0; + IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 269218aacbea..bdbc38e8bf29 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table * return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); } -static struct dst_entry *ip6_route_input_lookup(struct net *net, - struct net_device *dev, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_input_lookup(struct net *net, + struct net_device *dev, + struct flowi6 *fl6, int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); } +EXPORT_SYMBOL_GPL(ip6_route_input_lookup); void ip6_route_input(struct sk_buff *skb) { @@ -1164,7 +1165,7 @@ void ip6_route_input(struct sk_buff *skb) int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { - .flowi6_iif = l3mdev_fib_oif(skb->dev), + .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), @@ -1188,12 +1189,15 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { - struct dst_entry *dst; bool any_src; - dst = l3mdev_get_rt6_dst(net, fl6); - if (dst) - return dst; + if (rt6_need_strict(&fl6->daddr)) { + struct dst_entry *dst; + + dst = l3mdev_link_scope_lookup(net, fl6); + if (dst) + return dst; + } fl6->flowi6_iif = LOOPBACK_IFINDEX; @@ -1604,7 +1608,9 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) rcu_read_unlock(); out: - return min_t(unsigned int, mtu, IP6_MAX_MTU); + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); + + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } static struct dst_entry *icmp6_dst_gc_list; @@ -2565,8 +2571,16 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, { u32 tb_id; struct net *net = dev_net(idev->dev); - struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, - DST_NOCOUNT); + struct net_device *dev = net->loopback_dev; + struct rt6_info *rt; + + /* use L3 Master device as loopback for host routes if device + * is enslaved and address is not link local or multicast + */ + if (!rt6_need_strict(addr)) + dev = l3mdev_master_dev_rcu(idev->dev) ? : dev; + + rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); if (!rt) return ERR_PTR(-ENOMEM); @@ -3347,11 +3361,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) } else { fl6.flowi6_oif = oif; - if (netif_index_is_l3_master(net, oif)) { - fl6.flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | - FLOWI_FLAG_SKIP_NH_OIF; - } - rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 182b6a9be29d..b1cdf8009d29 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -62,7 +62,7 @@ For comments look at net/ipv4/ip_gre.c --ANK */ -#define HASH_SIZE 16 +#define IP6_SIT_HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) static bool log_ecn_error = true; @@ -78,9 +78,9 @@ static struct rtnl_link_ops sit_link_ops __read_mostly; static int sit_net_id __read_mostly; struct sit_net { - struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; - struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_wc[1]; struct ip_tunnel __rcu **tunnels[4]; @@ -1126,7 +1126,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, } #endif -bool ipip6_valid_ip_proto(u8 ipproto) +static bool ipip6_valid_ip_proto(u8 ipproto) { return ipproto == IPPROTO_IPV6 || ipproto == IPPROTO_IPIP || @@ -1783,7 +1783,7 @@ static void __net_exit sit_destroy_tunnels(struct net *net, for (prio = 1; prio < 4; prio++) { int h; - for (h = 0; h < HASH_SIZE; h++) { + for (h = 0; h < IP6_SIT_HASH_SIZE; h++) { struct ip_tunnel *t; t = rtnl_dereference(sitn->tunnels[prio][h]); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 94f4f89d73e7..54cf7197c7ab 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -671,6 +671,7 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n", genhash ? "failed" : "mismatch", &ip6h->saddr, ntohs(th->source), @@ -817,12 +818,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); - else { - if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) - oif = skb->skb_iif; - - fl6.flowi6_oif = oif; - } + else + fl6.flowi6_oif = oif ? : skb->skb_iif; fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; @@ -1415,6 +1412,7 @@ process: sk = req->rsk_listener; tcp_v6_fill_cb(skb, hdr, th); if (tcp_v6_inbound_md5_hash(sk, skb)) { + sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } @@ -1471,10 +1469,7 @@ process: if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) ret = tcp_v6_do_rcv(sk, skb); - } else if (unlikely(sk_add_backlog(sk, skb, - sk->sk_rcvbuf + sk->sk_sndbuf))) { - bh_unlock_sock(sk); - __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); + } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } bh_unlock_sock(sk); @@ -1868,17 +1863,6 @@ void tcp6_proc_exit(struct net *net) } #endif -static void tcp_v6_clear_sk(struct sock *sk, int size) -{ - struct inet_sock *inet = inet_sk(sk); - - /* we do not want to clear pinet6 field, because of RCU lookups */ - sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6)); - - size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); - memset(&inet->pinet6 + 1, 0, size); -} - struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, @@ -1920,7 +1904,6 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif - .clear_sk = tcp_v6_clear_sk, .diag_destroy = tcp_abort, }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 19ac3a1c308d..9aa7c1c7a9ce 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1424,17 +1424,6 @@ void udp6_proc_exit(struct net *net) } #endif /* CONFIG_PROC_FS */ -void udp_v6_clear_sk(struct sock *sk, int size) -{ - struct inet_sock *inet = inet_sk(sk); - - /* we do not want to clear pinet6 field, because of RCU lookups */ - sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6)); - - size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); - memset(&inet->pinet6 + 1, 0, size); -} - /* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { @@ -1465,7 +1454,7 @@ struct proto udpv6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = udp_v6_clear_sk, + .diag_destroy = udp_abort, }; static struct inet_protosw udpv6_protosw = { diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 0682c031ccdc..f6eb1ab34f4b 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -29,8 +29,6 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); void udpv6_destroy_sock(struct sock *sk); -void udp_v6_clear_sk(struct sock *sk, int size); - #ifdef CONFIG_PROC_FS int udp6_seq_show(struct seq_file *seq, void *v); #endif diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index fd6ef414899b..47d0d2b87106 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -55,7 +55,6 @@ struct proto udplitev6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udplite6_protosw = { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 70a86adad875..e0f71c01d728 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -134,7 +134,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) nexthdr = nh[nhoff]; if (skb_dst(skb)) - oif = l3mdev_fib_oif(skb_dst(skb)->dev); + oif = skb_dst(skb)->dev->ifindex; memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index ccc244406fb9..391c3cbd2eed 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -845,9 +845,6 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) if (sock->state != SS_UNCONNECTED) goto out; - if ((sk = sock->sk) == NULL) - goto out; - err = -EOPNOTSUPP; if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && (sk->sk_type != SOCK_DGRAM)) diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig index 5db94d940ecc..87fca36e6c47 100644 --- a/net/kcm/Kconfig +++ b/net/kcm/Kconfig @@ -3,6 +3,7 @@ config AF_KCM tristate "KCM sockets" depends on INET select BPF_SYSCALL + select STREAM_PARSER ---help--- KCM (Kernel Connection Multiplexor) sockets provide a method for multiplexing messages of a message based application diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index 16c2e03bd388..bf75c9231cca 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -155,8 +155,8 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, seq_printf(seq, " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", psock->index, - psock->stats.rx_msgs, - psock->stats.rx_bytes, + psock->strp.stats.rx_msgs, + psock->strp.stats.rx_bytes, psock->stats.tx_msgs, psock->stats.tx_bytes, psock->sk->sk_receive_queue.qlen, @@ -170,14 +170,27 @@ static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, if (psock->tx_stopped) seq_puts(seq, "TxStop "); - if (psock->rx_stopped) + if (psock->strp.rx_stopped) seq_puts(seq, "RxStop "); if (psock->tx_kcm) seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); - if (psock->ready_rx_msg) - seq_puts(seq, "RdyRx "); + if (!psock->strp.rx_paused && !psock->ready_rx_msg) { + if (psock->sk->sk_receive_queue.qlen) { + if (psock->strp.rx_need_bytes) + seq_printf(seq, "RxWait=%u ", + psock->strp.rx_need_bytes); + else + seq_printf(seq, "RxWait "); + } + } else { + if (psock->strp.rx_paused) + seq_puts(seq, "RxPause "); + + if (psock->ready_rx_msg) + seq_puts(seq, "RdyRx "); + } seq_puts(seq, "\n"); } @@ -275,6 +288,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) { struct kcm_psock_stats psock_stats; struct kcm_mux_stats mux_stats; + struct strp_aggr_stats strp_stats; struct kcm_mux *mux; struct kcm_psock *psock; struct net *net = seq->private; @@ -282,20 +296,28 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) memset(&mux_stats, 0, sizeof(mux_stats)); memset(&psock_stats, 0, sizeof(psock_stats)); + memset(&strp_stats, 0, sizeof(strp_stats)); mutex_lock(&knet->mutex); aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); aggregate_psock_stats(&knet->aggregate_psock_stats, &psock_stats); + aggregate_strp_stats(&knet->aggregate_strp_stats, + &strp_stats); list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) { spin_lock_bh(&mux->lock); aggregate_mux_stats(&mux->stats, &mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &psock_stats); - list_for_each_entry(psock, &mux->psocks, psock_list) + aggregate_strp_stats(&mux->aggregate_strp_stats, + &strp_stats); + list_for_each_entry(psock, &mux->psocks, psock_list) { aggregate_psock_stats(&psock->stats, &psock_stats); + save_strp_stats(&psock->strp, &strp_stats); + } + spin_unlock_bh(&mux->lock); } @@ -328,7 +350,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) mux_stats.rx_ready_drops); seq_printf(seq, - "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", "Psock", "RX-Msgs", "RX-Bytes", @@ -337,6 +359,8 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) "Reserved", "Unreserved", "RX-Aborts", + "RX-Intr", + "RX-Unrecov", "RX-MemFail", "RX-NeedMor", "RX-BadLen", @@ -345,20 +369,22 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v) "TX-Aborts"); seq_printf(seq, - "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", + "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", "", - psock_stats.rx_msgs, - psock_stats.rx_bytes, + strp_stats.rx_msgs, + strp_stats.rx_bytes, psock_stats.tx_msgs, psock_stats.tx_bytes, psock_stats.reserved, psock_stats.unreserved, - psock_stats.rx_aborts, - psock_stats.rx_mem_fail, - psock_stats.rx_need_more_hdr, - psock_stats.rx_bad_hdr_len, - psock_stats.rx_msg_too_big, - psock_stats.rx_msg_timeouts, + strp_stats.rx_aborts, + strp_stats.rx_interrupted, + strp_stats.rx_unrecov_intr, + strp_stats.rx_mem_fail, + strp_stats.rx_need_more_hdr, + strp_stats.rx_bad_hdr_len, + strp_stats.rx_msg_too_big, + strp_stats.rx_msg_timeouts, psock_stats.tx_aborts); return 0; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 411693288648..b7f869a85ab7 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1,3 +1,13 @@ +/* + * Kernel Connection Multiplexor + * + * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + #include <linux/bpf.h> #include <linux/errno.h> #include <linux/errqueue.h> @@ -17,7 +27,6 @@ #include <net/kcm.h> #include <net/netns/generic.h> #include <net/sock.h> -#include <net/tcp.h> #include <uapi/linux/kcm.h> unsigned int kcm_net_id; @@ -36,38 +45,12 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) return (struct kcm_tx_msg *)skb->cb; } -static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) -{ - return (struct kcm_rx_msg *)((void *)skb->cb + - offsetof(struct qdisc_skb_cb, data)); -} - static void report_csk_error(struct sock *csk, int err) { csk->sk_err = EPIPE; csk->sk_error_report(csk); } -/* Callback lock held */ -static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, - struct sk_buff *skb) -{ - struct sock *csk = psock->sk; - - /* Unrecoverable error in receive */ - - del_timer(&psock->rx_msg_timer); - - if (psock->rx_stopped) - return; - - psock->rx_stopped = 1; - KCM_STATS_INCR(psock->stats.rx_aborts); - - /* Report an error on the lower socket */ - report_csk_error(csk, err); -} - static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, bool wakeup_kcm) { @@ -110,12 +93,13 @@ static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, static void kcm_update_rx_mux_stats(struct kcm_mux *mux, struct kcm_psock *psock) { - KCM_STATS_ADD(mux->stats.rx_bytes, - psock->stats.rx_bytes - psock->saved_rx_bytes); + STRP_STATS_ADD(mux->stats.rx_bytes, + psock->strp.stats.rx_bytes - + psock->saved_rx_bytes); mux->stats.rx_msgs += - psock->stats.rx_msgs - psock->saved_rx_msgs; - psock->saved_rx_msgs = psock->stats.rx_msgs; - psock->saved_rx_bytes = psock->stats.rx_bytes; + psock->strp.stats.rx_msgs - psock->saved_rx_msgs; + psock->saved_rx_msgs = psock->strp.stats.rx_msgs; + psock->saved_rx_bytes = psock->strp.stats.rx_bytes; } static void kcm_update_tx_mux_stats(struct kcm_mux *mux, @@ -168,11 +152,11 @@ static void kcm_rcv_ready(struct kcm_sock *kcm) */ list_del(&psock->psock_ready_list); psock->ready_rx_msg = NULL; - /* Commit clearing of ready_rx_msg for queuing work */ smp_mb(); - queue_work(kcm_wq, &psock->rx_work); + strp_unpause(&psock->strp); + strp_check_rcv(&psock->strp); } /* Buffer limit is okay now, add to ready list */ @@ -286,6 +270,7 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, if (list_empty(&mux->kcm_rx_waiters)) { psock->ready_rx_msg = head; + strp_pause(&psock->strp); list_add_tail(&psock->psock_ready_list, &mux->psocks_ready); spin_unlock_bh(&mux->rx_lock); @@ -354,346 +339,60 @@ static void unreserve_rx_kcm(struct kcm_psock *psock, spin_unlock_bh(&mux->rx_lock); } -static void kcm_start_rx_timer(struct kcm_psock *psock) -{ - if (psock->sk->sk_rcvtimeo) - mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo); -} - -/* Macro to invoke filter function. */ -#define KCM_RUN_FILTER(prog, ctx) \ - (*prog->bpf_func)(ctx, prog->insnsi) - -/* Lower socket lock held */ -static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, - unsigned int orig_offset, size_t orig_len) -{ - struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; - struct kcm_rx_msg *rxm; - struct kcm_sock *kcm; - struct sk_buff *head, *skb; - size_t eaten = 0, cand_len; - ssize_t extra; - int err; - bool cloned_orig = false; - - if (psock->ready_rx_msg) - return 0; - - head = psock->rx_skb_head; - if (head) { - /* Message already in progress */ - - rxm = kcm_rx_msg(head); - if (unlikely(rxm->early_eaten)) { - /* Already some number of bytes on the receive sock - * data saved in rx_skb_head, just indicate they - * are consumed. - */ - eaten = orig_len <= rxm->early_eaten ? - orig_len : rxm->early_eaten; - rxm->early_eaten -= eaten; - - return eaten; - } - - if (unlikely(orig_offset)) { - /* Getting data with a non-zero offset when a message is - * in progress is not expected. If it does happen, we - * need to clone and pull since we can't deal with - * offsets in the skbs for a message expect in the head. - */ - orig_skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!orig_skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - return 0; - } - if (!pskb_pull(orig_skb, orig_offset)) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - kfree_skb(orig_skb); - desc->error = -ENOMEM; - return 0; - } - cloned_orig = true; - orig_offset = 0; - } - - if (!psock->rx_skb_nextp) { - /* We are going to append to the frags_list of head. - * Need to unshare the frag_list. - */ - err = skb_unclone(head, GFP_ATOMIC); - if (err) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = err; - return 0; - } - - if (unlikely(skb_shinfo(head)->frag_list)) { - /* We can't append to an sk_buff that already - * has a frag_list. We create a new head, point - * the frag_list of that to the old head, and - * then are able to use the old head->next for - * appending to the message. - */ - if (WARN_ON(head->next)) { - desc->error = -EINVAL; - return 0; - } - - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - return 0; - } - skb->len = head->len; - skb->data_len = head->len; - skb->truesize = head->truesize; - *kcm_rx_msg(skb) = *kcm_rx_msg(head); - psock->rx_skb_nextp = &head->next; - skb_shinfo(skb)->frag_list = head; - psock->rx_skb_head = skb; - head = skb; - } else { - psock->rx_skb_nextp = - &skb_shinfo(head)->frag_list; - } - } - } - - while (eaten < orig_len) { - /* Always clone since we will consume something */ - skb = skb_clone(orig_skb, GFP_ATOMIC); - if (!skb) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = -ENOMEM; - break; - } - - cand_len = orig_len - eaten; - - head = psock->rx_skb_head; - if (!head) { - head = skb; - psock->rx_skb_head = head; - /* Will set rx_skb_nextp on next packet if needed */ - psock->rx_skb_nextp = NULL; - rxm = kcm_rx_msg(head); - memset(rxm, 0, sizeof(*rxm)); - rxm->offset = orig_offset + eaten; - } else { - /* Unclone since we may be appending to an skb that we - * already share a frag_list with. - */ - err = skb_unclone(skb, GFP_ATOMIC); - if (err) { - KCM_STATS_INCR(psock->stats.rx_mem_fail); - desc->error = err; - break; - } - - rxm = kcm_rx_msg(head); - *psock->rx_skb_nextp = skb; - psock->rx_skb_nextp = &skb->next; - head->data_len += skb->len; - head->len += skb->len; - head->truesize += skb->truesize; - } - - if (!rxm->full_len) { - ssize_t len; - - len = KCM_RUN_FILTER(psock->bpf_prog, head); - - if (!len) { - /* Need more header to determine length */ - if (!rxm->accum_len) { - /* Start RX timer for new message */ - kcm_start_rx_timer(psock); - } - rxm->accum_len += cand_len; - eaten += cand_len; - KCM_STATS_INCR(psock->stats.rx_need_more_hdr); - WARN_ON(eaten != orig_len); - break; - } else if (len > psock->sk->sk_rcvbuf) { - /* Message length exceeds maximum allowed */ - KCM_STATS_INCR(psock->stats.rx_msg_too_big); - desc->error = -EMSGSIZE; - psock->rx_skb_head = NULL; - kcm_abort_rx_psock(psock, EMSGSIZE, head); - break; - } else if (len <= (ssize_t)head->len - - skb->len - rxm->offset) { - /* Length must be into new skb (and also - * greater than zero) - */ - KCM_STATS_INCR(psock->stats.rx_bad_hdr_len); - desc->error = -EPROTO; - psock->rx_skb_head = NULL; - kcm_abort_rx_psock(psock, EPROTO, head); - break; - } - - rxm->full_len = len; - } - - extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; - - if (extra < 0) { - /* Message not complete yet. */ - if (rxm->full_len - rxm->accum_len > - tcp_inq(psock->sk)) { - /* Don't have the whole messages in the socket - * buffer. Set psock->rx_need_bytes to wait for - * the rest of the message. Also, set "early - * eaten" since we've already buffered the skb - * but don't consume yet per tcp_read_sock. - */ - - if (!rxm->accum_len) { - /* Start RX timer for new message */ - kcm_start_rx_timer(psock); - } - - psock->rx_need_bytes = rxm->full_len - - rxm->accum_len; - rxm->accum_len += cand_len; - rxm->early_eaten = cand_len; - KCM_STATS_ADD(psock->stats.rx_bytes, cand_len); - desc->count = 0; /* Stop reading socket */ - break; - } - rxm->accum_len += cand_len; - eaten += cand_len; - WARN_ON(eaten != orig_len); - break; - } - - /* Positive extra indicates ore bytes than needed for the - * message - */ - - WARN_ON(extra > cand_len); - - eaten += (cand_len - extra); - - /* Hurray, we have a new message! */ - del_timer(&psock->rx_msg_timer); - psock->rx_skb_head = NULL; - KCM_STATS_INCR(psock->stats.rx_msgs); - -try_queue: - kcm = reserve_rx_kcm(psock, head); - if (!kcm) { - /* Unable to reserve a KCM, message is held in psock. */ - break; - } - - if (kcm_queue_rcv_skb(&kcm->sk, head)) { - /* Should mean socket buffer full */ - unreserve_rx_kcm(psock, false); - goto try_queue; - } - } - - if (cloned_orig) - kfree_skb(orig_skb); - - KCM_STATS_ADD(psock->stats.rx_bytes, eaten); - - return eaten; -} - -/* Called with lock held on lower socket */ -static int psock_tcp_read_sock(struct kcm_psock *psock) -{ - read_descriptor_t desc; - - desc.arg.data = psock; - desc.error = 0; - desc.count = 1; /* give more than one skb per call */ - - /* sk should be locked here, so okay to do tcp_read_sock */ - tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); - - unreserve_rx_kcm(psock, true); - - return desc.error; -} - /* Lower sock lock held */ -static void psock_tcp_data_ready(struct sock *sk) +static void psock_data_ready(struct sock *sk) { struct kcm_psock *psock; read_lock_bh(&sk->sk_callback_lock); psock = (struct kcm_psock *)sk->sk_user_data; - if (unlikely(!psock || psock->rx_stopped)) - goto out; + if (likely(psock)) + strp_data_ready(&psock->strp); - if (psock->ready_rx_msg) - goto out; - - if (psock->rx_need_bytes) { - if (tcp_inq(sk) >= psock->rx_need_bytes) - psock->rx_need_bytes = 0; - else - goto out; - } - - if (psock_tcp_read_sock(psock) == -ENOMEM) - queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); - -out: read_unlock_bh(&sk->sk_callback_lock); } -static void do_psock_rx_work(struct kcm_psock *psock) +/* Called with lower sock held */ +static void kcm_rcv_strparser(struct strparser *strp, struct sk_buff *skb) { - read_descriptor_t rd_desc; - struct sock *csk = psock->sk; - - /* We need the read lock to synchronize with psock_tcp_data_ready. We - * need the socket lock for calling tcp_read_sock. - */ - lock_sock(csk); - read_lock_bh(&csk->sk_callback_lock); - - if (unlikely(csk->sk_user_data != psock)) - goto out; - - if (unlikely(psock->rx_stopped)) - goto out; - - if (psock->ready_rx_msg) - goto out; - - rd_desc.arg.data = psock; + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + struct kcm_sock *kcm; - if (psock_tcp_read_sock(psock) == -ENOMEM) - queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); +try_queue: + kcm = reserve_rx_kcm(psock, skb); + if (!kcm) { + /* Unable to reserve a KCM, message is held in psock and strp + * is paused. + */ + return; + } -out: - read_unlock_bh(&csk->sk_callback_lock); - release_sock(csk); + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Should mean socket buffer full */ + unreserve_rx_kcm(psock, false); + goto try_queue; + } } -static void psock_rx_work(struct work_struct *w) +static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb) { - do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + struct bpf_prog *prog = psock->bpf_prog; + + return (*prog->bpf_func)(skb, prog->insnsi); } -static void psock_rx_delayed_work(struct work_struct *w) +static int kcm_read_sock_done(struct strparser *strp, int err) { - do_psock_rx_work(container_of(w, struct kcm_psock, - rx_delayed_work.work)); + struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp); + + unreserve_rx_kcm(psock, true); + + return err; } -static void psock_tcp_state_change(struct sock *sk) +static void psock_state_change(struct sock *sk) { /* TCP only does a POLLIN for a half close. Do a POLLHUP here * since application will normally not poll with POLLIN @@ -703,7 +402,7 @@ static void psock_tcp_state_change(struct sock *sk) report_csk_error(sk, EPIPE); } -static void psock_tcp_write_space(struct sock *sk) +static void psock_write_space(struct sock *sk) { struct kcm_psock *psock; struct kcm_mux *mux; @@ -714,14 +413,13 @@ static void psock_tcp_write_space(struct sock *sk) psock = (struct kcm_psock *)sk->sk_user_data; if (unlikely(!psock)) goto out; - mux = psock->mux; spin_lock_bh(&mux->lock); /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; - if (kcm) + if (kcm && !unlikely(kcm->tx_stopped)) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); @@ -1412,7 +1110,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, struct kcm_sock *kcm = kcm_sk(sk); int err = 0; long timeo; - struct kcm_rx_msg *rxm; + struct strp_rx_msg *rxm; int copied = 0; struct sk_buff *skb; @@ -1426,7 +1124,7 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, /* Okay, have a message on the receive queue */ - rxm = kcm_rx_msg(skb); + rxm = strp_rx_msg(skb); if (len > rxm->full_len) len = rxm->full_len; @@ -1482,7 +1180,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, struct sock *sk = sock->sk; struct kcm_sock *kcm = kcm_sk(sk); long timeo; - struct kcm_rx_msg *rxm; + struct strp_rx_msg *rxm; int err = 0; ssize_t copied; struct sk_buff *skb; @@ -1499,7 +1197,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, /* Okay, have a message on the receive queue */ - rxm = kcm_rx_msg(skb); + rxm = strp_rx_msg(skb); if (len > rxm->full_len) len = rxm->full_len; @@ -1675,15 +1373,6 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) spin_unlock_bh(&mux->rx_lock); } -static void kcm_rx_msg_timeout(unsigned long arg) -{ - struct kcm_psock *psock = (struct kcm_psock *)arg; - - /* Message assembly timed out */ - KCM_STATS_INCR(psock->stats.rx_msg_timeouts); - kcm_abort_rx_psock(psock, ETIMEDOUT, NULL); -} - static int kcm_attach(struct socket *sock, struct socket *csock, struct bpf_prog *prog) { @@ -1693,19 +1382,13 @@ static int kcm_attach(struct socket *sock, struct socket *csock, struct kcm_psock *psock = NULL, *tpsock; struct list_head *head; int index = 0; - - if (csock->ops->family != PF_INET && - csock->ops->family != PF_INET6) - return -EINVAL; + struct strp_callbacks cb; + int err; csk = csock->sk; if (!csk) return -EINVAL; - /* Only support TCP for now */ - if (csk->sk_protocol != IPPROTO_TCP) - return -EINVAL; - psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); if (!psock) return -ENOMEM; @@ -1714,11 +1397,16 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->sk = csk; psock->bpf_prog = prog; - setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout, - (unsigned long)psock); + cb.rcv_msg = kcm_rcv_strparser; + cb.abort_parser = NULL; + cb.parse_msg = kcm_parse_func_strparser; + cb.read_sock_done = kcm_read_sock_done; - INIT_WORK(&psock->rx_work, psock_rx_work); - INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); + err = strp_init(&psock->strp, csk, &cb); + if (err) { + kmem_cache_free(kcm_psockp, psock); + return err; + } sock_hold(csk); @@ -1727,9 +1415,9 @@ static int kcm_attach(struct socket *sock, struct socket *csock, psock->save_write_space = csk->sk_write_space; psock->save_state_change = csk->sk_state_change; csk->sk_user_data = psock; - csk->sk_data_ready = psock_tcp_data_ready; - csk->sk_write_space = psock_tcp_write_space; - csk->sk_state_change = psock_tcp_state_change; + csk->sk_data_ready = psock_data_ready; + csk->sk_write_space = psock_write_space; + csk->sk_state_change = psock_state_change; write_unlock_bh(&csk->sk_callback_lock); /* Finished initialization, now add the psock to the MUX. */ @@ -1751,7 +1439,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock, spin_unlock_bh(&mux->lock); /* Schedule RX work in case there are already bytes queued */ - queue_work(kcm_wq, &psock->rx_work); + strp_check_rcv(&psock->strp); return 0; } @@ -1791,6 +1479,8 @@ static void kcm_unattach(struct kcm_psock *psock) struct sock *csk = psock->sk; struct kcm_mux *mux = psock->mux; + lock_sock(csk); + /* Stop getting callbacks from TCP socket. After this there should * be no way to reserve a kcm for this psock. */ @@ -1799,7 +1489,7 @@ static void kcm_unattach(struct kcm_psock *psock) csk->sk_data_ready = psock->save_data_ready; csk->sk_write_space = psock->save_write_space; csk->sk_state_change = psock->save_state_change; - psock->rx_stopped = 1; + strp_stop(&psock->strp); if (WARN_ON(psock->rx_kcm)) { write_unlock_bh(&csk->sk_callback_lock); @@ -1822,18 +1512,17 @@ static void kcm_unattach(struct kcm_psock *psock) write_unlock_bh(&csk->sk_callback_lock); - del_timer_sync(&psock->rx_msg_timer); - cancel_work_sync(&psock->rx_work); - cancel_delayed_work_sync(&psock->rx_delayed_work); + /* Call strp_done without sock lock */ + release_sock(csk); + strp_done(&psock->strp); + lock_sock(csk); bpf_prog_put(psock->bpf_prog); - kfree_skb(psock->rx_skb_head); - psock->rx_skb_head = NULL; - spin_lock_bh(&mux->lock); aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); + save_strp_stats(&psock->strp, &mux->aggregate_strp_stats); KCM_STATS_INCR(mux->stats.psock_unattach); @@ -1876,6 +1565,8 @@ no_reserved: fput(csk->sk_socket->file); kmem_cache_free(kcm_psockp, psock); } + + release_sock(csk); } static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) @@ -1916,6 +1607,7 @@ static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) spin_unlock_bh(&mux->lock); + /* Lower socket lock should already be held */ kcm_unattach(psock); err = 0; @@ -2073,6 +1765,8 @@ static void release_mux(struct kcm_mux *mux) aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &knet->aggregate_psock_stats); + aggregate_strp_stats(&mux->aggregate_strp_stats, + &knet->aggregate_strp_stats); list_del_rcu(&mux->kcm_mux_list); knet->count--; mutex_unlock(&knet->mutex); @@ -2152,6 +1846,13 @@ static int kcm_release(struct socket *sock) * it will just return. */ __skb_queue_purge(&sk->sk_write_queue); + + /* Set tx_stopped. This is checked when psock is bound to a kcm and we + * get a writespace callback. This prevents further work being queued + * from the callback (unbinding the psock occurs after canceling work. + */ + kcm->tx_stopped = 1; + release_sock(sk); spin_lock_bh(&mux->lock); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 5871537af387..2599af6378e4 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -139,7 +139,7 @@ struct l2tp_session { void (*session_close)(struct l2tp_session *session); void (*ref)(struct l2tp_session *session); void (*deref)(struct l2tp_session *session); -#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) +#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) void (*show)(struct seq_file *m, void *priv); #endif uint8_t priv[0]; /* private data */ diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 57fc5a46ce06..965f7e344cef 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -121,7 +121,7 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev, } -static struct net_device_ops l2tp_eth_netdev_ops = { +static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_init = l2tp_eth_dev_init, .ndo_uninit = l2tp_eth_dev_uninit, .ndo_start_xmit = l2tp_eth_dev_xmit, @@ -195,7 +195,7 @@ static void l2tp_eth_delete(struct l2tp_session *session) } } -#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) +#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) static void l2tp_eth_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; @@ -268,7 +268,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; session->session_close = l2tp_eth_delete; -#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) +#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) session->show = l2tp_eth_show; #endif diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 1d02e8d20e56..bf3117771822 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -867,7 +867,7 @@ out: return skb->len; } -static struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { +static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = { [L2TP_ATTR_NONE] = { .type = NLA_UNSPEC, }, [L2TP_ATTR_PW_TYPE] = { .type = NLA_U16, }, [L2TP_ATTR_ENCAP_TYPE] = { .type = NLA_U16, }, diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 232cb92033e8..41d47bfda15c 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -177,7 +177,7 @@ static int pppol2tp_recv_payload_hook(struct sk_buff *skb) if (!pskb_may_pull(skb, 2)) return 1; - if ((skb->data[0] == 0xff) && (skb->data[1] == 0x03)) + if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI)) skb_pull(skb, 2); return 0; @@ -282,7 +282,6 @@ static void pppol2tp_session_sock_put(struct l2tp_session *session) static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { - static const unsigned char ppph[2] = { 0xff, 0x03 }; struct sock *sk = sock->sk; struct sk_buff *skb; int error; @@ -312,7 +311,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, error = -ENOMEM; skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len + - sizeof(ppph) + total_len, + 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ 0, GFP_KERNEL); if (!skb) goto error_put_sess_tun; @@ -325,8 +324,8 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, skb_reserve(skb, uhlen); /* Add PPP header */ - skb->data[0] = ppph[0]; - skb->data[1] = ppph[1]; + skb->data[0] = PPP_ALLSTATIONS; + skb->data[1] = PPP_UI; skb_put(skb, 2); /* Copy user data into skb */ @@ -369,7 +368,6 @@ error: */ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { - static const u8 ppph[2] = { 0xff, 0x03 }; struct sock *sk = (struct sock *) chan->private; struct sock *sk_tun; struct l2tp_session *session; @@ -398,14 +396,14 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) sizeof(struct iphdr) + /* IP header */ uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */ session->hdr_len + /* L2TP header */ - sizeof(ppph); /* PPP header */ + 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ if (skb_cow_head(skb, headroom)) goto abort_put_sess_tun; /* Setup PPP header */ - __skb_push(skb, sizeof(ppph)); - skb->data[0] = ppph[0]; - skb->data[1] = ppph[1]; + __skb_push(skb, 2); + skb->data[0] = PPP_ALLSTATIONS; + skb->data[1] = PPP_UI; local_bh_disable(); l2tp_xmit_skb(session, skb, session->hdr_len); @@ -440,7 +438,7 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); if (sock) { - inet_shutdown(sock, 2); + inet_shutdown(sock, SEND_SHUTDOWN); /* Don't let the session go away before our socket does */ l2tp_session_inc_refcount(session); } @@ -554,7 +552,7 @@ out: return error; } -#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) +#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) static void pppol2tp_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; @@ -725,7 +723,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, session->recv_skb = pppol2tp_recv; session->session_close = pppol2tp_session_close; -#if defined(CONFIG_L2TP_DEBUGFS) || defined(CONFIG_L2TP_DEBUGFS_MODULE) +#if IS_ENABLED(CONFIG_L2TP_DEBUGFS) session->show = pppol2tp_show; #endif diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index c4a1c3e84e12..8da86ceca33d 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -100,15 +100,14 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); /** - * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns - * cached route for L3 master device if relevant - * to flow + * l3mdev_link_scope_lookup - IPv6 route lookup based on flow for link + * local and multicast addresses * @net: network namespace for device index lookup * @fl6: IPv6 flow struct for lookup */ -struct dst_entry *l3mdev_get_rt6_dst(struct net *net, - struct flowi6 *fl6) +struct dst_entry *l3mdev_link_scope_lookup(struct net *net, + struct flowi6 *fl6) { struct dst_entry *dst = NULL; struct net_device *dev; @@ -121,70 +120,15 @@ struct dst_entry *l3mdev_get_rt6_dst(struct net *net, dev = netdev_master_upper_dev_get_rcu(dev); if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_rt6_dst) - dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6); + dev->l3mdev_ops->l3mdev_link_scope_lookup) + dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6); rcu_read_unlock(); } return dst; } -EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst); - -/** - * l3mdev_get_saddr - get source address for a flow based on an interface - * enslaved to an L3 master device - * @net: network namespace for device index lookup - * @ifindex: Interface index - * @fl4: IPv4 flow struct - */ - -int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) -{ - struct net_device *dev; - int rc = 0; - - if (ifindex) { - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, ifindex); - if (dev && netif_is_l3_slave(dev)) - dev = netdev_master_upper_dev_get_rcu(dev); - - if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_saddr) - rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4); - - rcu_read_unlock(); - } - - return rc; -} -EXPORT_SYMBOL_GPL(l3mdev_get_saddr); - -int l3mdev_get_saddr6(struct net *net, const struct sock *sk, - struct flowi6 *fl6) -{ - struct net_device *dev; - int rc = 0; - - if (fl6->flowi6_oif) { - rcu_read_lock(); - - dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); - if (dev && netif_is_l3_slave(dev)) - dev = netdev_master_upper_dev_get_rcu(dev); - - if (dev && netif_is_l3_master(dev) && - dev->l3mdev_ops->l3mdev_get_saddr6) - rc = dev->l3mdev_ops->l3mdev_get_saddr6(dev, sk, fl6); - - rcu_read_unlock(); - } - - return rc; -} -EXPORT_SYMBOL_GPL(l3mdev_get_saddr6); +EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup); /** * l3mdev_fib_rule_match - Determine if flowi references an @@ -222,3 +166,38 @@ out: return rc; } + +void l3mdev_update_flow(struct net *net, struct flowi *fl) +{ + struct net_device *dev; + int ifindex; + + rcu_read_lock(); + + if (fl->flowi_oif) { + dev = dev_get_by_index_rcu(net, fl->flowi_oif); + if (dev) { + ifindex = l3mdev_master_ifindex_rcu(dev); + if (ifindex) { + fl->flowi_oif = ifindex; + fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; + goto out; + } + } + } + + if (fl->flowi_iif) { + dev = dev_get_by_index_rcu(net, fl->flowi_iif); + if (dev) { + ifindex = l3mdev_master_ifindex_rcu(dev); + if (ifindex) { + fl->flowi_iif = ifindex; + fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF; + } + } + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(l3mdev_update_flow); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8ae3ed97d95c..db916cf51ffe 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; -static long llc_ui_wait_for_conn(struct sock *sk, long timeout); +static bool llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); @@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout) return rc; } -static long llc_ui_wait_for_conn(struct sock *sk, long timeout) +static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT(wait); diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index afa94687d5e1..f6749dced021 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -304,10 +304,13 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, buf_size = IEEE80211_MAX_AMPDU_BUF; /* make sure the size doesn't exceed the maximum supported by the hw */ - if (buf_size > local->hw.max_rx_aggregation_subframes) - buf_size = local->hw.max_rx_aggregation_subframes; + if (buf_size > sta->sta.max_rx_aggregation_subframes) + buf_size = sta->sta.max_rx_aggregation_subframes; params.buf_size = buf_size; + ht_dbg(sta->sdata, "AddBA Req buf_size=%d for %pM\n", + buf_size, sta->sta.addr); + /* examine state machine */ mutex_lock(&sta->ampdu_mlme.mtx); @@ -412,8 +415,10 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, } end: - if (status == WLAN_STATUS_SUCCESS) + if (status == WLAN_STATUS_SUCCESS) { __set_bit(tid, sta->ampdu_mlme.agg_session_valid); + __clear_bit(tid, sta->ampdu_mlme.unexpected_agg); + } mutex_unlock(&sta->ampdu_mlme.mtx); end_no_lock: diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 543b1d4fc33d..fd6541f3ade3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3,6 +3,7 @@ * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH + * Copyright (C) 2015-2016 Intel Deutschland GmbH * * This file is GPLv2 as found in COPYING. */ @@ -39,7 +40,7 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, if (type == NL80211_IFTYPE_MONITOR && flags) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - sdata->u.mntr_flags = *flags; + sdata->u.mntr.flags = *flags; } return wdev; @@ -73,8 +74,29 @@ static int ieee80211_change_iface(struct wiphy *wiphy, sdata->u.mgd.use_4addr = params->use_4addr; } - if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *monitor_sdata; + u32 mu_mntr_cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + monitor_sdata = rtnl_dereference(local->monitor_sdata); + if (monitor_sdata && + wiphy_ext_feature_isset(wiphy, mu_mntr_cap_flag)) { + memcpy(monitor_sdata->vif.bss_conf.mu_group.membership, + params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); + memcpy(monitor_sdata->vif.bss_conf.mu_group.position, + params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, + WLAN_USER_POSITION_LEN); + monitor_sdata->vif.mu_mimo_owner = true; + ieee80211_bss_info_change_notify(monitor_sdata, + BSS_CHANGED_MU_GROUPS); + + ether_addr_copy(monitor_sdata->u.mntr.mu_follow_addr, + params->macaddr); + } + + if (!flags) + return 0; if (ieee80211_sdata_running(sdata)) { u32 mask = MONITOR_FLAG_COOK_FRAMES | @@ -89,11 +111,11 @@ static int ieee80211_change_iface(struct wiphy *wiphy, * cooked_mntrs, monitor and all fif_* counters * reconfigure hardware */ - if ((*flags & mask) != (sdata->u.mntr_flags & mask)) + if ((*flags & mask) != (sdata->u.mntr.flags & mask)) return -EBUSY; ieee80211_adjust_monitor_flags(sdata, -1); - sdata->u.mntr_flags = *flags; + sdata->u.mntr.flags = *flags; ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); @@ -103,7 +125,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy, * and ieee80211_do_open take care of "everything" * mentioned in the comment above. */ - sdata->u.mntr_flags = *flags; + sdata->u.mntr.flags = *flags; } } @@ -131,6 +153,149 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy, ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } +static int ieee80211_start_nan(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + int ret; + + mutex_lock(&sdata->local->chanctx_mtx); + ret = ieee80211_check_combinations(sdata, NULL, 0, 0); + mutex_unlock(&sdata->local->chanctx_mtx); + if (ret < 0) + return ret; + + ret = ieee80211_do_open(wdev, true); + if (ret) + return ret; + + ret = drv_start_nan(sdata->local, sdata, conf); + if (ret) + ieee80211_sdata_stop(sdata); + + sdata->u.nan.conf = *conf; + + return ret; +} + +static void ieee80211_stop_nan(struct wiphy *wiphy, + struct wireless_dev *wdev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + + drv_stop_nan(sdata->local, sdata); + ieee80211_sdata_stop(sdata); +} + +static int ieee80211_nan_change_conf(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf, + u32 changes) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct cfg80211_nan_conf new_conf; + int ret = 0; + + if (sdata->vif.type != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!ieee80211_sdata_running(sdata)) + return -ENETDOWN; + + new_conf = sdata->u.nan.conf; + + if (changes & CFG80211_NAN_CONF_CHANGED_PREF) + new_conf.master_pref = conf->master_pref; + + if (changes & CFG80211_NAN_CONF_CHANGED_DUAL) + new_conf.dual = conf->dual; + + ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); + if (!ret) + sdata->u.nan.conf = new_conf; + + return ret; +} + +static int ieee80211_add_nan_func(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_func *nan_func) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + int ret; + + if (sdata->vif.type != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!ieee80211_sdata_running(sdata)) + return -ENETDOWN; + + spin_lock_bh(&sdata->u.nan.func_lock); + + ret = idr_alloc(&sdata->u.nan.function_inst_ids, + nan_func, 1, sdata->local->hw.max_nan_de_entries + 1, + GFP_ATOMIC); + spin_unlock_bh(&sdata->u.nan.func_lock); + + if (ret < 0) + return ret; + + nan_func->instance_id = ret; + + WARN_ON(nan_func->instance_id == 0); + + ret = drv_add_nan_func(sdata->local, sdata, nan_func); + if (ret) { + spin_lock_bh(&sdata->u.nan.func_lock); + idr_remove(&sdata->u.nan.function_inst_ids, + nan_func->instance_id); + spin_unlock_bh(&sdata->u.nan.func_lock); + } + + return ret; +} + +static struct cfg80211_nan_func * +ieee80211_find_nan_func_by_cookie(struct ieee80211_sub_if_data *sdata, + u64 cookie) +{ + struct cfg80211_nan_func *func; + int id; + + lockdep_assert_held(&sdata->u.nan.func_lock); + + idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) { + if (func->cookie == cookie) + return func; + } + + return NULL; +} + +static void ieee80211_del_nan_func(struct wiphy *wiphy, + struct wireless_dev *wdev, u64 cookie) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct cfg80211_nan_func *func; + u8 instance_id = 0; + + if (sdata->vif.type != NL80211_IFTYPE_NAN || + !ieee80211_sdata_running(sdata)) + return; + + spin_lock_bh(&sdata->u.nan.func_lock); + + func = ieee80211_find_nan_func_by_cookie(sdata, cookie); + if (func) + instance_id = func->instance_id; + + spin_unlock_bh(&sdata->u.nan.func_lock); + + if (instance_id) + drv_del_nan_func(sdata->local, sdata, instance_id); +} + static int ieee80211_set_noack_map(struct wiphy *wiphy, struct net_device *dev, u16 noack_map) @@ -236,6 +401,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: @@ -2015,6 +2181,7 @@ static int ieee80211_scan(struct wiphy *wiphy, !(req->flags & NL80211_SCAN_FLAG_AP))) return -EOPNOTSUPP; break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } @@ -2940,10 +3107,6 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, } chanctx = container_of(conf, struct ieee80211_chanctx, conf); - if (!chanctx) { - err = -EBUSY; - goto out; - } ch_switch.timestamp = 0; ch_switch.device_timestamp = 0; @@ -3360,6 +3523,63 @@ static int ieee80211_del_tx_ts(struct wiphy *wiphy, struct net_device *dev, return -ENOENT; } +void ieee80211_nan_func_terminated(struct ieee80211_vif *vif, + u8 inst_id, + enum nl80211_nan_func_term_reason reason, + gfp_t gfp) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct cfg80211_nan_func *func; + u64 cookie; + + if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) + return; + + spin_lock_bh(&sdata->u.nan.func_lock); + + func = idr_find(&sdata->u.nan.function_inst_ids, inst_id); + if (WARN_ON(!func)) { + spin_unlock_bh(&sdata->u.nan.func_lock); + return; + } + + cookie = func->cookie; + idr_remove(&sdata->u.nan.function_inst_ids, inst_id); + + spin_unlock_bh(&sdata->u.nan.func_lock); + + cfg80211_free_nan_func(func); + + cfg80211_nan_func_terminated(ieee80211_vif_to_wdev(vif), inst_id, + reason, cookie, gfp); +} +EXPORT_SYMBOL(ieee80211_nan_func_terminated); + +void ieee80211_nan_func_match(struct ieee80211_vif *vif, + struct cfg80211_nan_match_params *match, + gfp_t gfp) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct cfg80211_nan_func *func; + + if (WARN_ON(vif->type != NL80211_IFTYPE_NAN)) + return; + + spin_lock_bh(&sdata->u.nan.func_lock); + + func = idr_find(&sdata->u.nan.function_inst_ids, match->inst_id); + if (WARN_ON(!func)) { + spin_unlock_bh(&sdata->u.nan.func_lock); + return; + } + match->cookie = func->cookie; + + spin_unlock_bh(&sdata->u.nan.func_lock); + + cfg80211_nan_match(ieee80211_vif_to_wdev(vif), match, gfp); +} +EXPORT_SYMBOL(ieee80211_nan_func_match); + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -3445,4 +3665,9 @@ const struct cfg80211_ops mac80211_config_ops = { .set_ap_chanwidth = ieee80211_set_ap_chanwidth, .add_tx_ts = ieee80211_add_tx_ts, .del_tx_ts = ieee80211_del_tx_ts, + .start_nan = ieee80211_start_nan, + .stop_nan = ieee80211_stop_nan, + .nan_change_conf = ieee80211_nan_change_conf, + .add_nan_func = ieee80211_add_nan_func, + .del_nan_func = ieee80211_del_nan_func, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 74142d07ad31..e75cbf6ecc26 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -274,6 +274,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, ieee80211_get_max_required_bw(sdata)); break; case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: continue; case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_WDS: @@ -646,6 +647,9 @@ static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata, struct ieee80211_chanctx *curr_ctx = NULL; int ret = 0; + if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN)) + return -ENOTSUPP; + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, lockdep_is_held(&local->chanctx_mtx)); @@ -718,6 +722,7 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, switch (sdata->vif.type) { case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: continue; case NL80211_IFTYPE_STATION: if (!sdata->u.mgd.associated) @@ -980,6 +985,7 @@ ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata) case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: case NUM_NL80211_IFTYPES: WARN_ON(1); break; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 2906c1004e1a..f56e2f487d09 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -71,138 +71,45 @@ DEBUGFS_READONLY_FILE(wep_iv, "%#08x", DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s", local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver"); -struct aqm_info { - struct ieee80211_local *local; - size_t size; - size_t len; - unsigned char buf[0]; -}; - -#define AQM_HDR_LEN 200 -#define AQM_HW_ENTRY_LEN 40 -#define AQM_TXQ_ENTRY_LEN 110 - -static int aqm_open(struct inode *inode, struct file *file) +static ssize_t aqm_read(struct file *file, + char __user *user_buf, + size_t count, + loff_t *ppos) { - struct ieee80211_local *local = inode->i_private; - struct ieee80211_sub_if_data *sdata; - struct sta_info *sta; - struct txq_info *txqi; + struct ieee80211_local *local = file->private_data; struct fq *fq = &local->fq; - struct aqm_info *info = NULL; + char buf[200]; int len = 0; - int i; - - if (!local->ops->wake_tx_queue) - return -EOPNOTSUPP; - - len += AQM_HDR_LEN; - len += 6 * AQM_HW_ENTRY_LEN; - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) - len += AQM_TXQ_ENTRY_LEN; - list_for_each_entry_rcu(sta, &local->sta_list, list) - len += AQM_TXQ_ENTRY_LEN * ARRAY_SIZE(sta->sta.txq); - rcu_read_unlock(); - - info = vmalloc(len); - if (!info) - return -ENOMEM; spin_lock_bh(&local->fq.lock); rcu_read_lock(); - file->private_data = info; - info->local = local; - info->size = len; - len = 0; - - len += scnprintf(info->buf + len, info->size - len, - "* hw\n" - "access name value\n" - "R fq_flows_cnt %u\n" - "R fq_backlog %u\n" - "R fq_overlimit %u\n" - "R fq_collisions %u\n" - "RW fq_limit %u\n" - "RW fq_quantum %u\n", - fq->flows_cnt, - fq->backlog, - fq->overlimit, - fq->collisions, - fq->limit, - fq->quantum); - - len += scnprintf(info->buf + len, - info->size - len, - "* vif\n" - "ifname addr ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n"); - - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - txqi = to_txq_info(sdata->vif.txq); - len += scnprintf(info->buf + len, info->size - len, - "%s %pM %u %u %u %u %u %u %u %u\n", - sdata->name, - sdata->vif.addr, - txqi->txq.ac, - txqi->tin.backlog_bytes, - txqi->tin.backlog_packets, - txqi->tin.flows, - txqi->tin.overlimit, - txqi->tin.collisions, - txqi->tin.tx_bytes, - txqi->tin.tx_packets); - } - - len += scnprintf(info->buf + len, - info->size - len, - "* sta\n" - "ifname addr tid ac backlog-bytes backlog-packets flows overlimit collisions tx-bytes tx-packets\n"); - - list_for_each_entry_rcu(sta, &local->sta_list, list) { - sdata = sta->sdata; - for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - txqi = to_txq_info(sta->sta.txq[i]); - len += scnprintf(info->buf + len, info->size - len, - "%s %pM %d %d %u %u %u %u %u %u %u\n", - sdata->name, - sta->sta.addr, - txqi->txq.tid, - txqi->txq.ac, - txqi->tin.backlog_bytes, - txqi->tin.backlog_packets, - txqi->tin.flows, - txqi->tin.overlimit, - txqi->tin.collisions, - txqi->tin.tx_bytes, - txqi->tin.tx_packets); - } - } - - info->len = len; + len = scnprintf(buf, sizeof(buf), + "access name value\n" + "R fq_flows_cnt %u\n" + "R fq_backlog %u\n" + "R fq_overlimit %u\n" + "R fq_overmemory %u\n" + "R fq_collisions %u\n" + "R fq_memory_usage %u\n" + "RW fq_memory_limit %u\n" + "RW fq_limit %u\n" + "RW fq_quantum %u\n", + fq->flows_cnt, + fq->backlog, + fq->overmemory, + fq->overlimit, + fq->collisions, + fq->memory_usage, + fq->memory_limit, + fq->limit, + fq->quantum); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); - return 0; -} - -static int aqm_release(struct inode *inode, struct file *file) -{ - vfree(file->private_data); - return 0; -} - -static ssize_t aqm_read(struct file *file, - char __user *user_buf, - size_t count, - loff_t *ppos) -{ - struct aqm_info *info = file->private_data; - return simple_read_from_buffer(user_buf, count, ppos, - info->buf, info->len); + buf, len); } static ssize_t aqm_write(struct file *file, @@ -210,8 +117,7 @@ static ssize_t aqm_write(struct file *file, size_t count, loff_t *ppos) { - struct aqm_info *info = file->private_data; - struct ieee80211_local *local = info->local; + struct ieee80211_local *local = file->private_data; char buf[100]; size_t len; @@ -228,6 +134,8 @@ static ssize_t aqm_write(struct file *file, if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) return count; + else if (sscanf(buf, "fq_memory_limit %u", &local->fq.memory_limit) == 1) + return count; else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1) return count; @@ -237,8 +145,7 @@ static ssize_t aqm_write(struct file *file, static const struct file_operations aqm_ops = { .write = aqm_write, .read = aqm_read, - .open = aqm_open, - .release = aqm_release, + .open = simple_open, .llseek = default_llseek, }; @@ -302,6 +209,7 @@ static const char *hw_flag_names[] = { FLAG(USES_RSS), FLAG(TX_AMSDU), FLAG(TX_FRAG_LIST), + FLAG(REPORTS_LOW_ACK), #undef FLAG }; @@ -428,7 +336,9 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); - DEBUGFS_ADD_MODE(aqm, 0600); + + if (local->ops->wake_tx_queue) + DEBUGFS_ADD_MODE(aqm, 0600); statsd = debugfs_create_dir("statistics", phyd); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index a5ba739cd2a7..bcec1240f41d 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -30,7 +30,7 @@ static ssize_t ieee80211_if_read( size_t count, loff_t *ppos, ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int)) { - char buf[70]; + char buf[200]; ssize_t ret = -EINVAL; read_lock(&dev_base_lock); @@ -486,6 +486,38 @@ static ssize_t ieee80211_if_fmt_num_buffered_multicast( } IEEE80211_IF_FILE_R(num_buffered_multicast); +static ssize_t ieee80211_if_fmt_aqm( + const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) +{ + struct ieee80211_local *local = sdata->local; + struct txq_info *txqi = to_txq_info(sdata->vif.txq); + int len; + + spin_lock_bh(&local->fq.lock); + rcu_read_lock(); + + len = scnprintf(buf, + buflen, + "ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n" + "%u %u %u %u %u %u %u %u %u %u\n", + txqi->txq.ac, + txqi->tin.backlog_bytes, + txqi->tin.backlog_packets, + txqi->tin.flows, + txqi->cstats.drop_count, + txqi->cstats.ecn_mark, + txqi->tin.overlimit, + txqi->tin.collisions, + txqi->tin.tx_bytes, + txqi->tin.tx_packets); + + rcu_read_unlock(); + spin_unlock_bh(&local->fq.lock); + + return len; +} +IEEE80211_IF_FILE_R(aqm); + /* IBSS attributes */ static ssize_t ieee80211_if_fmt_tsf( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) @@ -524,9 +556,15 @@ static ssize_t ieee80211_if_parse_tsf( ret = kstrtoull(buf, 10, &tsf); if (ret < 0) return ret; - if (tsf_is_delta) - tsf = drv_get_tsf(local, sdata) + tsf_is_delta * tsf; - if (local->ops->set_tsf) { + if (tsf_is_delta && local->ops->offset_tsf) { + drv_offset_tsf(local, sdata, tsf_is_delta * tsf); + wiphy_info(local->hw.wiphy, + "debugfs offset TSF by %018lld\n", + tsf_is_delta * tsf); + } else if (local->ops->set_tsf) { + if (tsf_is_delta) + tsf = drv_get_tsf(local, sdata) + + tsf_is_delta * tsf; drv_set_tsf(local, sdata, tsf); wiphy_info(local->hw.wiphy, "debugfs set TSF to %#018llx\n", tsf); @@ -618,6 +656,9 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz); DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); DEBUGFS_ADD(hw_queues); + + if (sdata->local->ops->wake_tx_queue) + DEBUGFS_ADD(aqm); } static void add_sta_files(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index fd334133ff45..a2fcdb47a0e6 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -133,6 +133,55 @@ static ssize_t sta_last_seq_ctrl_read(struct file *file, char __user *userbuf, } STA_OPS(last_seq_ctrl); +#define AQM_TXQ_ENTRY_LEN 130 + +static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct sta_info *sta = file->private_data; + struct ieee80211_local *local = sta->local; + size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1); + char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf; + struct txq_info *txqi; + ssize_t rv; + int i; + + if (!buf) + return -ENOMEM; + + spin_lock_bh(&local->fq.lock); + rcu_read_lock(); + + p += scnprintf(p, + bufsz+buf-p, + "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n"); + + for (i = 0; i < IEEE80211_NUM_TIDS; i++) { + txqi = to_txq_info(sta->sta.txq[i]); + p += scnprintf(p, bufsz+buf-p, + "%d %d %u %u %u %u %u %u %u %u %u\n", + txqi->txq.tid, + txqi->txq.ac, + txqi->tin.backlog_bytes, + txqi->tin.backlog_packets, + txqi->tin.flows, + txqi->cstats.drop_count, + txqi->cstats.ecn_mark, + txqi->tin.overlimit, + txqi->tin.collisions, + txqi->tin.tx_bytes, + txqi->tin.tx_packets); + } + + rcu_read_unlock(); + spin_unlock_bh(&local->fq.lock); + + rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + kfree(buf); + return rv; +} +STA_OPS(aqm); + static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -478,6 +527,9 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); DEBUGFS_ADD_COUNTER(tx_filtered, status_stats.filtered); + if (local->ops->wake_tx_queue) + DEBUGFS_ADD(aqm); + if (sizeof(sta->driver_buffered_tids) == sizeof(u32)) debugfs_create_x32("driver_buffered_tids", 0400, sta->debugfs_dir, @@ -492,10 +544,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) void ieee80211_sta_debugfs_remove(struct sta_info *sta) { - struct ieee80211_local *local = sta->local; - struct ieee80211_sub_if_data *sdata = sta->sdata; - - drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir); debugfs_remove_recursive(sta->debugfs_dir); sta->debugfs_dir = NULL; } diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index c258f1041d33..bb886e7db47f 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -62,7 +62,7 @@ int drv_add_interface(struct ieee80211_local *local, if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && - !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)))) + !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; trace_drv_add_interface(local, sdata); @@ -215,6 +215,21 @@ void drv_set_tsf(struct ieee80211_local *local, trace_drv_return_void(local); } +void drv_offset_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + s64 offset) +{ + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_offset_tsf(local, sdata, offset); + if (local->ops->offset_tsf) + local->ops->offset_tsf(&local->hw, &sdata->vif, offset); + trace_drv_return_void(local); +} + void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index ba5fc1f01e53..09f77e4a8a79 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -162,7 +162,9 @@ static inline void drv_bss_info_changed(struct ieee80211_local *local, return; if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || - sdata->vif.type == NL80211_IFTYPE_MONITOR)) + sdata->vif.type == NL80211_IFTYPE_NAN || + (sdata->vif.type == NL80211_IFTYPE_MONITOR && + !sdata->vif.mu_mimo_owner))) return; if (!check_sdata_in_driver(sdata)) @@ -498,21 +500,6 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local, local->ops->sta_add_debugfs(&local->hw, &sdata->vif, sta, dir); } - -static inline void drv_sta_remove_debugfs(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, - struct dentry *dir) -{ - might_sleep(); - - sdata = get_bss_sdata(sdata); - check_sdata_in_driver(sdata); - - if (local->ops->sta_remove_debugfs) - local->ops->sta_remove_debugfs(&local->hw, &sdata->vif, - sta, dir); -} #endif static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, @@ -582,6 +569,9 @@ u64 drv_get_tsf(struct ieee80211_local *local, void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf); +void drv_offset_tsf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + s64 offset); void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); @@ -1088,13 +1078,13 @@ static inline void drv_leave_ibss(struct ieee80211_local *local, } static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, - struct ieee80211_sta *sta) + struct sta_info *sta) { u32 ret = 0; - trace_drv_get_expected_throughput(sta); - if (local->ops->get_expected_throughput) - ret = local->ops->get_expected_throughput(&local->hw, sta); + trace_drv_get_expected_throughput(&sta->sta); + if (local->ops->get_expected_throughput && sta->uploaded) + ret = local->ops->get_expected_throughput(&local->hw, &sta->sta); trace_drv_return_u32(local, ret); return ret; @@ -1179,4 +1169,83 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local, local->ops->wake_tx_queue(&local->hw, &txq->txq); } +static inline int drv_start_nan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_nan_conf *conf) +{ + int ret; + + might_sleep(); + check_sdata_in_driver(sdata); + + trace_drv_start_nan(local, sdata, conf); + ret = local->ops->start_nan(&local->hw, &sdata->vif, conf); + trace_drv_return_int(local, ret); + return ret; +} + +static inline void drv_stop_nan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + might_sleep(); + check_sdata_in_driver(sdata); + + trace_drv_stop_nan(local, sdata); + local->ops->stop_nan(&local->hw, &sdata->vif); + trace_drv_return_void(local); +} + +static inline int drv_nan_change_conf(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_nan_conf *conf, + u32 changes) +{ + int ret; + + might_sleep(); + check_sdata_in_driver(sdata); + + if (!local->ops->nan_change_conf) + return -EOPNOTSUPP; + + trace_drv_nan_change_conf(local, sdata, conf, changes); + ret = local->ops->nan_change_conf(&local->hw, &sdata->vif, conf, + changes); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline int drv_add_nan_func(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const struct cfg80211_nan_func *nan_func) +{ + int ret; + + might_sleep(); + check_sdata_in_driver(sdata); + + if (!local->ops->add_nan_func) + return -EOPNOTSUPP; + + trace_drv_add_nan_func(local, sdata, nan_func); + ret = local->ops->add_nan_func(&local->hw, &sdata->vif, nan_func); + trace_drv_return_int(local, ret); + + return ret; +} + +static inline void drv_del_nan_func(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u8 instance_id) +{ + might_sleep(); + check_sdata_in_driver(sdata); + + trace_drv_del_nan_func(local, sdata, instance_id); + if (local->ops->del_nan_func) + local->ops->del_nan_func(&local->hw, &sdata->vif, instance_id); + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f56d342c31b8..34c2add2c455 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -3,7 +3,7 @@ * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> - * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2013-2015 Intel Mobile Communications GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -86,6 +86,8 @@ struct ieee80211_local; #define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) +#define IEEE80211_MAX_NAN_INSTANCE_ID 255 + struct ieee80211_fragment_entry { struct sk_buff_head skb_list; unsigned long first_frag_time; @@ -813,17 +815,39 @@ enum txq_info_flags { * @def_flow: used as a fallback flow when a packet destined to @tin hashes to * a fq_flow which is already owned by a different tin * @def_cvars: codel vars for @def_flow + * @frags: used to keep fragments created after dequeue */ struct txq_info { struct fq_tin tin; struct fq_flow def_flow; struct codel_vars def_cvars; + struct codel_stats cstats; + struct sk_buff_head frags; unsigned long flags; /* keep last! */ struct ieee80211_txq txq; }; +struct ieee80211_if_mntr { + u32 flags; + u8 mu_follow_addr[ETH_ALEN] __aligned(2); +}; + +/** + * struct ieee80211_if_nan - NAN state + * + * @conf: current NAN configuration + * @func_ids: a bitmap of available instance_id's + */ +struct ieee80211_if_nan { + struct cfg80211_nan_conf conf; + + /* protects function_inst_ids */ + spinlock_t func_lock; + struct idr function_inst_ids; +}; + struct ieee80211_sub_if_data { struct list_head list; @@ -922,7 +946,8 @@ struct ieee80211_sub_if_data { struct ieee80211_if_ibss ibss; struct ieee80211_if_mesh mesh; struct ieee80211_if_ocb ocb; - u32 mntr_flags; + struct ieee80211_if_mntr mntr; + struct ieee80211_if_nan nan; } u; #ifdef CONFIG_MAC80211_DEBUGFS @@ -1112,7 +1137,6 @@ struct ieee80211_local { struct fq fq; struct codel_vars *cvars; struct codel_params cparams; - struct codel_stats cstats; const struct ieee80211_ops *ops; @@ -1208,7 +1232,7 @@ struct ieee80211_local { spinlock_t tim_lock; unsigned long num_sta; struct list_head sta_list; - struct rhashtable sta_hash; + struct rhltable sta_hash; struct timer_list sta_cleanup; int sta_generation; @@ -1476,6 +1500,13 @@ static inline struct txq_info *to_txq_info(struct ieee80211_txq *txq) return container_of(txq, struct txq_info, txq); } +static inline bool txq_has_queue(struct ieee80211_txq *txq) +{ + struct txq_info *txqi = to_txq_info(txq); + + return !(skb_queue_empty(&txqi->frags) && !txqi->tin.backlog_packets); +} + static inline int ieee80211_bssid_match(const u8 *raddr, const u8 *addr) { return ether_addr_equal(raddr, addr) || diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b123a9e325b3..638ec0759078 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -43,6 +43,8 @@ * by either the RTNL, the iflist_mtx or RCU. */ +static void ieee80211_iface_work(struct work_struct *work); + bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata) { struct ieee80211_chanctx_conf *chanctx_conf; @@ -188,7 +190,7 @@ static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr, continue; if (iter->vif.type == NL80211_IFTYPE_MONITOR && - !(iter->u.mntr_flags & MONITOR_FLAG_ACTIVE)) + !(iter->u.mntr.flags & MONITOR_FLAG_ACTIVE)) continue; m = iter->vif.addr; @@ -217,7 +219,7 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) return -EBUSY; if (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)) + !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) check_dup = false; ret = ieee80211_verify_mac(sdata, sa->sa_data, check_dup); @@ -325,6 +327,9 @@ static int ieee80211_check_queues(struct ieee80211_sub_if_data *sdata, int n_queues = sdata->local->hw.queues; int i; + if (iftype == NL80211_IFTYPE_NAN) + return 0; + if (iftype != NL80211_IFTYPE_P2P_DEVICE) { for (i = 0; i < IEEE80211_NUM_ACS; i++) { if (WARN_ON_ONCE(sdata->vif.hw_queue[i] == @@ -357,7 +362,7 @@ void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, const int offset) { struct ieee80211_local *local = sdata->local; - u32 flags = sdata->u.mntr_flags; + u32 flags = sdata->u.mntr.flags; #define ADJUST(_f, _s) do { \ if (flags & MONITOR_FLAG_##_f) \ @@ -448,6 +453,9 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) return ret; } + skb_queue_head_init(&sdata->skb_queue); + INIT_WORK(&sdata->work, ieee80211_iface_work); + return 0; } @@ -540,6 +548,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_OCB: + case NL80211_IFTYPE_NAN: /* no special treatment */ break; case NL80211_IFTYPE_UNSPECIFIED: @@ -589,12 +598,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) } break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) { + if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { local->cooked_mntrs++; break; } - if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) { + if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { res = drv_add_interface(local, sdata); if (res) goto err_stop; @@ -641,7 +650,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) local->fif_probe_req++; } - if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_NAN) changed |= ieee80211_reset_erp_info(sdata); ieee80211_bss_info_change_notify(sdata, changed); @@ -655,6 +665,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: break; default: /* not reached */ @@ -787,6 +798,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, struct ps_data *ps; struct cfg80211_chan_def chandef; bool cancel_scan; + struct cfg80211_nan_func *func; clear_bit(SDATA_STATE_RUNNING, &sdata->state); @@ -926,7 +938,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, /* no need to tell driver */ break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) { + if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { local->cooked_mntrs--; break; } @@ -939,6 +951,18 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ieee80211_adjust_monitor_flags(sdata, -1); break; + case NL80211_IFTYPE_NAN: + /* clean all the functions */ + spin_lock_bh(&sdata->u.nan.func_lock); + + idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, i) { + idr_remove(&sdata->u.nan.function_inst_ids, i); + cfg80211_free_nan_func(func); + } + idr_destroy(&sdata->u.nan.function_inst_ids); + + spin_unlock_bh(&sdata->u.nan.func_lock); + break; case NL80211_IFTYPE_P2P_DEVICE: /* relies on synchronize_rcu() below */ RCU_INIT_POINTER(local->p2p_sdata, NULL); @@ -1012,7 +1036,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_idle(local); mutex_unlock(&local->mtx); - if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)) + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) break; /* fall through */ @@ -1444,12 +1468,17 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_MONITOR: sdata->dev->type = ARPHRD_IEEE80211_RADIOTAP; sdata->dev->netdev_ops = &ieee80211_monitorif_ops; - sdata->u.mntr_flags = MONITOR_FLAG_CONTROL | + sdata->u.mntr.flags = MONITOR_FLAG_CONTROL | MONITOR_FLAG_OTHER_BSS; break; case NL80211_IFTYPE_WDS: sdata->vif.bss_conf.bssid = NULL; break; + case NL80211_IFTYPE_NAN: + idr_init(&sdata->u.nan.function_inst_ids); + spin_lock_init(&sdata->u.nan.func_lock); + sdata->vif.bss_conf.bssid = sdata->vif.addr; + break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_DEVICE: sdata->vif.bss_conf.bssid = sdata->vif.addr; @@ -1717,7 +1746,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ASSERT_RTNL(); - if (type == NL80211_IFTYPE_P2P_DEVICE) { + if (type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN) { struct wireless_dev *wdev; sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index d00ea9b13f49..1075ac24c8c5 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -660,6 +660,9 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, ieee80211_roc_setup(local); + local->hw.radiotap_timestamp.units_pos = -1; + local->hw.radiotap_timestamp.accuracy = -1; + return &local->hw; err_free: wiphy_free(wiphy); @@ -818,6 +821,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) !local->ops->tdls_recv_channel_switch)) return -EOPNOTSUPP; + if (WARN_ON(local->hw.wiphy->interface_modes & + BIT(NL80211_IFTYPE_NAN) && + (!local->ops->start_nan || !local->ops->stop_nan))) + return -EINVAL; + #ifdef CONFIG_PM if (hw->wiphy->wowlan && (!local->ops->suspend || !local->ops->resume)) return -EINVAL; @@ -1055,6 +1063,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->dynamic_ps_forced_timeout = -1; + if (!local->hw.max_nan_de_entries) + local->hw.max_nan_de_entries = IEEE80211_MAX_NAN_INSTANCE_ID; + result = ieee80211_wep_init(local); if (result < 0) wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index faccef977670..b747c9645e43 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -326,22 +326,33 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, u32 tx_time, estimated_retx; u64 result; - if (sta->mesh->fail_avg >= 100) - return MAX_METRIC; + /* Try to get rate based on HW/SW RC algorithm. + * Rate is returned in units of Kbps, correct this + * to comply with airtime calculation units + * Round up in case we get rate < 100Kbps + */ + rate = DIV_ROUND_UP(sta_get_expected_throughput(sta), 100); - sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo); - rate = cfg80211_calculate_bitrate(&rinfo); - if (WARN_ON(!rate)) - return MAX_METRIC; + if (rate) { + err = 0; + } else { + if (sta->mesh->fail_avg >= 100) + return MAX_METRIC; - err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100; + sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo); + rate = cfg80211_calculate_bitrate(&rinfo); + if (WARN_ON(!rate)) + return MAX_METRIC; + + err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100; + } /* bitrate is in units of 100 Kbps, while we need rate in units of * 1Mbps. This will be corrected on tx_time computation. */ tx_time = (device_constant + 10 * test_frame_len / rate); estimated_retx = ((1 << (2 * ARITH_SHIFT)) / (s_unit - err)); - result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT) ; + result = (tx_time * estimated_retx) >> (2 * ARITH_SHIFT); return (u32)result; } diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index 64bc22ad9496..faca22cd02b5 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -28,7 +28,7 @@ * could be, for instance, in case a neighbor is restarted and its TSF counter * reset. */ -#define TOFFSET_MAXIMUM_ADJUSTMENT 30000 /* 30 ms */ +#define TOFFSET_MAXIMUM_ADJUSTMENT 800 /* 0.8 ms */ struct sync_method { u8 method; @@ -70,9 +70,13 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) } spin_unlock_bh(&ifmsh->sync_offset_lock); - tsf = drv_get_tsf(local, sdata); - if (tsf != -1ULL) - drv_set_tsf(local, sdata, tsf + tsfdelta); + if (local->ops->offset_tsf) { + drv_offset_tsf(local, sdata, tsfdelta); + } else { + tsf = drv_get_tsf(local, sdata); + if (tsf != -1ULL) + drv_set_tsf(local, sdata, tsf + tsfdelta); + } } static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 8d426f637f58..7486f2dab4ba 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1672,11 +1672,15 @@ __ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata) non_acm_ac++) if (!(sdata->wmm_acm & BIT(7 - 2 * non_acm_ac))) break; - /* The loop will result in using BK even if it requires - * admission control, such configuration makes no sense - * and we have to transmit somehow - the AC selection - * does the same thing. + /* Usually the loop will result in using BK even if it + * requires admission control, but such a configuration + * makes no sense and we have to transmit somehow - the + * AC selection does the same thing. + * If we started out trying to downgrade from BK, then + * the extra condition here might be needed. */ + if (non_acm_ac >= IEEE80211_NUM_ACS) + non_acm_ac = IEEE80211_AC_BK; if (drv_conf_tx(local, sdata, ac, &sdata->tx_conf[non_acm_ac])) sdata_err(sdata, diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 55a9c5b94ce1..c3f610bba3fe 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -128,7 +128,8 @@ void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) if (!ieee80211_sdata_running(sdata)) continue; - if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) + if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || + sdata->vif.type == NL80211_IFTYPE_NAN) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) @@ -838,6 +839,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 00a43a70e1fc..28a3a0957c9e 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -178,8 +178,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) WARN_ON(!list_empty(&local->chanctx_list)); /* stop hardware - this must stop RX */ - if (local->open_count) - ieee80211_stop_device(local); + ieee80211_stop_device(local); suspend: local->suspended = true; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 9dce3b157908..6175db385ba7 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -180,6 +180,11 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len += 12; } + if (local->hw.radiotap_timestamp.units_pos >= 0) { + len = ALIGN(len, 8); + len += 12; + } + if (status->chains) { /* antenna and antenna signal fields */ len += 2 * hweight8(status->chains); @@ -447,6 +452,31 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos += 2; } + if (local->hw.radiotap_timestamp.units_pos >= 0) { + u16 accuracy = 0; + u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT; + + rthdr->it_present |= + cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP); + + /* ensure 8 byte alignment */ + while ((pos - (u8 *)rthdr) & 7) + pos++; + + put_unaligned_le64(status->device_timestamp, pos); + pos += sizeof(u64); + + if (local->hw.radiotap_timestamp.accuracy >= 0) { + accuracy = local->hw.radiotap_timestamp.accuracy; + flags |= IEEE80211_RADIOTAP_TIMESTAMP_FLAG_ACCURACY; + } + put_unaligned_le16(accuracy, pos); + pos += sizeof(u16); + + *pos++ = local->hw.radiotap_timestamp.units_pos; + *pos++ = flags; + } + for_each_set_bit(chain, &chains, IEEE80211_MAX_CHAINS) { *pos++ = status->chain_signal[chain]; *pos++ = chain; @@ -485,6 +515,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, struct net_device *prev_dev = NULL; int present_fcs_len = 0; unsigned int rtap_vendor_space = 0; + struct ieee80211_mgmt *mgmt; + struct ieee80211_sub_if_data *monitor_sdata = + rcu_dereference(local->monitor_sdata); if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; @@ -567,7 +600,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (sdata->vif.type != NL80211_IFTYPE_MONITOR) continue; - if (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) + if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) continue; if (!ieee80211_sdata_running(sdata)) @@ -585,6 +618,23 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, ieee80211_rx_stats(sdata->dev, skb->len); } + mgmt = (void *)skb->data; + if (monitor_sdata && + skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN && + ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_VHT && + mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT && + is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) && + ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) { + struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC); + + if (mu_skb) { + mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; + skb_queue_tail(&monitor_sdata->skb_queue, mu_skb); + ieee80211_queue_work(&local->hw, &monitor_sdata->work); + } + } + if (prev_dev) { skb->dev = prev_dev; netif_receive_skb(skb); @@ -1072,8 +1122,15 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); - if (!tid_agg_rx) + if (!tid_agg_rx) { + if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK && + !test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) && + !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg)) + ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid, + WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_REQUIRE_SETUP); goto dont_reorder; + } /* qos null data frames are excluded */ if (unlikely(hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_NULLFUNC))) @@ -1266,9 +1323,7 @@ static void sta_ps_start(struct sta_info *sta) return; for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { - struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - - if (txqi->tin.backlog_packets) + if (txq_has_queue(sta->sta.txq[tid])) set_bit(tid, &sta->txq_buffered_tids); else clear_bit(tid, &sta->txq_buffered_tids); @@ -2535,6 +2590,12 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) tid = le16_to_cpu(bar_data.control) >> 12; + if (!test_bit(tid, rx->sta->ampdu_mlme.agg_session_valid) && + !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg)) + ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid, + WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_REQUIRE_SETUP); + tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) return RX_DROP_MONITOR; @@ -3147,7 +3208,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR || - !(sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)) + !(sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)) continue; if (prev_dev) { @@ -3523,6 +3584,9 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) ieee80211_is_probe_req(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control) || ieee80211_is_beacon(hdr->frame_control); + case NL80211_IFTYPE_NAN: + /* Currently no frames on NAN interface are allowed */ + return false; default: break; } @@ -3940,7 +4004,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, __le16 fc; struct ieee80211_rx_data rx; struct ieee80211_sub_if_data *prev; - struct rhash_head *tmp; + struct rhlist_head *tmp; int err = 0; fc = ((struct ieee80211_hdr *)skb->data)->frame_control; @@ -3983,13 +4047,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, goto out; } else if (ieee80211_is_data(fc)) { struct sta_info *sta, *prev_sta; - const struct bucket_table *tbl; prev_sta = NULL; - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - - for_each_sta_info(local, tbl, hdr->addr2, sta, tmp) { + for_each_sta_info(local, hdr->addr2, sta, tmp) { if (!prev_sta) { prev_sta = sta; continue; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 070b40f15850..23d8ac829279 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -420,7 +420,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, { struct ieee80211_local *local = hw_to_local(hw); - trace_api_scan_completed(local, info); + trace_api_scan_completed(local, info->aborted); set_bit(SCAN_COMPLETED, &local->scanning); if (info->aborted) diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index aa58df80ede0..78e9ecbc96e6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -67,12 +67,10 @@ static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ - .insecure_elasticity = true, /* Disable chain-length checks. */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, - .hashfn = sta_addr_hash, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; @@ -80,8 +78,8 @@ static const struct rhashtable_params sta_rht_params = { static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { - return rhashtable_remove_fast(&local->sta_hash, &sta->hash_node, - sta_rht_params); + return rhltable_remove(&local->sta_hash, &sta->hash_node, + sta_rht_params); } static void __cleanup_single_sta(struct sta_info *sta) @@ -157,19 +155,22 @@ static void cleanup_single_sta(struct sta_info *sta) sta_info_free(local, sta); } +struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, + const u8 *addr) +{ + return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); +} + /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; + struct rhlist_head *tmp; struct sta_info *sta; - struct rhash_head *tmp; - const struct bucket_table *tbl; rcu_read_lock(); - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - - for_each_sta_info(local, tbl, addr, sta, tmp) { + for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold @@ -190,14 +191,11 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; + struct rhlist_head *tmp; struct sta_info *sta; - struct rhash_head *tmp; - const struct bucket_table *tbl; rcu_read_lock(); - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - - for_each_sta_info(local, tbl, addr, sta, tmp) { + for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); @@ -263,8 +261,8 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { - return rhashtable_insert_fast(&local->sta_hash, &sta->hash_node, - sta_rht_params); + return rhltable_insert(&local->sta_hash, &sta->hash_node, + sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) @@ -340,6 +338,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); + sta->sta.max_rx_aggregation_subframes = + local->hw.max_rx_aggregation_subframes; + sta->local = local; sta->sdata = sdata; sta->rx_stats.last_rx = jiffies; @@ -450,9 +451,9 @@ static int sta_info_insert_check(struct sta_info *sta) is_multicast_ether_addr(sta->sta.addr))) return -EINVAL; - /* Strictly speaking this isn't necessary as we hold the mutex, but - * the rhashtable code can't really deal with that distinction. We - * do require the mutex for correctness though. + /* The RCU read lock is required by rhashtable due to + * asynchronous resize/rehash. We also require the mutex + * for correctness. */ rcu_read_lock(); lockdep_assert_held(&sdata->local->sta_mtx); @@ -687,7 +688,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) } /* No need to do anything if the driver does all */ - if (ieee80211_hw_check(&local->hw, AP_LINK_PS)) + if (!local->ops->set_tim) return; if (sta->dead) @@ -1040,16 +1041,11 @@ static void sta_info_cleanup(unsigned long data) round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } -u32 sta_addr_hash(const void *key, u32 length, u32 seed) -{ - return jhash(key, ETH_ALEN, seed); -} - int sta_info_init(struct ieee80211_local *local) { int err; - err = rhashtable_init(&local->sta_hash, &sta_rht_params); + err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; @@ -1065,7 +1061,7 @@ int sta_info_init(struct ieee80211_local *local) void sta_info_stop(struct ieee80211_local *local) { del_timer_sync(&local->sta_cleanup); - rhashtable_destroy(&local->sta_hash); + rhltable_destroy(&local->sta_hash); } @@ -1135,17 +1131,14 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); + struct rhlist_head *tmp; struct sta_info *sta; - struct rhash_head *tmp; - const struct bucket_table *tbl; - - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); /* * Just return a random station if localaddr is NULL * ... first in list. */ - for_each_sta_info(local, tbl, addr, sta, tmp) { + for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; @@ -1209,12 +1202,10 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) if (sta->sta.txq[0]) { for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - struct txq_info *txqi = to_txq_info(sta->sta.txq[i]); - - if (!txqi->tin.backlog_packets) + if (!txq_has_queue(sta->sta.txq[i])) continue; - drv_wake_tx_queue(local, txqi); + drv_wake_tx_queue(local, to_txq_info(sta->sta.txq[i])); } } @@ -1645,10 +1636,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, return; for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { - struct txq_info *txqi = to_txq_info(sta->sta.txq[tid]); - if (!(driver_release_tids & BIT(tid)) || - txqi->tin.backlog_packets) + txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); @@ -2279,11 +2268,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); - /* check if the driver has a SW RC implementation */ - if (ref && ref->ops->get_expected_throughput) - thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); - else - thr = drv_get_expected_throughput(local, &sta->sta); + thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); @@ -2291,6 +2276,25 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) } } +u32 sta_get_expected_throughput(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct rate_control_ref *ref = NULL; + u32 thr = 0; + + if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) + ref = local->rate_ctrl; + + /* check if the driver has a SW RC implementation */ + if (ref && ref->ops->get_expected_throughput) + thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); + else + thr = drv_get_expected_throughput(local, sta); + + return thr; +} + unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 78b0ef32dddd..ed5fcb984a01 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -230,6 +230,8 @@ struct tid_ampdu_rx { * @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the * driver requested to close until the work for it runs * @agg_session_valid: bitmap indicating which TID has a rx BA session open on + * @unexpected_agg: bitmap indicating which TID already sent a delBA due to + * unexpected aggregation related frames outside a session * @work: work struct for starting/stopping aggregation * @tid_tx: aggregation info for Tx per TID * @tid_start_tx: sessions where start was requested @@ -244,6 +246,7 @@ struct sta_ampdu_mlme { unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; + unsigned long unexpected_agg[BITS_TO_LONGS(IEEE80211_NUM_TIDS)]; /* tx */ struct work_struct work; struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS]; @@ -452,7 +455,7 @@ struct sta_info { /* General information, mostly static */ struct list_head list, free_list; struct rcu_head rcu_head; - struct rhash_head hash_node; + struct rhlist_head hash_node; u8 addr[ETH_ALEN]; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -635,6 +638,9 @@ rcu_dereference_protected_tid_tx(struct sta_info *sta, int tid) */ #define STA_INFO_CLEANUP_INTERVAL (10 * HZ) +struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, + const u8 *addr); + /* * Get a STA info, must be under RCU read lock. */ @@ -644,17 +650,9 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr); -u32 sta_addr_hash(const void *key, u32 length, u32 seed); - -#define _sta_bucket_idx(_tbl, _a) \ - rht_bucket_index(_tbl, sta_addr_hash(_a, ETH_ALEN, (_tbl)->hash_rnd)) - -#define for_each_sta_info(local, tbl, _addr, _sta, _tmp) \ - rht_for_each_entry_rcu(_sta, _tmp, tbl, \ - _sta_bucket_idx(tbl, _addr), \ - hash_node) \ - /* compare address and run code only if it matches */ \ - if (ether_addr_equal(_sta->addr, (_addr))) +#define for_each_sta_info(local, _addr, _sta, _tmp) \ + rhl_for_each_entry_rcu(_sta, _tmp, \ + sta_info_hash_lookup(local, _addr), hash_node) /* * Get STA info by index, BROKEN! @@ -712,6 +710,8 @@ void sta_set_rate_info_tx(struct sta_info *sta, struct rate_info *rinfo); void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo); +u32 sta_get_expected_throughput(struct sta_info *sta); + void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time); u8 sta_info_tx_streams(struct sta_info *sta); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index a2a68269675d..ddf71c648cab 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -557,6 +557,12 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { + /* If driver relies on its own algorithm for station kickout, skip + * mac80211 packet loss mechanism. + */ + if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK)) + return; + /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) @@ -709,7 +715,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, if (!ieee80211_sdata_running(sdata)) continue; - if ((sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES) && + if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && !send_to_cooked) continue; @@ -740,8 +746,8 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); __le16 fc; struct ieee80211_supported_band *sband; + struct rhlist_head *tmp; struct sta_info *sta; - struct rhash_head *tmp; int retry_count; int rates_idx; bool send_to_cooked; @@ -749,7 +755,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) struct ieee80211_bar *bar; int shift = 0; int tid = IEEE80211_NUM_TIDS; - const struct bucket_table *tbl; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); @@ -758,9 +763,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) sband = local->hw.wiphy->bands[info->band]; fc = hdr->frame_control; - tbl = rht_dereference_rcu(local->sta_hash.tbl, &local->sta_hash); - - for_each_sta_info(local, tbl, hdr->addr1, sta, tmp) { + for_each_sta_info(local, hdr->addr1, sta, tmp) { /* skip wrong virtual interface */ if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) continue; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 77e4c53baefb..92a47afaa989 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -984,6 +984,32 @@ TRACE_EVENT(drv_set_tsf, ) ); +TRACE_EVENT(drv_offset_tsf, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + s64 offset), + + TP_ARGS(local, sdata, offset), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(s64, tsf_offset) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->tsf_offset = offset; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " tsf offset:%lld", + LOCAL_PR_ARG, VIF_PR_ARG, + (unsigned long long)__entry->tsf_offset + ) +); + DEFINE_EVENT(local_sdata_evt, drv_reset_tsf, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), @@ -1700,6 +1726,139 @@ TRACE_EVENT(drv_get_expected_throughput, ) ); +TRACE_EVENT(drv_start_nan, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_nan_conf *conf), + + TP_ARGS(local, sdata, conf), + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u8, master_pref) + __field(u8, dual) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->master_pref = conf->master_pref; + __entry->dual = conf->dual; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + ", master preference: %u, dual: %d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, + __entry->dual + ) +); + +TRACE_EVENT(drv_stop_nan, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + + TP_ARGS(local, sdata), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT, + LOCAL_PR_ARG, VIF_PR_ARG + ) +); + +TRACE_EVENT(drv_nan_change_conf, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct cfg80211_nan_conf *conf, + u32 changes), + + TP_ARGS(local, sdata, conf, changes), + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u8, master_pref) + __field(u8, dual) + __field(u32, changes) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->master_pref = conf->master_pref; + __entry->dual = conf->dual; + __entry->changes = changes; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + ", master preference: %u, dual: %d, changes: 0x%x", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->master_pref, + __entry->dual, __entry->changes + ) +); + +TRACE_EVENT(drv_add_nan_func, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + const struct cfg80211_nan_func *func), + + TP_ARGS(local, sdata, func), + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u8, type) + __field(u8, inst_id) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->type = func->type; + __entry->inst_id = func->instance_id; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + ", type: %u, inst_id: %u", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->type, __entry->inst_id + ) +); + +TRACE_EVENT(drv_del_nan_func, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + u8 instance_id), + + TP_ARGS(local, sdata, instance_id), + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u8, instance_id) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->instance_id = instance_id; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT + ", instance_id: %u", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->instance_id + ) +); + /* * Tracing for API calls that drivers call. */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 18b285e06bc8..1c56abc49627 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -796,36 +796,6 @@ static __le16 ieee80211_tx_next_seq(struct sta_info *sta, int tid) return ret; } -static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, - struct ieee80211_vif *vif, - struct ieee80211_sta *pubsta, - struct sk_buff *skb) -{ - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_txq *txq = NULL; - - if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || - (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) - return NULL; - - if (!ieee80211_is_data(hdr->frame_control)) - return NULL; - - if (pubsta) { - u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; - - txq = pubsta->txq[tid]; - } else if (vif) { - txq = vif->txq; - } - - if (!txq) - return NULL; - - return to_txq_info(txq); -} - static ieee80211_tx_result debug_noinline ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { @@ -883,9 +853,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) tid = *qc & IEEE80211_QOS_CTL_TID_MASK; tx->sta->tx_stats.msdu[tid]++; - if (!ieee80211_get_txq(tx->local, info->control.vif, &tx->sta->sta, - tx->skb)) - hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); + hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); return TX_CONTINUE; } @@ -1274,6 +1242,36 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } +static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, + struct ieee80211_vif *vif, + struct ieee80211_sta *pubsta, + struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_txq *txq = NULL; + + if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) || + (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)) + return NULL; + + if (!ieee80211_is_data(hdr->frame_control)) + return NULL; + + if (pubsta) { + u8 tid = skb->priority & IEEE80211_QOS_CTL_TID_MASK; + + txq = pubsta->txq[tid]; + } else if (vif) { + txq = vif->txq; + } + + if (!txq) + return NULL; + + return to_txq_info(txq); +} + static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) { IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); @@ -1344,7 +1342,7 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, local = container_of(fq, struct ieee80211_local, fq); txqi = container_of(tin, struct txq_info, tin); cparams = &local->cparams; - cstats = &local->cstats; + cstats = &txqi->cstats; if (flow == &txqi->def_flow) cvars = &txqi->def_cvars; @@ -1404,6 +1402,8 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, fq_tin_init(&txqi->tin); fq_flow_init(&txqi->def_flow); codel_vars_init(&txqi->def_cvars); + codel_stats_init(&txqi->cstats); + __skb_queue_head_init(&txqi->frags); txqi->txq.vif = &sdata->vif; @@ -1426,6 +1426,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local, struct fq_tin *tin = &txqi->tin; fq_tin_reset(fq, tin, fq_skb_free_func); + ieee80211_purge_tx_queue(&local->hw, &txqi->frags); } int ieee80211_txq_setup_flows(struct ieee80211_local *local) @@ -1433,6 +1434,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) struct fq *fq = &local->fq; int ret; int i; + bool supp_vht = false; + enum nl80211_band band; if (!local->ops->wake_tx_queue) return 0; @@ -1441,8 +1444,24 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) if (ret) return ret; + /* + * If the hardware doesn't support VHT, it is safe to limit the maximum + * queue size. 4 Mbytes is 64 max-size aggregates in 802.11n. + */ + for (band = 0; band < NUM_NL80211_BANDS; band++) { + struct ieee80211_supported_band *sband; + + sband = local->hw.wiphy->bands[band]; + if (!sband) + continue; + + supp_vht = supp_vht || sband->vht_cap.vht_supported; + } + + if (!supp_vht) + fq->memory_limit = 4 << 20; /* 4 Mbytes */ + codel_params_init(&local->cparams); - codel_stats_init(&local->cstats); local->cparams.interval = MS2TIME(100); local->cparams.target = MS2TIME(20); local->cparams.ecn = true; @@ -1477,54 +1496,46 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) spin_unlock_bh(&fq->lock); } -struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, - struct ieee80211_txq *txq) +static bool ieee80211_queue_skb(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct sk_buff *skb) { - struct ieee80211_local *local = hw_to_local(hw); - struct txq_info *txqi = container_of(txq, struct txq_info, txq); - struct ieee80211_hdr *hdr; - struct sk_buff *skb = NULL; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct fq *fq = &local->fq; - struct fq_tin *tin = &txqi->tin; + struct ieee80211_vif *vif; + struct txq_info *txqi; + struct ieee80211_sta *pubsta; - spin_lock_bh(&fq->lock); + if (!local->ops->wake_tx_queue || + sdata->vif.type == NL80211_IFTYPE_MONITOR) + return false; - if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) - goto out; + if (sta && sta->uploaded) + pubsta = &sta->sta; + else + pubsta = NULL; - skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); - if (!skb) - goto out; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + sdata = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); - ieee80211_set_skb_vif(skb, txqi); + vif = &sdata->vif; + txqi = ieee80211_get_txq(local, vif, pubsta, skb); - hdr = (struct ieee80211_hdr *)skb->data; - if (txq->sta && ieee80211_is_data_qos(hdr->frame_control)) { - struct sta_info *sta = container_of(txq->sta, struct sta_info, - sta); - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + if (!txqi) + return false; - hdr->seq_ctrl = ieee80211_tx_next_seq(sta, txq->tid); - if (test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags)) - info->flags |= IEEE80211_TX_CTL_AMPDU; - else - info->flags &= ~IEEE80211_TX_CTL_AMPDU; - } + info->control.vif = vif; -out: + spin_lock_bh(&fq->lock); + ieee80211_txq_enqueue(local, txqi, skb); spin_unlock_bh(&fq->lock); - if (skb && skb_has_frag_list(skb) && - !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { - if (skb_linearize(skb)) { - ieee80211_free_txskb(&local->hw, skb); - return NULL; - } - } + drv_wake_tx_queue(local, txqi); - return skb; + return true; } -EXPORT_SYMBOL(ieee80211_tx_dequeue); static bool ieee80211_tx_frags(struct ieee80211_local *local, struct ieee80211_vif *vif, @@ -1533,9 +1544,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, bool txpending) { struct ieee80211_tx_control control = {}; - struct fq *fq = &local->fq; struct sk_buff *skb, *tmp; - struct txq_info *txqi; unsigned long flags; skb_queue_walk_safe(skbs, skb, tmp) { @@ -1550,21 +1559,6 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, } #endif - txqi = ieee80211_get_txq(local, vif, sta, skb); - if (txqi) { - info->control.vif = vif; - - __skb_unlink(skb, skbs); - - spin_lock_bh(&fq->lock); - ieee80211_txq_enqueue(local, txqi, skb); - spin_unlock_bh(&fq->lock); - - drv_wake_tx_queue(local, txqi); - - continue; - } - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || (!txpending && !skb_queue_empty(&local->pending[q]))) { @@ -1648,7 +1642,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE) { + if (sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) { vif = &sdata->vif; break; } @@ -1685,10 +1679,13 @@ static bool __ieee80211_tx(struct ieee80211_local *local, /* * Invoke TX handlers, return 0 on success and non-zero if the * frame was dropped or queued. + * + * The handlers are split into an early and late part. The latter is everything + * that can be sensitive to reordering, and will be deferred to after packets + * are dequeued from the intermediate queues (when they are enabled). */ -static int invoke_tx_handlers(struct ieee80211_tx_data *tx) +static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) { - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); ieee80211_tx_result res = TX_DROP; #define CALL_TXH(txh) \ @@ -1706,6 +1703,31 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) if (!ieee80211_hw_check(&tx->local->hw, HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); + txh_done: + if (unlikely(res == TX_DROP)) { + I802_DEBUG_INC(tx->local->tx_handlers_drop); + if (tx->skb) + ieee80211_free_txskb(&tx->local->hw, tx->skb); + else + ieee80211_purge_tx_queue(&tx->local->hw, &tx->skbs); + return -1; + } else if (unlikely(res == TX_QUEUED)) { + I802_DEBUG_INC(tx->local->tx_handlers_queued); + return -1; + } + + return 0; +} + +/* + * Late handlers can be called while the sta lock is held. Handlers that can + * cause packets to be generated will cause deadlock! + */ +static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + ieee80211_tx_result res = TX_CONTINUE; + if (unlikely(info->flags & IEEE80211_TX_INTFL_RETRANSMISSION)) { __skb_queue_tail(&tx->skbs, tx->skb); tx->skb = NULL; @@ -1738,6 +1760,15 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) return 0; } +static int invoke_tx_handlers(struct ieee80211_tx_data *tx) +{ + int r = invoke_tx_handlers_early(tx); + + if (r) + return r; + return invoke_tx_handlers_late(tx); +} + bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct sk_buff *skb, int band, struct ieee80211_sta **sta) @@ -1812,7 +1843,13 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; - if (!invoke_tx_handlers(&tx)) + if (invoke_tx_handlers_early(&tx)) + return false; + + if (ieee80211_queue_skb(local, sdata, tx.sta, tx.skb)) + return true; + + if (!invoke_tx_handlers_late(&tx)) result = __ieee80211_tx(local, &tx.skbs, led_len, tx.sta, txpending); @@ -2268,15 +2305,9 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_STATION: if (sdata->wdev.wiphy->flags & WIPHY_FLAG_SUPPORTS_TDLS) { sta = sta_info_get(sdata, skb->data); - if (sta) { - bool tdls_peer, tdls_auth; - - tdls_peer = test_sta_flag(sta, - WLAN_STA_TDLS_PEER); - tdls_auth = test_sta_flag(sta, - WLAN_STA_TDLS_PEER_AUTH); - - if (tdls_peer && tdls_auth) { + if (sta && test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { + if (test_sta_flag(sta, + WLAN_STA_TDLS_PEER_AUTH)) { *sta_out = sta; return 0; } @@ -2288,8 +2319,7 @@ static int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, * after a TDLS sta is removed due to being * unreachable. */ - if (tdls_peer && !tdls_auth && - !ieee80211_is_tdls_setup(skb)) + if (!ieee80211_is_tdls_setup(skb)) return -EINVAL; } @@ -2339,7 +2369,6 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct mesh_path __maybe_unused *mppath = NULL, *mpath = NULL; const u8 *encaps_data; int encaps_len, skip_header_bytes; - int nh_pos, h_pos; bool wme_sta = false, authorized = false; bool tdls_peer; bool multicast; @@ -2645,13 +2674,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, encaps_len = 0; } - nh_pos = skb_network_header(skb) - skb->data; - h_pos = skb_transport_header(skb) - skb->data; - skb_pull(skb, skip_header_bytes); - nh_pos -= skip_header_bytes; - h_pos -= skip_header_bytes; - head_need = hdrlen + encaps_len + meshhdrlen - skb_headroom(skb); /* @@ -2677,18 +2700,12 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } } - if (encaps_data) { + if (encaps_data) memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); - nh_pos += encaps_len; - h_pos += encaps_len; - } #ifdef CONFIG_MAC80211_MESH - if (meshhdrlen > 0) { + if (meshhdrlen > 0) memcpy(skb_push(skb, meshhdrlen), &mesh_hdr, meshhdrlen); - nh_pos += meshhdrlen; - h_pos += meshhdrlen; - } #endif if (ieee80211_is_data_qos(fc)) { @@ -2704,15 +2721,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } else memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); - nh_pos += hdrlen; - h_pos += hdrlen; - - /* Update skb pointers to various headers since this modified frame - * is going to go through Linux networking code that may potentially - * need things like pointer to IP header. */ skb_reset_mac_header(skb); - skb_set_network_header(skb, nh_pos); - skb_set_transport_header(skb, h_pos); info = IEEE80211_SKB_CB(skb); memset(info, 0, sizeof(*info)); @@ -3184,8 +3193,71 @@ out: return ret; } +/* + * Can be called while the sta lock is held. Anything that can cause packets to + * be generated will cause deadlock! + */ +static void ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, u8 pn_offs, + struct ieee80211_key *key, + struct sk_buff *skb) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr = (void *)skb->data; + u8 tid = IEEE80211_NUM_TIDS; + + if (key) + info->control.hw_key = &key->conf; + + ieee80211_tx_stats(skb->dev, skb->len); + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + *ieee80211_get_qos_ctl(hdr) = tid; + hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); + } else { + info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; + hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); + sdata->sequence_number += 0x10; + } + + if (skb_shinfo(skb)->gso_size) + sta->tx_stats.msdu[tid] += + DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); + else + sta->tx_stats.msdu[tid]++; + + info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; + + /* statistics normally done by ieee80211_tx_h_stats (but that + * has to consider fragmentation, so is more complex) + */ + sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; + sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; + + if (pn_offs) { + u64 pn; + u8 *crypto_hdr = skb->data + pn_offs; + + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + pn = atomic64_inc_return(&key->conf.tx_pn); + crypto_hdr[0] = pn; + crypto_hdr[1] = pn >> 8; + crypto_hdr[4] = pn >> 16; + crypto_hdr[5] = pn >> 24; + crypto_hdr[6] = pn >> 32; + crypto_hdr[7] = pn >> 40; + break; + } + } +} + static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, - struct net_device *dev, struct sta_info *sta, + struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, struct sk_buff *skb) { @@ -3236,8 +3308,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, return true; } - ieee80211_tx_stats(dev, skb->len + extra_head); - if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) && ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb)) return true; @@ -3266,24 +3336,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | IEEE80211_TX_CTL_DONTFRAG | (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); - - if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { - *ieee80211_get_qos_ctl(hdr) = tid; - if (!ieee80211_get_txq(local, &sdata->vif, &sta->sta, skb)) - hdr->seq_ctrl = ieee80211_tx_next_seq(sta, tid); - } else { - info->flags |= IEEE80211_TX_CTL_ASSIGN_SEQ; - hdr->seq_ctrl = cpu_to_le16(sdata->sequence_number); - sdata->sequence_number += 0x10; - } - - if (skb_shinfo(skb)->gso_size) - sta->tx_stats.msdu[tid] += - DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); - else - sta->tx_stats.msdu[tid]++; - - info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; + info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT; __skb_queue_head_init(&tx.skbs); @@ -3293,9 +3346,6 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, tx.sta = sta; tx.key = fast_tx->key; - if (fast_tx->key) - info->control.hw_key = &fast_tx->key->conf; - if (!ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { tx.skb = skb; r = ieee80211_tx_h_rate_ctrl(&tx); @@ -3309,31 +3359,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, } } - /* statistics normally done by ieee80211_tx_h_stats (but that - * has to consider fragmentation, so is more complex) - */ - sta->tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; - sta->tx_stats.packets[skb_get_queue_mapping(skb)]++; - - if (fast_tx->pn_offs) { - u64 pn; - u8 *crypto_hdr = skb->data + fast_tx->pn_offs; + if (ieee80211_queue_skb(local, sdata, sta, skb)) + return true; - switch (fast_tx->key->conf.cipher) { - case WLAN_CIPHER_SUITE_CCMP: - case WLAN_CIPHER_SUITE_CCMP_256: - case WLAN_CIPHER_SUITE_GCMP: - case WLAN_CIPHER_SUITE_GCMP_256: - pn = atomic64_inc_return(&fast_tx->key->conf.tx_pn); - crypto_hdr[0] = pn; - crypto_hdr[1] = pn >> 8; - crypto_hdr[4] = pn >> 16; - crypto_hdr[5] = pn >> 24; - crypto_hdr[6] = pn >> 32; - crypto_hdr[7] = pn >> 40; - break; - } - } + ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, + fast_tx->key, skb); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -3344,6 +3374,94 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, return true; } +struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct txq_info *txqi = container_of(txq, struct txq_info, txq); + struct ieee80211_hdr *hdr; + struct sk_buff *skb = NULL; + struct fq *fq = &local->fq; + struct fq_tin *tin = &txqi->tin; + struct ieee80211_tx_info *info; + struct ieee80211_tx_data tx; + ieee80211_tx_result r; + + spin_lock_bh(&fq->lock); + + if (test_bit(IEEE80211_TXQ_STOP, &txqi->flags)) + goto out; + + /* Make sure fragments stay together. */ + skb = __skb_dequeue(&txqi->frags); + if (skb) + goto out; + +begin: + skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); + if (!skb) + goto out; + + ieee80211_set_skb_vif(skb, txqi); + + hdr = (struct ieee80211_hdr *)skb->data; + info = IEEE80211_SKB_CB(skb); + + memset(&tx, 0, sizeof(tx)); + __skb_queue_head_init(&tx.skbs); + tx.local = local; + tx.skb = skb; + tx.sdata = vif_to_sdata(info->control.vif); + + if (txq->sta) + tx.sta = container_of(txq->sta, struct sta_info, sta); + + /* + * The key can be removed while the packet was queued, so need to call + * this here to get the current key. + */ + r = ieee80211_tx_h_select_key(&tx); + if (r != TX_CONTINUE) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } + + if (info->control.flags & IEEE80211_TX_CTRL_FAST_XMIT) { + struct sta_info *sta = container_of(txq->sta, struct sta_info, + sta); + u8 pn_offs = 0; + + if (tx.key && + (tx.key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) + pn_offs = ieee80211_hdrlen(hdr->frame_control); + + ieee80211_xmit_fast_finish(sta->sdata, sta, pn_offs, + tx.key, skb); + } else { + if (invoke_tx_handlers_late(&tx)) + goto begin; + + skb = __skb_dequeue(&tx.skbs); + + if (!skb_queue_empty(&tx.skbs)) + skb_queue_splice_tail(&tx.skbs, &txqi->frags); + } + + if (skb && skb_has_frag_list(skb) && + !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) { + if (skb_linearize(skb)) { + ieee80211_free_txskb(&local->hw, skb); + goto begin; + } + } + +out: + spin_unlock_bh(&fq->lock); + + return skb; +} +EXPORT_SYMBOL(ieee80211_tx_dequeue); + void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags) @@ -3368,7 +3486,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, fast_tx = rcu_dereference(sta->fast_tx); if (fast_tx && - ieee80211_xmit_fast(sdata, dev, sta, fast_tx, skb)) + ieee80211_xmit_fast(sdata, sta, fast_tx, skb)) goto out; } @@ -4395,9 +4513,6 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ac = ieee802_1d_to_ac[tid & 7]; skb_reset_mac_header(skb); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb_set_queue_mapping(skb, ac); skb->priority = tid; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 42bf0b6685e8..545c79a42a77 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -598,7 +598,7 @@ static void __iterate_interfaces(struct ieee80211_local *local, list_for_each_entry_rcu(sdata, &local->interfaces, list) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: - if (!(sdata->u.mntr_flags & MONITOR_FLAG_ACTIVE)) + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) continue; break; case NL80211_IFTYPE_AP_VLAN: @@ -1209,7 +1209,8 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, } if (sdata->vif.type != NL80211_IFTYPE_MONITOR && - sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE) { + sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + sdata->vif.type != NL80211_IFTYPE_NAN) { sdata->vif.bss_conf.qos = enable_qos; if (bss_notify) ieee80211_bss_info_change_notify(sdata, @@ -1748,6 +1749,46 @@ static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata) mutex_unlock(&local->sta_mtx); } +static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) +{ + struct cfg80211_nan_func *func, **funcs; + int res, id, i = 0; + + res = drv_start_nan(sdata->local, sdata, + &sdata->u.nan.conf); + if (WARN_ON(res)) + return res; + + funcs = kzalloc((sdata->local->hw.max_nan_de_entries + 1) * + sizeof(*funcs), GFP_KERNEL); + if (!funcs) + return -ENOMEM; + + /* Add all the functions: + * This is a little bit ugly. We need to call a potentially sleeping + * callback for each NAN function, so we can't hold the spinlock. + */ + spin_lock_bh(&sdata->u.nan.func_lock); + + idr_for_each_entry(&sdata->u.nan.function_inst_ids, func, id) + funcs[i++] = func; + + spin_unlock_bh(&sdata->u.nan.func_lock); + + for (i = 0; funcs[i]; i++) { + res = drv_add_nan_func(sdata->local, sdata, funcs[i]); + if (WARN_ON(res)) + ieee80211_nan_func_terminated(&sdata->vif, + funcs[i]->instance_id, + NL80211_NAN_FUNC_TERM_REASON_ERROR, + GFP_KERNEL); + } + + kfree(funcs); + + return 0; +} + int ieee80211_reconfig(struct ieee80211_local *local) { struct ieee80211_hw *hw = &local->hw; @@ -1971,6 +2012,13 @@ int ieee80211_reconfig(struct ieee80211_local *local) ieee80211_bss_info_change_notify(sdata, changed); } break; + case NL80211_IFTYPE_NAN: + res = ieee80211_reconfig_nan(sdata); + if (res < 0) { + ieee80211_handle_reconfig_failure(local); + return res; + } + break; case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: @@ -2555,7 +2603,6 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, if (need_basic && basic_rates & BIT(i)) basic = 0x80; - rate = sband->bitrates[i].bitrate; rate = DIV_ROUND_UP(sband->bitrates[i].bitrate, 5 * (1 << shift)); *pos++ = basic | (u8) rate; @@ -3394,11 +3441,18 @@ void ieee80211_txq_get_depth(struct ieee80211_txq *txq, unsigned long *byte_cnt) { struct txq_info *txqi = to_txq_info(txq); + u32 frag_cnt = 0, frag_bytes = 0; + struct sk_buff *skb; + + skb_queue_walk(&txqi->frags, skb) { + frag_cnt++; + frag_bytes += skb->len; + } if (frame_cnt) - *frame_cnt = txqi->tin.backlog_packets; + *frame_cnt = txqi->tin.backlog_packets + frag_cnt; if (byte_cnt) - *byte_cnt = txqi->tin.backlog_bytes; + *byte_cnt = txqi->tin.backlog_bytes + frag_bytes; } EXPORT_SYMBOL(ieee80211_txq_get_depth); diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 7079cd32a7ad..06019dba4b10 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -663,6 +663,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, /* TODO check this */ SET_NETDEV_DEV(ndev, &local->phy->dev); + dev_net_set(ndev, wpan_phy_net(local->hw.phy)); sdata = netdev_priv(ndev); ndev->ieee802154_ptr = &sdata->wpan_dev; memcpy(sdata->name, ndev->name, IFNAMSIZ); diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 446e1300383e..4dcf6e18563a 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -101,11 +101,16 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, sdata->dev->stats.rx_bytes += skb->len; switch (mac_cb(skb)->type) { + case IEEE802154_FC_TYPE_BEACON: + case IEEE802154_FC_TYPE_ACK: + case IEEE802154_FC_TYPE_MAC_CMD: + goto fail; + case IEEE802154_FC_TYPE_DATA: return ieee802154_deliver_skb(skb); default: - pr_warn("ieee802154: bad frame received (type = %d)\n", - mac_cb(skb)->type); + pr_warn_ratelimited("ieee802154: bad frame received " + "(type = %d)\n", mac_cb(skb)->type); goto fail; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 5c161e7759b5..0e4334cbde17 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -961,9 +961,6 @@ static void mpls_ifdown(struct net_device *dev, int event) RCU_INIT_POINTER(nh->nh_dev, NULL); } endfor_nexthops(rt); } - - - return; } static void mpls_ifup(struct net_device *dev, unsigned int nh_flags) @@ -997,8 +994,6 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags) ACCESS_ONCE(rt->rt_nhn_alive) = alive; } - - return; } static int mpls_dev_notify(struct notifier_block *this, unsigned long event, diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 732a5c17e986..bdfef6c3271a 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -1,9 +1,6 @@ #ifndef MPLS_INTERNAL_H #define MPLS_INTERNAL_H - -struct mpls_shim_hdr { - __be32 label_stack_entry; -}; +#include <net/mpls.h> struct mpls_entry_decoded { u32 label; @@ -93,11 +90,6 @@ struct mpls_route { /* next hop label forwarding entry */ #define endfor_nexthops(rt) } -static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) -{ - return (struct mpls_shim_hdr *)skb_network_header(skb); -} - static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) { struct mpls_shim_hdr result; diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 2055e57ed1c3..b4da6d8e8632 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -23,32 +23,50 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + u16 mac_offset = skb->mac_header; netdev_features_t mpls_features; + u16 mac_len = skb->mac_len; __be16 mpls_protocol; + unsigned int mpls_hlen; + + skb_reset_network_header(skb); + mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb); + if (unlikely(!pskb_may_pull(skb, mpls_hlen))) + goto out; /* Setup inner SKB. */ mpls_protocol = skb->protocol; skb->protocol = skb->inner_protocol; - /* Push back the mac header that skb_mac_gso_segment() has pulled. - * It will be re-pulled by the call to skb_mac_gso_segment() below - */ - __skb_push(skb, skb->mac_len); + __skb_pull(skb, mpls_hlen); + + skb->mac_len = 0; + skb_reset_mac_header(skb); /* Segment inner packet. */ mpls_features = skb->dev->mpls_features & features; segs = skb_mac_gso_segment(skb, mpls_features); + if (IS_ERR_OR_NULL(segs)) { + skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset, + mac_len); + goto out; + } + skb = segs; + + mpls_hlen += mac_len; + do { + skb->mac_len = mac_len; + skb->protocol = mpls_protocol; + skb_reset_inner_network_header(skb); - /* Restore outer protocol. */ - skb->protocol = mpls_protocol; + __skb_push(skb, mpls_hlen); - /* Re-pull the mac header that the call to skb_mac_gso_segment() - * above pulled. It will be re-pushed after returning - * skb_mac_gso_segment(), an indirect caller of this function. - */ - __skb_pull(skb, skb->data - skb_mac_header(skb)); + skb_reset_mac_header(skb); + skb_set_network_header(skb, mac_len); + } while ((skb = skb->next)); +out: return segs; } diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 644a8da6d4bd..cf52cf30ac4b 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) return en->labels * sizeof(struct mpls_shim_hdr); } -static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int mpls_xmit(struct sk_buff *skb) { struct mpls_iptunnel_encap *tun_encap_info; struct mpls_shim_hdr *hdr; @@ -90,7 +90,11 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb_cow(skb, hh_len + new_header_size)) goto drop; + skb_set_inner_protocol(skb, skb->protocol); + skb_reset_inner_network_header(skb); + skb_push(skb, new_header_size); + skb_reset_network_header(skb); skb->dev = out_dev; @@ -115,7 +119,7 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) net_dbg_ratelimited("%s: packet transmission failed: %d\n", __func__, err); - return 0; + return LWTUNNEL_XMIT_DONE; drop: kfree_skb(skb); @@ -153,7 +157,8 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla, if (ret) goto errout; newts->type = LWTUNNEL_ENCAP_MPLS; - newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; + newts->headroom = mpls_encap_size(tun_encap_info); *ts = newts; @@ -209,7 +214,7 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) static const struct lwtunnel_encap_ops mpls_iptun_ops = { .build_state = mpls_build_state, - .output = mpls_output, + .xmit = mpls_xmit, .fill_encap = mpls_fill_encap_info, .get_encap_size = mpls_encap_nlsize, .cmp_encap = mpls_encap_cmp, diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 33738c060547..13290a70fa71 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -170,6 +170,7 @@ struct ncsi_package; #define NCSI_PACKAGE_SHIFT 5 #define NCSI_PACKAGE_INDEX(c) (((c) >> NCSI_PACKAGE_SHIFT) & 0x7) +#define NCSI_RESERVED_CHANNEL 0x1f #define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) #define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) @@ -186,9 +187,15 @@ struct ncsi_channel { struct ncsi_channel_mode modes[NCSI_MODE_MAX]; struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; struct ncsi_channel_stats stats; - struct timer_list timer; /* Link monitor timer */ - bool enabled; /* Timer is enabled */ - unsigned int timeout; /* Times of timeout */ + struct { + struct timer_list timer; + bool enabled; + unsigned int state; +#define NCSI_CHANNEL_MONITOR_START 0 +#define NCSI_CHANNEL_MONITOR_RETRY 1 +#define NCSI_CHANNEL_MONITOR_WAIT 2 +#define NCSI_CHANNEL_MONITOR_WAIT_MAX 5 + } monitor; struct list_head node; struct list_head link; }; @@ -206,7 +213,8 @@ struct ncsi_package { struct ncsi_request { unsigned char id; /* Request ID - 0 to 255 */ bool used; /* Request that has been assigned */ - bool driven; /* Drive state machine */ + unsigned int flags; /* NCSI request property */ +#define NCSI_REQ_FLAG_EVENT_DRIVEN 1 struct ncsi_dev_priv *ndp; /* Associated NCSI device */ struct sk_buff *cmd; /* Associated NCSI command packet */ struct sk_buff *rsp; /* Associated NCSI response packet */ @@ -258,6 +266,7 @@ struct ncsi_dev_priv { struct list_head packages; /* List of packages */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ +#define NCSI_REQ_START_IDX 1 unsigned int pending_req_num; /* Number of pending requests */ struct ncsi_package *active_package; /* Currently handled package */ struct ncsi_channel *active_channel; /* Currently handled channel */ @@ -274,7 +283,7 @@ struct ncsi_cmd_arg { unsigned char package; /* Destination package ID */ unsigned char channel; /* Detination channel ID or 0x1f */ unsigned short payload; /* Command packet payload length */ - bool driven; /* Drive the state machine? */ + unsigned int req_flags; /* NCSI request properties */ union { unsigned char bytes[16]; /* Command packet specific data */ unsigned short words[8]; @@ -313,7 +322,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, unsigned char id, struct ncsi_package **np, struct ncsi_channel **nc); -struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven); +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, + unsigned int req_flags); void ncsi_free_request(struct ncsi_request *nr); struct ncsi_dev *ncsi_find_dev(struct net_device *dev); int ncsi_process_next_channel(struct ncsi_dev_priv *ndp); diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index d463468442ae..b41a6617d498 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -53,7 +53,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, struct ncsi_aen_lsc_pkt *lsc; struct ncsi_channel *nc; struct ncsi_channel_mode *ncm; - unsigned long old_data; + bool chained; + int state; + unsigned long old_data, data; unsigned long flags; /* Find the NCSI channel */ @@ -62,20 +64,27 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, return -ENODEV; /* Update the link status */ - ncm = &nc->modes[NCSI_MODE_LINK]; lsc = (struct ncsi_aen_lsc_pkt *)h; + + spin_lock_irqsave(&nc->lock, flags); + ncm = &nc->modes[NCSI_MODE_LINK]; old_data = ncm->data[2]; - ncm->data[2] = ntohl(lsc->status); + data = ntohl(lsc->status); + ncm->data[2] = data; ncm->data[4] = ntohl(lsc->oem_status); - if (!((old_data ^ ncm->data[2]) & 0x1) || - !list_empty(&nc->link)) + + chained = !list_empty(&nc->link); + state = nc->state; + spin_unlock_irqrestore(&nc->lock, flags); + + if (!((old_data ^ data) & 0x1) || chained) return 0; - if (!(nc->state == NCSI_CHANNEL_INACTIVE && (ncm->data[2] & 0x1)) && - !(nc->state == NCSI_CHANNEL_ACTIVE && !(ncm->data[2] & 0x1))) + if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) && + !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1))) return 0; if (!(ndp->flags & NCSI_DEV_HWA) && - nc->state == NCSI_CHANNEL_ACTIVE) + state == NCSI_CHANNEL_ACTIVE) ndp->flags |= NCSI_DEV_RESHUFFLE; ncsi_stop_channel_monitor(nc); @@ -97,13 +106,21 @@ static int ncsi_aen_handler_cr(struct ncsi_dev_priv *ndp, if (!nc) return -ENODEV; + spin_lock_irqsave(&nc->lock, flags); if (!list_empty(&nc->link) || - nc->state != NCSI_CHANNEL_ACTIVE) + nc->state != NCSI_CHANNEL_ACTIVE) { + spin_unlock_irqrestore(&nc->lock, flags); return 0; + } + spin_unlock_irqrestore(&nc->lock, flags); ncsi_stop_channel_monitor(nc); + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); + spin_lock_irqsave(&ndp->lock, flags); - xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + nc->state = NCSI_CHANNEL_INACTIVE; list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 21057a8ceeac..db7083bfd476 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -272,7 +272,7 @@ static struct ncsi_request *ncsi_alloc_command(struct ncsi_cmd_arg *nca) struct sk_buff *skb; struct ncsi_request *nr; - nr = ncsi_alloc_request(ndp, nca->driven); + nr = ncsi_alloc_request(ndp, nca->req_flags); if (!nr) return NULL; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index ef017b871857..5e509e547c2d 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -132,6 +132,7 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) struct ncsi_dev *nd = &ndp->ndev; struct ncsi_package *np; struct ncsi_channel *nc; + unsigned long flags; nd->state = ncsi_dev_state_functional; if (force_down) { @@ -142,14 +143,21 @@ static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) nd->link_up = 0; NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + if (!list_empty(&nc->link) || - nc->state != NCSI_CHANNEL_ACTIVE) + nc->state != NCSI_CHANNEL_ACTIVE) { + spin_unlock_irqrestore(&nc->lock, flags); continue; + } if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { + spin_unlock_irqrestore(&nc->lock, flags); nd->link_up = 1; goto report; } + + spin_unlock_irqrestore(&nc->lock, flags); } } @@ -163,43 +171,55 @@ static void ncsi_channel_monitor(unsigned long data) struct ncsi_package *np = nc->package; struct ncsi_dev_priv *ndp = np->ndp; struct ncsi_cmd_arg nca; - bool enabled; - unsigned int timeout; + bool enabled, chained; + unsigned int monitor_state; unsigned long flags; - int ret; + int state, ret; spin_lock_irqsave(&nc->lock, flags); - timeout = nc->timeout; - enabled = nc->enabled; + state = nc->state; + chained = !list_empty(&nc->link); + enabled = nc->monitor.enabled; + monitor_state = nc->monitor.state; spin_unlock_irqrestore(&nc->lock, flags); - if (!enabled || !list_empty(&nc->link)) + if (!enabled || chained) return; - if (nc->state != NCSI_CHANNEL_INACTIVE && - nc->state != NCSI_CHANNEL_ACTIVE) + if (state != NCSI_CHANNEL_INACTIVE && + state != NCSI_CHANNEL_ACTIVE) return; - if (!(timeout % 2)) { + switch (monitor_state) { + case NCSI_CHANNEL_MONITOR_START: + case NCSI_CHANNEL_MONITOR_RETRY: nca.ndp = ndp; nca.package = np->id; nca.channel = nc->id; nca.type = NCSI_PKT_CMD_GLS; - nca.driven = false; + nca.req_flags = 0; ret = ncsi_xmit_cmd(&nca); if (ret) { netdev_err(ndp->ndev.dev, "Error %d sending GLS\n", ret); return; } - } - if (timeout + 1 >= 3) { + break; + case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX: + break; + default: if (!(ndp->flags & NCSI_DEV_HWA) && - nc->state == NCSI_CHANNEL_ACTIVE) + state == NCSI_CHANNEL_ACTIVE) { ncsi_report_link(ndp, true); + ndp->flags |= NCSI_DEV_RESHUFFLE; + } + + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); spin_lock_irqsave(&ndp->lock, flags); - xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + nc->state = NCSI_CHANNEL_INACTIVE; list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); ncsi_process_next_channel(ndp); @@ -207,10 +227,9 @@ static void ncsi_channel_monitor(unsigned long data) } spin_lock_irqsave(&nc->lock, flags); - nc->timeout = timeout + 1; - nc->enabled = true; + nc->monitor.state++; spin_unlock_irqrestore(&nc->lock, flags); - mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); + mod_timer(&nc->monitor.timer, jiffies + HZ); } void ncsi_start_channel_monitor(struct ncsi_channel *nc) @@ -218,12 +237,12 @@ void ncsi_start_channel_monitor(struct ncsi_channel *nc) unsigned long flags; spin_lock_irqsave(&nc->lock, flags); - WARN_ON_ONCE(nc->enabled); - nc->timeout = 0; - nc->enabled = true; + WARN_ON_ONCE(nc->monitor.enabled); + nc->monitor.enabled = true; + nc->monitor.state = NCSI_CHANNEL_MONITOR_START; spin_unlock_irqrestore(&nc->lock, flags); - mod_timer(&nc->timer, jiffies + HZ * (1 << (nc->timeout / 2))); + mod_timer(&nc->monitor.timer, jiffies + HZ); } void ncsi_stop_channel_monitor(struct ncsi_channel *nc) @@ -231,14 +250,14 @@ void ncsi_stop_channel_monitor(struct ncsi_channel *nc) unsigned long flags; spin_lock_irqsave(&nc->lock, flags); - if (!nc->enabled) { + if (!nc->monitor.enabled) { spin_unlock_irqrestore(&nc->lock, flags); return; } - nc->enabled = false; + nc->monitor.enabled = false; spin_unlock_irqrestore(&nc->lock, flags); - del_timer_sync(&nc->timer); + del_timer_sync(&nc->monitor.timer); } struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, @@ -267,8 +286,9 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) nc->id = id; nc->package = np; nc->state = NCSI_CHANNEL_INACTIVE; - nc->enabled = false; - setup_timer(&nc->timer, ncsi_channel_monitor, (unsigned long)nc); + nc->monitor.enabled = false; + setup_timer(&nc->monitor.timer, + ncsi_channel_monitor, (unsigned long)nc); spin_lock_init(&nc->lock); INIT_LIST_HEAD(&nc->link); for (index = 0; index < NCSI_CAP_MAX; index++) @@ -405,7 +425,8 @@ void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, * be same. Otherwise, the bogus response might be replied. So * the available IDs are allocated in round-robin fashion. */ -struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) +struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, + unsigned int req_flags) { struct ncsi_request *nr = NULL; int i, limit = ARRAY_SIZE(ndp->requests); @@ -413,30 +434,31 @@ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, bool driven) /* Check if there is one available request until the ceiling */ spin_lock_irqsave(&ndp->lock, flags); - for (i = ndp->request_id; !nr && i < limit; i++) { + for (i = ndp->request_id; i < limit; i++) { if (ndp->requests[i].used) continue; nr = &ndp->requests[i]; nr->used = true; - nr->driven = driven; - if (++ndp->request_id >= limit) - ndp->request_id = 0; + nr->flags = req_flags; + ndp->request_id = i + 1; + goto found; } /* Fail back to check from the starting cursor */ - for (i = 0; !nr && i < ndp->request_id; i++) { + for (i = NCSI_REQ_START_IDX; i < ndp->request_id; i++) { if (ndp->requests[i].used) continue; nr = &ndp->requests[i]; nr->used = true; - nr->driven = driven; - if (++ndp->request_id >= limit) - ndp->request_id = 0; + nr->flags = req_flags; + ndp->request_id = i + 1; + goto found; } - spin_unlock_irqrestore(&ndp->lock, flags); +found: + spin_unlock_irqrestore(&ndp->lock, flags); return nr; } @@ -458,7 +480,7 @@ void ncsi_free_request(struct ncsi_request *nr) nr->cmd = NULL; nr->rsp = NULL; nr->used = false; - driven = nr->driven; + driven = !!(nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN); spin_unlock_irqrestore(&ndp->lock, flags); if (driven && cmd && --ndp->pending_req_num == 0) @@ -508,10 +530,11 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) struct ncsi_package *np = ndp->active_package; struct ncsi_channel *nc = ndp->active_channel; struct ncsi_cmd_arg nca; + unsigned long flags; int ret; nca.ndp = ndp; - nca.driven = true; + nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_suspend: nd->state = ncsi_dev_state_suspend_select; @@ -527,7 +550,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) nca.package = np->id; if (nd->state == ncsi_dev_state_suspend_select) { nca.type = NCSI_PKT_CMD_SP; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; if (ndp->flags & NCSI_DEV_HWA) nca.bytes[0] = 0; else @@ -544,7 +567,7 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_suspend_deselect; } else if (nd->state == ncsi_dev_state_suspend_deselect) { nca.type = NCSI_PKT_CMD_DP; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; nd->state = ncsi_dev_state_suspend_done; } @@ -556,7 +579,9 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) break; case ncsi_dev_state_suspend_done: - xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + spin_lock_irqsave(&nc->lock, flags); + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); ncsi_process_next_channel(ndp); break; @@ -574,10 +599,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) struct ncsi_channel *nc = ndp->active_channel; struct ncsi_cmd_arg nca; unsigned char index; + unsigned long flags; int ret; nca.ndp = ndp; - nca.driven = true; + nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_config: case ncsi_dev_state_config_sp: @@ -590,7 +616,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) else nca.bytes[0] = 1; nca.package = np->id; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; @@ -675,10 +701,12 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) goto error; break; case ncsi_dev_state_config_done: + spin_lock_irqsave(&nc->lock, flags); if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) - xchg(&nc->state, NCSI_CHANNEL_ACTIVE); + nc->state = NCSI_CHANNEL_ACTIVE; else - xchg(&nc->state, NCSI_CHANNEL_INACTIVE); + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); ncsi_start_channel_monitor(nc); ncsi_process_next_channel(ndp); @@ -707,18 +735,25 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) found = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { + spin_lock_irqsave(&nc->lock, flags); + if (!list_empty(&nc->link) || - nc->state != NCSI_CHANNEL_INACTIVE) + nc->state != NCSI_CHANNEL_INACTIVE) { + spin_unlock_irqrestore(&nc->lock, flags); continue; + } if (!found) found = nc; ncm = &nc->modes[NCSI_MODE_LINK]; if (ncm->data[2] & 0x1) { + spin_unlock_irqrestore(&nc->lock, flags); found = nc; goto out; } + + spin_unlock_irqrestore(&nc->lock, flags); } } @@ -797,7 +832,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) int ret; nca.ndp = ndp; - nca.driven = true; + nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_probe: nd->state = ncsi_dev_state_probe_deselect; @@ -807,7 +842,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) /* Deselect all possible packages */ nca.type = NCSI_PKT_CMD_DP; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; for (index = 0; index < 8; index++) { nca.package = index; ret = ncsi_xmit_cmd(&nca); @@ -823,7 +858,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) /* Select all possible packages */ nca.type = NCSI_PKT_CMD_SP; nca.bytes[0] = 1; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; for (index = 0; index < 8; index++) { nca.package = index; ret = ncsi_xmit_cmd(&nca); @@ -876,7 +911,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nca.type = NCSI_PKT_CMD_SP; nca.bytes[0] = 1; nca.package = ndp->active_package->id; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; @@ -884,12 +919,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_cis; break; case ncsi_dev_state_probe_cis: - ndp->pending_req_num = 32; + ndp->pending_req_num = NCSI_RESERVED_CHANNEL; /* Clear initial state */ nca.type = NCSI_PKT_CMD_CIS; nca.package = ndp->active_package->id; - for (index = 0; index < 0x20; index++) { + for (index = 0; index < NCSI_RESERVED_CHANNEL; index++) { nca.channel = index; ret = ncsi_xmit_cmd(&nca); if (ret) @@ -933,7 +968,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) /* Deselect the active package */ nca.type = NCSI_PKT_CMD_DP; nca.package = ndp->active_package->id; - nca.channel = 0x1f; + nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; @@ -987,11 +1022,14 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) goto out; } - old_state = xchg(&nc->state, NCSI_CHANNEL_INVISIBLE); list_del_init(&nc->link); - spin_unlock_irqrestore(&ndp->lock, flags); + spin_lock_irqsave(&nc->lock, flags); + old_state = nc->state; + nc->state = NCSI_CHANNEL_INVISIBLE; + spin_unlock_irqrestore(&nc->lock, flags); + ndp->active_channel = nc; ndp->active_package = nc->package; @@ -1006,7 +1044,7 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) break; default: netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n", - nc->state, nc->package->id, nc->id); + old_state, nc->package->id, nc->id); ncsi_report_link(ndp, false); return -EINVAL; } @@ -1070,7 +1108,7 @@ static int ncsi_inet6addr_event(struct notifier_block *this, return NOTIFY_OK; nca.ndp = ndp; - nca.driven = false; + nca.req_flags = 0; nca.package = np->id; nca.channel = nc->id; nca.dwords[0] = nc->caps[NCSI_CAP_MC].cap; @@ -1118,7 +1156,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, /* Initialize private NCSI device */ spin_lock_init(&ndp->lock); INIT_LIST_HEAD(&ndp->packages); - ndp->request_id = 0; + ndp->request_id = NCSI_REQ_START_IDX; for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) { ndp->requests[i].id = i; ndp->requests[i].ndp = ndp; @@ -1149,9 +1187,7 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev); int ncsi_start_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); - struct ncsi_package *np; - struct ncsi_channel *nc; - int old_state, ret; + int ret; if (nd->state != ncsi_dev_state_registered && nd->state != ncsi_dev_state_functional) @@ -1163,15 +1199,6 @@ int ncsi_start_dev(struct ncsi_dev *nd) return 0; } - /* Reset channel's state and start over */ - NCSI_FOR_EACH_PACKAGE(ndp, np) { - NCSI_FOR_EACH_CHANNEL(np, nc) { - old_state = xchg(&nc->state, NCSI_CHANNEL_INACTIVE); - WARN_ON_ONCE(!list_empty(&nc->link) || - old_state == NCSI_CHANNEL_INVISIBLE); - } - } - if (ndp->flags & NCSI_DEV_HWA) ret = ncsi_enable_hwa(ndp); else @@ -1181,6 +1208,35 @@ int ncsi_start_dev(struct ncsi_dev *nd) } EXPORT_SYMBOL_GPL(ncsi_start_dev); +void ncsi_stop_dev(struct ncsi_dev *nd) +{ + struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); + struct ncsi_package *np; + struct ncsi_channel *nc; + bool chained; + int old_state; + unsigned long flags; + + /* Stop the channel monitor and reset channel's state */ + NCSI_FOR_EACH_PACKAGE(ndp, np) { + NCSI_FOR_EACH_CHANNEL(np, nc) { + ncsi_stop_channel_monitor(nc); + + spin_lock_irqsave(&nc->lock, flags); + chained = !list_empty(&nc->link); + old_state = nc->state; + nc->state = NCSI_CHANNEL_INACTIVE; + spin_unlock_irqrestore(&nc->lock, flags); + + WARN_ON_ONCE(chained || + old_state == NCSI_CHANNEL_INVISIBLE); + } + } + + ncsi_report_link(ndp, true); +} +EXPORT_SYMBOL_GPL(ncsi_stop_dev); + void ncsi_unregister_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index af84389a6bf1..087db775b3dc 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -317,12 +317,12 @@ static int ncsi_rsp_handler_gls(struct ncsi_request *nr) ncm->data[3] = ntohl(rsp->other); ncm->data[4] = ntohl(rsp->oem_status); - if (nr->driven) + if (nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN) return 0; /* Reset the channel monitor if it has been enabled */ spin_lock_irqsave(&nc->lock, flags); - nc->timeout = 0; + nc->monitor.state = NCSI_CHANNEL_MONITOR_START; spin_unlock_irqrestore(&nc->lock, flags); return 0; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 9266ceebd112..e8d56d9a4df2 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -474,6 +474,12 @@ config NFT_META This option adds the "meta" expression that you can use to match and to set packet metainformation such as the packet mark. +config NFT_NUMGEN + tristate "Netfilter nf_tables number generator module" + help + This option adds the number generator expression used to perform + incremental counting and random numbers bound to a upper limit. + config NFT_CT depends on NF_CONNTRACK tristate "Netfilter nf_tables conntrack module" @@ -481,13 +487,13 @@ config NFT_CT This option adds the "meta" expression that you can use to match connection tracking information such as the flow state. -config NFT_RBTREE +config NFT_SET_RBTREE tristate "Netfilter nf_tables rbtree set module" help This option adds the "rbtree" set type (Red Black tree) that is used to build interval-based sets. -config NFT_HASH +config NFT_SET_HASH tristate "Netfilter nf_tables hash set module" help This option adds the "hash" set type that is used to build one-way @@ -542,6 +548,12 @@ config NFT_QUEUE This is required if you intend to use the userspace queueing infrastructure (also known as NFQUEUE) from nftables. +config NFT_QUOTA + tristate "Netfilter nf_tables quota module" + help + This option adds the "quota" expression that you can use to match + enforce bytes quotas. + config NFT_REJECT default m if NETFILTER_ADVANCED=n tristate "Netfilter nf_tables reject support" @@ -563,6 +575,12 @@ config NFT_COMPAT x_tables match/target extensions over the nf_tables framework. +config NFT_HASH + tristate "Netfilter nf_tables hash module" + help + This option adds the "hash" expression that you can use to perform + a hash operation on registers. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 69134541d65b..c23c3c84416f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -71,8 +71,9 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o # nf_tables nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o -nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o +nf_tables-objs += nft_immediate.o nft_cmp.o nft_range.o nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o +nf_tables-objs += nft_lookup.o nft_dynset.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o @@ -80,18 +81,21 @@ obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o +obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o +obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o -obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o -obj-$(CONFIG_NFT_HASH) += nft_hash.o +obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o +obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o obj-$(CONFIG_NFT_REDIR) += nft_redir.o +obj-$(CONFIG_NFT_HASH) += nft_hash.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index f39276d1c2d7..fa6715db4581 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -22,6 +22,7 @@ #include <linux/proc_fs.h> #include <linux/mutex.h> #include <linux/slab.h> +#include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -61,33 +62,55 @@ EXPORT_SYMBOL(nf_hooks_needed); #endif static DEFINE_MUTEX(nf_hook_mutex); +#define nf_entry_dereference(e) \ + rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) -static struct list_head *nf_find_hook_list(struct net *net, - const struct nf_hook_ops *reg) +static struct nf_hook_entry *nf_hook_entry_head(struct net *net, + const struct nf_hook_ops *reg) { - struct list_head *hook_list = NULL; + struct nf_hook_entry *hook_head = NULL; if (reg->pf != NFPROTO_NETDEV) - hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + hook_head = nf_entry_dereference(net->nf.hooks[reg->pf] + [reg->hooknum]); else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS if (reg->dev && dev_net(reg->dev) == net) - hook_list = ®->dev->nf_hooks_ingress; + hook_head = + nf_entry_dereference( + reg->dev->nf_hooks_ingress); #endif } - return hook_list; + return hook_head; } -struct nf_hook_entry { - const struct nf_hook_ops *orig_ops; - struct nf_hook_ops ops; -}; +/* must hold nf_hook_mutex */ +static void nf_set_hooks_head(struct net *net, const struct nf_hook_ops *reg, + struct nf_hook_entry *entry) +{ + switch (reg->pf) { + case NFPROTO_NETDEV: + /* We already checked in nf_register_net_hook() that this is + * used from ingress. + */ + rcu_assign_pointer(reg->dev->nf_hooks_ingress, entry); + break; + default: + rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum], + entry); + break; + } +} int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *hook_list; + struct nf_hook_entry *hooks_entry; struct nf_hook_entry *entry; - struct nf_hook_ops *elem; + + if (reg->pf == NFPROTO_NETDEV && + (reg->hooknum != NF_NETDEV_INGRESS || + !reg->dev || dev_net(reg->dev) != net)) + return -EINVAL; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) @@ -95,19 +118,30 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) entry->orig_ops = reg; entry->ops = *reg; + entry->next = NULL; + + mutex_lock(&nf_hook_mutex); + hooks_entry = nf_hook_entry_head(net, reg); - hook_list = nf_find_hook_list(net, reg); - if (!hook_list) { - kfree(entry); - return -ENOENT; + if (hooks_entry && hooks_entry->orig_ops->priority > reg->priority) { + /* This is the case where we need to insert at the head */ + entry->next = hooks_entry; + hooks_entry = NULL; } - mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, hook_list, list) { - if (reg->priority < elem->priority) - break; + while (hooks_entry && + reg->priority >= hooks_entry->orig_ops->priority && + nf_entry_dereference(hooks_entry->next)) { + hooks_entry = nf_entry_dereference(hooks_entry->next); + } + + if (hooks_entry) { + entry->next = nf_entry_dereference(hooks_entry->next); + rcu_assign_pointer(hooks_entry->next, entry); + } else { + nf_set_hooks_head(net, reg, entry); } - list_add_rcu(&entry->ops.list, elem->list.prev); + mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -122,24 +156,33 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *hook_list; - struct nf_hook_entry *entry; - struct nf_hook_ops *elem; - - hook_list = nf_find_hook_list(net, reg); - if (!hook_list) - return; + struct nf_hook_entry *hooks_entry; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, hook_list, list) { - entry = container_of(elem, struct nf_hook_entry, ops); - if (entry->orig_ops == reg) { - list_del_rcu(&entry->ops.list); - break; + hooks_entry = nf_hook_entry_head(net, reg); + if (hooks_entry->orig_ops == reg) { + nf_set_hooks_head(net, reg, + nf_entry_dereference(hooks_entry->next)); + goto unlock; + } + while (hooks_entry && nf_entry_dereference(hooks_entry->next)) { + struct nf_hook_entry *next = + nf_entry_dereference(hooks_entry->next); + struct nf_hook_entry *nnext; + + if (next->orig_ops != reg) { + hooks_entry = next; + continue; } + nnext = nf_entry_dereference(next->next); + rcu_assign_pointer(hooks_entry->next, nnext); + hooks_entry = next; + break; } + +unlock: mutex_unlock(&nf_hook_mutex); - if (&elem->list == hook_list) { + if (!hooks_entry) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); return; } @@ -151,10 +194,10 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(net, &entry->ops); + nf_queue_nf_hook_drop(net, hooks_entry); /* other cpu might still process nfqueue verdict that used reg */ synchronize_net(); - kfree(entry); + kfree(hooks_entry); } EXPORT_SYMBOL(nf_unregister_net_hook); @@ -188,19 +231,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks); static LIST_HEAD(nf_hook_list); -int nf_register_hook(struct nf_hook_ops *reg) +static int _nf_register_hook(struct nf_hook_ops *reg) { struct net *net, *last; int ret; - rtnl_lock(); for_each_net(net) { ret = nf_register_net_hook(net, reg); if (ret && ret != -ENOENT) goto rollback; } list_add_tail(®->list, &nf_hook_list); - rtnl_unlock(); return 0; rollback: @@ -210,19 +251,34 @@ rollback: break; nf_unregister_net_hook(net, reg); } + return ret; +} + +int nf_register_hook(struct nf_hook_ops *reg) +{ + int ret; + + rtnl_lock(); + ret = _nf_register_hook(reg); rtnl_unlock(); + return ret; } EXPORT_SYMBOL(nf_register_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +static void _nf_unregister_hook(struct nf_hook_ops *reg) { struct net *net; - rtnl_lock(); list_del(®->list); for_each_net(net) nf_unregister_net_hook(net, reg); +} + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + rtnl_lock(); + _nf_unregister_hook(reg); rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -246,6 +302,26 @@ err: } EXPORT_SYMBOL(nf_register_hooks); +/* Caller MUST take rtnl_lock() */ +int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = _nf_register_hook(®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + _nf_unregister_hooks(reg, i); + return err; +} +EXPORT_SYMBOL(_nf_register_hooks); + void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) { while (n-- > 0) @@ -253,10 +329,17 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) } EXPORT_SYMBOL(nf_unregister_hooks); -unsigned int nf_iterate(struct list_head *head, - struct sk_buff *skb, +/* Caller MUST take rtnl_lock */ +void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) +{ + while (n-- > 0) + _nf_unregister_hook(®[n]); +} +EXPORT_SYMBOL(_nf_unregister_hooks); + +unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, - struct nf_hook_ops **elemp) + struct nf_hook_entry **entryp) { unsigned int verdict; @@ -264,20 +347,23 @@ unsigned int nf_iterate(struct list_head *head, * The caller must not block between calls to this * function because of risk of continuing from deleted element. */ - list_for_each_entry_continue_rcu((*elemp), head, list) { - if (state->thresh > (*elemp)->priority) + while (*entryp) { + if (state->thresh > (*entryp)->ops.priority) { + *entryp = rcu_dereference((*entryp)->next); continue; + } /* Optimization: we don't need to hold module reference here, since function can't sleep. --RR */ repeat: - verdict = (*elemp)->hook((*elemp)->priv, skb, state); + verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state); if (verdict != NF_ACCEPT) { #ifdef CONFIG_NETFILTER_DEBUG if (unlikely((verdict & NF_VERDICT_MASK) > NF_MAX_VERDICT)) { NFDEBUG("Evil return from %p(%u).\n", - (*elemp)->hook, state->hook); + (*entryp)->ops.hook, state->hook); + *entryp = rcu_dereference((*entryp)->next); continue; } #endif @@ -285,25 +371,23 @@ repeat: return verdict; goto repeat; } + *entryp = rcu_dereference((*entryp)->next); } return NF_ACCEPT; } /* Returns 1 if okfn() needs to be executed by the caller, - * -EPERM for NF_DROP, 0 otherwise. */ + * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) { - struct nf_hook_ops *elem; + struct nf_hook_entry *entry; unsigned int verdict; int ret = 0; - /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); - - elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); + entry = rcu_dereference(state->hook_entries); next_hook: - verdict = nf_iterate(state->hook_list, skb, state, &elem); + verdict = nf_iterate(skb, state, &entry); if (verdict == NF_ACCEPT || verdict == NF_STOP) { ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { @@ -312,8 +396,10 @@ next_hook: if (ret == 0) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - int err = nf_queue(skb, elem, state, - verdict >> NF_VERDICT_QBITS); + int err; + + RCU_INIT_POINTER(state->hook_entries, entry); + err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) @@ -321,7 +407,6 @@ next_hook: kfree_skb(skb); } } - rcu_read_unlock(); return ret; } EXPORT_SYMBOL(nf_hook_slow); @@ -441,7 +526,7 @@ static int __net_init netfilter_net_init(struct net *net) for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&net->nf.hooks[i][h]); + RCU_INIT_POINTER(net->nf.hooks[i][h], NULL); } #ifdef CONFIG_PROC_FS diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index f04fd8df210b..fc230d99aa3b 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -281,13 +281,10 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); - /* Show what happens instead of calling nf_ct_kill() */ - if (del_timer(&ct->timeout)) { - IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" + if (nf_ct_kill(ct)) { + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple=" FMT_TUPLE "\n", __func__, ct, ARG_TUPLE(&tuple)); - if (ct->timeout.function) - ct->timeout.function(ct->timeout.data); } else { IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" FMT_TUPLE "\n", diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9934b0c93c1e..ba6a1d421222 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -72,12 +72,24 @@ EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); struct hlist_nulls_head *nf_conntrack_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash); +struct conntrack_gc_work { + struct delayed_work dwork; + u32 last_bucket; + bool exiting; +}; + static __read_mostly struct kmem_cache *nf_conntrack_cachep; static __read_mostly spinlock_t nf_conntrack_locks_all_lock; -static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; +#define GC_MAX_BUCKETS_DIV 64u +#define GC_MAX_BUCKETS 8192u +#define GC_INTERVAL (5 * HZ) +#define GC_MAX_EVICTS 256u + +static struct conntrack_gc_work conntrack_gc_work; + void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { spin_lock(lock); @@ -164,7 +176,7 @@ unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_max); +seqcount_t nf_conntrack_generation __read_mostly; DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); @@ -367,12 +379,10 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - struct net *net = nf_ct_net(ct); struct nf_conntrack_l4proto *l4proto; pr_debug("destroy_conntrack(%p)\n", ct); NF_CT_ASSERT(atomic_read(&nfct->use) == 0); - NF_CT_ASSERT(!timer_pending(&ct->timeout)); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); @@ -395,7 +405,6 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_ct_del_from_dying_or_unconfirmed_list(ct); - NF_CT_STAT_INC(net, delete); local_bh_enable(); if (ct->master) @@ -427,7 +436,6 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) nf_ct_add_to_dying_list(ct); - NF_CT_STAT_INC(net, delete_list); local_bh_enable(); } @@ -435,35 +443,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; + if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) + return false; + tstamp = nf_conn_tstamp_find(ct); if (tstamp && tstamp->stop == 0) tstamp->stop = ktime_get_real_ns(); - if (nf_ct_is_dying(ct)) - goto delete; - if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) { - /* destroy event was not delivered */ + /* destroy event was not delivered. nf_ct_put will + * be done by event cache worker on redelivery. + */ nf_ct_delete_from_lists(ct); nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); return false; } nf_conntrack_ecache_work(nf_ct_net(ct)); - set_bit(IPS_DYING_BIT, &ct->status); - delete: nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; } EXPORT_SYMBOL_GPL(nf_ct_delete); -static void death_by_timeout(unsigned long ul_conntrack) -{ - nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0); -} - static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, @@ -481,22 +484,17 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, net_eq(net, nf_ct_net(ct)); } -/* must be called with rcu read lock held */ -void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) +/* caller must hold rcu readlock and none of the nf_conntrack_locks */ +static void nf_ct_gc_expired(struct nf_conn *ct) { - struct hlist_nulls_head *hptr; - unsigned int sequence, hsz; + if (!atomic_inc_not_zero(&ct->ct_general.use)) + return; - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hsz = nf_conntrack_htable_size; - hptr = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + if (nf_ct_should_gc(ct)) + nf_ct_kill(ct); - *hash = hptr; - *hsize = hsz; + nf_ct_put(ct); } -EXPORT_SYMBOL_GPL(nf_conntrack_get_ht); /* * Warning : @@ -510,21 +508,26 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int bucket, sequence; + unsigned int bucket, hsize; begin: - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - bucket = scale_hash(hash); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + nf_conntrack_get_ht(&ct_hash, &hsize); + bucket = reciprocal_scale(hash, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { - if (nf_ct_key_equal(h, tuple, zone, net)) { - NF_CT_STAT_INC_ATOMIC(net, found); - return h; + struct nf_conn *ct; + + ct = nf_ct_tuplehash_to_ctrack(h); + if (nf_ct_is_expired(ct)) { + nf_ct_gc_expired(ct); + continue; } - NF_CT_STAT_INC_ATOMIC(net, searched); + + if (nf_ct_is_dying(ct)) + continue; + + if (nf_ct_key_equal(h, tuple, zone, net)) + return h; } /* * if the nulls value we got at the end of this lookup is @@ -618,7 +621,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) zone, net)) goto out; - add_timer(&ct->timeout); smp_wmb(); /* The caller holds a reference to this object */ atomic_set(&ct->ct_general.use, 2); @@ -771,8 +773,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ - ct->timeout.expires += jiffies; - add_timer(&ct->timeout); + ct->timeout += nfct_time_stamp; atomic_inc(&ct->ct_general.use); ct->status |= IPS_CONFIRMED; @@ -791,7 +792,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert); local_bh_enable(); help = nfct_help(ct); @@ -823,29 +823,40 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; - unsigned int hash, sequence; + unsigned int hash, hsize; struct hlist_nulls_node *n; struct nf_conn *ct; zone = nf_ct_zone(ignored_conntrack); rcu_read_lock(); - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hash = hash_conntrack(net, tuple); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + begin: + nf_conntrack_get_ht(&ct_hash, &hsize); + hash = __hash_conntrack(net, tuple, hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); - if (ct != ignored_conntrack && - nf_ct_key_equal(h, tuple, zone, net)) { + + if (ct == ignored_conntrack) + continue; + + if (nf_ct_is_expired(ct)) { + nf_ct_gc_expired(ct); + continue; + } + + if (nf_ct_key_equal(h, tuple, zone, net)) { NF_CT_STAT_INC_ATOMIC(net, found); rcu_read_unlock(); return 1; } - NF_CT_STAT_INC_ATOMIC(net, searched); } + + if (get_nulls_value(n) != hash) { + NF_CT_STAT_INC_ATOMIC(net, search_restart); + goto begin; + } + rcu_read_unlock(); return 0; @@ -867,6 +878,11 @@ static unsigned int early_drop_list(struct net *net, hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); + if (nf_ct_is_expired(tmp)) { + nf_ct_gc_expired(tmp); + continue; + } + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || !net_eq(nf_ct_net(tmp), net) || nf_ct_is_dying(tmp)) @@ -884,7 +900,6 @@ static unsigned int early_drop_list(struct net *net, */ if (net_eq(nf_ct_net(tmp), net) && nf_ct_is_confirmed(tmp) && - del_timer(&tmp->timeout) && nf_ct_delete(tmp, 0, 0)) drops++; @@ -900,14 +915,11 @@ static noinline int early_drop(struct net *net, unsigned int _hash) for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { struct hlist_nulls_head *ct_hash; - unsigned hash, sequence, drops; + unsigned int hash, hsize, drops; rcu_read_lock(); - do { - sequence = read_seqcount_begin(&nf_conntrack_generation); - hash = scale_hash(_hash++); - ct_hash = nf_conntrack_hash; - } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + nf_conntrack_get_ht(&ct_hash, &hsize); + hash = reciprocal_scale(_hash++, hsize); drops = early_drop_list(net, &ct_hash[hash]); rcu_read_unlock(); @@ -921,6 +933,69 @@ static noinline int early_drop(struct net *net, unsigned int _hash) return false; } +static void gc_worker(struct work_struct *work) +{ + unsigned int i, goal, buckets = 0, expired_count = 0; + unsigned long next_run = GC_INTERVAL; + unsigned int ratio, scanned = 0; + struct conntrack_gc_work *gc_work; + + gc_work = container_of(work, struct conntrack_gc_work, dwork.work); + + goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS); + i = gc_work->last_bucket; + + do { + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; + struct hlist_nulls_node *n; + unsigned int hashsz; + struct nf_conn *tmp; + + i++; + rcu_read_lock(); + + nf_conntrack_get_ht(&ct_hash, &hashsz); + if (i >= hashsz) + i = 0; + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { + tmp = nf_ct_tuplehash_to_ctrack(h); + + scanned++; + if (nf_ct_is_expired(tmp)) { + nf_ct_gc_expired(tmp); + expired_count++; + continue; + } + } + + /* could check get_nulls_value() here and restart if ct + * was moved to another chain. But given gc is best-effort + * we will just continue with next hash slot. + */ + rcu_read_unlock(); + cond_resched_rcu_qs(); + } while (++buckets < goal && + expired_count < GC_MAX_EVICTS); + + if (gc_work->exiting) + return; + + ratio = scanned ? expired_count * 100 / scanned : 0; + if (ratio >= 90) + next_run = 0; + + gc_work->last_bucket = i; + schedule_delayed_work(&gc_work->dwork, next_run); +} + +static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) +{ + INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); + gc_work->exiting = false; +} + static struct nf_conn * __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, @@ -957,8 +1032,6 @@ __nf_conntrack_alloc(struct net *net, /* save hash for reusing when confirming */ *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; ct->status = 0; - /* Don't set timer yet: wait for confirmation */ - setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); write_pnet(&ct->ct_net, net); memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - @@ -1096,10 +1169,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } spin_unlock(&nf_conntrack_expect_lock); } - if (!exp) { + if (!exp) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); - NF_CT_STAT_INC(net, new); - } /* Now it is inserted into the unconfirmed list, bump refcount */ nf_conntrack_get(&ct->ct_general); @@ -1204,7 +1275,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, skb->nfct = NULL; } - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ l3proto = __nf_ct_l3proto_find(pf); ret = l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum); @@ -1332,7 +1403,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, unsigned long extra_jiffies, int do_acct) { - NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); NF_CT_ASSERT(skb); /* Only update if this is not a fixed timeout */ @@ -1340,39 +1410,25 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, goto acct; /* If not in hash table, timer will not be active yet */ - if (!nf_ct_is_confirmed(ct)) { - ct->timeout.expires = extra_jiffies; - } else { - unsigned long newtime = jiffies + extra_jiffies; - - /* Only update the timeout if the new timeout is at least - HZ jiffies from the old timeout. Need del_timer for race - avoidance (may already be dying). */ - if (newtime - ct->timeout.expires >= HZ) - mod_timer_pending(&ct->timeout, newtime); - } + if (nf_ct_is_confirmed(ct)) + extra_jiffies += nfct_time_stamp; + ct->timeout = extra_jiffies; acct: if (do_acct) nf_ct_acct_update(ct, ctinfo, skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); -bool __nf_ct_kill_acct(struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, - int do_acct) +bool nf_ct_kill_acct(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) { - if (do_acct) - nf_ct_acct_update(ct, ctinfo, skb->len); + nf_ct_acct_update(ct, ctinfo, skb->len); - if (del_timer(&ct->timeout)) { - ct->timeout.function((unsigned long)ct); - return true; - } - return false; + return nf_ct_delete(ct, 0, 0); } -EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); +EXPORT_SYMBOL_GPL(nf_ct_kill_acct); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -1505,11 +1561,8 @@ void nf_ct_iterate_cleanup(struct net *net, while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ - if (del_timer(&ct->timeout)) - nf_ct_delete(ct, portid, report); - - /* ... else the timer will get him soon. */ + nf_ct_delete(ct, portid, report); nf_ct_put(ct); cond_resched(); } @@ -1545,6 +1598,7 @@ static int untrack_refs(void) void nf_conntrack_cleanup_start(void) { + conntrack_gc_work.exiting = true; RCU_INIT_POINTER(ip_ct_attach, NULL); } @@ -1554,6 +1608,7 @@ void nf_conntrack_cleanup_end(void) while (untrack_refs() > 0) schedule(); + cancel_delayed_work_sync(&conntrack_gc_work.dwork); nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); nf_conntrack_proto_fini(); @@ -1828,6 +1883,10 @@ int nf_conntrack_init_start(void) } /* - and look it like as a confirmed connection */ nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); + + conntrack_gc_work_init(&conntrack_gc_work); + schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL); + return 0; err_proto: diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index d28011b42845..da9df2d56e66 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -49,8 +49,13 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct nf_conntrack_ecache *e; - if (nf_ct_is_dying(ct)) + if (!nf_ct_is_confirmed(ct)) + continue; + + e = nf_ct_ecache_find(ct); + if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) continue; if (nf_conntrack_event(IPCT_DESTROY, ct)) { @@ -58,8 +63,7 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) break; } - /* we've got the event delivered, now it's dying */ - set_bit(IPS_DYING_BIT, &ct->status); + e->state = NFCT_ECACHE_DESTROY_SENT; refs[evicted] = ct; if (++evicted >= ARRAY_SIZE(refs)) { @@ -130,7 +134,7 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, if (!e) goto out_unlock; - if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) { + if (nf_ct_is_confirmed(ct)) { struct nf_ct_event item = { .ct = ct, .portid = e->portid ? e->portid : portid, @@ -150,11 +154,13 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, * triggered by a process, we store the PORTID * to include it in the retransmission. */ - if (eventmask & (1 << IPCT_DESTROY) && - e->portid == 0 && portid != 0) - e->portid = portid; - else + if (eventmask & (1 << IPCT_DESTROY)) { + if (e->portid == 0 && portid != 0) + e->portid = portid; + e->state = NFCT_ECACHE_DESTROY_FAIL; + } else { e->missed |= eventmask; + } } else { e->missed &= ~missed; } diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 43147005bea3..e3ed20060878 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -237,7 +237,7 @@ static int try_eprt(const char *data, size_t dlen, struct nf_conntrack_man *cmd, } delim = data[0]; if (isdigit(delim) || delim < 33 || delim > 126 || data[2] != delim) { - pr_debug("try_eprt: invalid delimitter.\n"); + pr_debug("try_eprt: invalid delimiter.\n"); return 0; } @@ -301,8 +301,6 @@ static int find_pattern(const char *data, size_t dlen, size_t i = plen; pr_debug("find_pattern `%s': dlen = %Zu\n", pattern, dlen); - if (dlen == 0) - return 0; if (dlen <= plen) { /* Short packet: try for partial? */ @@ -311,19 +309,8 @@ static int find_pattern(const char *data, size_t dlen, else return 0; } - if (strncasecmp(data, pattern, plen) != 0) { -#if 0 - size_t i; - - pr_debug("ftp: string mismatch\n"); - for (i = 0; i < plen; i++) { - pr_debug("ftp:char %u `%c'(%u) vs `%c'(%u)\n", - i, data[i], data[i], - pattern[i], pattern[i]); - } -#endif + if (strncasecmp(data, pattern, plen) != 0) return 0; - } pr_debug("Pattern matches!\n"); /* Now we've found the constant string, try to skip diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 5c0db5c64734..f65d93639d12 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -736,7 +736,7 @@ static int callforward_do_filter(struct net *net, const struct nf_afinfo *afinfo; int ret = 0; - /* rcu_read_lock()ed by nf_hook_slow() */ + /* rcu_read_lock()ed by nf_hook_thresh */ afinfo = nf_get_afinfo(family); if (!afinfo) return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index b989b81ac156..336e21559e01 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -189,7 +189,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; struct net *net = nf_ct_net(ct); - int ret = 0; /* We already got a helper explicitly attached. The function * nf_conntrack_alter_reply - in case NAT is in use - asks for looking @@ -223,15 +222,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, if (helper == NULL) { if (help) RCU_INIT_POINTER(help->helper, NULL); - goto out; + return 0; } if (help == NULL) { help = nf_ct_helper_ext_add(ct, helper, flags); - if (help == NULL) { - ret = -ENOMEM; - goto out; - } + if (help == NULL) + return -ENOMEM; } else { /* We only allow helper re-assignment of the same sort since * we cannot reallocate the helper extension area. @@ -240,13 +237,13 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, if (tmp && tmp->help != helper->help) { RCU_INIT_POINTER(help->helper, NULL); - goto out; + return 0; } } rcu_assign_pointer(help->helper, helper); -out: - return ret; + + return 0; } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); @@ -349,7 +346,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct, /* Called from the helper function, this call never fails */ help = nfct_help(ct); - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index fdfc71f416b7..27540455dc62 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -149,10 +149,7 @@ nla_put_failure: static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) { - long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ; - - if (timeout < 0) - timeout = 0; + long timeout = nf_ct_expires(ct) / HZ; if (nla_put_be32(skb, CTA_TIMEOUT, htonl(timeout))) goto nla_put_failure; @@ -818,14 +815,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_nulls_node *n; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; - int res; + struct nf_conn *nf_ct_evict[8]; + int res, i; spinlock_t *lockp; last = (struct nf_conn *)cb->args[1]; + i = 0; local_bh_disable(); for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: + while (i) { + i--; + if (nf_ct_should_gc(nf_ct_evict[i])) + nf_ct_kill(nf_ct_evict[i]); + nf_ct_put(nf_ct_evict[i]); + } + lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); if (cb->args[0] >= nf_conntrack_htable_size) { @@ -837,6 +843,13 @@ restart: if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); + if (nf_ct_is_expired(ct)) { + if (i < ARRAY_SIZE(nf_ct_evict) && + atomic_inc_not_zero(&ct->ct_general.use)) + nf_ct_evict[i++] = ct; + continue; + } + if (!net_eq(net, nf_ct_net(ct))) continue; @@ -878,6 +891,13 @@ out: if (last) nf_ct_put(last); + while (i) { + i--; + if (nf_ct_should_gc(nf_ct_evict[i])) + nf_ct_kill(nf_ct_evict[i]); + nf_ct_put(nf_ct_evict[i]); + } + return skb->len; } @@ -1147,9 +1167,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } } - if (del_timer(&ct->timeout)) - nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); - + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_put(ct); return 0; @@ -1517,11 +1535,10 @@ static int ctnetlink_change_timeout(struct nf_conn *ct, { u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); - if (!del_timer(&ct->timeout)) - return -ETIME; + ct->timeout = nfct_time_stamp + timeout * HZ; - ct->timeout.expires = jiffies + timeout * HZ; - add_timer(&ct->timeout); + if (test_bit(IPS_DYING_BIT, &ct->status)) + return -ETIME; return 0; } @@ -1719,9 +1736,8 @@ ctnetlink_create_conntrack(struct net *net, if (!cda[CTA_TIMEOUT]) goto err1; - ct->timeout.expires = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); - ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->timeout = nfct_time_stamp + ntohl(nla_get_be32(cda[CTA_TIMEOUT])) * HZ; rcu_read_lock(); if (cda[CTA_HELP]) { @@ -1968,13 +1984,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_SEARCHED, htonl(st->searched)) || - nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || - nla_put_be32(skb, CTA_STATS_NEW, htonl(st->new)) || + if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || - nla_put_be32(skb, CTA_STATS_DELETE, htonl(st->delete)) || - nla_put_be32(skb, CTA_STATS_DELETE_LIST, htonl(st->delete_list)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 5588c7ae1ac2..f60a4755d71e 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -157,8 +157,7 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, pr_debug("setting timeout of conntrack %p to 0\n", sibling); sibling->proto.gre.timeout = 0; sibling->proto.gre.stream_timeout = 0; - if (del_timer(&sibling->timeout)) - sibling->timeout.function((unsigned long)sibling); + nf_ct_kill(sibling); nf_ct_put(sibling); return 1; } else { diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index b65d5864b6d9..8d2c7d8c666a 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -159,54 +159,6 @@ static int kill_l4proto(struct nf_conn *i, void *data) nf_ct_l3num(i) == l4proto->l3proto; } -static struct nf_ip_net *nf_ct_l3proto_net(struct net *net, - struct nf_conntrack_l3proto *l3proto) -{ - if (l3proto->l3proto == PF_INET) - return &net->ct.nf_ct_proto; - else - return NULL; -} - -static int nf_ct_l3proto_register_sysctl(struct net *net, - struct nf_conntrack_l3proto *l3proto) -{ - int err = 0; - struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); - /* nf_conntrack_l3proto_ipv6 doesn't support sysctl */ - if (in == NULL) - return 0; - -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - if (in->ctl_table != NULL) { - err = nf_ct_register_sysctl(net, - &in->ctl_table_header, - l3proto->ctl_table_path, - in->ctl_table); - if (err < 0) { - kfree(in->ctl_table); - in->ctl_table = NULL; - } - } -#endif - return err; -} - -static void nf_ct_l3proto_unregister_sysctl(struct net *net, - struct nf_conntrack_l3proto *l3proto) -{ - struct nf_ip_net *in = nf_ct_l3proto_net(net, l3proto); - - if (in == NULL) - return; -#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) - if (in->ctl_table_header != NULL) - nf_ct_unregister_sysctl(&in->ctl_table_header, - &in->ctl_table, - 0); -#endif -} - int nf_ct_l3proto_register(struct nf_conntrack_l3proto *proto) { int ret = 0; @@ -241,7 +193,7 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_register); int nf_ct_l3proto_pernet_register(struct net *net, struct nf_conntrack_l3proto *proto) { - int ret = 0; + int ret; if (proto->init_net) { ret = proto->init_net(net); @@ -249,7 +201,7 @@ int nf_ct_l3proto_pernet_register(struct net *net, return ret; } - return nf_ct_l3proto_register_sysctl(net, proto); + return 0; } EXPORT_SYMBOL_GPL(nf_ct_l3proto_pernet_register); @@ -272,8 +224,6 @@ EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister); void nf_ct_l3proto_pernet_unregister(struct net *net, struct nf_conntrack_l3proto *proto) { - nf_ct_l3proto_unregister_sysctl(net, proto); - /* Remove all contrack entries for this protocol */ nf_ct_iterate_cleanup(net, kill_l3proto, proto, 0, 0); } @@ -312,26 +262,6 @@ int nf_ct_l4proto_register_sysctl(struct net *net, } } } -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_table != NULL) { - if (err < 0) { - nf_ct_kfree_compat_sysctl_table(pn); - goto out; - } - err = nf_ct_register_sysctl(net, - &pn->ctl_compat_header, - "net/ipv4/netfilter", - pn->ctl_compat_table); - if (err == 0) - goto out; - - nf_ct_kfree_compat_sysctl_table(pn); - nf_ct_unregister_sysctl(&pn->ctl_table_header, - &pn->ctl_table, - pn->users); - } -out: -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ return err; } @@ -346,13 +276,6 @@ void nf_ct_l4proto_unregister_sysctl(struct net *net, nf_ct_unregister_sysctl(&pn->ctl_table_header, &pn->ctl_table, pn->users); - -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - if (l4proto->l3proto != AF_INET6 && pn->ctl_compat_header != NULL) - nf_ct_unregister_sysctl(&pn->ctl_compat_header, - &pn->ctl_compat_table, - 0); -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ } diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 399a38fd685a..a45bee52dccc 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -402,7 +402,8 @@ static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, { struct dccp_hdr _hdr, *dh; - dh = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + /* Actually only need first 4 bytes to get ports. */ + dh = skb_header_pointer(skb, dataoff, 4, &_hdr); if (dh == NULL) return false; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index 86dc752e5349..d5868bad33a7 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -151,17 +151,6 @@ static struct ctl_table generic_sysctl_table[] = { }, { } }; -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table generic_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_generic_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -179,40 +168,14 @@ static int generic_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int generic_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct nf_generic_net *gn) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(generic_compat_sysctl_table, - sizeof(generic_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &gn->timeout; -#endif -#endif - return 0; -} - static int generic_init_net(struct net *net, u_int16_t proto) { - int ret; struct nf_generic_net *gn = generic_pernet(net); struct nf_proto_net *pn = &gn->pn; gn->timeout = nf_ct_generic_timeout; - ret = generic_kmemdup_compat_sysctl_table(pn, gn); - if (ret < 0) - return ret; - - ret = generic_kmemdup_sysctl_table(pn, gn); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - - return ret; + return generic_kmemdup_sysctl_table(pn, gn); } static struct nf_proto_net *generic_get_net_proto(struct net *net) diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index a96451a7af20..9a715f88b2f1 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -192,15 +192,15 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { - const struct gre_hdr_pptp *pgrehdr; - struct gre_hdr_pptp _pgrehdr; + const struct pptp_gre_header *pgrehdr; + struct pptp_gre_header _pgrehdr; __be16 srckey; - const struct gre_hdr *grehdr; - struct gre_hdr _grehdr; + const struct gre_base_hdr *grehdr; + struct gre_base_hdr _grehdr; /* first only delinearize old RFC1701 GRE header */ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); - if (!grehdr || grehdr->version != GRE_VERSION_PPTP) { + if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) { /* try to behave like "nf_conntrack_proto_generic" */ tuple->src.u.all = 0; tuple->dst.u.all = 0; @@ -212,8 +212,8 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, if (!pgrehdr) return true; - if (ntohs(grehdr->protocol) != GRE_PROTOCOL_PPTP) { - pr_debug("GRE_VERSION_PPTP but unknown proto\n"); + if (grehdr->protocol != GRE_PROTO_PPP) { + pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol)); return false; } diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 1d7ab960a9e6..982ea62606c7 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -161,8 +161,8 @@ static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, const struct sctphdr *hp; struct sctphdr _hdr; - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + /* Actually only need first 4 bytes to get ports. */ + hp = skb_header_pointer(skb, dataoff, 4, &_hdr); if (hp == NULL) return false; @@ -705,54 +705,6 @@ static struct ctl_table sctp_sysctl_table[] = { }, { } }; - -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table sctp_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_sctp_timeout_closed", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_cookie_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_cookie_echoed", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_established", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_shutdown_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_shutdown_recd", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -781,32 +733,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int sctp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct sctp_net *sn) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(sctp_compat_sysctl_table, - sizeof(sctp_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &sn->timeouts[SCTP_CONNTRACK_CLOSED]; - pn->ctl_compat_table[1].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_WAIT]; - pn->ctl_compat_table[2].data = &sn->timeouts[SCTP_CONNTRACK_COOKIE_ECHOED]; - pn->ctl_compat_table[3].data = &sn->timeouts[SCTP_CONNTRACK_ESTABLISHED]; - pn->ctl_compat_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; - pn->ctl_compat_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; - pn->ctl_compat_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; -#endif -#endif - return 0; -} - static int sctp_init_net(struct net *net, u_int16_t proto) { - int ret; struct sctp_net *sn = sctp_pernet(net); struct nf_proto_net *pn = &sn->pn; @@ -817,18 +745,7 @@ static int sctp_init_net(struct net *net, u_int16_t proto) sn->timeouts[i] = sctp_timeouts[i]; } - if (proto == AF_INET) { - ret = sctp_kmemdup_compat_sysctl_table(pn, sn); - if (ret < 0) - return ret; - - ret = sctp_kmemdup_sysctl_table(pn, sn); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - } else - ret = sctp_kmemdup_sysctl_table(pn, sn); - - return ret; + return sctp_kmemdup_sysctl_table(pn, sn); } static struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 70c8381641a7..69f687740c76 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -282,8 +282,8 @@ static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *hp; struct tcphdr _hdr; - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + /* Actually only need first 4 bytes to get ports. */ + hp = skb_header_pointer(skb, dataoff, 4, &_hdr); if (hp == NULL) return false; @@ -1481,90 +1481,6 @@ static struct ctl_table tcp_sysctl_table[] = { }, { } }; - -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table tcp_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_tcp_timeout_syn_sent", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_syn_sent2", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_syn_recv", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_established", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_fin_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_close_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_last_ack", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_time_wait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_close", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_timeout_max_retrans", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_tcp_loose", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_tcp_be_liberal", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ip_conntrack_tcp_max_retrans", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -1597,38 +1513,8 @@ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct nf_tcp_net *tn) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table, - sizeof(tcp_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; - pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2]; - pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; - pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; - pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; - pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; - pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; - pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; - pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; - pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; - pn->ctl_compat_table[10].data = &tn->tcp_loose; - pn->ctl_compat_table[11].data = &tn->tcp_be_liberal; - pn->ctl_compat_table[12].data = &tn->tcp_max_retrans; -#endif -#endif - return 0; -} - static int tcp_init_net(struct net *net, u_int16_t proto) { - int ret; struct nf_tcp_net *tn = tcp_pernet(net); struct nf_proto_net *pn = &tn->pn; @@ -1643,18 +1529,7 @@ static int tcp_init_net(struct net *net, u_int16_t proto) tn->tcp_max_retrans = nf_ct_tcp_max_retrans; } - if (proto == AF_INET) { - ret = tcp_kmemdup_compat_sysctl_table(pn, tn); - if (ret < 0) - return ret; - - ret = tcp_kmemdup_sysctl_table(pn, tn); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - } else - ret = tcp_kmemdup_sysctl_table(pn, tn); - - return ret; + return tcp_kmemdup_sysctl_table(pn, tn); } static struct nf_proto_net *tcp_get_net_proto(struct net *net) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 4fd040575ffe..20f35ed68030 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -44,8 +44,8 @@ static bool udp_pkt_to_tuple(const struct sk_buff *skb, const struct udphdr *hp; struct udphdr _hdr; - /* Actually only need first 8 bytes. */ - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + /* Actually only need first 4 bytes to get ports. */ + hp = skb_header_pointer(skb, dataoff, 4, &_hdr); if (hp == NULL) return false; @@ -218,23 +218,6 @@ static struct ctl_table udp_sysctl_table[] = { }, { } }; -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT -static struct ctl_table udp_compat_sysctl_table[] = { - { - .procname = "ip_conntrack_udp_timeout", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "ip_conntrack_udp_timeout_stream", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { } -}; -#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, @@ -254,27 +237,8 @@ static int udp_kmemdup_sysctl_table(struct nf_proto_net *pn, return 0; } -static int udp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn, - struct nf_udp_net *un) -{ -#ifdef CONFIG_SYSCTL -#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT - pn->ctl_compat_table = kmemdup(udp_compat_sysctl_table, - sizeof(udp_compat_sysctl_table), - GFP_KERNEL); - if (!pn->ctl_compat_table) - return -ENOMEM; - - pn->ctl_compat_table[0].data = &un->timeouts[UDP_CT_UNREPLIED]; - pn->ctl_compat_table[1].data = &un->timeouts[UDP_CT_REPLIED]; -#endif -#endif - return 0; -} - static int udp_init_net(struct net *net, u_int16_t proto) { - int ret; struct nf_udp_net *un = udp_pernet(net); struct nf_proto_net *pn = &un->pn; @@ -285,18 +249,7 @@ static int udp_init_net(struct net *net, u_int16_t proto) un->timeouts[i] = udp_timeouts[i]; } - if (proto == AF_INET) { - ret = udp_kmemdup_compat_sysctl_table(pn, un); - if (ret < 0) - return ret; - - ret = udp_kmemdup_sysctl_table(pn, un); - if (ret < 0) - nf_ct_kfree_compat_sysctl_table(pn); - } else - ret = udp_kmemdup_sysctl_table(pn, un); - - return ret; + return udp_kmemdup_sysctl_table(pn, un); } static struct nf_proto_net *udp_get_net_proto(struct net *net) diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 9d692f5adb94..029206e8dec4 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -54,7 +54,8 @@ static bool udplite_pkt_to_tuple(const struct sk_buff *skb, const struct udphdr *hp; struct udphdr _hdr; - hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + /* Actually only need first 4 bytes to get ports. */ + hp = skb_header_pointer(skb, dataoff, 4, &_hdr); if (hp == NULL) return false; diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index dff0f0cc59e4..ef7063eced7c 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -169,7 +169,7 @@ int nf_ct_seq_adjust(struct sk_buff *skb, s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; - int res; + int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; @@ -184,27 +184,31 @@ int nf_ct_seq_adjust(struct sk_buff *skb, else seqoff = this_way->offset_before; + newseq = htonl(ntohl(tcph->seq) + seqoff); + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); + pr_debug("Adjusting sequence number from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq)); + tcph->seq = newseq; + + if (!tcph->ack) + goto out; + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; - newseq = htonl(ntohl(tcph->seq) + seqoff); newack = htonl(ntohl(tcph->ack_seq) - ackoff); - - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); - - pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", + pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); - - tcph->seq = newseq; tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, tcph, ct, ctinfo); +out: spin_unlock_bh(&ct->lock); return res; diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 7d77217de6a3..621b81c7bddc 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -83,9 +83,10 @@ static int digits_len(const struct nf_conn *ct, const char *dptr, static int iswordc(const char c) { if (isalnum(c) || c == '!' || c == '"' || c == '%' || - (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' || + (c >= '(' && c <= '+') || c == ':' || c == '<' || c == '>' || c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' || - c == '{' || c == '}' || c == '~') + c == '{' || c == '}' || c == '~' || (c >= '-' && c <= '/') || + c == '\'') return 1; return 0; } @@ -329,13 +330,12 @@ static const char *sip_follow_continuation(const char *dptr, const char *limit) static const char *sip_skip_whitespace(const char *dptr, const char *limit) { for (; dptr < limit; dptr++) { - if (*dptr == ' ') + if (*dptr == ' ' || *dptr == '\t') continue; if (*dptr != '\r' && *dptr != '\n') break; dptr = sip_follow_continuation(dptr, limit); - if (dptr == NULL) - return NULL; + break; } return dptr; } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9f267c3ffb39..5f446cd9f3fd 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -212,6 +212,11 @@ static int ct_seq_show(struct seq_file *s, void *v) if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) return 0; + if (nf_ct_should_gc(ct)) { + nf_ct_kill(ct); + goto release; + } + /* we only want to print DIR_ORIGINAL */ if (NF_CT_DIRECTION(hash)) goto release; @@ -228,8 +233,7 @@ static int ct_seq_show(struct seq_file *s, void *v) seq_printf(s, "%-8s %u %-8s %u %ld ", l3proto->name, nf_ct_l3num(ct), l4proto->name, nf_ct_protonum(ct), - timer_pending(&ct->timeout) - ? (long)(ct->timeout.expires - jiffies)/HZ : 0); + nf_ct_expires(ct) / HZ); if (l4proto->print_conntrack) l4proto->print_conntrack(s, ct); @@ -353,13 +357,13 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, - st->searched, + 0, st->found, - st->new, + 0, st->invalid, st->ignore, - st->delete, - st->delete_list, + 0, + 0, st->insert, st->insert_failed, st->drop, diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 065522564ac6..e0adb5959342 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -13,13 +13,13 @@ /* core.c */ -unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - struct nf_hook_state *state, struct nf_hook_ops **elemp); +unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, + struct nf_hook_entry **entryp); /* nf_queue.c */ -int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, - struct nf_hook_state *state, unsigned int queuenum); -void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); +int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, + unsigned int queuenum); +void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index aa5847a16713..30a17d649a83 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -39,12 +39,12 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger) return NULL; } -void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) +int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) { const struct nf_logger *log; - if (pf == NFPROTO_UNSPEC) - return; + if (pf == NFPROTO_UNSPEC || pf >= ARRAY_SIZE(net->nf.nf_loggers)) + return -EOPNOTSUPP; mutex_lock(&nf_log_mutex); log = nft_log_dereference(net->nf.nf_loggers[pf]); @@ -52,6 +52,8 @@ void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger) rcu_assign_pointer(net->nf.nf_loggers[pf], logger); mutex_unlock(&nf_log_mutex); + + return 0; } EXPORT_SYMBOL(nf_log_set); diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index a5aa5967b8e1..119fe1cb1ea9 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -77,7 +77,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, nf_log_buf_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & XT_LOG_TCPSEQ) { + if (logflags & NF_LOG_TCPSEQ) { nf_log_buf_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); } @@ -107,7 +107,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, /* Max length: 11 "URGP=65535 " */ nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); - if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { + if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { u_int8_t _opt[60 - sizeof(struct tcphdr)]; const u_int8_t *op; unsigned int i; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index ecee105bbada..bbb8f3df79f7 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -566,16 +566,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ - if (!del_timer(&ct->timeout)) - return 1; - ct->status &= ~IPS_NAT_DONE_MASK; - rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource, nf_nat_bysource_params); - add_timer(&ct->timeout); - /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b19ad20a705c..96964a0070e1 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,14 +96,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); -void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) +void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry) { const struct nf_queue_handler *qh; rcu_read_lock(); qh = rcu_dereference(net->nf.queue_handler); if (qh) - qh->nf_hook_drop(net, ops); + qh->nf_hook_drop(net, entry); rcu_read_unlock(); } @@ -112,7 +112,6 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) * through nf_reinject(). */ int nf_queue(struct sk_buff *skb, - struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum) { @@ -141,7 +140,6 @@ int nf_queue(struct sk_buff *skb, *entry = (struct nf_queue_entry) { .skb = skb, - .elem = elem, .state = *state, .size = sizeof(*entry) + afinfo->route_key_size, }; @@ -165,11 +163,15 @@ err: void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { + struct nf_hook_entry *hook_entry; struct sk_buff *skb = entry->skb; - struct nf_hook_ops *elem = entry->elem; const struct nf_afinfo *afinfo; + struct nf_hook_ops *elem; int err; + hook_entry = rcu_dereference(entry->state.hook_entries); + elem = &hook_entry->ops; + nf_queue_entry_release_refs(entry); /* Continue traversal iff userspace said ok... */ @@ -186,8 +188,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) if (verdict == NF_ACCEPT) { next_hook: - verdict = nf_iterate(entry->state.hook_list, - skb, &entry->state, &elem); + verdict = nf_iterate(skb, &entry->state, &hook_entry); } switch (verdict & NF_VERDICT_MASK) { @@ -198,7 +199,8 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - err = nf_queue(skb, elem, &entry->state, + RCU_INIT_POINTER(entry->state.hook_entries, hook_entry); + err = nf_queue(skb, &entry->state, verdict >> NF_VERDICT_QBITS); if (err < 0) { if (err == -ESRCH && diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7e1c876c7608..b70d3ea1430e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1196,6 +1196,83 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) } } +struct nft_chain_hook { + u32 num; + u32 priority; + const struct nf_chain_type *type; + struct net_device *dev; +}; + +static int nft_chain_parse_hook(struct net *net, + const struct nlattr * const nla[], + struct nft_af_info *afi, + struct nft_chain_hook *hook, bool create) +{ + struct nlattr *ha[NFTA_HOOK_MAX + 1]; + const struct nf_chain_type *type; + struct net_device *dev; + int err; + + err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], + nft_hook_policy); + if (err < 0) + return err; + + if (ha[NFTA_HOOK_HOOKNUM] == NULL || + ha[NFTA_HOOK_PRIORITY] == NULL) + return -EINVAL; + + hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); + if (hook->num >= afi->nhooks) + return -EINVAL; + + hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); + + type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT]; + if (nla[NFTA_CHAIN_TYPE]) { + type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE], + create); + if (IS_ERR(type)) + return PTR_ERR(type); + } + if (!(type->hook_mask & (1 << hook->num))) + return -EOPNOTSUPP; + if (!try_module_get(type->owner)) + return -ENOENT; + + hook->type = type; + + hook->dev = NULL; + if (afi->flags & NFT_AF_NEEDS_DEV) { + char ifname[IFNAMSIZ]; + + if (!ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + + nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); + dev = dev_get_by_name(net, ifname); + if (!dev) { + module_put(type->owner); + return -ENOENT; + } + hook->dev = dev; + } else if (ha[NFTA_HOOK_DEV]) { + module_put(type->owner); + return -EOPNOTSUPP; + } + + return 0; +} + +static void nft_chain_release_hook(struct nft_chain_hook *hook) +{ + module_put(hook->type->owner); + if (hook->dev != NULL) + dev_put(hook->dev); +} + static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) @@ -1206,10 +1283,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_chain *chain; struct nft_base_chain *basechain = NULL; - struct nlattr *ha[NFTA_HOOK_MAX + 1]; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; - struct net_device *dev = NULL; u8 policy = NF_ACCEPT; u64 handle = 0; unsigned int i; @@ -1273,6 +1348,37 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + if (nla[NFTA_CHAIN_HOOK]) { + struct nft_base_chain *basechain; + struct nft_chain_hook hook; + struct nf_hook_ops *ops; + + if (!(chain->flags & NFT_BASE_CHAIN)) + return -EBUSY; + + err = nft_chain_parse_hook(net, nla, afi, &hook, + create); + if (err < 0) + return err; + + basechain = nft_base_chain(chain); + if (basechain->type != hook.type) { + nft_chain_release_hook(&hook); + return -EBUSY; + } + + for (i = 0; i < afi->nops; i++) { + ops = &basechain->ops[i]; + if (ops->hooknum != hook.num || + ops->priority != hook.priority || + ops->dev != hook.dev) { + nft_chain_release_hook(&hook); + return -EBUSY; + } + } + nft_chain_release_hook(&hook); + } + if (nla[NFTA_CHAIN_HANDLE] && name) { struct nft_chain *chain2; @@ -1320,102 +1426,53 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return -EOVERFLOW; if (nla[NFTA_CHAIN_HOOK]) { - const struct nf_chain_type *type; + struct nft_chain_hook hook; struct nf_hook_ops *ops; nf_hookfn *hookfn; - u32 hooknum, priority; - - type = chain_type[family][NFT_CHAIN_T_DEFAULT]; - if (nla[NFTA_CHAIN_TYPE]) { - type = nf_tables_chain_type_lookup(afi, - nla[NFTA_CHAIN_TYPE], - create); - if (IS_ERR(type)) - return PTR_ERR(type); - } - err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], - nft_hook_policy); + err = nft_chain_parse_hook(net, nla, afi, &hook, create); if (err < 0) return err; - if (ha[NFTA_HOOK_HOOKNUM] == NULL || - ha[NFTA_HOOK_PRIORITY] == NULL) - return -EINVAL; - - hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])); - if (hooknum >= afi->nhooks) - return -EINVAL; - priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY])); - - if (!(type->hook_mask & (1 << hooknum))) - return -EOPNOTSUPP; - if (!try_module_get(type->owner)) - return -ENOENT; - hookfn = type->hooks[hooknum]; - - if (afi->flags & NFT_AF_NEEDS_DEV) { - char ifname[IFNAMSIZ]; - - if (!ha[NFTA_HOOK_DEV]) { - module_put(type->owner); - return -EOPNOTSUPP; - } - - nla_strlcpy(ifname, ha[NFTA_HOOK_DEV], IFNAMSIZ); - dev = dev_get_by_name(net, ifname); - if (!dev) { - module_put(type->owner); - return -ENOENT; - } - } else if (ha[NFTA_HOOK_DEV]) { - module_put(type->owner); - return -EOPNOTSUPP; - } basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); if (basechain == NULL) { - module_put(type->owner); - if (dev != NULL) - dev_put(dev); + nft_chain_release_hook(&hook); return -ENOMEM; } - if (dev != NULL) - strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + if (hook.dev != NULL) + strncpy(basechain->dev_name, hook.dev->name, IFNAMSIZ); if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); if (IS_ERR(stats)) { - module_put(type->owner); + nft_chain_release_hook(&hook); kfree(basechain); - if (dev != NULL) - dev_put(dev); return PTR_ERR(stats); } basechain->stats = stats; } else { stats = netdev_alloc_pcpu_stats(struct nft_stats); if (stats == NULL) { - module_put(type->owner); + nft_chain_release_hook(&hook); kfree(basechain); - if (dev != NULL) - dev_put(dev); return -ENOMEM; } rcu_assign_pointer(basechain->stats, stats); } - basechain->type = type; + hookfn = hook.type->hooks[hook.num]; + basechain->type = hook.type; chain = &basechain->chain; for (i = 0; i < afi->nops; i++) { ops = &basechain->ops[i]; ops->pf = family; - ops->hooknum = hooknum; - ops->priority = priority; + ops->hooknum = hook.num; + ops->priority = hook.priority; ops->priv = chain; ops->hook = afi->hooks[ops->hooknum]; - ops->dev = dev; + ops->dev = hook.dev; if (hookfn) ops->hook = hookfn; if (afi->hook_ops_init) @@ -3426,12 +3483,12 @@ static int nft_setelem_parse_flags(const struct nft_set *set, } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, u32 nlmsg_flags) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; - struct nft_set_ext *ext; + struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_userdata *udata; @@ -3558,9 +3615,19 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err4; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(ctx->net, set, &elem); - if (err < 0) + err = set->ops->insert(ctx->net, set, &elem, &ext2); + if (err) { + if (err == -EEXIST) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && + memcmp(nft_set_ext_data(ext), + nft_set_ext_data(ext2), set->dlen) != 0) + err = -EBUSY; + else if (!(nlmsg_flags & NLM_F_EXCL)) + err = 0; + } goto err5; + } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); @@ -3616,7 +3683,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) return -ENFILE; - err = nft_add_set_elem(&ctx, set, attr); + err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); if (err < 0) { atomic_dec(&set->nelems); break; @@ -4343,6 +4410,31 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, } /** + * nft_parse_u32_check - fetch u32 attribute and check for maximum value + * + * @attr: netlink attribute to fetch value from + * @max: maximum value to be stored in dest + * @dest: pointer to the variable + * + * Parse, check and store a given u32 netlink attribute into variable. + * This function returns -ERANGE if the value goes over maximum value. + * Otherwise a 0 is returned and the attribute value is stored in the + * destination variable. + */ +unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) +{ + int val; + + val = ntohl(nla_get_be32(attr)); + if (val > max) + return -ERANGE; + + *dest = val; + return 0; +} +EXPORT_SYMBOL_GPL(nft_parse_u32_check); + +/** * nft_parse_register - parse a register value from a netlink attribute * * @attr: netlink attribute diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index fb8b5892b5ff..0dd5c695482f 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -34,7 +34,7 @@ static struct nf_loginfo trace_loginfo = { .u = { .log = { .level = LOGLEVEL_WARNING, - .logflags = NF_LOG_MASK, + .logflags = NF_LOG_DEFAULT_MASK, }, }, }; @@ -93,12 +93,15 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) ptr = skb_network_header(skb); - else + else { + if (!pkt->tprot_set) + return false; ptr = skb_network_header(skb) + pkt->xt.thoff; + } ptr += priv->offset; - if (unlikely(ptr + priv->len >= skb_tail_pointer(skb))) + if (unlikely(ptr + priv->len > skb_tail_pointer(skb))) return false; *dest = 0; @@ -260,8 +263,13 @@ int __init nf_tables_core_module_init(void) if (err < 0) goto err7; - return 0; + err = nft_range_module_init(); + if (err < 0) + goto err8; + return 0; +err8: + nft_dynset_module_exit(); err7: nft_payload_module_exit(); err6: diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c index 6b5f76295d3d..f713cc205669 100644 --- a/net/netfilter/nf_tables_inet.c +++ b/net/netfilter/nf_tables_inet.c @@ -82,7 +82,10 @@ static int __init nf_tables_inet_init(void) { int ret; - nft_register_chain_type(&filter_inet); + ret = nft_register_chain_type(&filter_inet); + if (ret < 0) + return ret; + ret = register_pernet_subsys(&nf_tables_inet_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_inet); diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 75d696f11045..9e2ae424b640 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -15,78 +15,6 @@ #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> -static inline void -nft_netdev_set_pktinfo_ipv4(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct iphdr *iph, _iph; - u32 len, thoff; - - nft_set_pktinfo(pkt, skb, state); - - iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), - &_iph); - if (!iph) - return; - - if (iph->ihl < 5 || iph->version != 4) - return; - - len = ntohs(iph->tot_len); - thoff = iph->ihl * 4; - if (skb->len < len) - return; - else if (len < thoff) - return; - - pkt->tprot = iph->protocol; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; -} - -static inline void -__nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_IPV6) - struct ipv6hdr *ip6h, _ip6h; - unsigned int thoff = 0; - unsigned short frag_off; - int protohdr; - u32 pkt_len; - - ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), - &_ip6h); - if (!ip6h) - return; - - if (ip6h->version != 6) - return; - - pkt_len = ntohs(ip6h->payload_len); - if (pkt_len + sizeof(*ip6h) > skb->len) - return; - - protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL); - if (protohdr < 0) - return; - - pkt->tprot = protohdr; - pkt->xt.thoff = thoff; - pkt->xt.fragoff = frag_off; -#endif -} - -static inline void nft_netdev_set_pktinfo_ipv6(struct nft_pktinfo *pkt, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - nft_set_pktinfo(pkt, skb, state); - __nft_netdev_set_pktinfo_ipv6(pkt, skb, state); -} - static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -95,13 +23,13 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_IP): - nft_netdev_set_pktinfo_ipv4(&pkt, skb, state); + nft_set_pktinfo_ipv4_validate(&pkt, skb, state); break; case htons(ETH_P_IPV6): - nft_netdev_set_pktinfo_ipv6(&pkt, skb, state); + nft_set_pktinfo_ipv6_validate(&pkt, skb, state); break; default: - nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb, state); break; } @@ -221,14 +149,25 @@ static int __init nf_tables_netdev_init(void) { int ret; - nft_register_chain_type(&nft_filter_chain_netdev); - ret = register_pernet_subsys(&nf_tables_netdev_net_ops); - if (ret < 0) { - nft_unregister_chain_type(&nft_filter_chain_netdev); + ret = nft_register_chain_type(&nft_filter_chain_netdev); + if (ret) return ret; - } - register_netdevice_notifier(&nf_tables_netdev_notifier); + + ret = register_pernet_subsys(&nf_tables_netdev_net_ops); + if (ret) + goto err1; + + ret = register_netdevice_notifier(&nf_tables_netdev_notifier); + if (ret) + goto err2; + return 0; + +err2: + unregister_pernet_subsys(&nf_tables_netdev_net_ops); +err1: + nft_unregister_chain_type(&nft_filter_chain_netdev); + return ret; } static void __exit nf_tables_netdev_exit(void) diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index fa24a5b398b1..ab695f8e2d29 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -113,20 +113,22 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { const struct sk_buff *skb = pkt->skb; - unsigned int len = min_t(unsigned int, - pkt->xt.thoff - skb_network_offset(skb), - NFT_TRACETYPE_NETWORK_HSIZE); int off = skb_network_offset(skb); + unsigned int len, nh_end; + nh_end = pkt->tprot_set ? pkt->xt.thoff : skb->len; + len = min_t(unsigned int, nh_end - skb_network_offset(skb), + NFT_TRACETYPE_NETWORK_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) return -1; - len = min_t(unsigned int, skb->len - pkt->xt.thoff, - NFT_TRACETYPE_TRANSPORT_HSIZE); - - if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, - pkt->xt.thoff, len)) - return -1; + if (pkt->tprot_set) { + len = min_t(unsigned int, skb->len - pkt->xt.thoff, + NFT_TRACETYPE_TRANSPORT_HSIZE); + if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, + pkt->xt.thoff, len)) + return -1; + } if (!skb_mac_header_was_set(skb)) return 0; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index e924e95fcc7f..3b79f34b5095 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -43,7 +43,7 @@ nfnl_userspace_cthelper(struct sk_buff *skb, unsigned int protoff, if (help == NULL) return NF_DROP; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(help->helper); if (helper == NULL) return NF_DROP; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 6577db524ef6..eb086a192c5a 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -442,7 +442,9 @@ __build_packet_message(struct nfnl_log_net *log, if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ - /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + /* rcu_read_lock()ed by nf_hook_thresh or + * nf_log_packet. + */ nla_put_be32(inst->skb, NFULA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; @@ -477,7 +479,9 @@ __build_packet_message(struct nfnl_log_net *log, if (nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ - /* rcu_read_lock()ed by nf_hook_slow or nf_log_packet */ + /* rcu_read_lock()ed by nf_hook_thresh or + * nf_log_packet. + */ nla_put_be32(inst->skb, NFULA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f49f45081acb..af832c526048 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -740,7 +740,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); - /* rcu_read_lock()ed by nf_hook_slow() */ + /* rcu_read_lock()ed by nf_hook_thresh */ queue = instance_lookup(q, queuenum); if (!queue) return -ESRCH; @@ -917,12 +917,14 @@ static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; -static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long ops_ptr) +static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr) { - return entry->elem == (struct nf_hook_ops *)ops_ptr; + return rcu_access_pointer(entry->state.hook_entries) == + (struct nf_hook_entry *)entry_ptr; } -static void nfqnl_nf_hook_drop(struct net *net, struct nf_hook_ops *hook) +static void nfqnl_nf_hook_drop(struct net *net, + const struct nf_hook_entry *hook) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; @@ -1522,9 +1524,16 @@ static int __init nfnetlink_queue_init(void) goto cleanup_netlink_notifier; } - register_netdevice_notifier(&nfqnl_dev_notifier); + status = register_netdevice_notifier(&nfqnl_dev_notifier); + if (status < 0) { + pr_err("nf_queue: failed to register netdevice notifier\n"); + goto cleanup_netlink_subsys; + } + return status; +cleanup_netlink_subsys: + nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index d71cc18fa35d..31c15ed2e5fc 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -52,6 +52,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, { struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_data_desc d1, d2; + u32 len; int err; if (tb[NFTA_BITWISE_SREG] == NULL || @@ -61,7 +62,12 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, tb[NFTA_BITWISE_XOR] == NULL) return -EINVAL; - priv->len = ntohl(nla_get_be32(tb[NFTA_BITWISE_LEN])); + err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); + if (err < 0) + return err; + + priv->len = len; + priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); err = nft_validate_register_load(priv->sreg, priv->len); if (err < 0) diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index b78c28ba465f..ee63d981268d 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -99,6 +99,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_byteorder *priv = nft_expr_priv(expr); + u32 size, len; int err; if (tb[NFTA_BYTEORDER_SREG] == NULL || @@ -117,7 +118,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, return -EINVAL; } - priv->size = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_SIZE])); + err = nft_parse_u32_check(tb[NFTA_BYTEORDER_SIZE], U8_MAX, &size); + if (err < 0) + return err; + + priv->size = size; + switch (priv->size) { case 2: case 4: @@ -128,7 +134,12 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, } priv->sreg = nft_parse_register(tb[NFTA_BYTEORDER_SREG]); - priv->len = ntohl(nla_get_be32(tb[NFTA_BYTEORDER_LEN])); + err = nft_parse_u32_check(tb[NFTA_BYTEORDER_LEN], U8_MAX, &len); + if (err < 0) + return err; + + priv->len = len; + err = nft_validate_register_load(priv->sreg, priv->len); if (err < 0) return err; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index e25b35d70e4d..2e53739812b1 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -84,6 +84,9 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; + if (desc.len > U8_MAX) + return -ERANGE; + priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); priv->len = desc.len; return 0; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 51e180f2a003..d7b0d171172a 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -128,15 +128,18 @@ static void nft_ct_get_eval(const struct nft_expr *expr, memcpy(dest, &count, sizeof(count)); return; } + case NFT_CT_L3PROTOCOL: + *dest = nf_ct_l3num(ct); + return; + case NFT_CT_PROTOCOL: + *dest = nf_ct_protonum(ct); + return; default: break; } tuple = &ct->tuplehash[priv->dir].tuple; switch (priv->key) { - case NFT_CT_L3PROTOCOL: - *dest = nf_ct_l3num(ct); - return; case NFT_CT_SRC: memcpy(dest, tuple->src.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); @@ -145,9 +148,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, memcpy(dest, tuple->dst.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; - case NFT_CT_PROTOCOL: - *dest = nf_ct_protonum(ct); - return; case NFT_CT_PROTO_SRC: *dest = (__force __u16)tuple->src.u.all; return; @@ -283,8 +283,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: - if (tb[NFTA_CT_DIRECTION] == NULL) - return -EINVAL; + /* For compatibility, do not report error if NFTA_CT_DIRECTION + * attribute is specified. + */ len = sizeof(u8); break; case NFT_CT_SRC: @@ -363,6 +364,8 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; len = FIELD_SIZEOF(struct nf_conn, mark); break; #endif @@ -432,8 +435,6 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; switch (priv->key) { - case NFT_CT_L3PROTOCOL: - case NFT_CT_PROTOCOL: case NFT_CT_SRC: case NFT_CT_DST: case NFT_CT_PROTO_SRC: diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 0af26699bf04..e3b83c31da2e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -22,6 +22,7 @@ struct nft_dynset { enum nft_dynset_ops op:8; enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; + bool invert; u64 timeout; struct nft_expr *expr; struct nft_set_binding binding; @@ -82,10 +83,14 @@ static void nft_dynset_eval(const struct nft_expr *expr, if (sexpr != NULL) sexpr->ops->eval(sexpr, regs, pkt); + + if (priv->invert) + regs->verdict.code = NFT_BREAK; return; } out: - regs->verdict.code = NFT_BREAK; + if (!priv->invert) + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { @@ -96,6 +101,7 @@ static const struct nla_policy nft_dynset_policy[NFTA_DYNSET_MAX + 1] = { [NFTA_DYNSET_SREG_DATA] = { .type = NLA_U32 }, [NFTA_DYNSET_TIMEOUT] = { .type = NLA_U64 }, [NFTA_DYNSET_EXPR] = { .type = NLA_NESTED }, + [NFTA_DYNSET_FLAGS] = { .type = NLA_U32 }, }; static int nft_dynset_init(const struct nft_ctx *ctx, @@ -113,6 +119,15 @@ static int nft_dynset_init(const struct nft_ctx *ctx, tb[NFTA_DYNSET_SREG_KEY] == NULL) return -EINVAL; + if (tb[NFTA_DYNSET_FLAGS]) { + u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); + + if (flags & ~NFT_DYNSET_F_INV) + return -EINVAL; + if (flags & NFT_DYNSET_F_INV) + priv->invert = true; + } + set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME], genmask); if (IS_ERR(set)) { @@ -220,6 +235,7 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_dynset *priv = nft_expr_priv(expr); + u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; if (nft_dump_register(skb, NFTA_DYNSET_SREG_KEY, priv->sreg_key)) goto nla_put_failure; @@ -235,6 +251,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_DYNSET_FLAGS, htonl(flags))) + goto nla_put_failure; return 0; nla_put_failure: diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 82c264e40278..a84cf3d66056 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -59,7 +59,7 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_exthdr *priv = nft_expr_priv(expr); - u32 offset, len; + u32 offset, len, err; if (tb[NFTA_EXTHDR_DREG] == NULL || tb[NFTA_EXTHDR_TYPE] == NULL || @@ -67,11 +67,13 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, tb[NFTA_EXTHDR_LEN] == NULL) return -EINVAL; - offset = ntohl(nla_get_be32(tb[NFTA_EXTHDR_OFFSET])); - len = ntohl(nla_get_be32(tb[NFTA_EXTHDR_LEN])); + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); + if (err < 0) + return err; - if (offset > U8_MAX || len > U8_MAX) - return -ERANGE; + err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); + if (err < 0) + return err; priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); priv->offset = offset; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 564fa7929ed5..09473b415b95 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -1,395 +1,145 @@ /* - * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2016 Laura Garcia <nevola@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * - * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/list.h> -#include <linux/log2.h> -#include <linux/jhash.h> #include <linux/netlink.h> -#include <linux/workqueue.h> -#include <linux/rhashtable.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> - -/* We target a hash table size of 4, element hint is 75% of final size */ -#define NFT_HASH_ELEMENT_HINT 3 +#include <net/netfilter/nf_tables_core.h> +#include <linux/jhash.h> struct nft_hash { - struct rhashtable ht; - struct delayed_work gc_work; -}; - -struct nft_hash_elem { - struct rhash_head node; - struct nft_set_ext ext; + enum nft_registers sreg:8; + enum nft_registers dreg:8; + u8 len; + u32 modulus; + u32 seed; + u32 offset; }; -struct nft_hash_cmp_arg { - const struct nft_set *set; - const u32 *key; - u8 genmask; -}; - -static const struct rhashtable_params nft_hash_params; - -static inline u32 nft_hash_key(const void *data, u32 len, u32 seed) -{ - const struct nft_hash_cmp_arg *arg = data; - - return jhash(arg->key, len, seed); -} - -static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed) -{ - const struct nft_hash_elem *he = data; - - return jhash(nft_set_ext_key(&he->ext), len, seed); -} - -static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, - const void *ptr) -{ - const struct nft_hash_cmp_arg *x = arg->key; - const struct nft_hash_elem *he = ptr; - - if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) - return 1; - if (nft_set_elem_expired(&he->ext)) - return 1; - if (!nft_set_elem_active(&he->ext, x->genmask)) - return 1; - return 0; -} - -static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) -{ - struct nft_hash *priv = nft_set_priv(set); - const struct nft_hash_elem *he; - struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_cur(net), - .set = set, - .key = key, - }; - - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); - if (he != NULL) - *ext = &he->ext; - - return !!he; -} - -static bool nft_hash_update(struct nft_set *set, const u32 *key, - void *(*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *regs), - const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext) -{ - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he; - struct nft_hash_cmp_arg arg = { - .genmask = NFT_GENMASK_ANY, - .set = set, - .key = key, - }; - - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); - if (he != NULL) - goto out; - - he = new(set, expr, regs); - if (he == NULL) - goto err1; - if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, - nft_hash_params)) - goto err2; -out: - *ext = &he->ext; - return true; - -err2: - nft_set_elem_destroy(set, he); -err1: - return false; -} - -static int nft_hash_insert(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) -{ - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he = elem->priv; - struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_next(net), - .set = set, - .key = elem->key.val.data, - }; - - return rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, - nft_hash_params); -} - -static void nft_hash_activate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) -{ - struct nft_hash_elem *he = elem->priv; - - nft_set_elem_change_active(net, set, &he->ext); - nft_set_elem_clear_busy(&he->ext); -} - -static void *nft_hash_deactivate(const struct net *net, - const struct nft_set *set, - const struct nft_set_elem *elem) +static void nft_hash_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he; - struct nft_hash_cmp_arg arg = { - .genmask = nft_genmask_next(net), - .set = set, - .key = elem->key.val.data, - }; - - rcu_read_lock(); - he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); - if (he != NULL) { - if (!nft_set_elem_mark_busy(&he->ext) || - !nft_is_active(net, &he->ext)) - nft_set_elem_change_active(net, set, &he->ext); - else - he = NULL; - } - rcu_read_unlock(); + struct nft_hash *priv = nft_expr_priv(expr); + const void *data = ®s->data[priv->sreg]; + u32 h; - return he; + h = reciprocal_scale(jhash(data, priv->len, priv->seed), priv->modulus); + regs->data[priv->dreg] = h + priv->offset; } -static void nft_hash_remove(const struct nft_set *set, - const struct nft_set_elem *elem) -{ - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he = elem->priv; - - rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); -} +static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { + [NFTA_HASH_SREG] = { .type = NLA_U32 }, + [NFTA_HASH_DREG] = { .type = NLA_U32 }, + [NFTA_HASH_LEN] = { .type = NLA_U32 }, + [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, + [NFTA_HASH_SEED] = { .type = NLA_U32 }, +}; -static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, - struct nft_set_iter *iter) +static int nft_hash_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { - struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he; - struct rhashtable_iter hti; - struct nft_set_elem elem; - int err; - - err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); - iter->err = err; - if (err) - return; - - err = rhashtable_walk_start(&hti); - if (err && err != -EAGAIN) { - iter->err = err; - goto out; - } - - while ((he = rhashtable_walk_next(&hti))) { - if (IS_ERR(he)) { - err = PTR_ERR(he); - if (err != -EAGAIN) { - iter->err = err; - goto out; - } + struct nft_hash *priv = nft_expr_priv(expr); + u32 len; - continue; - } + if (!tb[NFTA_HASH_SREG] || + !tb[NFTA_HASH_DREG] || + !tb[NFTA_HASH_LEN] || + !tb[NFTA_HASH_SEED] || + !tb[NFTA_HASH_MODULUS]) + return -EINVAL; - if (iter->count < iter->skip) - goto cont; - if (nft_set_elem_expired(&he->ext)) - goto cont; - if (!nft_set_elem_active(&he->ext, iter->genmask)) - goto cont; + if (tb[NFTA_HASH_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); - elem.priv = he; + priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]); + priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); - iter->err = iter->fn(ctx, set, iter, &elem); - if (iter->err < 0) - goto out; + len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN])); + if (len == 0 || len > U8_MAX) + return -ERANGE; -cont: - iter->count++; - } + priv->len = len; -out: - rhashtable_walk_stop(&hti); - rhashtable_walk_exit(&hti); -} - -static void nft_hash_gc(struct work_struct *work) -{ - struct nft_set *set; - struct nft_hash_elem *he; - struct nft_hash *priv; - struct nft_set_gc_batch *gcb = NULL; - struct rhashtable_iter hti; - int err; - - priv = container_of(work, struct nft_hash, gc_work.work); - set = nft_set_container_of(priv); - - err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); - if (err) - goto schedule; + priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); + if (priv->modulus <= 1) + return -ERANGE; - err = rhashtable_walk_start(&hti); - if (err && err != -EAGAIN) - goto out; + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; - while ((he = rhashtable_walk_next(&hti))) { - if (IS_ERR(he)) { - if (PTR_ERR(he) != -EAGAIN) - goto out; - continue; - } + priv->seed = ntohl(nla_get_be32(tb[NFTA_HASH_SEED])); - if (!nft_set_elem_expired(&he->ext)) - continue; - if (nft_set_elem_mark_busy(&he->ext)) - continue; - - gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); - if (gcb == NULL) - goto out; - rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); - atomic_dec(&set->nelems); - nft_set_gc_batch_add(gcb, he); - } -out: - rhashtable_walk_stop(&hti); - rhashtable_walk_exit(&hti); - - nft_set_gc_batch_complete(gcb); -schedule: - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); + return nft_validate_register_load(priv->sreg, len) && + nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, sizeof(u32)); } -static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) +static int nft_hash_dump(struct sk_buff *skb, + const struct nft_expr *expr) { - return sizeof(struct nft_hash); -} - -static const struct rhashtable_params nft_hash_params = { - .head_offset = offsetof(struct nft_hash_elem, node), - .hashfn = nft_hash_key, - .obj_hashfn = nft_hash_obj, - .obj_cmpfn = nft_hash_cmp, - .automatic_shrinking = true, -}; + const struct nft_hash *priv = nft_expr_priv(expr); -static int nft_hash_init(const struct nft_set *set, - const struct nft_set_desc *desc, - const struct nlattr * const tb[]) -{ - struct nft_hash *priv = nft_set_priv(set); - struct rhashtable_params params = nft_hash_params; - int err; - - params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT; - params.key_len = set->klen; - - err = rhashtable_init(&priv->ht, ¶ms); - if (err < 0) - return err; - - INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc); - if (set->flags & NFT_SET_TIMEOUT) - queue_delayed_work(system_power_efficient_wq, &priv->gc_work, - nft_set_gc_interval(set)); + if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_LEN, htonl(priv->len))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_SEED, htonl(priv->seed))) + goto nla_put_failure; + if (priv->offset != 0) + if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) + goto nla_put_failure; return 0; -} -static void nft_hash_elem_destroy(void *ptr, void *arg) -{ - nft_set_elem_destroy((const struct nft_set *)arg, ptr); +nla_put_failure: + return -1; } -static void nft_hash_destroy(const struct nft_set *set) -{ - struct nft_hash *priv = nft_set_priv(set); - - cancel_delayed_work_sync(&priv->gc_work); - rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy, - (void *)set); -} - -static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, - struct nft_set_estimate *est) -{ - unsigned int esize; - - esize = sizeof(struct nft_hash_elem); - if (desc->size) { - est->size = sizeof(struct nft_hash) + - roundup_pow_of_two(desc->size * 4 / 3) * - sizeof(struct nft_hash_elem *) + - desc->size * esize; - } else { - /* Resizing happens when the load drops below 30% or goes - * above 75%. The average of 52.5% load (approximated by 50%) - * is used for the size estimation of the hash buckets, - * meaning we calculate two buckets per element. - */ - est->size = esize + 2 * sizeof(struct nft_hash_elem *); - } - - est->class = NFT_SET_CLASS_O_1; - - return true; -} - -static struct nft_set_ops nft_hash_ops __read_mostly = { - .privsize = nft_hash_privsize, - .elemsize = offsetof(struct nft_hash_elem, ext), - .estimate = nft_hash_estimate, +static struct nft_expr_type nft_hash_type; +static const struct nft_expr_ops nft_hash_ops = { + .type = &nft_hash_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_hash)), + .eval = nft_hash_eval, .init = nft_hash_init, - .destroy = nft_hash_destroy, - .insert = nft_hash_insert, - .activate = nft_hash_activate, - .deactivate = nft_hash_deactivate, - .remove = nft_hash_remove, - .lookup = nft_hash_lookup, - .update = nft_hash_update, - .walk = nft_hash_walk, - .features = NFT_SET_MAP | NFT_SET_TIMEOUT, + .dump = nft_hash_dump, +}; + +static struct nft_expr_type nft_hash_type __read_mostly = { + .name = "hash", + .ops = &nft_hash_ops, + .policy = nft_hash_policy, + .maxattr = NFTA_HASH_MAX, .owner = THIS_MODULE, }; static int __init nft_hash_module_init(void) { - return nft_register_set(&nft_hash_ops); + return nft_register_expr(&nft_hash_type); } static void __exit nft_hash_module_exit(void) { - nft_unregister_set(&nft_hash_ops); + nft_unregister_expr(&nft_hash_type); } module_init(nft_hash_module_init); module_exit(nft_hash_module_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_SET(); +MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>"); +MODULE_ALIAS_NFT_EXPR("hash"); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index db3b746858e3..d17018ff54e6 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -53,6 +53,10 @@ static int nft_immediate_init(const struct nft_ctx *ctx, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; + + if (desc.len > U8_MAX) + return -ERANGE; + priv->dlen = desc.len; priv->dreg = nft_parse_register(tb[NFTA_IMMEDIATE_DREG]); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 24a73bb26e94..1b01404bb33f 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -58,8 +58,11 @@ static int nft_log_init(const struct nft_ctx *ctx, if (tb[NFTA_LOG_LEVEL] != NULL && tb[NFTA_LOG_GROUP] != NULL) return -EINVAL; - if (tb[NFTA_LOG_GROUP] != NULL) + if (tb[NFTA_LOG_GROUP] != NULL) { li->type = NF_LOG_TYPE_ULOG; + if (tb[NFTA_LOG_FLAGS] != NULL) + return -EINVAL; + } nla = tb[NFTA_LOG_PREFIX]; if (nla != NULL) { @@ -87,6 +90,10 @@ static int nft_log_init(const struct nft_ctx *ctx, if (tb[NFTA_LOG_FLAGS] != NULL) { li->u.log.logflags = ntohl(nla_get_be32(tb[NFTA_LOG_FLAGS])); + if (li->u.log.logflags & ~NF_LOG_MASK) { + err = -EINVAL; + goto err1; + } } break; case NF_LOG_TYPE_ULOG: diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index e164325d1bc0..8166b6994cc7 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -43,7 +43,7 @@ static void nft_lookup_eval(const struct nft_expr *expr, return; } - if (found && set->flags & NFT_SET_MAP) + if (set->flags & NFT_SET_MAP) nft_data_copy(®s->data[priv->dreg], nft_set_ext_data(ext), set->dlen); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8a6bc7630912..6c1e0246706e 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -52,6 +52,8 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = pkt->pf; break; case NFT_META_L4PROTO: + if (!pkt->tprot_set) + goto err; *dest = pkt->tprot; break; case NFT_META_PRIORITY: diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c new file mode 100644 index 000000000000..55bc5ab78d4a --- /dev/null +++ b/net/netfilter/nft_numgen.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2016 Laura Garcia <nevola@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <linux/static_key.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> + +static DEFINE_PER_CPU(struct rnd_state, nft_numgen_prandom_state); + +struct nft_ng_inc { + enum nft_registers dreg:8; + u32 modulus; + atomic_t counter; + u32 offset; +}; + +static void nft_ng_inc_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + u32 nval, oval; + + do { + oval = atomic_read(&priv->counter); + nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; + } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); + + regs->data[priv->dreg] = nval + priv->offset; +} + +static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { + [NFTA_NG_DREG] = { .type = NLA_U32 }, + [NFTA_NG_MODULUS] = { .type = NLA_U32 }, + [NFTA_NG_TYPE] = { .type = NLA_U32 }, + [NFTA_NG_OFFSET] = { .type = NLA_U32 }, +}; + +static int nft_ng_inc_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + + if (tb[NFTA_NG_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); + + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); + if (priv->modulus == 0) + return -ERANGE; + + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; + + priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); + atomic_set(&priv->counter, 0); + + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, sizeof(u32)); +} + +static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, + u32 modulus, enum nft_ng_types type, u32 offset) +{ + if (nft_dump_register(skb, NFTA_NG_DREG, dreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NG_MODULUS, htonl(modulus))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NG_TYPE, htonl(type))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_NG_OFFSET, htonl(offset))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ng_inc *priv = nft_expr_priv(expr); + + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_INCREMENTAL, + priv->offset); +} + +struct nft_ng_random { + enum nft_registers dreg:8; + u32 modulus; + u32 offset; +}; + +static void nft_ng_random_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_random *priv = nft_expr_priv(expr); + struct rnd_state *state = this_cpu_ptr(&nft_numgen_prandom_state); + u32 val; + + val = reciprocal_scale(prandom_u32_state(state), priv->modulus); + regs->data[priv->dreg] = val + priv->offset; +} + +static int nft_ng_random_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ng_random *priv = nft_expr_priv(expr); + + if (tb[NFTA_NG_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_NG_OFFSET])); + + priv->modulus = ntohl(nla_get_be32(tb[NFTA_NG_MODULUS])); + if (priv->modulus == 0) + return -ERANGE; + + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; + + prandom_init_once(&nft_numgen_prandom_state); + + priv->dreg = nft_parse_register(tb[NFTA_NG_DREG]); + + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, sizeof(u32)); +} + +static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_ng_random *priv = nft_expr_priv(expr); + + return nft_ng_dump(skb, priv->dreg, priv->modulus, NFT_NG_RANDOM, + priv->offset); +} + +static struct nft_expr_type nft_ng_type; +static const struct nft_expr_ops nft_ng_inc_ops = { + .type = &nft_ng_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), + .eval = nft_ng_inc_eval, + .init = nft_ng_inc_init, + .dump = nft_ng_inc_dump, +}; + +static const struct nft_expr_ops nft_ng_random_ops = { + .type = &nft_ng_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), + .eval = nft_ng_random_eval, + .init = nft_ng_random_init, + .dump = nft_ng_random_dump, +}; + +static const struct nft_expr_ops * +nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) +{ + u32 type; + + if (!tb[NFTA_NG_DREG] || + !tb[NFTA_NG_MODULUS] || + !tb[NFTA_NG_TYPE]) + return ERR_PTR(-EINVAL); + + type = ntohl(nla_get_be32(tb[NFTA_NG_TYPE])); + + switch (type) { + case NFT_NG_INCREMENTAL: + return &nft_ng_inc_ops; + case NFT_NG_RANDOM: + return &nft_ng_random_ops; + } + + return ERR_PTR(-EINVAL); +} + +static struct nft_expr_type nft_ng_type __read_mostly = { + .name = "numgen", + .select_ops = &nft_ng_select_ops, + .policy = nft_ng_policy, + .maxattr = NFTA_NG_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_ng_module_init(void) +{ + return nft_register_expr(&nft_ng_type); +} + +static void __exit nft_ng_module_exit(void) +{ + nft_unregister_expr(&nft_ng_type); +} + +module_init(nft_ng_module_init); +module_exit(nft_ng_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Laura Garcia <nevola@gmail.com>"); +MODULE_ALIAS_NFT_EXPR("numgen"); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 12cd4bf16d17..b2f88617611a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -92,6 +92,8 @@ static void nft_payload_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: + if (!pkt->tprot_set) + goto err; offset = pkt->xt.thoff; break; default: @@ -184,6 +186,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: + if (!pkt->tprot_set) + goto err; offset = pkt->xt.thoff; break; default: diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index 61d216eb7917..393d359a1889 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -22,9 +22,10 @@ static u32 jhash_initval __read_mostly; struct nft_queue { - u16 queuenum; - u16 queues_total; - u16 flags; + enum nft_registers sreg_qnum:8; + u16 queuenum; + u16 queues_total; + u16 flags; }; static void nft_queue_eval(const struct nft_expr *expr, @@ -54,31 +55,78 @@ static void nft_queue_eval(const struct nft_expr *expr, regs->verdict.code = ret; } +static void nft_queue_sreg_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_queue *priv = nft_expr_priv(expr); + u32 queue, ret; + + queue = regs->data[priv->sreg_qnum]; + + ret = NF_QUEUE_NR(queue); + if (priv->flags & NFT_QUEUE_FLAG_BYPASS) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + + regs->verdict.code = ret; +} + static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = { [NFTA_QUEUE_NUM] = { .type = NLA_U16 }, [NFTA_QUEUE_TOTAL] = { .type = NLA_U16 }, [NFTA_QUEUE_FLAGS] = { .type = NLA_U16 }, + [NFTA_QUEUE_SREG_QNUM] = { .type = NLA_U32 }, }; static int nft_queue_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_queue *priv = nft_expr_priv(expr); + u32 maxid; - if (tb[NFTA_QUEUE_NUM] == NULL) - return -EINVAL; - - init_hashrandom(&jhash_initval); priv->queuenum = ntohs(nla_get_be16(tb[NFTA_QUEUE_NUM])); - if (tb[NFTA_QUEUE_TOTAL] != NULL) + if (tb[NFTA_QUEUE_TOTAL]) priv->queues_total = ntohs(nla_get_be16(tb[NFTA_QUEUE_TOTAL])); - if (tb[NFTA_QUEUE_FLAGS] != NULL) { + else + priv->queues_total = 1; + + if (priv->queues_total == 0) + return -EINVAL; + + maxid = priv->queues_total - 1 + priv->queuenum; + if (maxid > U16_MAX) + return -ERANGE; + + if (tb[NFTA_QUEUE_FLAGS]) { + priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); + if (priv->flags & ~NFT_QUEUE_FLAG_MASK) + return -EINVAL; + } + return 0; +} + +static int nft_queue_sreg_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_queue *priv = nft_expr_priv(expr); + int err; + + priv->sreg_qnum = nft_parse_register(tb[NFTA_QUEUE_SREG_QNUM]); + err = nft_validate_register_load(priv->sreg_qnum, sizeof(u32)); + if (err < 0) + return err; + + if (tb[NFTA_QUEUE_FLAGS]) { priv->flags = ntohs(nla_get_be16(tb[NFTA_QUEUE_FLAGS])); if (priv->flags & ~NFT_QUEUE_FLAG_MASK) return -EINVAL; + if (priv->flags & NFT_QUEUE_FLAG_CPU_FANOUT) + return -EOPNOTSUPP; } + return 0; } @@ -97,6 +145,21 @@ nla_put_failure: return -1; } +static int +nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_queue *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_QUEUE_SREG_QNUM, priv->sreg_qnum) || + nla_put_be16(skb, NFTA_QUEUE_FLAGS, htons(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + static struct nft_expr_type nft_queue_type; static const struct nft_expr_ops nft_queue_ops = { .type = &nft_queue_type, @@ -106,9 +169,35 @@ static const struct nft_expr_ops nft_queue_ops = { .dump = nft_queue_dump, }; +static const struct nft_expr_ops nft_queue_sreg_ops = { + .type = &nft_queue_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_queue)), + .eval = nft_queue_sreg_eval, + .init = nft_queue_sreg_init, + .dump = nft_queue_sreg_dump, +}; + +static const struct nft_expr_ops * +nft_queue_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_QUEUE_NUM] && tb[NFTA_QUEUE_SREG_QNUM]) + return ERR_PTR(-EINVAL); + + init_hashrandom(&jhash_initval); + + if (tb[NFTA_QUEUE_NUM]) + return &nft_queue_ops; + + if (tb[NFTA_QUEUE_SREG_QNUM]) + return &nft_queue_sreg_ops; + + return ERR_PTR(-EINVAL); +} + static struct nft_expr_type nft_queue_type __read_mostly = { .name = "queue", - .ops = &nft_queue_ops, + .select_ops = &nft_queue_select_ops, .policy = nft_queue_policy, .maxattr = NFTA_QUEUE_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c new file mode 100644 index 000000000000..c00104c07095 --- /dev/null +++ b/net/netfilter/nft_quota.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/atomic.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +struct nft_quota { + u64 quota; + bool invert; + atomic64_t remain; +}; + +static inline bool nft_overquota(struct nft_quota *priv, + const struct nft_pktinfo *pkt) +{ + return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0; +} + +static void nft_quota_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_quota *priv = nft_expr_priv(expr); + + if (nft_overquota(priv, pkt) ^ priv->invert) + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = { + [NFTA_QUOTA_BYTES] = { .type = NLA_U64 }, + [NFTA_QUOTA_FLAGS] = { .type = NLA_U32 }, +}; + +static int nft_quota_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_quota *priv = nft_expr_priv(expr); + u32 flags = 0; + u64 quota; + + if (!tb[NFTA_QUOTA_BYTES]) + return -EINVAL; + + quota = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_BYTES])); + if (quota > S64_MAX) + return -EOVERFLOW; + + if (tb[NFTA_QUOTA_FLAGS]) { + flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS])); + if (flags & ~NFT_QUOTA_F_INV) + return -EINVAL; + } + + priv->quota = quota; + priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false; + atomic64_set(&priv->remain, quota); + + return 0; +} + +static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_quota *priv = nft_expr_priv(expr); + u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0; + + if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota), + NFTA_QUOTA_PAD) || + nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_quota_type; +static const struct nft_expr_ops nft_quota_ops = { + .type = &nft_quota_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_quota)), + .eval = nft_quota_eval, + .init = nft_quota_init, + .dump = nft_quota_dump, +}; + +static struct nft_expr_type nft_quota_type __read_mostly = { + .name = "quota", + .ops = &nft_quota_ops, + .policy = nft_quota_policy, + .maxattr = NFTA_QUOTA_MAX, + .flags = NFT_EXPR_STATEFUL, + .owner = THIS_MODULE, +}; + +static int __init nft_quota_module_init(void) +{ + return nft_register_expr(&nft_quota_type); +} + +static void __exit nft_quota_module_exit(void) +{ + nft_unregister_expr(&nft_quota_type); +} + +module_init(nft_quota_module_init); +module_exit(nft_quota_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_EXPR("quota"); diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c new file mode 100644 index 000000000000..c6d5358482d1 --- /dev/null +++ b/net/netfilter/nft_range.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> + +struct nft_range_expr { + struct nft_data data_from; + struct nft_data data_to; + enum nft_registers sreg:8; + u8 len; + enum nft_range_ops op:8; +}; + +static void nft_range_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_range_expr *priv = nft_expr_priv(expr); + bool mismatch; + int d1, d2; + + d1 = memcmp(®s->data[priv->sreg], &priv->data_from, priv->len); + d2 = memcmp(®s->data[priv->sreg], &priv->data_to, priv->len); + switch (priv->op) { + case NFT_RANGE_EQ: + mismatch = (d1 < 0 || d2 > 0); + break; + case NFT_RANGE_NEQ: + mismatch = (d1 >= 0 && d2 <= 0); + break; + } + + if (mismatch) + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_range_policy[NFTA_RANGE_MAX + 1] = { + [NFTA_RANGE_SREG] = { .type = NLA_U32 }, + [NFTA_RANGE_OP] = { .type = NLA_U32 }, + [NFTA_RANGE_FROM_DATA] = { .type = NLA_NESTED }, + [NFTA_RANGE_TO_DATA] = { .type = NLA_NESTED }, +}; + +static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_range_expr *priv = nft_expr_priv(expr); + struct nft_data_desc desc_from, desc_to; + int err; + + err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from), + &desc_from, tb[NFTA_RANGE_FROM_DATA]); + if (err < 0) + return err; + + err = nft_data_init(NULL, &priv->data_to, sizeof(priv->data_to), + &desc_to, tb[NFTA_RANGE_TO_DATA]); + if (err < 0) + goto err1; + + if (desc_from.len != desc_to.len) { + err = -EINVAL; + goto err2; + } + + priv->sreg = nft_parse_register(tb[NFTA_RANGE_SREG]); + err = nft_validate_register_load(priv->sreg, desc_from.len); + if (err < 0) + goto err2; + + priv->op = ntohl(nla_get_be32(tb[NFTA_RANGE_OP])); + priv->len = desc_from.len; + return 0; +err2: + nft_data_uninit(&priv->data_to, desc_to.type); +err1: + nft_data_uninit(&priv->data_from, desc_from.type); + return err; +} + +static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_range_expr *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_RANGE_SREG, priv->sreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_RANGE_OP, htonl(priv->op))) + goto nla_put_failure; + + if (nft_data_dump(skb, NFTA_RANGE_FROM_DATA, &priv->data_from, + NFT_DATA_VALUE, priv->len) < 0 || + nft_data_dump(skb, NFTA_RANGE_TO_DATA, &priv->data_to, + NFT_DATA_VALUE, priv->len) < 0) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_range_type; +static const struct nft_expr_ops nft_range_ops = { + .type = &nft_range_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_range_expr)), + .eval = nft_range_eval, + .init = nft_range_init, + .dump = nft_range_dump, +}; + +static struct nft_expr_type nft_range_type __read_mostly = { + .name = "range", + .ops = &nft_range_ops, + .policy = nft_range_policy, + .maxattr = NFTA_RANGE_MAX, + .owner = THIS_MODULE, +}; + +int __init nft_range_module_init(void) +{ + return nft_register_expr(&nft_range_type); +} + +void nft_range_module_exit(void) +{ + nft_unregister_expr(&nft_range_type); +} diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c new file mode 100644 index 000000000000..3794cb2fc788 --- /dev/null +++ b/net/netfilter/nft_set_hash.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/log2.h> +#include <linux/jhash.h> +#include <linux/netlink.h> +#include <linux/workqueue.h> +#include <linux/rhashtable.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> + +/* We target a hash table size of 4, element hint is 75% of final size */ +#define NFT_HASH_ELEMENT_HINT 3 + +struct nft_hash { + struct rhashtable ht; + struct delayed_work gc_work; +}; + +struct nft_hash_elem { + struct rhash_head node; + struct nft_set_ext ext; +}; + +struct nft_hash_cmp_arg { + const struct nft_set *set; + const u32 *key; + u8 genmask; +}; + +static const struct rhashtable_params nft_hash_params; + +static inline u32 nft_hash_key(const void *data, u32 len, u32 seed) +{ + const struct nft_hash_cmp_arg *arg = data; + + return jhash(arg->key, len, seed); +} + +static inline u32 nft_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct nft_hash_elem *he = data; + + return jhash(nft_set_ext_key(&he->ext), len, seed); +} + +static inline int nft_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct nft_hash_cmp_arg *x = arg->key; + const struct nft_hash_elem *he = ptr; + + if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) + return 1; + if (nft_set_elem_expired(&he->ext)) + return 1; + if (!nft_set_elem_active(&he->ext, x->genmask)) + return 1; + return 0; +} + +static bool nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_hash *priv = nft_set_priv(set); + const struct nft_hash_elem *he; + struct nft_hash_cmp_arg arg = { + .genmask = nft_genmask_cur(net), + .set = set, + .key = key, + }; + + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + if (he != NULL) + *ext = &he->ext; + + return !!he; +} + +static bool nft_hash_update(struct nft_set *set, const u32 *key, + void *(*new)(struct nft_set *, + const struct nft_expr *, + struct nft_regs *regs), + const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_set_ext **ext) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; + struct nft_hash_cmp_arg arg = { + .genmask = NFT_GENMASK_ANY, + .set = set, + .key = key, + }; + + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + if (he != NULL) + goto out; + + he = new(set, expr, regs); + if (he == NULL) + goto err1; + if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params)) + goto err2; +out: + *ext = &he->ext; + return true; + +err2: + nft_set_elem_destroy(set, he); +err1: + return false; +} + +static int nft_hash_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he = elem->priv; + struct nft_hash_cmp_arg arg = { + .genmask = nft_genmask_next(net), + .set = set, + .key = elem->key.val.data, + }; + struct nft_hash_elem *prev; + + prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params); + if (IS_ERR(prev)) + return PTR_ERR(prev); + if (prev) { + *ext = &prev->ext; + return -EEXIST; + } + return 0; +} + +static void nft_hash_activate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash_elem *he = elem->priv; + + nft_set_elem_change_active(net, set, &he->ext); + nft_set_elem_clear_busy(&he->ext); +} + +static void *nft_hash_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; + struct nft_hash_cmp_arg arg = { + .genmask = nft_genmask_next(net), + .set = set, + .key = elem->key.val.data, + }; + + rcu_read_lock(); + he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params); + if (he != NULL) { + if (!nft_set_elem_mark_busy(&he->ext) || + !nft_is_active(net, &he->ext)) + nft_set_elem_change_active(net, set, &he->ext); + else + he = NULL; + } + rcu_read_unlock(); + + return he; +} + +static void nft_hash_remove(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he = elem->priv; + + rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); +} + +static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_hash *priv = nft_set_priv(set); + struct nft_hash_elem *he; + struct rhashtable_iter hti; + struct nft_set_elem elem; + int err; + + err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); + iter->err = err; + if (err) + return; + + err = rhashtable_walk_start(&hti); + if (err && err != -EAGAIN) { + iter->err = err; + goto out; + } + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + err = PTR_ERR(he); + if (err != -EAGAIN) { + iter->err = err; + goto out; + } + + continue; + } + + if (iter->count < iter->skip) + goto cont; + if (nft_set_elem_expired(&he->ext)) + goto cont; + if (!nft_set_elem_active(&he->ext, iter->genmask)) + goto cont; + + elem.priv = he; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + goto out; + +cont: + iter->count++; + } + +out: + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); +} + +static void nft_hash_gc(struct work_struct *work) +{ + struct nft_set *set; + struct nft_hash_elem *he; + struct nft_hash *priv; + struct nft_set_gc_batch *gcb = NULL; + struct rhashtable_iter hti; + int err; + + priv = container_of(work, struct nft_hash, gc_work.work); + set = nft_set_container_of(priv); + + err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); + if (err) + goto schedule; + + err = rhashtable_walk_start(&hti); + if (err && err != -EAGAIN) + goto out; + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) + goto out; + continue; + } + + if (!nft_set_elem_expired(&he->ext)) + continue; + if (nft_set_elem_mark_busy(&he->ext)) + continue; + + gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); + if (gcb == NULL) + goto out; + rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params); + atomic_dec(&set->nelems); + nft_set_gc_batch_add(gcb, he); + } +out: + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + nft_set_gc_batch_complete(gcb); +schedule: + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); +} + +static unsigned int nft_hash_privsize(const struct nlattr * const nla[]) +{ + return sizeof(struct nft_hash); +} + +static const struct rhashtable_params nft_hash_params = { + .head_offset = offsetof(struct nft_hash_elem, node), + .hashfn = nft_hash_key, + .obj_hashfn = nft_hash_obj, + .obj_cmpfn = nft_hash_cmp, + .automatic_shrinking = true, +}; + +static int nft_hash_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const tb[]) +{ + struct nft_hash *priv = nft_set_priv(set); + struct rhashtable_params params = nft_hash_params; + int err; + + params.nelem_hint = desc->size ?: NFT_HASH_ELEMENT_HINT; + params.key_len = set->klen; + + err = rhashtable_init(&priv->ht, ¶ms); + if (err < 0) + return err; + + INIT_DEFERRABLE_WORK(&priv->gc_work, nft_hash_gc); + if (set->flags & NFT_SET_TIMEOUT) + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); + return 0; +} + +static void nft_hash_elem_destroy(void *ptr, void *arg) +{ + nft_set_elem_destroy((const struct nft_set *)arg, ptr); +} + +static void nft_hash_destroy(const struct nft_set *set) +{ + struct nft_hash *priv = nft_set_priv(set); + + cancel_delayed_work_sync(&priv->gc_work); + rhashtable_free_and_destroy(&priv->ht, nft_hash_elem_destroy, + (void *)set); +} + +static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned int esize; + + esize = sizeof(struct nft_hash_elem); + if (desc->size) { + est->size = sizeof(struct nft_hash) + + roundup_pow_of_two(desc->size * 4 / 3) * + sizeof(struct nft_hash_elem *) + + desc->size * esize; + } else { + /* Resizing happens when the load drops below 30% or goes + * above 75%. The average of 52.5% load (approximated by 50%) + * is used for the size estimation of the hash buckets, + * meaning we calculate two buckets per element. + */ + est->size = esize + 2 * sizeof(struct nft_hash_elem *); + } + + est->class = NFT_SET_CLASS_O_1; + + return true; +} + +static struct nft_set_ops nft_hash_ops __read_mostly = { + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup, + .update = nft_hash_update, + .walk = nft_hash_walk, + .features = NFT_SET_MAP | NFT_SET_TIMEOUT, + .owner = THIS_MODULE, +}; + +static int __init nft_hash_module_init(void) +{ + return nft_register_set(&nft_hash_ops); +} + +static void __exit nft_hash_module_exit(void) +{ + nft_unregister_set(&nft_hash_ops); +} + +module_init(nft_hash_module_init); +module_exit(nft_hash_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_set_rbtree.c index ffe9ae062d23..38b5bda242f8 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -96,7 +96,8 @@ out: } static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, - struct nft_rbtree_elem *new) + struct nft_rbtree_elem *new, + struct nft_set_ext **ext) { struct nft_rbtree *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); @@ -124,8 +125,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, else if (!nft_rbtree_interval_end(rbe) && nft_rbtree_interval_end(new)) p = &parent->rb_right; - else + else { + *ext = &rbe->ext; return -EEXIST; + } } } } @@ -135,13 +138,14 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) + const struct nft_set_elem *elem, + struct nft_set_ext **ext) { struct nft_rbtree_elem *rbe = elem->priv; int err; spin_lock_bh(&nft_rbtree_lock); - err = __nft_rbtree_insert(net, set, rbe); + err = __nft_rbtree_insert(net, set, rbe, ext); spin_unlock_bh(&nft_rbtree_lock); return err; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 515131f9e021..dbd6c4a12b97 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -24,7 +24,6 @@ static DEFINE_MUTEX(xt_rateest_mutex); #define RATEEST_HSIZE 16 static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly; static unsigned int jhash_rnd __read_mostly; -static bool rnd_inited __read_mostly; static unsigned int xt_rateest_hash(const char *name) { @@ -99,10 +98,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } cfg; int ret; - if (unlikely(!rnd_inited)) { - get_random_bytes(&jhash_rnd, sizeof(jhash_rnd)); - rnd_inited = true; - } + net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); est = xt_rateest_lookup(info->name); if (est) { diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index e118397254af..872db2d0e2a9 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -110,18 +110,14 @@ tcpmss_mangle_packet(struct sk_buff *skb, if (info->mss == XT_TCPMSS_CLAMP_PMTU) { struct net *net = par->net; unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); + unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu); - if (dst_mtu(skb_dst(skb)) <= minlen) { + if (min_mtu <= minlen) { net_err_ratelimited("unknown or invalid path-MTU (%u)\n", - dst_mtu(skb_dst(skb))); + min_mtu); return -1; } - if (in_mtu <= minlen) { - net_err_ratelimited("unknown or invalid path-MTU (%u)\n", - in_mtu); - return -1; - } - newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen; + newmss = min_mtu - minlen; } else newmss = info->mss; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 6e57a3966dc5..0471db4032c5 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -89,6 +89,8 @@ static int tee_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->oif[0]) { + int ret; + if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; @@ -101,7 +103,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par) priv->notifier.notifier_call = tee_netdev_event; info->priv = priv; - register_netdevice_notifier(&priv->notifier); + ret = register_netdevice_notifier(&priv->notifier); + if (ret) { + kfree(priv); + return ret; + } } else info->priv = NULL; diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 99bbc829868d..b6dc322593a3 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -366,14 +366,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) unsigned int i; int ret; - if (unlikely(!connlimit_rnd)) { - u_int32_t rand; + net_get_random_once(&connlimit_rnd, sizeof(connlimit_rnd)); - do { - get_random_bytes(&rand, sizeof(rand)); - } while (!rand); - cmpxchg(&connlimit_rnd, 0, rand); - } ret = nf_ct_l3proto_try_module_get(par->family); if (ret < 0) { pr_info("cannot load conntrack support for " diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 188404b9b002..a3b8f697cfc5 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -233,10 +233,8 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, return false; if (info->match_flags & XT_CONNTRACK_EXPIRES) { - unsigned long expires = 0; + unsigned long expires = nf_ct_expires(ct) / HZ; - if (timer_pending(&ct->timeout)) - expires = (ct->timeout.expires - jiffies) / HZ; if ((expires >= info->expires_min && expires <= info->expires_max) ^ !(info->invert_flags & XT_CONNTRACK_EXPIRES)) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 178696852bde..44a095ecc7b7 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -56,6 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net) } /* need to declare this at the top */ +static const struct file_operations dl_file_ops_v1; static const struct file_operations dl_file_ops; /* hash table crap */ @@ -86,8 +87,8 @@ struct dsthash_ent { unsigned long expires; /* precalculated expiry time */ struct { unsigned long prev; /* last modification */ - u_int32_t credit; - u_int32_t credit_cap, cost; + u_int64_t credit; + u_int64_t credit_cap, cost; } rateinfo; struct rcu_head rcu; }; @@ -98,7 +99,7 @@ struct xt_hashlimit_htable { u_int8_t family; bool rnd_initialized; - struct hashlimit_cfg1 cfg; /* config */ + struct hashlimit_cfg2 cfg; /* config */ /* used internally */ spinlock_t lock; /* lock for list_head */ @@ -114,6 +115,30 @@ struct xt_hashlimit_htable { struct hlist_head hash[0]; /* hashtable itself */ }; +static int +cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision) +{ + if (revision == 1) { + struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from; + + to->mode = cfg->mode; + to->avg = cfg->avg; + to->burst = cfg->burst; + to->size = cfg->size; + to->max = cfg->max; + to->gc_interval = cfg->gc_interval; + to->expire = cfg->expire; + to->srcmask = cfg->srcmask; + to->dstmask = cfg->dstmask; + } else if (revision == 2) { + memcpy(to, from, sizeof(struct hashlimit_cfg2)); + } else { + return -EINVAL; + } + + return 0; +} + static DEFINE_MUTEX(hashlimit_mutex); /* protects htables list */ static struct kmem_cache *hashlimit_cachep __read_mostly; @@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) } static void htable_gc(struct work_struct *work); -static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, - u_int8_t family) +static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg, + const char *name, u_int8_t family, + struct xt_hashlimit_htable **out_hinfo, + int revision) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; - unsigned int size; - unsigned int i; + unsigned int size, i; + int ret; - if (minfo->cfg.size) { - size = minfo->cfg.size; + if (cfg->size) { + size = cfg->size; } else { size = (totalram_pages << PAGE_SHIFT) / 16384 / sizeof(struct list_head); @@ -238,10 +265,14 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, sizeof(struct list_head) * size); if (hinfo == NULL) return -ENOMEM; - minfo->hinfo = hinfo; + *out_hinfo = hinfo; /* copy match config into hashtable config */ - memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); + ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2); + + if (ret) + return ret; + hinfo->cfg.size = size; if (hinfo->cfg.max == 0) hinfo->cfg.max = 8 * hinfo->cfg.size; @@ -255,17 +286,18 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo, hinfo->count = 0; hinfo->family = family; hinfo->rnd_initialized = false; - hinfo->name = kstrdup(minfo->name, GFP_KERNEL); + hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { vfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); - hinfo->pde = proc_create_data(minfo->name, 0, + hinfo->pde = proc_create_data(name, 0, (family == NFPROTO_IPV4) ? hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, - &dl_file_ops, hinfo); + (revision == 1) ? &dl_file_ops_v1 : &dl_file_ops, + hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); vfree(hinfo); @@ -398,7 +430,8 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) (slowest userspace tool allows), which means CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. */ -#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) +#define MAX_CPJ_v1 (0xFFFFFFFF / (HZ*60*60*24)) +#define MAX_CPJ (0xFFFFFFFFFFFFFFFF / (HZ*60*60*24)) /* Repeated shift and or gives us all 1s, final shift and add 1 gives * us the power of 2 below the theoretical max, so GCC simply does a @@ -408,9 +441,12 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) #define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) #define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define _POW2_BELOW64(x) (_POW2_BELOW32(x)|_POW2_BELOW32((x)>>32)) #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) +#define POW2_BELOW64(x) ((_POW2_BELOW64(x)>>1) + 1) -#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) +#define CREDITS_PER_JIFFY POW2_BELOW64(MAX_CPJ) +#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1) /* in byte mode, the lowest possible rate is one packet/second. * credit_cap is used as a counter that tells us how many times we can @@ -425,14 +461,24 @@ static u32 xt_hashlimit_len_to_chunks(u32 len) } /* Precision saver. */ -static u32 user2credits(u32 user) +static u64 user2credits(u64 user, int revision) { - /* If multiplying would overflow... */ - if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) - /* Divide first. */ - return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + if (revision == 1) { + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY_v1)) + /* Divide first. */ + return (user / XT_HASHLIMIT_SCALE) *\ + HZ * CREDITS_PER_JIFFY_v1; + + return (user * HZ * CREDITS_PER_JIFFY_v1) \ + / XT_HASHLIMIT_SCALE; + } else { + if (user > 0xFFFFFFFFFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + return (user / XT_HASHLIMIT_SCALE_v2) *\ + HZ * CREDITS_PER_JIFFY; - return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE; + return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2; + } } static u32 user2credits_byte(u32 user) @@ -442,10 +488,11 @@ static u32 user2credits_byte(u32 user) return (u32) (us >> 32); } -static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) +static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, + u32 mode, int revision) { unsigned long delta = now - dh->rateinfo.prev; - u32 cap; + u64 cap, cpj; if (delta == 0) return; @@ -453,7 +500,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) dh->rateinfo.prev = now; if (mode & XT_HASHLIMIT_BYTES) { - u32 tmp = dh->rateinfo.credit; + u64 tmp = dh->rateinfo.credit; dh->rateinfo.credit += CREDITS_PER_JIFFY_BYTES * delta; cap = CREDITS_PER_JIFFY_BYTES * HZ; if (tmp >= dh->rateinfo.credit) {/* overflow */ @@ -461,7 +508,9 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) return; } } else { - dh->rateinfo.credit += delta * CREDITS_PER_JIFFY; + cpj = (revision == 1) ? + CREDITS_PER_JIFFY_v1 : CREDITS_PER_JIFFY; + dh->rateinfo.credit += delta * cpj; cap = dh->rateinfo.credit_cap; } if (dh->rateinfo.credit > cap) @@ -469,7 +518,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode) } static void rateinfo_init(struct dsthash_ent *dh, - struct xt_hashlimit_htable *hinfo) + struct xt_hashlimit_htable *hinfo, int revision) { dh->rateinfo.prev = jiffies; if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { @@ -478,8 +527,8 @@ static void rateinfo_init(struct dsthash_ent *dh, dh->rateinfo.credit_cap = hinfo->cfg.burst; } else { dh->rateinfo.credit = user2credits(hinfo->cfg.avg * - hinfo->cfg.burst); - dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + hinfo->cfg.burst, revision); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg, revision); dh->rateinfo.credit_cap = dh->rateinfo.credit; } } @@ -603,15 +652,15 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) } static bool -hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, + struct xt_hashlimit_htable *hinfo, + const struct hashlimit_cfg2 *cfg, int revision) { - const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; - struct xt_hashlimit_htable *hinfo = info->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; bool race = false; - u32 cost; + u64 cost; if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; @@ -626,18 +675,18 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) } else if (race) { /* Already got an entry, update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_recalc(dh, now, hinfo->cfg.mode); + rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } else { dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_init(dh, hinfo); + rateinfo_init(dh, hinfo, revision); } } else { /* update expiration timeout */ dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); - rateinfo_recalc(dh, now, hinfo->cfg.mode); + rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } - if (info->cfg.mode & XT_HASHLIMIT_BYTES) + if (cfg->mode & XT_HASHLIMIT_BYTES) cost = hashlimit_byte_cost(skb->len, dh); else cost = dh->rateinfo.cost; @@ -647,84 +696,157 @@ hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) dh->rateinfo.credit -= cost; spin_unlock(&dh->lock); rcu_read_unlock_bh(); - return !(info->cfg.mode & XT_HASHLIMIT_INVERT); + return !(cfg->mode & XT_HASHLIMIT_INVERT); } spin_unlock(&dh->lock); rcu_read_unlock_bh(); /* default match is underlimit - so over the limit, we need to invert */ - return info->cfg.mode & XT_HASHLIMIT_INVERT; + return cfg->mode & XT_HASHLIMIT_INVERT; hotdrop: par->hotdrop = true; return false; } -static int hashlimit_mt_check(const struct xt_mtchk_param *par) +static bool +hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; + struct hashlimit_cfg2 cfg = {}; + int ret; + + ret = cfg_copy(&cfg, (void *)&info->cfg, 1); + + if (ret) + return ret; + + return hashlimit_mt_common(skb, par, hinfo, &cfg, 1); +} + +static bool +hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; + + return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2); +} + +static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, + struct xt_hashlimit_htable **hinfo, + struct hashlimit_cfg2 *cfg, + const char *name, int revision) { struct net *net = par->net; - struct xt_hashlimit_mtinfo1 *info = par->matchinfo; int ret; - if (info->cfg.gc_interval == 0 || info->cfg.expire == 0) - return -EINVAL; - if (info->name[sizeof(info->name)-1] != '\0') + if (cfg->gc_interval == 0 || cfg->expire == 0) return -EINVAL; if (par->family == NFPROTO_IPV4) { - if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) + if (cfg->srcmask > 32 || cfg->dstmask > 32) return -EINVAL; } else { - if (info->cfg.srcmask > 128 || info->cfg.dstmask > 128) + if (cfg->srcmask > 128 || cfg->dstmask > 128) return -EINVAL; } - if (info->cfg.mode & ~XT_HASHLIMIT_ALL) { + if (cfg->mode & ~XT_HASHLIMIT_ALL) { pr_info("Unknown mode mask %X, kernel too old?\n", - info->cfg.mode); + cfg->mode); return -EINVAL; } /* Check for overflow. */ - if (info->cfg.mode & XT_HASHLIMIT_BYTES) { - if (user2credits_byte(info->cfg.avg) == 0) { - pr_info("overflow, rate too high: %u\n", info->cfg.avg); + if (cfg->mode & XT_HASHLIMIT_BYTES) { + if (user2credits_byte(cfg->avg) == 0) { + pr_info("overflow, rate too high: %llu\n", cfg->avg); return -EINVAL; } - } else if (info->cfg.burst == 0 || - user2credits(info->cfg.avg * info->cfg.burst) < - user2credits(info->cfg.avg)) { - pr_info("overflow, try lower: %u/%u\n", - info->cfg.avg, info->cfg.burst); + } else if (cfg->burst == 0 || + user2credits(cfg->avg * cfg->burst, revision) < + user2credits(cfg->avg, revision)) { + pr_info("overflow, try lower: %llu/%llu\n", + cfg->avg, cfg->burst); return -ERANGE; } mutex_lock(&hashlimit_mutex); - info->hinfo = htable_find_get(net, info->name, par->family); - if (info->hinfo == NULL) { - ret = htable_create(net, info, par->family); + *hinfo = htable_find_get(net, name, par->family); + if (*hinfo == NULL) { + ret = htable_create(net, cfg, name, par->family, + hinfo, revision); if (ret < 0) { mutex_unlock(&hashlimit_mutex); return ret; } } mutex_unlock(&hashlimit_mutex); + return 0; } -static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) +static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) +{ + struct xt_hashlimit_mtinfo1 *info = par->matchinfo; + struct hashlimit_cfg2 cfg = {}; + int ret; + + if (info->name[sizeof(info->name) - 1] != '\0') + return -EINVAL; + + ret = cfg_copy(&cfg, (void *)&info->cfg, 1); + + if (ret) + return ret; + + return hashlimit_mt_check_common(par, &info->hinfo, + &cfg, info->name, 1); +} + +static int hashlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + + if (info->name[sizeof(info->name) - 1] != '\0') + return -EINVAL; + + return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg, + info->name, 2); +} + +static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) { const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; htable_put(info->hinfo); } +static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + + htable_put(info->hinfo); +} + static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .revision = 1, .family = NFPROTO_IPV4, - .match = hashlimit_mt, + .match = hashlimit_mt_v1, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .checkentry = hashlimit_mt_check_v1, + .destroy = hashlimit_mt_destroy_v1, + .me = THIS_MODULE, + }, + { + .name = "hashlimit", + .revision = 2, + .family = NFPROTO_IPV4, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo2), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, @@ -734,8 +856,18 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .name = "hashlimit", .revision = 1, .family = NFPROTO_IPV6, - .match = hashlimit_mt, + .match = hashlimit_mt_v1, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), + .checkentry = hashlimit_mt_check_v1, + .destroy = hashlimit_mt_destroy_v1, + .me = THIS_MODULE, + }, + { + .name = "hashlimit", + .revision = 2, + .family = NFPROTO_IPV6, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo2), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, @@ -786,18 +918,12 @@ static void dl_seq_stop(struct seq_file *s, void *v) spin_unlock_bh(&htable->lock); } -static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, - struct seq_file *s) +static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) { - const struct xt_hashlimit_htable *ht = s->private; - - spin_lock(&ent->lock); - /* recalculate to show accurate numbers */ - rateinfo_recalc(ent, jiffies, ht->cfg.mode); - switch (family) { case NFPROTO_IPV4: - seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n", + seq_printf(s, "%ld %pI4:%u->%pI4:%u %llu %llu %llu\n", (long)(ent->expires - jiffies)/HZ, &ent->dst.ip.src, ntohs(ent->dst.src_port), @@ -808,7 +934,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, break; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: - seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n", + seq_printf(s, "%ld %pI6:%u->%pI6:%u %llu %llu %llu\n", (long)(ent->expires - jiffies)/HZ, &ent->dst.ip6.src, ntohs(ent->dst.src_port), @@ -821,10 +947,52 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, default: BUG(); } +} + +static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + const struct xt_hashlimit_htable *ht = s->private; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies, ht->cfg.mode, 1); + + dl_seq_print(ent, family, s); + + spin_unlock(&ent->lock); + return seq_has_overflowed(s); +} + +static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + const struct xt_hashlimit_htable *ht = s->private; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2); + + dl_seq_print(ent, family, s); + spin_unlock(&ent->lock); return seq_has_overflowed(s); } +static int dl_seq_show_v1(struct seq_file *s, void *v) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + + if (!hlist_empty(&htable->hash[*bucket])) { + hlist_for_each_entry(ent, &htable->hash[*bucket], node) + if (dl_seq_real_show_v1(ent, htable->family, s)) + return -1; + } + return 0; +} + static int dl_seq_show(struct seq_file *s, void *v) { struct xt_hashlimit_htable *htable = s->private; @@ -839,6 +1007,13 @@ static int dl_seq_show(struct seq_file *s, void *v) return 0; } +static const struct seq_operations dl_seq_ops_v1 = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show_v1 +}; + static const struct seq_operations dl_seq_ops = { .start = dl_seq_start, .next = dl_seq_next, @@ -846,17 +1021,37 @@ static const struct seq_operations dl_seq_ops = { .show = dl_seq_show }; +static int dl_proc_open_v1(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops_v1); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE_DATA(inode); + } + return ret; +} + static int dl_proc_open(struct inode *inode, struct file *file) { int ret = seq_open(file, &dl_seq_ops); if (!ret) { struct seq_file *sf = file->private_data; + sf->private = PDE_DATA(inode); } return ret; } +static const struct file_operations dl_file_ops_v1 = { + .owner = THIS_MODULE, + .open = dl_proc_open_v1, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + static const struct file_operations dl_file_ops = { .owner = THIS_MODULE, .open = dl_proc_open, diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 9f4ab00c8050..f679dd4c272a 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -41,7 +41,7 @@ helper_mt(const struct sk_buff *skb, struct xt_action_param *par) if (!master_help) return ret; - /* rcu_read_lock()ed by nf_hook_slow */ + /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(master_help->helper); if (!helper) return ret; @@ -65,7 +65,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par) par->family); return ret; } - info->name[29] = '\0'; + info->name[sizeof(info->name) - 1] = '\0'; return 0; } diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index e5f18988aee0..bb33598e4530 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -107,8 +107,8 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) info->invert & XT_PHYSDEV_OP_BRIDGED) && par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { - pr_info("using --physdev-out and --physdev-is-out are only" - "supported in the FORWARD and POSTROUTING chains with" + pr_info("using --physdev-out and --physdev-is-out are only " + "supported in the FORWARD and POSTROUTING chains with " "bridged traffic.\n"); if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) return -EINVAL; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index d725a27743a1..e3b7a09b103e 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -110,7 +110,6 @@ static const struct file_operations recent_old_fops, recent_mt_fops; #endif static u_int32_t hash_rnd __read_mostly; -static bool hash_rnd_inited __read_mostly; static inline unsigned int recent_entry_hash4(const union nf_inet_addr *addr) { @@ -340,10 +339,8 @@ static int recent_mt_check(const struct xt_mtchk_param *par, int ret = -EINVAL; size_t sz; - if (unlikely(!hash_rnd_inited)) { - get_random_bytes(&hash_rnd, sizeof(hash_rnd)); - hash_rnd_inited = true; - } + net_get_random_once(&hash_rnd, sizeof(hash_rnd)); + if (info->check_set & ~XT_RECENT_VALID_FLAGS) { pr_info("Unsupported user space flags (%08x)\n", info->check_set); diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index ef36a56a02c6..4dedb96d1a06 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -68,7 +68,7 @@ match_packet(const struct sk_buff *skb, ++i, offset, sch->type, htons(sch->length), sch->flags); #endif - offset += WORD_ROUND(ntohs(sch->length)); + offset += SCTP_PAD4(ntohs(sch->length)); pr_debug("skb->len: %d\toffset: %d\n", skb->len, offset); diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 8dd836a8dd60..b2f0e986a6f4 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -63,43 +63,74 @@ out_nlmsg_trim: static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, int protocol, int s_num) { + struct rhashtable_iter *hti = (void *)cb->args[2]; struct netlink_table *tbl = &nl_table[protocol]; - struct rhashtable *ht = &tbl->hash; - const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht); struct net *net = sock_net(skb->sk); struct netlink_diag_req *req; struct netlink_sock *nlsk; struct sock *sk; - int ret = 0, num = 0, i; + int num = 2; + int ret = 0; req = nlmsg_data(cb->nlh); - for (i = 0; i < htbl->size; i++) { - struct rhash_head *pos; + if (s_num > 1) + goto mc_list; - rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) { - sk = (struct sock *)nlsk; + num--; - if (!net_eq(sock_net(sk), net)) - continue; - if (num < s_num) { - num++; + if (!hti) { + hti = kmalloc(sizeof(*hti), GFP_KERNEL); + if (!hti) + return -ENOMEM; + + cb->args[2] = (long)hti; + } + + if (!s_num) + rhashtable_walk_enter(&tbl->hash, hti); + + ret = rhashtable_walk_start(hti); + if (ret == -EAGAIN) + ret = 0; + if (ret) + goto stop; + + while ((nlsk = rhashtable_walk_next(hti))) { + if (IS_ERR(nlsk)) { + ret = PTR_ERR(nlsk); + if (ret == -EAGAIN) { + ret = 0; continue; } + break; + } - if (sk_diag_fill(sk, skb, req, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - sock_i_ino(sk)) < 0) { - ret = 1; - goto done; - } + sk = (struct sock *)nlsk; - num++; + if (!net_eq(sock_net(sk), net)) + continue; + + if (sk_diag_fill(sk, skb, req, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + ret = 1; + break; } } +stop: + rhashtable_walk_stop(hti); + if (ret) + goto done; + + rhashtable_walk_exit(hti); + num++; + +mc_list: + read_lock(&nl_table_lock); sk_for_each_bound(sk, &tbl->mc_list) { if (sk_hashed(sk)) continue; @@ -116,13 +147,14 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, NLM_F_MULTI, sock_i_ino(sk)) < 0) { ret = 1; - goto done; + break; } num++; } + read_unlock(&nl_table_lock); + done: cb->args[0] = num; - cb->args[1] = protocol; return ret; } @@ -131,20 +163,20 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_diag_req *req; int s_num = cb->args[0]; + int err = 0; req = nlmsg_data(cb->nlh); - rcu_read_lock(); - read_lock(&nl_table_lock); - if (req->sdiag_protocol == NDIAG_PROTO_ALL) { int i; for (i = cb->args[1]; i < MAX_LINKS; i++) { - if (__netlink_diag_dump(skb, cb, i, s_num)) + err = __netlink_diag_dump(skb, cb, i, s_num); + if (err) break; s_num = 0; } + cb->args[1] = i; } else { if (req->sdiag_protocol >= MAX_LINKS) { read_unlock(&nl_table_lock); @@ -152,13 +184,22 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) return -ENOENT; } - __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); + err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } - read_unlock(&nl_table_lock); - rcu_read_unlock(); + return err < 0 ? err : skb->len; +} + +static int netlink_diag_dump_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *hti = (void *)cb->args[2]; + + if (cb->args[0] == 1) + rhashtable_walk_exit(hti); - return skb->len; + kfree(hti); + + return 0; } static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) @@ -172,6 +213,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = netlink_diag_dump, + .done = netlink_diag_dump_done, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index a09132a69869..23cc12639ba7 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -977,7 +977,7 @@ static int genl_ctrl_event(int event, struct genl_family *family, return 0; } -static struct genl_ops genl_ctrl_ops[] = { +static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, .doit = ctrl_getfamily, @@ -986,7 +986,7 @@ static struct genl_ops genl_ctrl_ops[] = { }, }; -static struct genl_multicast_group genl_ctrl_groups[] = { +static const struct genl_multicast_group genl_ctrl_groups[] = { { .name = "notify", }, }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 1ecbd7715f6d..4e03f64709bc 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -71,6 +71,8 @@ struct ovs_frag_data { static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); #define DEFERRED_ACTION_FIFO_SIZE 10 +#define OVS_RECURSION_LIMIT 5 +#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) struct action_fifo { int head; int tail; @@ -78,7 +80,12 @@ struct action_fifo { struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; }; +struct recirc_keys { + struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; +}; + static struct action_fifo __percpu *action_fifos; +static struct recirc_keys __percpu *recirc_keys; static DEFINE_PER_CPU(int, exec_actions_level); static void action_fifo_init(struct action_fifo *fifo) @@ -153,7 +160,7 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_mpls *mpls) { - __be32 *new_mpls_lse; + struct mpls_shim_hdr *new_mpls_lse; /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ if (skb->encapsulation) @@ -162,19 +169,23 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, if (skb_cow_head(skb, MPLS_HLEN) < 0) return -ENOMEM; + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb->mac_len); + skb_set_inner_protocol(skb, skb->protocol); + } + skb_push(skb, MPLS_HLEN); memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), skb->mac_len); skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->mac_len); - new_mpls_lse = (__be32 *)skb_mpls_header(skb); - *new_mpls_lse = mpls->mpls_lse; + new_mpls_lse = mpls_hdr(skb); + new_mpls_lse->label_stack_entry = mpls->mpls_lse; skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); - if (!skb->inner_protocol) - skb_set_inner_protocol(skb, skb->protocol); skb->protocol = mpls->mpls_ethertype; invalidate_flow_key(key); @@ -191,18 +202,19 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, if (unlikely(err)) return err; - skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN); + skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), skb->mac_len); __skb_pull(skb, MPLS_HLEN); skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->mac_len); - /* skb_mpls_header() is used to locate the ethertype - * field correctly in the presence of VLAN tags. + /* mpls_hdr() is used to locate the ethertype field correctly in the + * presence of VLAN tags. */ - hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); update_ethertype(skb, hdr, ethertype); if (eth_p_mpls(skb->protocol)) skb->protocol = ethertype; @@ -214,7 +226,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, const __be32 *mpls_lse, const __be32 *mask) { - __be32 *stack; + struct mpls_shim_hdr *stack; __be32 lse; int err; @@ -222,16 +234,16 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (unlikely(err)) return err; - stack = (__be32 *)skb_mpls_header(skb); - lse = OVS_MASKED(*stack, *mpls_lse, *mask); + stack = mpls_hdr(skb); + lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be32 diff[] = { ~(*stack), lse }; + __be32 diff[] = { ~(stack->label_stack_entry), lse }; skb->csum = ~csum_partial((char *)diff, sizeof(diff), ~skb->csum); } - *stack = lse; + stack->label_stack_entry = lse; flow_key->mpls.top_lse = lse; return 0; } @@ -241,20 +253,24 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) int err; err = skb_vlan_pop(skb); - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { invalidate_flow_key(key); - else - key->eth.tci = 0; + } else { + key->eth.vlan.tci = 0; + key->eth.vlan.tpid = 0; + } return err; } static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_vlan *vlan) { - if (skb_vlan_tag_present(skb)) + if (skb_vlan_tag_present(skb)) { invalidate_flow_key(key); - else - key->eth.tci = vlan->vlan_tci; + } else { + key->eth.vlan.tci = vlan->vlan_tci; + key->eth.vlan.tpid = vlan->vlan_tpid; + } return skb_vlan_push(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); } @@ -1011,6 +1027,7 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, const struct nlattr *a, int rem) { struct deferred_action *da; + int level; if (!is_flow_key_valid(key)) { int err; @@ -1034,6 +1051,18 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, return 0; } + level = this_cpu_read(exec_actions_level); + if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { + struct recirc_keys *rks = this_cpu_ptr(recirc_keys); + struct sw_flow_key *recirc_key = &rks->key[level - 1]; + + *recirc_key = *key; + recirc_key->recirc_id = nla_get_u32(a); + ovs_dp_process_packet(skb, recirc_key); + + return 0; + } + da = add_deferred_actions(skb, key, NULL); if (da) { da->pkt_key.recirc_id = nla_get_u32(a); @@ -1200,11 +1229,10 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts, struct sw_flow_key *key) { - static const int ovs_recursion_limit = 5; int err, level; level = __this_cpu_inc_return(exec_actions_level); - if (unlikely(level > ovs_recursion_limit)) { + if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); kfree_skb(skb); @@ -1229,10 +1257,17 @@ int action_fifos_init(void) if (!action_fifos) return -ENOMEM; + recirc_keys = alloc_percpu(struct recirc_keys); + if (!recirc_keys) { + free_percpu(action_fifos); + return -ENOMEM; + } + return 0; } void action_fifos_exit(void) { free_percpu(action_fifos); + free_percpu(recirc_keys); } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e054a748ff25..31045ef44a82 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1367,7 +1367,7 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) if (ct_info->helper) module_put(ct_info->helper->me); if (ct_info->ct) - nf_ct_put(ct_info->ct); + nf_ct_tmpl_free(ct_info->ct); } void ovs_ct_init(struct net *net) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 524c0fd3078e..4d67ea856067 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -928,7 +928,6 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; - struct sw_flow_key key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -956,20 +955,24 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &key, &mask); + ovs_match_init(&match, &new_flow->key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; - ovs_flow_mask_key(&new_flow->key, &key, true, &mask); - /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &key, log); + &new_flow->key, log); if (error) goto err_kfree_flow; + /* unmasked key is needed to match when ufid is not used. */ + if (ovs_identifier_is_key(&new_flow->id)) + match.key = new_flow->id.unmasked_key; + + ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); + /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); @@ -996,7 +999,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &key); + flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); @@ -1121,7 +1124,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, &mask); + ovs_match_init(&match, &key, true, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); } else if (!ufid_present) { @@ -1238,7 +1241,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { @@ -1297,7 +1300,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (unlikely(err)) @@ -2437,3 +2440,7 @@ module_exit(dp_cleanup); MODULE_DESCRIPTION("Open vSwitch switching datapath"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 0ea128eeeab2..c8c82e109c68 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/in.h> #include <linux/rcupdate.h> +#include <linux/cpumask.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> @@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, { struct flow_stats *stats; int node = numa_node_id(); + int cpu = smp_processor_id(); int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - stats = rcu_dereference(flow->stats[node]); + stats = rcu_dereference(flow->stats[cpu]); - /* Check if already have node-specific stats. */ + /* Check if already have CPU-specific stats. */ if (likely(stats)) { spin_lock(&stats->lock); /* Mark if we write on the pre-allocated stats. */ - if (node == 0 && unlikely(flow->stats_last_writer != node)) - flow->stats_last_writer = node; + if (cpu == 0 && unlikely(flow->stats_last_writer != cpu)) + flow->stats_last_writer = cpu; } else { stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ spin_lock(&stats->lock); - /* If the current NUMA-node is the only writer on the + /* If the current CPU is the only writer on the * pre-allocated stats keep using them. */ - if (unlikely(flow->stats_last_writer != node)) { + if (unlikely(flow->stats_last_writer != cpu)) { /* A previous locker may have already allocated the - * stats, so we need to check again. If node-specific + * stats, so we need to check again. If CPU-specific * stats were already allocated, we update the pre- * allocated stats as we have already locked them. */ - if (likely(flow->stats_last_writer != NUMA_NO_NODE) - && likely(!rcu_access_pointer(flow->stats[node]))) { - /* Try to allocate node-specific stats. */ + if (likely(flow->stats_last_writer != -1) && + likely(!rcu_access_pointer(flow->stats[cpu]))) { + /* Try to allocate CPU-specific stats. */ struct flow_stats *new_stats; new_stats = @@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, new_stats->tcp_flags = tcp_flags; spin_lock_init(&new_stats->lock); - rcu_assign_pointer(flow->stats[node], + rcu_assign_pointer(flow->stats[cpu], new_stats); goto unlock; } } - flow->stats_last_writer = node; + flow->stats_last_writer = cpu; } } @@ -136,14 +138,15 @@ void ovs_flow_stats_get(const struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, unsigned long *used, __be16 *tcp_flags) { - int node; + int cpu; *used = 0; *tcp_flags = 0; memset(ovs_stats, 0, sizeof(*ovs_stats)); - for_each_node(node) { - struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]); + /* We open code this to make sure cpu 0 is always considered */ + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) { + struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); if (stats) { /* Local CPU may write on non-local stats, so we must @@ -163,10 +166,11 @@ void ovs_flow_stats_get(const struct sw_flow *flow, /* Called with ovs_mutex. */ void ovs_flow_stats_clear(struct sw_flow *flow) { - int node; + int cpu; - for_each_node(node) { - struct flow_stats *stats = ovsl_dereference(flow->stats[node]); + /* We open code this to make sure cpu 0 is always considered */ + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) { + struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]); if (stats) { spin_lock_bh(&stats->lock); @@ -302,24 +306,57 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } -static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +/** + * Parse vlan tag from vlan header. + * Returns ERROR on memory error. + * Returns 0 if it encounters a non-vlan or incomplete packet. + * Returns 1 after successfully parsing vlan tag. + */ +static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh) { - struct qtag_prefix { - __be16 eth_type; /* ETH_P_8021Q */ - __be16 tci; - }; - struct qtag_prefix *qp; + struct vlan_head *vh = (struct vlan_head *)skb->data; + + if (likely(!eth_type_vlan(vh->tpid))) + return 0; - if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) + if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16))) return 0; - if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + - sizeof(__be16)))) + if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) + + sizeof(__be16)))) return -ENOMEM; - qp = (struct qtag_prefix *) skb->data; - key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); - __skb_pull(skb, sizeof(struct qtag_prefix)); + vh = (struct vlan_head *)skb->data; + key_vh->tci = vh->tci | htons(VLAN_TAG_PRESENT); + key_vh->tpid = vh->tpid; + + __skb_pull(skb, sizeof(struct vlan_head)); + return 1; +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + int res; + + key->eth.vlan.tci = 0; + key->eth.vlan.tpid = 0; + key->eth.cvlan.tci = 0; + key->eth.cvlan.tpid = 0; + + if (likely(skb_vlan_tag_present(skb))) { + key->eth.vlan.tci = htons(skb->vlan_tci); + key->eth.vlan.tpid = skb->vlan_proto; + } else { + /* Parse outer vlan tag in the non-accelerated case. */ + res = parse_vlan_tag(skb, &key->eth.vlan); + if (res <= 0) + return res; + } + + /* Parse inner vlan tag. */ + res = parse_vlan_tag(skb, &key->eth.cvlan); + if (res <= 0) + return res; return 0; } @@ -480,12 +517,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) * update skb->csum here. */ - key->eth.tci = 0; - if (skb_vlan_tag_present(skb)) - key->eth.tci = htons(skb->vlan_tci); - else if (eth->h_proto == htons(ETH_P_8021Q)) - if (unlikely(parse_vlan(skb, key))) - return -ENOMEM; + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; key->eth.type = parse_ethertype(skb); if (unlikely(key->eth.type == htons(0))) @@ -600,12 +633,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) } else if (eth_p_mpls(key->eth.type)) { size_t stack_len = MPLS_HLEN; - /* In the presence of an MPLS label stack the end of the L2 - * header and the beginning of the L3 header differ. - * - * Advance network_header to the beginning of the L3 - * header. mac_len corresponds to the end of the L2 header. - */ + skb_set_inner_network_header(skb, skb->mac_len); while (1) { __be32 lse; @@ -613,12 +641,12 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (unlikely(error)) return 0; - memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); if (stack_len == MPLS_HLEN) memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); - skb_set_network_header(skb, skb->mac_len + stack_len); + skb_set_inner_network_header(skb, skb->mac_len + stack_len); if (lse & htonl(MPLS_LS_S_MASK)) break; @@ -734,8 +762,6 @@ int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, { int err; - memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); - /* Extract metadata from netlink attributes. */ err = ovs_nla_get_flow_metadata(net, attr, key, log); if (err) diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 03378e75a67c..ae783f5c6695 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -50,6 +50,11 @@ struct ovs_tunnel_info { struct metadata_dst *tun_dst; }; +struct vlan_head { + __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/ + __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ +}; + #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) @@ -69,7 +74,8 @@ struct sw_flow_key { struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ - __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + struct vlan_head vlan; + struct vlan_head cvlan; __be16 type; /* Ethernet frame type. */ } eth; union { @@ -172,14 +178,14 @@ struct sw_flow { struct hlist_node node[2]; u32 hash; } flow_table, ufid_table; - int stats_last_writer; /* NUMA-node id of the last writer on + int stats_last_writer; /* CPU id of the last writer on * 'stats[0]'. */ struct sw_flow_key key; struct sw_flow_id id; struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; - struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one + struct flow_stats __rcu *stats[]; /* One for each CPU. First one * is allocated at flow creation time, * the rest are allocated on demand * while holding the 'stats[0].lock'. diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c78a6a1476fb..ae25ded82b3b 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -808,6 +808,167 @@ int ovs_nla_put_tunnel_info(struct sk_buff *skb, ip_tunnel_info_af(tun_info)); } +static int encode_vlan_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *a[], + bool is_mask, bool inner) +{ + __be16 tci = 0; + __be16 tpid = 0; + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (a[OVS_KEY_ATTR_ETHERTYPE]) + tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (likely(!inner)) { + SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask); + SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask); + } else { + SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask); + SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask); + } + return 0; +} + +static int validate_vlan_from_nlattrs(const struct sw_flow_match *match, + u64 key_attrs, bool inner, + const struct nlattr **a, bool log) +{ + __be16 tci = 0; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) { + /* Not a VLAN. */ + return 0; + } + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (tci) { + OVS_NLERR(log, "%s TCI does not have VLAN_TAG_PRESENT bit set.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) { + /* Corner case for truncated VLAN header. */ + OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + } + + return 1; +} + +static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, + u64 key_attrs, bool inner, + const struct nlattr **a, bool log) +{ + __be16 tci = 0; + __be16 tpid = 0; + bool encap_valid = !!(match->key->eth.vlan.tci & + htons(VLAN_TAG_PRESENT)); + bool i_encap_valid = !!(match->key->eth.cvlan.tci & + htons(VLAN_TAG_PRESENT)); + + if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) { + /* Not a VLAN. */ + return 0; + } + + if ((!inner && !encap_valid) || (inner && !i_encap_valid)) { + OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (a[OVS_KEY_ATTR_ETHERTYPE]) + tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (tpid != htons(0xffff)) { + OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).", + (inner) ? "C-VLAN" : "VLAN", ntohs(tpid)); + return -EINVAL; + } + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_TAG_PRESENT bit.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + return 1; +} + +static int __parse_vlan_from_nlattrs(struct sw_flow_match *match, + u64 *key_attrs, bool inner, + const struct nlattr **a, bool is_mask, + bool log) +{ + int err; + const struct nlattr *encap; + + if (!is_mask) + err = validate_vlan_from_nlattrs(match, *key_attrs, inner, + a, log); + else + err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner, + a, log); + if (err <= 0) + return err; + + err = encode_vlan_from_nlattrs(match, a, is_mask, inner); + if (err) + return err; + + *key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + *key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + *key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + + encap = a[OVS_KEY_ATTR_ENCAP]; + + if (!is_mask) + err = parse_flow_nlattrs(encap, a, key_attrs, log); + else + err = parse_flow_mask_nlattrs(encap, a, key_attrs, log); + + return err; +} + +static int parse_vlan_from_nlattrs(struct sw_flow_match *match, + u64 *key_attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + int err; + bool encap_valid = false; + + err = __parse_vlan_from_nlattrs(match, key_attrs, false, a, + is_mask, log); + if (err) + return err; + + encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_TAG_PRESENT)); + if (encap_valid) { + err = __parse_vlan_from_nlattrs(match, key_attrs, true, a, + is_mask, log); + if (err) + return err; + } + + return 0; +} + static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask, bool log) @@ -923,20 +1084,11 @@ static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, } if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { - __be16 tci; - - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (!(tci & htons(VLAN_TAG_PRESENT))) { - if (is_mask) - OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit."); - else - OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set."); - - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + /* VLAN attribute is always parsed before getting here since it + * may occur multiple times. + */ + OVS_NLERR(log, "VLAN attribute unexpected."); + return -EINVAL; } if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { @@ -1182,49 +1334,18 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, bool log) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct nlattr *encap; struct nlattr *newmask = NULL; u64 key_attrs = 0; u64 mask_attrs = 0; - bool encap_valid = false; int err; err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); if (err) return err; - if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && - (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && - (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { - __be16 tci; - - if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && - (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { - OVS_NLERR(log, "Invalid Vlan frame."); - return -EINVAL; - } - - key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - encap = a[OVS_KEY_ATTR_ENCAP]; - key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - encap_valid = true; - - if (tci & htons(VLAN_TAG_PRESENT)) { - err = parse_flow_nlattrs(encap, a, &key_attrs, log); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) { - OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute."); - return -EINVAL; - } - } else { - OVS_NLERR(log, "Encap attr is set for non-VLAN frame"); - return -EINVAL; - } - } + err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log); + if (err) + return err; err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); if (err) @@ -1265,46 +1386,12 @@ int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, goto free_newmask; /* Always match on tci. */ - SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); - - if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { - __be16 eth_type = 0; - __be16 tci = 0; - - if (!encap_valid) { - OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame."); - err = -EINVAL; - goto free_newmask; - } - - mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - if (a[OVS_KEY_ATTR_ETHERTYPE]) - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - - if (eth_type == htons(0xffff)) { - mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - encap = a[OVS_KEY_ATTR_ENCAP]; - err = parse_flow_mask_nlattrs(encap, a, - &mask_attrs, log); - if (err) - goto free_newmask; - } else { - OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).", - ntohs(eth_type)); - err = -EINVAL; - goto free_newmask; - } - - if (a[OVS_KEY_ATTR_VLAN]) - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true); + SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true); - if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", - ntohs(tci)); - err = -EINVAL; - goto free_newmask; - } - } + err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log); + if (err) + goto free_newmask; err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, log); @@ -1410,12 +1497,25 @@ int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, return metadata_from_nlattrs(net, &match, &attrs, a, false, log); } +static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh, + bool is_mask) +{ + __be16 eth_type = !is_mask ? vh->tpid : htons(0xffff); + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci)) + return -EMSGSIZE; + return 0; +} + static int __ovs_nla_put_key(const struct sw_flow_key *swkey, const struct sw_flow_key *output, bool is_mask, struct sk_buff *skb) { struct ovs_key_ethernet *eth_key; - struct nlattr *nla, *encap; + struct nlattr *nla; + struct nlattr *encap = NULL; + struct nlattr *in_encap = NULL; if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) goto nla_put_failure; @@ -1464,17 +1564,21 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, ether_addr_copy(eth_key->eth_src, output->eth.src); ether_addr_copy(eth_key->eth_dst, output->eth.dst); - if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - __be16 eth_type; - eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) + if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask)) goto nla_put_failure; encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.tci) + if (!swkey->eth.vlan.tci) goto unencap; - } else - encap = NULL; + + if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) + goto nla_put_failure; + in_encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.cvlan.tci) + goto unencap; + } + } if (swkey->eth.type == htons(ETH_P_802_2)) { /* @@ -1493,6 +1597,14 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) goto nla_put_failure; + if (eth_type_vlan(swkey->eth.type)) { + /* There are 3 VLAN tags, we don't know anything about the rest + * of the packet, so truncate here. + */ + WARN_ON_ONCE(!(encap && in_encap)); + goto unencap; + } + if (swkey->eth.type == htons(ETH_P_IP)) { struct ovs_key_ipv4 *ipv4_key; @@ -1619,6 +1731,8 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, } unencap: + if (in_encap) + nla_nest_end(skb, in_encap); if (encap) nla_nest_end(skb, encap); @@ -1882,13 +1996,15 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, void ovs_match_init(struct sw_flow_match *match, struct sw_flow_key *key, + bool reset_key, struct sw_flow_mask *mask) { memset(match, 0, sizeof(*match)); match->key = key; match->mask = mask; - memset(key, 0, sizeof(*key)); + if (reset_key) + memset(key, 0, sizeof(*key)); if (mask) { memset(&mask->key, 0, sizeof(mask->key)); @@ -1935,7 +2051,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, struct nlattr *a; int err = 0, start, opts_type; - ovs_match_init(&match, &key, NULL); + ovs_match_init(&match, &key, true, NULL); opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); if (opts_type < 0) return opts_type; @@ -2283,7 +2399,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_PUSH_VLAN: vlan = nla_data(a); - if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + if (!eth_type_vlan(vlan->vlan_tpid)) return -EINVAL; if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) return -EINVAL; @@ -2388,7 +2504,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, - key->eth.tci, log); + key->eth.vlan.tci, log); if (err) ovs_nla_free_flow_actions(*sfa); diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 47dd142eca1c..45f9769e5aac 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -41,7 +41,8 @@ size_t ovs_tun_key_attr_size(void); size_t ovs_key_attr_size(void); void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, struct sw_flow_mask *mask); + struct sw_flow_key *key, bool reset_key, + struct sw_flow_mask *mask); int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d073fff82fdb..ea7a8073fa02 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -32,6 +32,7 @@ #include <linux/module.h> #include <linux/in.h> #include <linux/rcupdate.h> +#include <linux/cpumask.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> @@ -79,17 +80,12 @@ struct sw_flow *ovs_flow_alloc(void) { struct sw_flow *flow; struct flow_stats *stats; - int node; - flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL); if (!flow) return ERR_PTR(-ENOMEM); - flow->sf_acts = NULL; - flow->mask = NULL; - flow->id.unmasked_key = NULL; - flow->id.ufid_len = 0; - flow->stats_last_writer = NUMA_NO_NODE; + flow->stats_last_writer = -1; /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, @@ -102,10 +98,6 @@ struct sw_flow *ovs_flow_alloc(void) RCU_INIT_POINTER(flow->stats[0], stats); - for_each_node(node) - if (node != 0) - RCU_INIT_POINTER(flow->stats[node], NULL); - return flow; err: kmem_cache_free(flow_cache, flow); @@ -142,16 +134,17 @@ static struct flex_array *alloc_buckets(unsigned int n_buckets) static void flow_free(struct sw_flow *flow) { - int node; + int cpu; if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); - for_each_node(node) - if (flow->stats[node]) + /* We open code this to make sure cpu 0 is always considered */ + for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpu_possible_mask)) + if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, - (struct flow_stats __force *)flow->stats[node]); + (struct flow_stats __force *)flow->stats[cpu]); kmem_cache_free(flow_cache, flow); } @@ -756,7 +749,7 @@ int ovs_flow_init(void) BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) - + (nr_node_ids + + (nr_cpu_ids * sizeof(struct flow_stats *)), 0, 0, NULL); if (flow_cache == NULL) diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 6b21fd068d87..8f198437c724 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -485,9 +485,14 @@ static unsigned int packet_length(const struct sk_buff *skb) { unsigned int length = skb->len - ETH_HLEN; - if (skb->protocol == htons(ETH_P_8021Q)) + if (skb_vlan_tagged(skb)) length -= VLAN_HLEN; + /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow + * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none + * account for 802.1ad. e.g. is_skb_forwardable(). + */ + return length; } diff --git a/net/rds/ib.h b/net/rds/ib.h index 046f7508c06b..45ac8e8e58f4 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -333,6 +333,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp); void rds_ib_state_change(struct sock *sk); int rds_ib_listen_init(void); void rds_ib_listen_stop(void); +__printf(2, 3) void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); diff --git a/net/rds/rds.h b/net/rds/rds.h index b2d17f0fafa8..fd0bccb2f9f9 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -688,6 +688,7 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...); #define rds_conn_error(conn, fmt...) \ __rds_conn_error(conn, KERN_WARNING "RDS: " fmt) +__printf(2, 3) void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); #define rds_conn_path_error(cp, fmt...) \ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 784c53163b7b..86f8853a038c 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -19,6 +19,20 @@ config AF_RXRPC See Documentation/networking/rxrpc.txt. +config AF_RXRPC_IPV6 + bool "IPv6 support for RxRPC" + depends on (IPV6 = m && AF_RXRPC = m) || (IPV6 = y && AF_RXRPC) + help + Say Y here to allow AF_RXRPC to use IPV6 UDP as well as IPV4 UDP as + its network transport. + +config AF_RXRPC_INJECT_LOSS + bool "Inject packet loss into RxRPC packet stream" + depends on AF_RXRPC + help + Say Y here to inject packet loss by discarding some received and some + transmitted packets. + config AF_RXRPC_DEBUG bool "RxRPC dynamic debugging" diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 10f3f48a16a8..8fc6ea347182 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -22,6 +22,7 @@ af-rxrpc-y := \ peer_object.o \ recvmsg.o \ security.o \ + sendmsg.o \ skbuff.o \ utils.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 88effadd4b16..44c9c2b0b190 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -16,12 +16,14 @@ #include <linux/net.h> #include <linux/slab.h> #include <linux/skbuff.h> +#include <linux/random.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/key-type.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/af_rxrpc.h> +#define CREATE_TRACE_POINTS #include "ar-internal.h" MODULE_DESCRIPTION("RxRPC network protocol"); @@ -43,7 +45,7 @@ u32 rxrpc_epoch; atomic_t rxrpc_debug_id; /* count of skbs currently in use */ -atomic_t rxrpc_n_skbs; +atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; struct workqueue_struct *rxrpc_workqueue; @@ -104,19 +106,25 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, case AF_INET: if (srx->transport_len < sizeof(struct sockaddr_in)) return -EINVAL; - _debug("INET: %x @ %pI4", - ntohs(srx->transport.sin.sin_port), - &srx->transport.sin.sin_addr); tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad); break; +#ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: + if (srx->transport_len < sizeof(struct sockaddr_in6)) + return -EINVAL; + tail = offsetof(struct sockaddr_rxrpc, transport) + + sizeof(struct sockaddr_in6); + break; +#endif + default: return -EAFNOSUPPORT; } if (tail < len) memset((void *)srx + tail, 0, len - tail); + _debug("INET: %pISp", &srx->transport); return 0; } @@ -128,7 +136,8 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct sock *sk = sock->sk; struct rxrpc_local *local; - struct rxrpc_sock *rx = rxrpc_sk(sk), *prx; + struct rxrpc_sock *rx = rxrpc_sk(sk); + u16 service_id = srx->srx_service; int ret; _enter("%p,%p,%d", rx, saddr, len); @@ -152,16 +161,13 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) goto error_unlock; } - if (rx->srx.srx_service) { - write_lock_bh(&local->services_lock); - list_for_each_entry(prx, &local->services, listen_link) { - if (prx->srx.srx_service == rx->srx.srx_service) - goto service_in_use; - } - + if (service_id) { + write_lock(&local->services_lock); + if (rcu_access_pointer(local->service)) + goto service_in_use; rx->local = local; - list_add_tail(&rx->listen_link, &local->services); - write_unlock_bh(&local->services_lock); + rcu_assign_pointer(local->service, rx); + write_unlock(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; } else { @@ -174,7 +180,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) return 0; service_in_use: - write_unlock_bh(&local->services_lock); + write_unlock(&local->services_lock); rxrpc_put_local(local); ret = -EADDRINUSE; error_unlock: @@ -191,7 +197,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); - unsigned int max; + unsigned int max, old; int ret; _enter("%p,%d", rx, backlog); @@ -210,9 +216,13 @@ static int rxrpc_listen(struct socket *sock, int backlog) backlog = max; else if (backlog < 0 || backlog > max) break; + old = sk->sk_max_ack_backlog; sk->sk_max_ack_backlog = backlog; - rx->sk.sk_state = RXRPC_SERVER_LISTENING; - ret = 0; + ret = rxrpc_service_prealloc(rx, GFP_KERNEL); + if (ret == 0) + rx->sk.sk_state = RXRPC_SERVER_LISTENING; + else + sk->sk_max_ack_backlog = old; break; default: ret = -EBUSY; @@ -230,6 +240,8 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @srx: The address of the peer to contact * @key: The security context to use (defaults to socket setting) * @user_call_ID: The ID to use + * @gfp: The allocation constraints + * @notify_rx: Where to send notifications instead of socket queue * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and @@ -242,7 +254,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct sockaddr_rxrpc *srx, struct key *key, unsigned long user_call_ID, - gfp_t gfp) + gfp_t gfp, + rxrpc_notify_rx_t notify_rx) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; @@ -269,6 +282,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.service_id = srx->srx_service; call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, gfp); + if (!IS_ERR(call)) + call->notify_rx = notify_rx; release_sock(&rx->sk); _leave(" = %p", call); @@ -278,40 +293,39 @@ EXPORT_SYMBOL(rxrpc_kernel_begin_call); /** * rxrpc_kernel_end_call - Allow a kernel service to end a call it was using + * @sock: The socket the call is on * @call: The call to end * * Allow a kernel service to end a call it was using. The call must be * complete before this is called (the call should be aborted if necessary). */ -void rxrpc_kernel_end_call(struct rxrpc_call *call) +void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, atomic_read(&call->usage)); - rxrpc_remove_user_ID(call->socket, call); - rxrpc_put_call(call); + rxrpc_release_call(rxrpc_sk(sock->sk), call); + rxrpc_put_call(call, rxrpc_call_put_kernel); } EXPORT_SYMBOL(rxrpc_kernel_end_call); /** - * rxrpc_kernel_intercept_rx_messages - Intercept received RxRPC messages + * rxrpc_kernel_new_call_notification - Get notifications of new calls * @sock: The socket to intercept received messages on - * @interceptor: The function to pass the messages to + * @notify_new_call: Function to be called when new calls appear + * @discard_new_call: Function to discard preallocated calls * - * Allow a kernel service to intercept messages heading for the Rx queue on an - * RxRPC socket. They get passed to the specified function instead. - * @interceptor should free the socket buffers it is given. @interceptor is - * called with the socket receive queue spinlock held and softirqs disabled - - * this ensures that the messages will be delivered in the right order. + * Allow a kernel service to be given notifications about new calls. */ -void rxrpc_kernel_intercept_rx_messages(struct socket *sock, - rxrpc_interceptor_t interceptor) +void rxrpc_kernel_new_call_notification( + struct socket *sock, + rxrpc_notify_new_call_t notify_new_call, + rxrpc_discard_new_call_t discard_new_call) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - _enter(""); - rx->interceptor = interceptor; + rx->notify_new_call = notify_new_call; + rx->discard_new_call = discard_new_call; } - -EXPORT_SYMBOL(rxrpc_kernel_intercept_rx_messages); +EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); /* * connect an RxRPC socket @@ -391,6 +405,23 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) switch (rx->sk.sk_state) { case RXRPC_UNBOUND: + rx->srx.srx_family = AF_RXRPC; + rx->srx.srx_service = 0; + rx->srx.transport_type = SOCK_DGRAM; + rx->srx.transport.family = rx->family; + switch (rx->family) { + case AF_INET: + rx->srx.transport_len = sizeof(struct sockaddr_in); + break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + rx->srx.transport_len = sizeof(struct sockaddr_in6); + break; +#endif + default: + ret = -EAFNOSUPPORT; + goto error_unlock; + } local = rxrpc_lookup_local(&rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); @@ -505,15 +536,16 @@ error: static unsigned int rxrpc_poll(struct file *file, struct socket *sock, poll_table *wait) { - unsigned int mask; struct sock *sk = sock->sk; + struct rxrpc_sock *rx = rxrpc_sk(sk); + unsigned int mask; sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ - if (!skb_queue_empty(&sk->sk_receive_queue)) + if (!list_empty(&rx->recvmsg_q)) mask |= POLLIN | POLLRDNORM; /* the socket is writable if there is space to add new data to the @@ -540,7 +572,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, return -EAFNOSUPPORT; /* we support transport protocol UDP/UDP6 only */ - if (protocol != PF_INET) + if (protocol != PF_INET && + IS_ENABLED(CONFIG_AF_RXRPC_IPV6) && protocol != PF_INET6) return -EPROTONOSUPPORT; if (sock->type != SOCK_DGRAM) @@ -554,6 +587,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, return -ENOMEM; sock_init_data(sock, sk); + sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_state = RXRPC_UNBOUND; sk->sk_write_space = rxrpc_write_space; sk->sk_max_ack_backlog = 0; @@ -563,9 +597,11 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->family = protocol; rx->calls = RB_ROOT; - INIT_LIST_HEAD(&rx->listen_link); - INIT_LIST_HEAD(&rx->secureq); - INIT_LIST_HEAD(&rx->acceptq); + spin_lock_init(&rx->incoming_lock); + INIT_LIST_HEAD(&rx->sock_calls); + INIT_LIST_HEAD(&rx->to_be_accepted); + INIT_LIST_HEAD(&rx->recvmsg_q); + rwlock_init(&rx->recvmsg_lock); rwlock_init(&rx->call_lock); memset(&rx->srx, 0, sizeof(rx->srx)); @@ -574,6 +610,39 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, } /* + * Kill all the calls on a socket and shut it down. + */ +static int rxrpc_shutdown(struct socket *sock, int flags) +{ + struct sock *sk = sock->sk; + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret = 0; + + _enter("%p,%d", sk, flags); + + if (flags != SHUT_RDWR) + return -EOPNOTSUPP; + if (sk->sk_state == RXRPC_CLOSE) + return -ESHUTDOWN; + + lock_sock(sk); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (sk->sk_state < RXRPC_CLOSE) { + sk->sk_state = RXRPC_CLOSE; + sk->sk_shutdown = SHUTDOWN_MASK; + } else { + ret = -ESHUTDOWN; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + + rxrpc_discard_prealloc(rx); + + release_sock(sk); + return ret; +} + +/* * RxRPC socket destructor */ static void rxrpc_sock_destructor(struct sock *sk) @@ -609,15 +678,14 @@ static int rxrpc_release_sock(struct sock *sk) sk->sk_state = RXRPC_CLOSE; spin_unlock_bh(&sk->sk_receive_queue.lock); - ASSERTCMP(rx->listen_link.next, !=, LIST_POISON1); - - if (!list_empty(&rx->listen_link)) { - write_lock_bh(&rx->local->services_lock); - list_del(&rx->listen_link); - write_unlock_bh(&rx->local->services_lock); + if (rx->local && rx->local->service == rx) { + write_lock(&rx->local->services_lock); + rx->local->service = NULL; + write_unlock(&rx->local->services_lock); } /* try to flush out this socket */ + rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); @@ -666,7 +734,7 @@ static const struct proto_ops rxrpc_rpc_ops = { .poll = rxrpc_poll, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, - .shutdown = sock_no_shutdown, + .shutdown = rxrpc_shutdown, .setsockopt = rxrpc_setsockopt, .getsockopt = sock_no_getsockopt, .sendmsg = rxrpc_sendmsg, @@ -697,7 +765,13 @@ static int __init af_rxrpc_init(void) BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); - rxrpc_epoch = get_seconds(); + get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); + rxrpc_epoch |= RXRPC_RANDOM_EPOCH; + get_random_bytes(&rxrpc_client_conn_ids.cur, + sizeof(rxrpc_client_conn_ids.cur)); + rxrpc_client_conn_ids.cur &= 0x3fffffff; + if (rxrpc_client_conn_ids.cur == 0) + rxrpc_client_conn_ids.cur = 1; ret = -ENOMEM; rxrpc_call_jar = kmem_cache_create( @@ -788,7 +862,8 @@ static void __exit af_rxrpc_exit(void) proto_unregister(&rxrpc_proto); rxrpc_destroy_all_calls(); rxrpc_destroy_all_connections(); - ASSERTCMP(atomic_read(&rxrpc_n_skbs), ==, 0); + ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0); + ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0); rxrpc_destroy_all_locals(); remove_proc_entry("rxrpc_conns", init_net.proc_net); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ff83fb1ddd47..d38dffd78085 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -35,11 +35,23 @@ struct rxrpc_crypt { #define rxrpc_queue_delayed_work(WS,D) \ queue_delayed_work(rxrpc_workqueue, (WS), (D)) -#define rxrpc_queue_call(CALL) rxrpc_queue_work(&(CALL)->processor) - struct rxrpc_connection; /* + * Mark applied to socket buffers. + */ +enum rxrpc_skb_mark { + RXRPC_SKB_MARK_DATA, /* data message */ + RXRPC_SKB_MARK_FINAL_ACK, /* final ACK received message */ + RXRPC_SKB_MARK_BUSY, /* server busy message */ + RXRPC_SKB_MARK_REMOTE_ABORT, /* remote abort message */ + RXRPC_SKB_MARK_LOCAL_ABORT, /* local abort message */ + RXRPC_SKB_MARK_NET_ERROR, /* network error message */ + RXRPC_SKB_MARK_LOCAL_ERROR, /* local error message */ + RXRPC_SKB_MARK_NEW_CALL, /* local error message */ +}; + +/* * sk_state for RxRPC sockets */ enum { @@ -52,19 +64,44 @@ enum { }; /* + * Service backlog preallocation. + * + * This contains circular buffers of preallocated peers, connections and calls + * for incoming service calls and their head and tail pointers. This allows + * calls to be set up in the data_ready handler, thereby avoiding the need to + * shuffle packets around so much. + */ +struct rxrpc_backlog { + unsigned short peer_backlog_head; + unsigned short peer_backlog_tail; + unsigned short conn_backlog_head; + unsigned short conn_backlog_tail; + unsigned short call_backlog_head; + unsigned short call_backlog_tail; +#define RXRPC_BACKLOG_MAX 32 + struct rxrpc_peer *peer_backlog[RXRPC_BACKLOG_MAX]; + struct rxrpc_connection *conn_backlog[RXRPC_BACKLOG_MAX]; + struct rxrpc_call *call_backlog[RXRPC_BACKLOG_MAX]; +}; + +/* * RxRPC socket definition */ struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; - rxrpc_interceptor_t interceptor; /* kernel service Rx interceptor function */ + rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ + rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ struct rxrpc_local *local; /* local endpoint */ - struct list_head listen_link; /* link in the local endpoint's listen list */ - struct list_head secureq; /* calls awaiting connection security clearance */ - struct list_head acceptq; /* calls awaiting acceptance */ + struct rxrpc_backlog *backlog; /* Preallocation for services */ + spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ + struct list_head sock_calls; /* List of calls owned by this socket */ + struct list_head to_be_accepted; /* calls awaiting acceptance */ + struct list_head recvmsg_q; /* Calls awaiting recvmsg's attention */ + rwlock_t recvmsg_lock; /* Lock for recvmsg_q */ struct key *key; /* security for this socket */ struct key *securities; /* list of server security descriptors */ - struct rb_root calls; /* outstanding calls on this socket */ + struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ rwlock_t call_lock; /* lock for calls */ @@ -103,13 +140,11 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { - struct rxrpc_call *call; /* call with which associated */ - unsigned long resend_at; /* time in jiffies at which to resend */ union { - unsigned int offset; /* offset into buffer of next read */ + u8 nr_jumbo; /* Number of jumbo subpackets */ + }; + union { int remain; /* amount of space remaining for next write */ - u32 error; /* network error code */ - bool need_resend; /* T if needs resending */ }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ @@ -117,13 +152,6 @@ struct rxrpc_skb_priv { #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) -enum rxrpc_command { - RXRPC_CMD_SEND_DATA, /* send data message */ - RXRPC_CMD_SEND_ABORT, /* request abort generation */ - RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ - RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ -}; - /* * RxRPC security module interface */ @@ -150,7 +178,12 @@ struct rxrpc_security { void *); /* verify the security on a received packet */ - int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, u32 *); + int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, + unsigned int, unsigned int, rxrpc_seq_t, u16); + + /* Locate the data in a received packet that has been verified. */ + void (*locate_data)(struct rxrpc_call *, struct sk_buff *, + unsigned int *, unsigned int *); /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); @@ -180,9 +213,8 @@ struct rxrpc_local { struct list_head link; struct socket *socket; /* my UDP socket */ struct work_struct processor; - struct list_head services; /* services listening on this endpoint */ + struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ - struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ struct rb_root client_conns; /* Client connections by socket params */ @@ -220,10 +252,12 @@ struct rxrpc_peer { /* calculated RTT cache */ #define RXRPC_RTT_CACHE_SIZE 32 - suseconds_t rtt; /* current RTT estimate (in uS) */ - unsigned int rtt_point; /* next entry at which to insert */ - unsigned int rtt_usage; /* amount of cache actually used */ - suseconds_t rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* calculated RTT cache */ + ktime_t rtt_last_req; /* Time of last RTT request */ + u64 rtt; /* Current RTT estimate (in nS) */ + u64 rtt_sum; /* Sum of cache contents */ + u64 rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */ + u8 rtt_cursor; /* next entry at which to insert */ + u8 rtt_usage; /* amount of cache actually used */ }; /* @@ -255,6 +289,9 @@ enum rxrpc_conn_flag { RXRPC_CONN_HAS_IDR, /* Has a client conn ID assigned */ RXRPC_CONN_IN_SERVICE_CONNS, /* Conn is in peer->service_conns */ RXRPC_CONN_IN_CLIENT_CONNS, /* Conn is in local->client_conns */ + RXRPC_CONN_EXPOSED, /* Conn has extra ref for exposure */ + RXRPC_CONN_DONT_REUSE, /* Don't reuse this connection */ + RXRPC_CONN_COUNTED, /* Counted by rxrpc_nr_client_conns */ }; /* @@ -265,17 +302,29 @@ enum rxrpc_conn_event { }; /* + * The connection cache state. + */ +enum rxrpc_conn_cache_state { + RXRPC_CONN_CLIENT_INACTIVE, /* Conn is not yet listed */ + RXRPC_CONN_CLIENT_WAITING, /* Conn is on wait list, waiting for capacity */ + RXRPC_CONN_CLIENT_ACTIVE, /* Conn is on active list, doing calls */ + RXRPC_CONN_CLIENT_CULLED, /* Conn is culled and delisted, doing calls */ + RXRPC_CONN_CLIENT_IDLE, /* Conn is on idle list, doing mostly nothing */ + RXRPC_CONN__NR_CACHE_STATES +}; + +/* * The connection protocol state. */ enum rxrpc_conn_proto_state { RXRPC_CONN_UNUSED, /* Connection not yet attempted */ RXRPC_CONN_CLIENT, /* Client connection */ + RXRPC_CONN_SERVICE_PREALLOC, /* Service connection preallocation */ RXRPC_CONN_SERVICE_UNSECURED, /* Service unsecured connection */ RXRPC_CONN_SERVICE_CHALLENGING, /* Service challenging for security */ RXRPC_CONN_SERVICE, /* Service secured connection */ RXRPC_CONN_REMOTELY_ABORTED, /* Conn aborted by peer */ RXRPC_CONN_LOCALLY_ABORTED, /* Conn aborted locally */ - RXRPC_CONN_NETWORK_ERROR, /* Conn terminated by network error */ RXRPC_CONN__NR_STATES }; @@ -288,23 +337,33 @@ struct rxrpc_connection { struct rxrpc_conn_proto proto; struct rxrpc_conn_parameters params; - spinlock_t channel_lock; + atomic_t usage; + struct rcu_head rcu; + struct list_head cache_link; + spinlock_t channel_lock; + unsigned char active_chans; /* Mask of active channels */ +#define RXRPC_ACTIVE_CHANS_MASK ((1 << RXRPC_MAXCALLS) - 1) + struct list_head waiting_calls; /* Calls waiting for channels */ struct rxrpc_channel { struct rxrpc_call __rcu *call; /* Active call */ u32 call_id; /* ID of current call */ u32 call_counter; /* Call ID counter */ u32 last_call; /* ID of last call */ - u32 last_result; /* Result of last call (0/abort) */ + u8 last_type; /* Type of last packet */ + u16 last_service_id; + union { + u32 last_seq; + u32 last_abort; + }; } channels[RXRPC_MAXCALLS]; - wait_queue_head_t channel_wq; /* queue to wait for channel to become available */ - struct rcu_head rcu; struct work_struct processor; /* connection event processor */ union { struct rb_node client_node; /* Node in local->client_conns */ struct rb_node service_node; /* Node in peer->service_conns */ }; + struct list_head proc_link; /* link in procfs list */ struct list_head link; /* link in master connection list */ struct sk_buff_head rx_queue; /* received conn-level packets */ const struct rxrpc_security *security; /* applied security module */ @@ -313,21 +372,18 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ unsigned long flags; unsigned long events; - unsigned long put_time; /* Time at which last put */ + unsigned long idle_timestamp; /* Time at which last became idle */ spinlock_t state_lock; /* state-change lock */ - atomic_t usage; - enum rxrpc_conn_proto_state state : 8; /* current state of connection */ + enum rxrpc_conn_cache_state cache_state; + enum rxrpc_conn_proto_state state; /* current state of connection */ u32 local_abort; /* local abort code */ u32 remote_abort; /* remote abort code */ - int error; /* local error incurred */ int debug_id; /* debug ID for printks */ atomic_t serial; /* packet serial number counter */ - atomic_t hi_serial; /* highest serial number received */ - atomic_t avail_chans; /* number of channels available */ + unsigned int hi_serial; /* highest serial number received */ + u32 security_nonce; /* response re-use preventer */ u8 size_align; /* data size alignment (for security) */ - u8 header_size; /* rxrpc + security header size */ u8 security_size; /* security header size */ - u32 security_nonce; /* response re-use preventer */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ }; @@ -337,37 +393,23 @@ struct rxrpc_connection { */ enum rxrpc_call_flag { RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */ - RXRPC_CALL_TERMINAL_MSG, /* call has given the socket its final message */ - RXRPC_CALL_RCVD_LAST, /* all packets received */ - RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */ - RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */ - RXRPC_CALL_PROC_BUSY, /* the processor is busy */ - RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */ RXRPC_CALL_HAS_USERID, /* has a user ID attached */ - RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */ + RXRPC_CALL_IS_SERVICE, /* Call is service call */ + RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ + RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ + RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ + RXRPC_CALL_PINGING, /* Ping in process */ + RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ }; /* * Events that can be raised on a call. */ enum rxrpc_call_event { - RXRPC_CALL_EV_RCVD_ACKALL, /* ACKALL or reply received */ - RXRPC_CALL_EV_RCVD_BUSY, /* busy packet received */ - RXRPC_CALL_EV_RCVD_ABORT, /* abort packet received */ - RXRPC_CALL_EV_RCVD_ERROR, /* network error received */ - RXRPC_CALL_EV_ACK_FINAL, /* need to generate final ACK (and release call) */ RXRPC_CALL_EV_ACK, /* need to generate ACK */ - RXRPC_CALL_EV_REJECT_BUSY, /* need to generate busy message */ RXRPC_CALL_EV_ABORT, /* need to generate abort */ - RXRPC_CALL_EV_CONN_ABORT, /* local connection abort generated */ - RXRPC_CALL_EV_RESEND_TIMER, /* Tx resend timer expired */ + RXRPC_CALL_EV_TIMER, /* Timer expired */ RXRPC_CALL_EV_RESEND, /* Tx resend required */ - RXRPC_CALL_EV_DRAIN_RX_OOS, /* drain the Rx out of sequence queue */ - RXRPC_CALL_EV_LIFE_TIMER, /* call's lifetimer ran out */ - RXRPC_CALL_EV_ACCEPTED, /* incoming call accepted by userspace app */ - RXRPC_CALL_EV_SECURED, /* incoming call's connection is now secure */ - RXRPC_CALL_EV_POST_ACCEPT, /* need to post an "accept?" message to the app */ - RXRPC_CALL_EV_RELEASE, /* need to release the call's resources */ }; /* @@ -379,20 +421,38 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ - RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */ + RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */ - RXRPC_CALL_COMPLETE, /* - call completed */ - RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */ + RXRPC_CALL_COMPLETE, /* - call complete */ + NR__RXRPC_CALL_STATES +}; + +/* + * Call completion condition (state == RXRPC_CALL_COMPLETE). + */ +enum rxrpc_call_completion { + RXRPC_CALL_SUCCEEDED, /* - Normal termination */ RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */ RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */ + RXRPC_CALL_LOCAL_ERROR, /* - call failed due to local error */ RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */ - RXRPC_CALL_DEAD, /* - call is dead */ - NR__RXRPC_CALL_STATES + NR__RXRPC_CALL_COMPLETIONS +}; + +/* + * Call Tx congestion management modes. + */ +enum rxrpc_congest_mode { + RXRPC_CALL_SLOW_START, + RXRPC_CALL_CONGEST_AVOIDANCE, + RXRPC_CALL_PACKET_LOSS, + RXRPC_CALL_FAST_RETRANSMIT, + NR__RXRPC_CONGEST_MODES }; /* @@ -402,92 +462,329 @@ enum rxrpc_call_state { struct rxrpc_call { struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ - struct rxrpc_sock *socket; /* socket responsible */ - struct timer_list lifetimer; /* lifetime remaining on call */ - struct timer_list deadspan; /* reap timer for re-ACK'ing, etc */ - struct timer_list ack_timer; /* ACK generation timer */ - struct timer_list resend_timer; /* Tx resend timer */ - struct work_struct destroyer; /* call destroyer */ - struct work_struct processor; /* packet processor and ACK generator */ + struct rxrpc_peer *peer; /* Peer record for remote address */ + struct rxrpc_sock __rcu *socket; /* socket responsible */ + ktime_t ack_at; /* When deferred ACK needs to happen */ + ktime_t resend_at; /* When next resend needs to happen */ + ktime_t expire_at; /* When the call times out */ + struct timer_list timer; /* Combined event timer */ + struct work_struct processor; /* Event processor */ + rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ + struct list_head chan_wait_link; /* Link in conn->waiting_calls */ struct hlist_node error_link; /* link in error distribution list */ - struct list_head accept_link; /* calls awaiting acceptance */ - struct rb_node sock_node; /* node in socket call tree */ - struct sk_buff_head rx_queue; /* received packets */ - struct sk_buff_head rx_oos_queue; /* packets received out of sequence */ + struct list_head accept_link; /* Link in rx->acceptq */ + struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ + struct list_head sock_link; /* Link in rx->sock_calls */ + struct rb_node sock_node; /* Node in rx->calls */ struct sk_buff *tx_pending; /* Tx socket buffer being filled */ - wait_queue_head_t tx_waitq; /* wait for Tx window space to become available */ + wait_queue_head_t waitq; /* Wait queue for channel or Tx */ __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ unsigned long user_call_ID; /* user-defined call ID */ - unsigned long creation_jif; /* time of call creation */ unsigned long flags; unsigned long events; spinlock_t lock; rwlock_t state_lock; /* lock for state transition */ - atomic_t usage; - atomic_t skb_count; /* Outstanding packets on this call */ - atomic_t sequence; /* Tx data packet sequence counter */ - u32 local_abort; /* local abort code */ - u32 remote_abort; /* remote abort code */ - int error_report; /* Network error (ICMP/local transport) */ + u32 abort_code; /* Local/remote abort code */ int error; /* Local error incurred */ - enum rxrpc_call_state state : 8; /* current state of call */ + enum rxrpc_call_state state; /* current state of call */ + enum rxrpc_call_completion completion; /* Call completion condition */ + atomic_t usage; + u16 service_id; /* service ID */ + u8 security_ix; /* Security type */ + u32 call_id; /* call ID on connection */ + u32 cid; /* connection ID plus channel index */ int debug_id; /* debug ID for printks */ - u8 channel; /* connection channel occupied by this call */ - - /* transmission-phase ACK management */ - u8 acks_head; /* offset into window of first entry */ - u8 acks_tail; /* offset into window of last entry */ - u8 acks_winsz; /* size of un-ACK'd window */ - u8 acks_unacked; /* lowest unacked packet in last ACK received */ - int acks_latest; /* serial number of latest ACK received */ - rxrpc_seq_t acks_hard; /* highest definitively ACK'd msg seq */ - unsigned long *acks_window; /* sent packet window - * - elements are pointers with LSB set if ACK'd + unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ + unsigned short rx_pkt_len; /* Current recvmsg packet len */ + + /* Rx/Tx circular buffer, depending on phase. + * + * In the Rx phase, packets are annotated with 0 or the number of the + * segment of a jumbo packet each buffer refers to. There can be up to + * 47 segments in a maximum-size UDP packet. + * + * In the Tx phase, packets are annotated with which buffers have been + * acked. + */ +#define RXRPC_RXTX_BUFF_SIZE 64 +#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1) +#define RXRPC_INIT_RX_WINDOW_SIZE 32 + struct sk_buff **rxtx_buffer; + u8 *rxtx_annotations; +#define RXRPC_TX_ANNO_ACK 0 +#define RXRPC_TX_ANNO_UNACK 1 +#define RXRPC_TX_ANNO_NAK 2 +#define RXRPC_TX_ANNO_RETRANS 3 +#define RXRPC_TX_ANNO_MASK 0x03 +#define RXRPC_TX_ANNO_LAST 0x04 +#define RXRPC_TX_ANNO_RESENT 0x08 + +#define RXRPC_RX_ANNO_JUMBO 0x3f /* Jumbo subpacket number + 1 if not zero */ +#define RXRPC_RX_ANNO_JLAST 0x40 /* Set if last element of a jumbo packet */ +#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */ + rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but + * not hard-ACK'd packet follows this. + */ + rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + + /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS + * is fixed, we keep these numbers in terms of segments (ie. DATA + * packets) rather than bytes. + */ +#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN + u8 cong_cwnd; /* Congestion window size */ + u8 cong_extra; /* Extra to send for congestion management */ + u8 cong_ssthresh; /* Slow-start threshold */ + enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ + u8 cong_dup_acks; /* Count of ACKs showing missing packets */ + u8 cong_cumul_acks; /* Cumulative ACK count */ + ktime_t cong_tstamp; /* Last time cwnd was changed */ + + rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not + * consumed packet follows this. */ + rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */ + rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ + u8 rx_winsize; /* Size of Rx window */ + u8 tx_winsize; /* Maximum size of Tx window */ + bool tx_phase; /* T if transmission phase, F if receive phase */ + u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */ /* receive-phase ACK management */ - rxrpc_seq_t rx_data_expect; /* next data seq ID expected to be received */ - rxrpc_seq_t rx_data_post; /* next data seq ID expected to be posted */ - rxrpc_seq_t rx_data_recv; /* last data seq ID encountered by recvmsg */ - rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */ - rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */ - rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */ - rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ u8 ackr_reason; /* reason to ACK */ + u16 ackr_skew; /* skew on packet being ACK'd */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ - atomic_t ackr_not_idle; /* number of packets in Rx queue */ - - /* received packet records, 1 bit per record */ -#define RXRPC_ACKR_WINDOW_ASZ DIV_ROUND_UP(RXRPC_MAXACKS, BITS_PER_LONG) - unsigned long ackr_window[RXRPC_ACKR_WINDOW_ASZ + 1]; + rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ + rxrpc_seq_t ackr_consumed; /* Highest packet shown consumed */ + rxrpc_seq_t ackr_seen; /* Highest packet shown seen */ + rxrpc_serial_t ackr_ping; /* Last ping sent */ + ktime_t ackr_ping_time; /* Time last ping sent */ - u8 in_clientflag; /* Copy of conn->in_clientflag */ - struct rxrpc_local *local; /* Local endpoint. */ - u32 call_id; /* call ID on connection */ - u32 cid; /* connection ID plus channel index */ - u32 epoch; /* epoch of this connection */ - u16 service_id; /* service ID */ + /* transmission-phase ACK management */ + ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ + rxrpc_serial_t acks_latest; /* serial number of latest ACK received */ + rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ }; /* - * locally abort an RxRPC call + * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ -static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) -{ - write_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) { - call->local_abort = abort_code; - call->state = RXRPC_CALL_LOCALLY_ABORTED; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - } - write_unlock_bh(&call->state_lock); -} +struct rxrpc_ack_summary { + u8 ack_reason; + u8 nr_acks; /* Number of ACKs in packet */ + u8 nr_nacks; /* Number of NACKs in packet */ + u8 nr_new_acks; /* Number of new ACKs in packet */ + u8 nr_new_nacks; /* Number of new NACKs in packet */ + u8 nr_rot_new_acks; /* Number of rotated new ACKs */ + bool new_low_nack; /* T if new low NACK found */ + bool retrans_timeo; /* T if reTx due to timeout happened */ + u8 flight_size; /* Number of unreceived transmissions */ + /* Place to stash values for tracing */ + enum rxrpc_congest_mode mode:8; + u8 cwnd; + u8 ssthresh; + u8 dup_acks; + u8 cumulative_acks; +}; + +enum rxrpc_skb_trace { + rxrpc_skb_rx_cleaned, + rxrpc_skb_rx_freed, + rxrpc_skb_rx_got, + rxrpc_skb_rx_lost, + rxrpc_skb_rx_received, + rxrpc_skb_rx_rotated, + rxrpc_skb_rx_purged, + rxrpc_skb_rx_seen, + rxrpc_skb_tx_cleaned, + rxrpc_skb_tx_freed, + rxrpc_skb_tx_got, + rxrpc_skb_tx_new, + rxrpc_skb_tx_rotated, + rxrpc_skb_tx_seen, + rxrpc_skb__nr_trace +}; + +extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7]; + +enum rxrpc_conn_trace { + rxrpc_conn_new_client, + rxrpc_conn_new_service, + rxrpc_conn_queued, + rxrpc_conn_seen, + rxrpc_conn_got, + rxrpc_conn_put_client, + rxrpc_conn_put_service, + rxrpc_conn__nr_trace +}; + +extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4]; + +enum rxrpc_client_trace { + rxrpc_client_activate_chans, + rxrpc_client_alloc, + rxrpc_client_chan_activate, + rxrpc_client_chan_disconnect, + rxrpc_client_chan_pass, + rxrpc_client_chan_unstarted, + rxrpc_client_cleanup, + rxrpc_client_count, + rxrpc_client_discard, + rxrpc_client_duplicate, + rxrpc_client_exposed, + rxrpc_client_replace, + rxrpc_client_to_active, + rxrpc_client_to_culled, + rxrpc_client_to_idle, + rxrpc_client_to_inactive, + rxrpc_client_to_waiting, + rxrpc_client_uncount, + rxrpc_client__nr_trace +}; + +extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7]; +extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5]; + +enum rxrpc_call_trace { + rxrpc_call_new_client, + rxrpc_call_new_service, + rxrpc_call_queued, + rxrpc_call_queued_ref, + rxrpc_call_seen, + rxrpc_call_connected, + rxrpc_call_release, + rxrpc_call_got, + rxrpc_call_got_userid, + rxrpc_call_got_kernel, + rxrpc_call_put, + rxrpc_call_put_userid, + rxrpc_call_put_kernel, + rxrpc_call_put_noqueue, + rxrpc_call_error, + rxrpc_call__nr_trace +}; + +extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; + +enum rxrpc_transmit_trace { + rxrpc_transmit_wait, + rxrpc_transmit_queue, + rxrpc_transmit_queue_last, + rxrpc_transmit_rotate, + rxrpc_transmit_rotate_last, + rxrpc_transmit_await_reply, + rxrpc_transmit_end, + rxrpc_transmit__nr_trace +}; + +extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4]; + +enum rxrpc_receive_trace { + rxrpc_receive_incoming, + rxrpc_receive_queue, + rxrpc_receive_queue_last, + rxrpc_receive_front, + rxrpc_receive_rotate, + rxrpc_receive_end, + rxrpc_receive__nr_trace +}; + +extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4]; + +enum rxrpc_recvmsg_trace { + rxrpc_recvmsg_enter, + rxrpc_recvmsg_wait, + rxrpc_recvmsg_dequeue, + rxrpc_recvmsg_hole, + rxrpc_recvmsg_next, + rxrpc_recvmsg_cont, + rxrpc_recvmsg_full, + rxrpc_recvmsg_data_return, + rxrpc_recvmsg_terminal, + rxrpc_recvmsg_to_be_accepted, + rxrpc_recvmsg_return, + rxrpc_recvmsg__nr_trace +}; + +extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5]; + +enum rxrpc_rtt_tx_trace { + rxrpc_rtt_tx_ping, + rxrpc_rtt_tx_data, + rxrpc_rtt_tx__nr_trace +}; + +extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5]; + +enum rxrpc_rtt_rx_trace { + rxrpc_rtt_rx_ping_response, + rxrpc_rtt_rx_requested_ack, + rxrpc_rtt_rx__nr_trace +}; + +extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; + +enum rxrpc_timer_trace { + rxrpc_timer_begin, + rxrpc_timer_init_for_reply, + rxrpc_timer_expired, + rxrpc_timer_set_for_ack, + rxrpc_timer_set_for_resend, + rxrpc_timer_set_for_send, + rxrpc_timer__nr_trace +}; + +extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; + +enum rxrpc_propose_ack_trace { + rxrpc_propose_ack_client_tx_end, + rxrpc_propose_ack_input_data, + rxrpc_propose_ack_ping_for_lost_ack, + rxrpc_propose_ack_ping_for_lost_reply, + rxrpc_propose_ack_ping_for_params, + rxrpc_propose_ack_respond_to_ack, + rxrpc_propose_ack_respond_to_ping, + rxrpc_propose_ack_retry_tx, + rxrpc_propose_ack_rotate_rx, + rxrpc_propose_ack_terminal_ack, + rxrpc_propose_ack__nr_trace +}; + +enum rxrpc_propose_ack_outcome { + rxrpc_propose_ack_use, + rxrpc_propose_ack_update, + rxrpc_propose_ack_subsume, + rxrpc_propose_ack__nr_outcomes +}; + +extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8]; +extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes]; + +enum rxrpc_congest_change { + rxrpc_cong_begin_retransmission, + rxrpc_cong_cleared_nacks, + rxrpc_cong_new_low_nack, + rxrpc_cong_no_change, + rxrpc_cong_progress, + rxrpc_cong_retransmit_again, + rxrpc_cong_rtt_window_end, + rxrpc_cong_saw_nack, + rxrpc_congest__nr_change +}; + +extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10]; +extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9]; + +extern const char *const rxrpc_pkts[]; +extern const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; + +#include <trace/events/rxrpc.h> /* * af_rxrpc.c */ -extern atomic_t rxrpc_n_skbs; +extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; extern u32 rxrpc_epoch; extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; @@ -495,70 +792,178 @@ extern struct workqueue_struct *rxrpc_workqueue; /* * call_accept.c */ +int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); +void rxrpc_discard_prealloc(struct rxrpc_sock *); +struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, + struct rxrpc_connection *, + struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); -struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long); +struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *, unsigned long, + rxrpc_notify_rx_t); int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); -void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); +void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t); +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, + enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); /* * call_object.c */ +extern const char *const rxrpc_call_states[]; +extern const char *const rxrpc_call_completions[]; extern unsigned int rxrpc_max_call_lifetime; -extern unsigned int rxrpc_dead_call_expiry; extern struct kmem_cache *rxrpc_call_jar; extern struct list_head rxrpc_calls; extern rwlock_t rxrpc_call_lock; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); +struct rxrpc_call *rxrpc_alloc_call(gfp_t); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, unsigned long, gfp_t); -struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, - struct rxrpc_connection *, - struct sk_buff *); -void rxrpc_release_call(struct rxrpc_call *); +void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, + struct sk_buff *); +void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); -void __rxrpc_put_call(struct rxrpc_call *); +bool __rxrpc_queue_call(struct rxrpc_call *); +bool rxrpc_queue_call(struct rxrpc_call *); +void rxrpc_see_call(struct rxrpc_call *); +void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); +void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); +void rxrpc_cleanup_call(struct rxrpc_call *); void __exit rxrpc_destroy_all_calls(void); +static inline bool rxrpc_is_service_call(const struct rxrpc_call *call) +{ + return test_bit(RXRPC_CALL_IS_SERVICE, &call->flags); +} + +static inline bool rxrpc_is_client_call(const struct rxrpc_call *call) +{ + return !rxrpc_is_service_call(call); +} + +/* + * Transition a call to the complete state. + */ +static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + if (call->state < RXRPC_CALL_COMPLETE) { + call->abort_code = abort_code; + call->error = error; + call->completion = compl, + call->state = RXRPC_CALL_COMPLETE; + wake_up(&call->waitq); + return true; + } + return false; +} + +static inline bool rxrpc_set_call_completion(struct rxrpc_call *call, + enum rxrpc_call_completion compl, + u32 abort_code, + int error) +{ + bool ret; + + write_lock_bh(&call->state_lock); + ret = __rxrpc_set_call_completion(call, compl, abort_code, error); + write_unlock_bh(&call->state_lock); + return ret; +} + +/* + * Record that a call successfully completed. + */ +static inline bool __rxrpc_call_completed(struct rxrpc_call *call) +{ + return __rxrpc_set_call_completion(call, RXRPC_CALL_SUCCEEDED, 0, 0); +} + +static inline bool rxrpc_call_completed(struct rxrpc_call *call) +{ + bool ret; + + write_lock_bh(&call->state_lock); + ret = __rxrpc_call_completed(call); + write_unlock_bh(&call->state_lock); + return ret; +} + +/* + * Record that a call is locally aborted. + */ +static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, + rxrpc_seq_t seq, + u32 abort_code, int error) +{ + trace_rxrpc_abort(why, call->cid, call->call_id, seq, + abort_code, error); + return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, + abort_code, error); +} + +static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, + rxrpc_seq_t seq, u32 abort_code, int error) +{ + bool ret; + + write_lock_bh(&call->state_lock); + ret = __rxrpc_abort_call(why, call, seq, abort_code, error); + write_unlock_bh(&call->state_lock); + return ret; +} + /* * conn_client.c */ +extern unsigned int rxrpc_max_client_connections; +extern unsigned int rxrpc_reap_client_connections; +extern unsigned int rxrpc_conn_idle_client_expiry; +extern unsigned int rxrpc_conn_idle_client_fast_expiry; extern struct idr rxrpc_client_conn_ids; void rxrpc_destroy_client_conn_ids(void); int rxrpc_connect_call(struct rxrpc_call *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, gfp_t); -void rxrpc_unpublish_client_conn(struct rxrpc_connection *); +void rxrpc_expose_client_call(struct rxrpc_call *); +void rxrpc_disconnect_client_call(struct rxrpc_call *); +void rxrpc_put_client_conn(struct rxrpc_connection *); +void __exit rxrpc_destroy_all_client_connections(void); /* * conn_event.c */ void rxrpc_process_connection(struct work_struct *); -void rxrpc_reject_packet(struct rxrpc_local *, struct sk_buff *); -void rxrpc_reject_packets(struct rxrpc_local *); /* * conn_object.c */ extern unsigned int rxrpc_connection_expiry; extern struct list_head rxrpc_connections; +extern struct list_head rxrpc_connection_proc_list; extern rwlock_t rxrpc_connection_lock; int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, struct sk_buff *); -void __rxrpc_disconnect_call(struct rxrpc_call *); +void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); -void rxrpc_put_connection(struct rxrpc_connection *); +void rxrpc_kill_connection(struct rxrpc_connection *); +bool rxrpc_queue_conn(struct rxrpc_connection *); +void rxrpc_see_connection(struct rxrpc_connection *); +void rxrpc_get_connection(struct rxrpc_connection *); +struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *); +void rxrpc_put_service_conn(struct rxrpc_connection *); void __exit rxrpc_destroy_all_connections(void); static inline bool rxrpc_conn_is_client(const struct rxrpc_connection *conn) @@ -571,24 +976,15 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) return !rxrpc_conn_is_client(conn); } -static inline void rxrpc_get_connection(struct rxrpc_connection *conn) -{ - atomic_inc(&conn->usage); -} - -static inline -struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *conn) +static inline void rxrpc_put_connection(struct rxrpc_connection *conn) { - return atomic_inc_not_zero(&conn->usage) ? conn : NULL; -} + if (!conn) + return; -static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) -{ - if (!rxrpc_get_connection_maybe(conn)) - return false; - if (!rxrpc_queue_work(&conn->processor)) - rxrpc_put_connection(conn); - return true; + if (rxrpc_conn_is_client(conn)) + rxrpc_put_client_conn(conn); + else + rxrpc_put_service_conn(conn); } /* @@ -596,17 +992,14 @@ static inline bool rxrpc_queue_conn(struct rxrpc_connection *conn) */ struct rxrpc_connection *rxrpc_find_service_conn_rcu(struct rxrpc_peer *, struct sk_buff *); -struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *, - struct sockaddr_rxrpc *, - struct sk_buff *); +struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t); +void rxrpc_new_incoming_connection(struct rxrpc_connection *, struct sk_buff *); void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ void rxrpc_data_ready(struct sock *); -int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool); -void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); /* * insecure.c @@ -668,25 +1061,24 @@ extern unsigned int rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; +extern unsigned int rxrpc_resend_timeout; -extern const char *const rxrpc_pkts[]; extern const s8 rxrpc_ack_priority[]; -extern const char *rxrpc_acks(u8 reason); - /* * output.c */ -extern unsigned int rxrpc_resend_timeout; - -int rxrpc_send_data_packet(struct rxrpc_connection *, struct sk_buff *); -int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); +int rxrpc_send_call_packet(struct rxrpc_call *, u8); +int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); +void rxrpc_reject_packets(struct rxrpc_local *); /* * peer_event.c */ void rxrpc_error_report(struct sock *); void rxrpc_peer_error_distributor(struct work_struct *); +void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, + rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); /* * peer_object.c @@ -696,10 +1088,13 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); +struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, + struct rxrpc_peer *); -static inline void rxrpc_get_peer(struct rxrpc_peer *peer) +static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) { atomic_inc(&peer->usage); + return peer; } static inline @@ -718,14 +1113,13 @@ static inline void rxrpc_put_peer(struct rxrpc_peer *peer) /* * proc.c */ -extern const char *const rxrpc_call_states[]; extern const struct file_operations rxrpc_call_seq_fops; extern const struct file_operations rxrpc_connection_seq_fops; /* * recvmsg.c */ -void rxrpc_remove_user_ID(struct rxrpc_sock *, struct rxrpc_call *); +void rxrpc_notify_socket(struct rxrpc_call *); int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* @@ -744,9 +1138,21 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *); int rxrpc_init_server_conn_security(struct rxrpc_connection *); /* + * sendmsg.c + */ +int rxrpc_do_sendmsg(struct rxrpc_sock *, struct msghdr *, size_t); + +/* * skbuff.c */ +void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); void rxrpc_packet_destructor(struct sk_buff *); +void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_lose_skb(struct sk_buff *, enum rxrpc_skb_trace); +void rxrpc_purge_queue(struct sk_buff_head *); /* * sysctl.c @@ -764,6 +1170,23 @@ static inline void rxrpc_sysctl_exit(void) {} */ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *, struct sk_buff *); +static inline bool before(u32 seq1, u32 seq2) +{ + return (s32)(seq1 - seq2) < 0; +} +static inline bool before_eq(u32 seq1, u32 seq2) +{ + return (s32)(seq1 - seq2) <= 0; +} +static inline bool after(u32 seq1, u32 seq2) +{ + return (s32)(seq1 - seq2) > 0; +} +static inline bool after_eq(u32 seq1, u32 seq2) +{ + return (s32)(seq1 - seq2) >= 0; +} + /* * debug tracing */ @@ -846,11 +1269,12 @@ do { \ #define ASSERTCMP(X, OP, Y) \ do { \ - unsigned long _x = (unsigned long)(X); \ - unsigned long _y = (unsigned long)(Y); \ + __typeof__(X) _x = (X); \ + __typeof__(Y) _y = (__typeof__(X))(Y); \ if (unlikely(!(_x OP _y))) { \ - pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ - _x, _x, #OP, _y, _y); \ + pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ + (unsigned long)_x, (unsigned long)_x, #OP, \ + (unsigned long)_y, (unsigned long)_y); \ BUG(); \ } \ } while (0) @@ -865,11 +1289,12 @@ do { \ #define ASSERTIFCMP(C, X, OP, Y) \ do { \ - unsigned long _x = (unsigned long)(X); \ - unsigned long _y = (unsigned long)(Y); \ + __typeof__(X) _x = (X); \ + __typeof__(Y) _y = (__typeof__(X))(Y); \ if (unlikely((C) && !(_x OP _y))) { \ pr_err("Assertion failed - %lu(0x%lx) %s %lu(0x%lx) is false\n", \ - _x, _x, #OP, _y, _y); \ + (unsigned long)_x, (unsigned long)_x, #OP, \ + (unsigned long)_y, (unsigned long)_y); \ BUG(); \ } \ } while (0) @@ -893,54 +1318,3 @@ do { \ } while (0) #endif /* __KDEBUGALL */ - -/* - * socket buffer accounting / leak finding - */ -static inline void __rxrpc_new_skb(struct sk_buff *skb, const char *fn) -{ - //_net("new skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_inc(&rxrpc_n_skbs); -} - -#define rxrpc_new_skb(skb) __rxrpc_new_skb((skb), __func__) - -static inline void __rxrpc_kill_skb(struct sk_buff *skb, const char *fn) -{ - //_net("kill skb %p %s [%d]", skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_dec(&rxrpc_n_skbs); -} - -#define rxrpc_kill_skb(skb) __rxrpc_kill_skb((skb), __func__) - -static inline void __rxrpc_free_skb(struct sk_buff *skb, const char *fn) -{ - if (skb) { - CHECK_SLAB_OKAY(&skb->users); - //_net("free skb %p %s [%d]", - // skb, fn, atomic_read(&rxrpc_n_skbs)); - //atomic_dec(&rxrpc_n_skbs); - kfree_skb(skb); - } -} - -#define rxrpc_free_skb(skb) __rxrpc_free_skb((skb), __func__) - -static inline void rxrpc_purge_queue(struct sk_buff_head *list) -{ - struct sk_buff *skb; - while ((skb = skb_dequeue((list))) != NULL) - rxrpc_free_skb(skb); -} - -#define rxrpc_get_call(CALL) \ -do { \ - CHECK_SLAB_OKAY(&(CALL)->usage); \ - if (atomic_inc_return(&(CALL)->usage) == 1) \ - BUG(); \ -} while (0) - -#define rxrpc_put_call(CALL) \ -do { \ - __rxrpc_put_call(CALL); \ -} while (0) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 9bae21e66d65..3cac231d8405 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -20,265 +20,409 @@ #include <linux/in6.h> #include <linux/icmp.h> #include <linux/gfp.h> +#include <linux/circ_buf.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include "ar-internal.h" /* - * generate a connection-level abort + * Preallocate a single service call, connection and peer and, if possible, + * give them a user ID and attach the user's side of the ID to them. */ -static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx, - struct rxrpc_wire_header *whdr) +static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, + struct rxrpc_backlog *b, + rxrpc_notify_rx_t notify_rx, + rxrpc_user_attach_call_t user_attach_call, + unsigned long user_call_ID, gfp_t gfp) { - struct msghdr msg; - struct kvec iov[1]; - size_t len; - int ret; + const void *here = __builtin_return_address(0); + struct rxrpc_call *call; + int max, tmp; + unsigned int size = RXRPC_BACKLOG_MAX; + unsigned int head, tail, call_head, call_tail; + + max = rx->sk.sk_max_ack_backlog; + tmp = rx->sk.sk_ack_backlog; + if (tmp >= max) { + _leave(" = -ENOBUFS [full %u]", max); + return -ENOBUFS; + } + max -= tmp; + + /* We don't need more conns and peers than we have calls, but on the + * other hand, we shouldn't ever use more peers than conns or conns + * than calls. + */ + call_head = b->call_backlog_head; + call_tail = READ_ONCE(b->call_backlog_tail); + tmp = CIRC_CNT(call_head, call_tail, size); + if (tmp >= max) { + _leave(" = -ENOBUFS [enough %u]", tmp); + return -ENOBUFS; + } + max = tmp + 1; + + head = b->peer_backlog_head; + tail = READ_ONCE(b->peer_backlog_tail); + if (CIRC_CNT(head, tail, size) < max) { + struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp); + if (!peer) + return -ENOMEM; + b->peer_backlog[head] = peer; + smp_store_release(&b->peer_backlog_head, + (head + 1) & (size - 1)); + } - _enter("%d,,", local->debug_id); + head = b->conn_backlog_head; + tail = READ_ONCE(b->conn_backlog_tail); + if (CIRC_CNT(head, tail, size) < max) { + struct rxrpc_connection *conn; - whdr->type = RXRPC_PACKET_TYPE_BUSY; - whdr->serial = htonl(1); + conn = rxrpc_prealloc_service_connection(gfp); + if (!conn) + return -ENOMEM; + b->conn_backlog[head] = conn; + smp_store_release(&b->conn_backlog_head, + (head + 1) & (size - 1)); - msg.msg_name = &srx->transport.sin; - msg.msg_namelen = sizeof(srx->transport.sin); - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; + trace_rxrpc_conn(conn, rxrpc_conn_new_service, + atomic_read(&conn->usage), here); + } - iov[0].iov_base = whdr; - iov[0].iov_len = sizeof(*whdr); + /* Now it gets complicated, because calls get registered with the + * socket here, particularly if a user ID is preassigned by the user. + */ + call = rxrpc_alloc_call(gfp); + if (!call) + return -ENOMEM; + call->flags |= (1 << RXRPC_CALL_IS_SERVICE); + call->state = RXRPC_CALL_SERVER_PREALLOC; - len = iov[0].iov_len; + trace_rxrpc_call(call, rxrpc_call_new_service, + atomic_read(&call->usage), + here, (const void *)user_call_ID); - _proto("Tx BUSY %%1"); + write_lock(&rx->call_lock); + if (user_attach_call) { + struct rxrpc_call *xcall; + struct rb_node *parent, **pp; + + /* Check the user ID isn't already in use */ + pp = &rx->calls.rb_node; + parent = NULL; + while (*pp) { + parent = *pp; + xcall = rb_entry(parent, struct rxrpc_call, sock_node); + if (user_call_ID < call->user_call_ID) + pp = &(*pp)->rb_left; + else if (user_call_ID > call->user_call_ID) + pp = &(*pp)->rb_right; + else + goto id_in_use; + } - ret = kernel_sendmsg(local->socket, &msg, iov, 1, len); - if (ret < 0) { - _leave(" = -EAGAIN [sendmsg failed: %d]", ret); - return -EAGAIN; + call->user_call_ID = user_call_ID; + call->notify_rx = notify_rx; + rxrpc_get_call(call, rxrpc_call_got_kernel); + user_attach_call(call, user_call_ID); + rxrpc_get_call(call, rxrpc_call_got_userid); + rb_link_node(&call->sock_node, parent, pp); + rb_insert_color(&call->sock_node, &rx->calls); + set_bit(RXRPC_CALL_HAS_USERID, &call->flags); } - _leave(" = 0"); + list_add(&call->sock_link, &rx->sock_calls); + + write_unlock(&rx->call_lock); + + write_lock(&rxrpc_call_lock); + list_add_tail(&call->link, &rxrpc_calls); + write_unlock(&rxrpc_call_lock); + + b->call_backlog[call_head] = call; + smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1)); + _leave(" = 0 [%d -> %lx]", call->debug_id, user_call_ID); return 0; + +id_in_use: + write_unlock(&rx->call_lock); + rxrpc_cleanup_call(call); + _leave(" = -EBADSLT"); + return -EBADSLT; } /* - * accept an incoming call that needs peer, transport and/or connection setting - * up + * Preallocate sufficient service connections, calls and peers to cover the + * entire backlog of a socket. When a new call comes in, if we don't have + * sufficient of each available, the call gets rejected as busy or ignored. + * + * The backlog is replenished when a connection is accepted or rejected. */ -static int rxrpc_accept_incoming_call(struct rxrpc_local *local, - struct rxrpc_sock *rx, - struct sk_buff *skb, - struct sockaddr_rxrpc *srx) +int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) { - struct rxrpc_connection *conn; - struct rxrpc_skb_priv *sp, *nsp; - struct rxrpc_call *call; - struct sk_buff *notification; - int ret; + struct rxrpc_backlog *b = rx->backlog; - _enter(""); + if (!b) { + b = kzalloc(sizeof(struct rxrpc_backlog), gfp); + if (!b) + return -ENOMEM; + rx->backlog = b; + } + + if (rx->discard_new_call) + return 0; + + while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0) + ; - sp = rxrpc_skb(skb); + return 0; +} - /* get a notification message to send to the server app */ - notification = alloc_skb(0, GFP_NOFS); - if (!notification) { - _debug("no memory"); - ret = -ENOMEM; - goto error_nofree; +/* + * Discard the preallocation on a service. + */ +void rxrpc_discard_prealloc(struct rxrpc_sock *rx) +{ + struct rxrpc_backlog *b = rx->backlog; + unsigned int size = RXRPC_BACKLOG_MAX, head, tail; + + if (!b) + return; + rx->backlog = NULL; + + /* Make sure that there aren't any incoming calls in progress before we + * clear the preallocation buffers. + */ + spin_lock_bh(&rx->incoming_lock); + spin_unlock_bh(&rx->incoming_lock); + + head = b->peer_backlog_head; + tail = b->peer_backlog_tail; + while (CIRC_CNT(head, tail, size) > 0) { + struct rxrpc_peer *peer = b->peer_backlog[tail]; + kfree(peer); + tail = (tail + 1) & (size - 1); } - rxrpc_new_skb(notification); - notification->mark = RXRPC_SKB_MARK_NEW_CALL; - - conn = rxrpc_incoming_connection(local, srx, skb); - if (IS_ERR(conn)) { - _debug("no conn"); - ret = PTR_ERR(conn); - goto error; + + head = b->conn_backlog_head; + tail = b->conn_backlog_tail; + while (CIRC_CNT(head, tail, size) > 0) { + struct rxrpc_connection *conn = b->conn_backlog[tail]; + write_lock(&rxrpc_connection_lock); + list_del(&conn->link); + list_del(&conn->proc_link); + write_unlock(&rxrpc_connection_lock); + kfree(conn); + tail = (tail + 1) & (size - 1); } - call = rxrpc_incoming_call(rx, conn, skb); - rxrpc_put_connection(conn); - if (IS_ERR(call)) { - _debug("no call"); - ret = PTR_ERR(call); - goto error; + head = b->call_backlog_head; + tail = b->call_backlog_tail; + while (CIRC_CNT(head, tail, size) > 0) { + struct rxrpc_call *call = b->call_backlog[tail]; + if (rx->discard_new_call) { + _debug("discard %lx", call->user_call_ID); + rx->discard_new_call(call, call->user_call_ID); + rxrpc_put_call(call, rxrpc_call_put_kernel); + } + rxrpc_call_completed(call); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); + tail = (tail + 1) & (size - 1); } - /* attach the call to the socket */ - read_lock_bh(&local->services_lock); - if (rx->sk.sk_state == RXRPC_CLOSE) - goto invalid_service; + kfree(b); +} - write_lock(&rx->call_lock); - if (!test_and_set_bit(RXRPC_CALL_INIT_ACCEPT, &call->flags)) { - rxrpc_get_call(call); - - spin_lock(&call->conn->state_lock); - if (sp->hdr.securityIndex > 0 && - call->conn->state == RXRPC_CONN_SERVICE_UNSECURED) { - _debug("await conn sec"); - list_add_tail(&call->accept_link, &rx->secureq); - call->conn->state = RXRPC_CONN_SERVICE_CHALLENGING; - set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); - rxrpc_queue_conn(call->conn); - } else { - _debug("conn ready"); - call->state = RXRPC_CALL_SERVER_ACCEPTING; - list_add_tail(&call->accept_link, &rx->acceptq); - rxrpc_get_call(call); - atomic_inc(&call->skb_count); - nsp = rxrpc_skb(notification); - nsp->call = call; - - ASSERTCMP(atomic_read(&call->usage), >=, 3); - - _debug("notify"); - spin_lock(&call->lock); - ret = rxrpc_queue_rcv_skb(call, notification, true, - false); - spin_unlock(&call->lock); - notification = NULL; - BUG_ON(ret < 0); +/* + * Allocate a new incoming call from the prealloc pool, along with a connection + * and a peer as necessary. + */ +static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, + struct rxrpc_local *local, + struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_backlog *b = rx->backlog; + struct rxrpc_peer *peer, *xpeer; + struct rxrpc_call *call; + unsigned short call_head, conn_head, peer_head; + unsigned short call_tail, conn_tail, peer_tail; + unsigned short call_count, conn_count; + + /* #calls >= #conns >= #peers must hold true. */ + call_head = smp_load_acquire(&b->call_backlog_head); + call_tail = b->call_backlog_tail; + call_count = CIRC_CNT(call_head, call_tail, RXRPC_BACKLOG_MAX); + conn_head = smp_load_acquire(&b->conn_backlog_head); + conn_tail = b->conn_backlog_tail; + conn_count = CIRC_CNT(conn_head, conn_tail, RXRPC_BACKLOG_MAX); + ASSERTCMP(conn_count, >=, call_count); + peer_head = smp_load_acquire(&b->peer_backlog_head); + peer_tail = b->peer_backlog_tail; + ASSERTCMP(CIRC_CNT(peer_head, peer_tail, RXRPC_BACKLOG_MAX), >=, + conn_count); + + if (call_count == 0) + return NULL; + + if (!conn) { + /* No connection. We're going to need a peer to start off + * with. If one doesn't yet exist, use a spare from the + * preallocation set. We dump the address into the spare in + * anticipation - and to save on stack space. + */ + xpeer = b->peer_backlog[peer_tail]; + if (rxrpc_extract_addr_from_skb(&xpeer->srx, skb) < 0) + return NULL; + + peer = rxrpc_lookup_incoming_peer(local, xpeer); + if (peer == xpeer) { + b->peer_backlog[peer_tail] = NULL; + smp_store_release(&b->peer_backlog_tail, + (peer_tail + 1) & + (RXRPC_BACKLOG_MAX - 1)); } - spin_unlock(&call->conn->state_lock); - _debug("queued"); + /* Now allocate and set up the connection */ + conn = b->conn_backlog[conn_tail]; + b->conn_backlog[conn_tail] = NULL; + smp_store_release(&b->conn_backlog_tail, + (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); + rxrpc_get_local(local); + conn->params.local = local; + conn->params.peer = peer; + rxrpc_see_connection(conn); + rxrpc_new_incoming_connection(conn, skb); + } else { + rxrpc_get_connection(conn); } - write_unlock(&rx->call_lock); - _debug("process"); - rxrpc_fast_process_packet(call, skb); + /* And now we can allocate and set up a new call */ + call = b->call_backlog[call_tail]; + b->call_backlog[call_tail] = NULL; + smp_store_release(&b->call_backlog_tail, + (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); - _debug("done"); - read_unlock_bh(&local->services_lock); - rxrpc_free_skb(notification); - rxrpc_put_call(call); - _leave(" = 0"); - return 0; - -invalid_service: - _debug("invalid"); - read_unlock_bh(&local->services_lock); - - read_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { - rxrpc_get_call(call); - rxrpc_queue_call(call); - } - read_unlock_bh(&call->state_lock); - rxrpc_put_call(call); - ret = -ECONNREFUSED; -error: - rxrpc_free_skb(notification); -error_nofree: - _leave(" = %d", ret); - return ret; + rxrpc_see_call(call); + call->conn = conn; + call->peer = rxrpc_get_peer(conn->params.peer); + return call; } /* - * accept incoming calls that need peer, transport and/or connection setting up - * - the packets we get are all incoming client DATA packets that have seq == 1 + * Set up a new incoming call. Called in BH context with the RCU read lock + * held. + * + * If this is for a kernel service, when we allocate the call, it will have + * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the + * retainer ref obtained from the backlog buffer. Prealloc calls for userspace + * services only have the ref from the backlog buffer. We want to pass this + * ref to non-BH context to dispose of. + * + * If we want to report an error, we mark the skb with the packet type and + * abort code and return NULL. */ -void rxrpc_accept_incoming_calls(struct rxrpc_local *local) +struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_connection *conn, + struct sk_buff *skb) { - struct rxrpc_skb_priv *sp; - struct sockaddr_rxrpc srx; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_sock *rx; - struct rxrpc_wire_header whdr; - struct sk_buff *skb; - int ret; + struct rxrpc_call *call; + u16 service_id = sp->hdr.serviceId; - _enter("%d", local->debug_id); + _enter(""); - skb = skb_dequeue(&local->accept_queue); - if (!skb) { - _leave("\n"); - return; + /* Get the socket providing the service */ + rx = rcu_dereference(local->service); + if (service_id == rx->srx.srx_service) + goto found_service; + + trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EOPNOTSUPP); + skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->priority = RX_INVALID_OPERATION; + _leave(" = NULL [service]"); + return NULL; + +found_service: + spin_lock(&rx->incoming_lock); + if (rx->sk.sk_state == RXRPC_CLOSE) { + trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber, + sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); + skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; + skb->priority = RX_INVALID_OPERATION; + _leave(" = NULL [close]"); + call = NULL; + goto out; } - _net("incoming call skb %p", skb); - - sp = rxrpc_skb(skb); - - /* Set up a response packet header in case we need it */ - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.seq = htonl(sp->hdr.seq); - whdr.serial = 0; - whdr.flags = 0; - whdr.type = 0; - whdr.userStatus = 0; - whdr.securityIndex = sp->hdr.securityIndex; - whdr._rsvd = 0; - whdr.serviceId = htons(sp->hdr.serviceId); - - if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) - goto drop; - - /* get the socket providing the service */ - read_lock_bh(&local->services_lock); - list_for_each_entry(rx, &local->services, listen_link) { - if (rx->srx.srx_service == sp->hdr.serviceId && - rx->sk.sk_state != RXRPC_CLOSE) - goto found_service; + call = rxrpc_alloc_incoming_call(rx, local, conn, skb); + if (!call) { + skb->mark = RXRPC_SKB_MARK_BUSY; + _leave(" = NULL [busy]"); + call = NULL; + goto out; } - read_unlock_bh(&local->services_lock); - goto invalid_service; -found_service: - _debug("found service %hd", rx->srx.srx_service); - if (sk_acceptq_is_full(&rx->sk)) - goto backlog_full; - sk_acceptq_added(&rx->sk); - sock_hold(&rx->sk); - read_unlock_bh(&local->services_lock); - - ret = rxrpc_accept_incoming_call(local, rx, skb, &srx); - if (ret < 0) - sk_acceptq_removed(&rx->sk); - sock_put(&rx->sk); - switch (ret) { - case -ECONNRESET: /* old calls are ignored */ - case -ECONNABORTED: /* aborted calls are reaborted or ignored */ - case 0: - return; - case -ECONNREFUSED: - goto invalid_service; - case -EBUSY: - goto busy; - case -EKEYREJECTED: - goto security_mismatch; + trace_rxrpc_receive(call, rxrpc_receive_incoming, + sp->hdr.serial, sp->hdr.seq); + + /* Make the call live. */ + rxrpc_incoming_call(rx, call, skb); + conn = call->conn; + + if (rx->notify_new_call) + rx->notify_new_call(&rx->sk, call, call->user_call_ID); + else + sk_acceptq_added(&rx->sk); + + spin_lock(&conn->state_lock); + switch (conn->state) { + case RXRPC_CONN_SERVICE_UNSECURED: + conn->state = RXRPC_CONN_SERVICE_CHALLENGING; + set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); + rxrpc_queue_conn(call->conn); + break; + + case RXRPC_CONN_SERVICE: + write_lock(&call->state_lock); + if (rx->discard_new_call) + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + else + call->state = RXRPC_CALL_SERVER_ACCEPTING; + write_unlock(&call->state_lock); + break; + + case RXRPC_CONN_REMOTELY_ABORTED: + rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, + conn->remote_abort, ECONNABORTED); + break; + case RXRPC_CONN_LOCALLY_ABORTED: + rxrpc_abort_call("CON", call, sp->hdr.seq, + conn->local_abort, ECONNABORTED); + break; default: BUG(); } + spin_unlock(&conn->state_lock); -backlog_full: - read_unlock_bh(&local->services_lock); -busy: - rxrpc_busy(local, &srx, &whdr); - rxrpc_free_skb(skb); - return; + if (call->state == RXRPC_CALL_SERVER_ACCEPTING) + rxrpc_notify_socket(call); -drop: - rxrpc_free_skb(skb); - return; + /* We have to discard the prealloc queue's ref here and rely on a + * combination of the RCU read lock and refs held either by the socket + * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel + * service to prevent the call from being deallocated too early. + */ + rxrpc_put_call(call, rxrpc_call_put); -invalid_service: - skb->priority = RX_INVALID_OPERATION; - rxrpc_reject_packet(local, skb); - return; - - /* can't change connection security type mid-flow */ -security_mismatch: - skb->priority = RX_PROTOCOL_ERROR; - rxrpc_reject_packet(local, skb); - return; + _leave(" = %p{%d}", call, call->debug_id); +out: + spin_unlock(&rx->incoming_lock); + return call; } /* @@ -286,7 +430,8 @@ security_mismatch: * - assign the user call ID to the call at the front of the queue */ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, - unsigned long user_call_ID) + unsigned long user_call_ID, + rxrpc_notify_rx_t notify_rx) { struct rxrpc_call *call; struct rb_node *parent, **pp; @@ -298,12 +443,13 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, write_lock(&rx->call_lock); - ret = -ENODATA; - if (list_empty(&rx->acceptq)) - goto out; + if (list_empty(&rx->to_be_accepted)) { + write_unlock(&rx->call_lock); + kleave(" = -ENODATA [empty]"); + return ERR_PTR(-ENODATA); + } /* check the user ID isn't already in use */ - ret = -EBADSLT; pp = &rx->calls.rb_node; parent = NULL; while (*pp) { @@ -315,62 +461,59 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, else if (user_call_ID > call->user_call_ID) pp = &(*pp)->rb_right; else - goto out; + goto id_in_use; } - /* dequeue the first call and check it's still valid */ - call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + /* Dequeue the first call and check it's still valid. We gain + * responsibility for the queue's reference. + */ + call = list_entry(rx->to_be_accepted.next, + struct rxrpc_call, accept_link); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); + rxrpc_see_call(call); write_lock_bh(&call->state_lock); switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: call->state = RXRPC_CALL_SERVER_RECV_REQUEST; break; - case RXRPC_CALL_REMOTELY_ABORTED: - case RXRPC_CALL_LOCALLY_ABORTED: - ret = -ECONNABORTED; - goto out_release; - case RXRPC_CALL_NETWORK_ERROR: - ret = call->conn->error; + case RXRPC_CALL_COMPLETE: + ret = call->error; goto out_release; - case RXRPC_CALL_DEAD: - ret = -ETIME; - goto out_discard; default: BUG(); } /* formalise the acceptance */ + call->notify_rx = notify_rx; call->user_call_ID = user_call_ID; + rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags)) BUG(); - if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) - BUG(); - rxrpc_queue_call(call); - rxrpc_get_call(call); write_unlock_bh(&call->state_lock); write_unlock(&rx->call_lock); + rxrpc_notify_socket(call); + rxrpc_service_prealloc(rx, GFP_KERNEL); _leave(" = %p{%d}", call, call->debug_id); return call; - /* if the call is already dying or dead, then we leave the socket's ref - * on it to be released by rxrpc_dead_call_expired() as induced by - * rxrpc_release_call() */ out_release: _debug("release %p", call); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - rxrpc_queue_call(call); -out_discard: write_unlock_bh(&call->state_lock); - _debug("discard %p", call); -out: write_unlock(&rx->call_lock); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); + goto out; + +id_in_use: + ret = -EBADSLT; + write_unlock(&rx->call_lock); +out: + rxrpc_service_prealloc(rx, GFP_KERNEL); _leave(" = %d", ret); return ERR_PTR(ret); } @@ -382,6 +525,7 @@ out: int rxrpc_reject_call(struct rxrpc_sock *rx) { struct rxrpc_call *call; + bool abort = false; int ret; _enter(""); @@ -390,88 +534,73 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) write_lock(&rx->call_lock); - ret = -ENODATA; - if (list_empty(&rx->acceptq)) - goto out; + if (list_empty(&rx->to_be_accepted)) { + write_unlock(&rx->call_lock); + return -ENODATA; + } - /* dequeue the first call and check it's still valid */ - call = list_entry(rx->acceptq.next, struct rxrpc_call, accept_link); + /* Dequeue the first call and check it's still valid. We gain + * responsibility for the queue's reference. + */ + call = list_entry(rx->to_be_accepted.next, + struct rxrpc_call, accept_link); list_del_init(&call->accept_link); sk_acceptq_removed(&rx->sk); + rxrpc_see_call(call); write_lock_bh(&call->state_lock); switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: - call->state = RXRPC_CALL_SERVER_BUSY; - if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) - rxrpc_queue_call(call); - ret = 0; - goto out_release; - case RXRPC_CALL_REMOTELY_ABORTED: - case RXRPC_CALL_LOCALLY_ABORTED: - ret = -ECONNABORTED; - goto out_release; - case RXRPC_CALL_NETWORK_ERROR: - ret = call->conn->error; - goto out_release; - case RXRPC_CALL_DEAD: - ret = -ETIME; + __rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, ECONNABORTED); + abort = true; + /* fall through */ + case RXRPC_CALL_COMPLETE: + ret = call->error; goto out_discard; default: BUG(); } - /* if the call is already dying or dead, then we leave the socket's ref - * on it to be released by rxrpc_dead_call_expired() as induced by - * rxrpc_release_call() */ -out_release: - _debug("release %p", call); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - rxrpc_queue_call(call); out_discard: write_unlock_bh(&call->state_lock); - _debug("discard %p", call); -out: write_unlock(&rx->call_lock); + if (abort) { + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); + } + rxrpc_service_prealloc(rx, GFP_KERNEL); _leave(" = %d", ret); return ret; } -/** - * rxrpc_kernel_accept_call - Allow a kernel service to accept an incoming call - * @sock: The socket on which the impending call is waiting - * @user_call_ID: The tag to attach to the call +/* + * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls + * @sock: The socket on which to preallocate + * @notify_rx: Event notification function for the call + * @user_attach_call: Func to attach call to user_call_ID + * @user_call_ID: The tag to attach to the preallocated call + * @gfp: The allocation conditions. * - * Allow a kernel service to accept an incoming call, assuming the incoming - * call is still valid. - */ -struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock, - unsigned long user_call_ID) -{ - struct rxrpc_call *call; - - _enter(",%lx", user_call_ID); - call = rxrpc_accept_call(rxrpc_sk(sock->sk), user_call_ID); - _leave(" = %p", call); - return call; -} -EXPORT_SYMBOL(rxrpc_kernel_accept_call); - -/** - * rxrpc_kernel_reject_call - Allow a kernel service to reject an incoming call - * @sock: The socket on which the impending call is waiting + * Charge up the socket with preallocated calls, each with a user ID. A + * function should be provided to effect the attachment from the user's side. + * The user is given a ref to hold on the call. * - * Allow a kernel service to reject an incoming call with a BUSY message, - * assuming the incoming call is still valid. + * Note that the call may be come connected before this function returns. */ -int rxrpc_kernel_reject_call(struct socket *sock) +int rxrpc_kernel_charge_accept(struct socket *sock, + rxrpc_notify_rx_t notify_rx, + rxrpc_user_attach_call_t user_attach_call, + unsigned long user_call_ID, gfp_t gfp) { - int ret; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct rxrpc_backlog *b = rx->backlog; - _enter(""); - ret = rxrpc_reject_call(rxrpc_sk(sock->sk)); - _leave(" = %d", ret); - return ret; + if (sock->sk->sk_state == RXRPC_CLOSE) + return -ESHUTDOWN; + + return rxrpc_service_prealloc_one(rx, b, notify_rx, + user_attach_call, user_call_ID, + gfp); } -EXPORT_SYMBOL(rxrpc_kernel_reject_call); +EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index e60cf65c2232..4f00476630b9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -22,1281 +22,351 @@ #include "ar-internal.h" /* - * propose an ACK be sent + * Set the timer */ -void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, bool immediate) +void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why, + ktime_t now) { - unsigned long expiry; - s8 prior = rxrpc_ack_priority[ack_reason]; - - ASSERTCMP(prior, >, 0); - - _enter("{%d},%s,%%%x,%u", - call->debug_id, rxrpc_acks(ack_reason), serial, immediate); - - if (prior < rxrpc_ack_priority[call->ackr_reason]) { - if (immediate) - goto cancel_timer; - return; - } + unsigned long t_j, now_j = jiffies; + ktime_t t; + bool queue = false; - /* update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial - * numbers */ - if (prior == rxrpc_ack_priority[call->ackr_reason]) { - if (prior <= 4) - call->ackr_serial = serial; - if (immediate) - goto cancel_timer; - return; - } - - call->ackr_reason = ack_reason; - call->ackr_serial = serial; - - switch (ack_reason) { - case RXRPC_ACK_DELAY: - _debug("run delay timer"); - expiry = rxrpc_soft_ack_delay; - goto run_timer; - - case RXRPC_ACK_IDLE: - if (!immediate) { - _debug("run defer timer"); - expiry = rxrpc_idle_ack_delay; - goto run_timer; - } - goto cancel_timer; - - case RXRPC_ACK_REQUESTED: - expiry = rxrpc_requested_ack_delay; - if (!expiry) - goto cancel_timer; - if (!immediate || serial == 1) { - _debug("run defer timer"); - goto run_timer; - } - - default: - _debug("immediate ACK"); - goto cancel_timer; - } - -run_timer: - expiry += jiffies; - if (!timer_pending(&call->ack_timer) || - time_after(call->ack_timer.expires, expiry)) - mod_timer(&call->ack_timer, expiry); - return; - -cancel_timer: - _debug("cancel timer %%%u", serial); - try_to_del_timer_sync(&call->ack_timer); read_lock_bh(&call->state_lock); - if (call->state <= RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); -} - -/* - * propose an ACK be sent, locking the call structure - */ -void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, bool immediate) -{ - s8 prior = rxrpc_ack_priority[ack_reason]; - - if (prior > rxrpc_ack_priority[call->ackr_reason]) { - spin_lock_bh(&call->lock); - __rxrpc_propose_ACK(call, ack_reason, serial, immediate); - spin_unlock_bh(&call->lock); - } -} -/* - * set the resend timer - */ -static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend, - unsigned long resend_at) -{ - read_lock_bh(&call->state_lock); - if (call->state >= RXRPC_CALL_COMPLETE) - resend = 0; - - if (resend & 1) { - _debug("SET RESEND"); - set_bit(RXRPC_CALL_EV_RESEND, &call->events); - } - - if (resend & 2) { - _debug("MODIFY RESEND TIMER"); - set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - mod_timer(&call->resend_timer, resend_at); - } else { - _debug("KILL RESEND TIMER"); - del_timer_sync(&call->resend_timer); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - } - read_unlock_bh(&call->state_lock); -} - -/* - * resend packets - */ -static void rxrpc_resend(struct rxrpc_call *call) -{ - struct rxrpc_wire_header *whdr; - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - bool stop; - int loop; - u8 resend; - - _enter("{%d,%d,%d,%d},", - call->acks_hard, call->acks_unacked, - atomic_read(&call->sequence), - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); - - stop = false; - resend = 0; - resend_at = 0; - - for (loop = call->acks_tail; - loop != call->acks_head || stop; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - if (*p_txb & 1) - continue; - - txb = (struct sk_buff *) *p_txb; - sp = rxrpc_skb(txb); - - if (sp->need_resend) { - sp->need_resend = false; - - /* each Tx packet has a new serial number */ - sp->hdr.serial = atomic_inc_return(&call->conn->serial); - - whdr = (struct rxrpc_wire_header *)txb->head; - whdr->serial = htonl(sp->hdr.serial); - - _proto("Tx DATA %%%u { #%d }", - sp->hdr.serial, sp->hdr.seq); - if (rxrpc_send_data_packet(call->conn, txb) < 0) { - stop = true; - sp->resend_at = jiffies + 3; - } else { - sp->resend_at = - jiffies + rxrpc_resend_timeout; - } + if (call->state < RXRPC_CALL_COMPLETE) { + t = call->expire_at; + if (!ktime_after(t, now)) + goto out; + + if (!ktime_after(call->resend_at, now)) { + call->resend_at = call->expire_at; + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + queue = true; + } else if (ktime_before(call->resend_at, t)) { + t = call->resend_at; } - if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; + if (!ktime_after(call->ack_at, now)) { + call->ack_at = call->expire_at; + if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) + queue = true; + } else if (ktime_before(call->ack_at, t)) { + t = call->ack_at; } - } - - rxrpc_set_resend(call, resend, resend_at); - _leave(""); -} - -/* - * handle resend timer expiry - */ -static void rxrpc_resend_timer(struct rxrpc_call *call) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - int loop; - u8 resend; - - _enter("%d,%d,%d", - call->acks_tail, call->acks_unacked, call->acks_head); - - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - resend = 0; - resend_at = 0; - for (loop = call->acks_unacked; - loop != call->acks_head; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); + t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now))); + t_j += jiffies; - ASSERT(!(*p_txb & 1)); + /* We have to make sure that the calculated jiffies value falls + * at or after the nsec value, or we may loop ceaselessly + * because the timer times out, but we haven't reached the nsec + * timeout yet. + */ + t_j++; - if (sp->need_resend) { - ; - } else if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; + if (call->timer.expires != t_j || !timer_pending(&call->timer)) { + mod_timer(&call->timer, t_j); + trace_rxrpc_timer(call, why, now, now_j); } + + if (queue) + rxrpc_queue_call(call); } - rxrpc_set_resend(call, resend, resend_at); - _leave(""); +out: + read_unlock_bh(&call->state_lock); } /* - * process soft ACKs of our transmitted packets - * - these indicate packets the peer has or has not received, but hasn't yet - * given to the consumer, and so can still be discarded and re-requested + * propose an ACK be sent */ -static int rxrpc_process_soft_ACKs(struct rxrpc_call *call, - struct rxrpc_ackpacket *ack, - struct sk_buff *skb) +static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + u16 skew, u32 serial, bool immediate, + bool background, + enum rxrpc_propose_ack_trace why) { - struct rxrpc_skb_priv *sp; - struct sk_buff *txb; - unsigned long *p_txb, resend_at; - int loop; - u8 sacks[RXRPC_MAXACKS], resend; - - _enter("{%d,%d},{%d},", - call->acks_hard, - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz), - ack->nAcks); - - if (skb_copy_bits(skb, 0, sacks, ack->nAcks) < 0) - goto protocol_error; - - resend = 0; - resend_at = 0; - for (loop = 0; loop < ack->nAcks; loop++) { - p_txb = call->acks_window; - p_txb += (call->acks_tail + loop) & (call->acks_winsz - 1); - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); - - switch (sacks[loop]) { - case RXRPC_ACK_TYPE_ACK: - sp->need_resend = false; - *p_txb |= 1; - break; - case RXRPC_ACK_TYPE_NACK: - sp->need_resend = true; - *p_txb &= ~1; - resend = 1; - break; - default: - _debug("Unsupported ACK type %d", sacks[loop]); - goto protocol_error; - } - } - - smp_mb(); - call->acks_unacked = (call->acks_tail + loop) & (call->acks_winsz - 1); - - /* anything not explicitly ACK'd is implicitly NACK'd, but may just not - * have been received or processed yet by the far end */ - for (loop = call->acks_unacked; - loop != call->acks_head; - loop = (loop + 1) & (call->acks_winsz - 1) - ) { - p_txb = call->acks_window + loop; - smp_read_barrier_depends(); - txb = (struct sk_buff *) (*p_txb & ~1); - sp = rxrpc_skb(txb); + enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; + unsigned int expiry = rxrpc_soft_ack_delay; + ktime_t now, ack_at; + s8 prior = rxrpc_ack_priority[ack_reason]; - if (*p_txb & 1) { - /* packet must have been discarded */ - sp->need_resend = true; - *p_txb &= ~1; - resend |= 1; - } else if (sp->need_resend) { - ; - } else if (time_after_eq(jiffies + 1, sp->resend_at)) { - sp->need_resend = true; - resend |= 1; - } else if (resend & 2) { - if (time_before(sp->resend_at, resend_at)) - resend_at = sp->resend_at; - } else { - resend_at = sp->resend_at; - resend |= 2; + /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial + * numbers, but we don't alter the timeout. + */ + _debug("prior %u %u vs %u %u", + ack_reason, prior, + call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]); + if (ack_reason == call->ackr_reason) { + if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) { + outcome = rxrpc_propose_ack_update; + call->ackr_serial = serial; + call->ackr_skew = skew; } + if (!immediate) + goto trace; + } else if (prior > rxrpc_ack_priority[call->ackr_reason]) { + call->ackr_reason = ack_reason; + call->ackr_serial = serial; + call->ackr_skew = skew; + } else { + outcome = rxrpc_propose_ack_subsume; } - rxrpc_set_resend(call, resend, resend_at); - _leave(" = 0"); - return 0; - -protocol_error: - _leave(" = -EPROTO"); - return -EPROTO; -} - -/* - * discard hard-ACK'd packets from the Tx window - */ -static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) -{ - unsigned long _skb; - int tail = call->acks_tail, old_tail; - int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); + switch (ack_reason) { + case RXRPC_ACK_REQUESTED: + if (rxrpc_requested_ack_delay < expiry) + expiry = rxrpc_requested_ack_delay; + if (serial == 1) + immediate = false; + break; - _enter("{%u,%u},%u", call->acks_hard, win, hard); + case RXRPC_ACK_DELAY: + if (rxrpc_soft_ack_delay < expiry) + expiry = rxrpc_soft_ack_delay; + break; - ASSERTCMP(hard - call->acks_hard, <=, win); + case RXRPC_ACK_PING: + case RXRPC_ACK_IDLE: + if (rxrpc_idle_ack_delay < expiry) + expiry = rxrpc_idle_ack_delay; + break; - while (call->acks_hard < hard) { - smp_read_barrier_depends(); - _skb = call->acks_window[tail] & ~1; - rxrpc_free_skb((struct sk_buff *) _skb); - old_tail = tail; - tail = (tail + 1) & (call->acks_winsz - 1); - call->acks_tail = tail; - if (call->acks_unacked == old_tail) - call->acks_unacked = tail; - call->acks_hard++; + default: + immediate = true; + break; } - wake_up(&call->tx_waitq); -} - -/* - * clear the Tx window in the event of a failure - */ -static void rxrpc_clear_tx_window(struct rxrpc_call *call) -{ - rxrpc_rotate_tx_window(call, atomic_read(&call->sequence)); -} - -/* - * drain the out of sequence received packet queue into the packet Rx queue - */ -static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - bool terminal; - int ret; - - _enter("{%d,%d}", call->rx_data_post, call->rx_first_oos); - - spin_lock_bh(&call->lock); - - ret = -ECONNRESET; - if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) - goto socket_unavailable; - - skb = skb_dequeue(&call->rx_oos_queue); - if (skb) { - sp = rxrpc_skb(skb); - - _debug("drain OOS packet %d [%d]", - sp->hdr.seq, call->rx_first_oos); - - if (sp->hdr.seq != call->rx_first_oos) { - skb_queue_head(&call->rx_oos_queue, skb); - call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; - _debug("requeue %p {%u}", skb, call->rx_first_oos); - } else { - skb->mark = RXRPC_SKB_MARK_DATA; - terminal = ((sp->hdr.flags & RXRPC_LAST_PACKET) && - !(sp->hdr.flags & RXRPC_CLIENT_INITIATED)); - ret = rxrpc_queue_rcv_skb(call, skb, true, terminal); - BUG_ON(ret < 0); - _debug("drain #%u", call->rx_data_post); - call->rx_data_post++; - - /* find out what the next packet is */ - skb = skb_peek(&call->rx_oos_queue); - if (skb) - call->rx_first_oos = rxrpc_skb(skb)->hdr.seq; - else - call->rx_first_oos = 0; - _debug("peek %p {%u}", skb, call->rx_first_oos); + if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { + _debug("already scheduled"); + } else if (immediate || expiry == 0) { + _debug("immediate ACK %lx", call->events); + if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) && + background) + rxrpc_queue_call(call); + } else { + now = ktime_get_real(); + ack_at = ktime_add_ms(now, expiry); + if (ktime_before(ack_at, call->ack_at)) { + call->ack_at = ack_at; + rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now); } } - ret = 0; -socket_unavailable: - spin_unlock_bh(&call->lock); - _leave(" = %d", ret); - return ret; +trace: + trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate, + background, outcome); } /* - * insert an out of sequence packet into the buffer + * propose an ACK be sent, locking the call structure */ -static void rxrpc_insert_oos_packet(struct rxrpc_call *call, - struct sk_buff *skb) +void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, + u16 skew, u32 serial, bool immediate, bool background, + enum rxrpc_propose_ack_trace why) { - struct rxrpc_skb_priv *sp, *psp; - struct sk_buff *p; - u32 seq; - - sp = rxrpc_skb(skb); - seq = sp->hdr.seq; - _enter(",,{%u}", seq); - - skb->destructor = rxrpc_packet_destructor; - ASSERTCMP(sp->call, ==, NULL); - sp->call = call; - rxrpc_get_call(call); - atomic_inc(&call->skb_count); - - /* insert into the buffer in sequence order */ spin_lock_bh(&call->lock); - - skb_queue_walk(&call->rx_oos_queue, p) { - psp = rxrpc_skb(p); - if (psp->hdr.seq > seq) { - _debug("insert oos #%u before #%u", seq, psp->hdr.seq); - skb_insert(p, skb, &call->rx_oos_queue); - goto inserted; - } - } - - _debug("append oos #%u", seq); - skb_queue_tail(&call->rx_oos_queue, skb); -inserted: - - /* we might now have a new front to the queue */ - if (call->rx_first_oos == 0 || seq < call->rx_first_oos) - call->rx_first_oos = seq; - - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - call->rx_data_post == call->rx_first_oos) { - _debug("drain rx oos now"); - set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); - } - read_unlock(&call->state_lock); - + __rxrpc_propose_ACK(call, ack_reason, skew, serial, + immediate, background, why); spin_unlock_bh(&call->lock); - _leave(" [stored #%u]", call->rx_first_oos); -} - -/* - * clear the Tx window on final ACK reception - */ -static void rxrpc_zap_tx_window(struct rxrpc_call *call) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - unsigned long _skb, *acks_window; - u8 winsz = call->acks_winsz; - int tail; - - acks_window = call->acks_window; - call->acks_window = NULL; - - while (CIRC_CNT(call->acks_head, call->acks_tail, winsz) > 0) { - tail = call->acks_tail; - smp_read_barrier_depends(); - _skb = acks_window[tail] & ~1; - smp_mb(); - call->acks_tail = (call->acks_tail + 1) & (winsz - 1); - - skb = (struct sk_buff *) _skb; - sp = rxrpc_skb(skb); - _debug("+++ clear Tx %u", sp->hdr.seq); - rxrpc_free_skb(skb); - } - - kfree(acks_window); } /* - * process the extra information that may be appended to an ACK packet + * Handle congestion being detected by the retransmit timeout. */ -static void rxrpc_extract_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int latest, int nAcks) +static void rxrpc_congestion_timeout(struct rxrpc_call *call) { - struct rxrpc_ackinfo ackinfo; - struct rxrpc_peer *peer; - unsigned int mtu; - - if (skb_copy_bits(skb, nAcks + 3, &ackinfo, sizeof(ackinfo)) < 0) { - _leave(" [no ackinfo]"); - return; - } - - _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", - latest, - ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU), - ntohl(ackinfo.rwind), ntohl(ackinfo.jumbo_max)); - - mtu = min(ntohl(ackinfo.rxMTU), ntohl(ackinfo.maxMTU)); - - peer = call->conn->params.peer; - if (mtu < peer->maxdata) { - spin_lock_bh(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock_bh(&peer->lock); - _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); - } + set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); } /* - * process packets in the reception queue + * Perform retransmission of NAK'd and unack'd packets. */ -static int rxrpc_process_rx_queue(struct rxrpc_call *call, - u32 *_abort_code) +static void rxrpc_resend(struct rxrpc_call *call, ktime_t now) { - struct rxrpc_ackpacket ack; struct rxrpc_skb_priv *sp; struct sk_buff *skb; - bool post_ACK; - int latest; - u32 hard, tx; - - _enter(""); - -process_further: - skb = skb_dequeue(&call->rx_queue); - if (!skb) - return -EAGAIN; - - _net("deferred skb %p", skb); - - sp = rxrpc_skb(skb); - - _debug("process %s [st %d]", rxrpc_pkts[sp->hdr.type], call->state); - - post_ACK = false; - - switch (sp->hdr.type) { - /* data packets that wind up here have been received out of - * order, need security processing or are jumbo packets */ - case RXRPC_PACKET_TYPE_DATA: - _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); + rxrpc_seq_t cursor, seq, top; + ktime_t max_age, oldest, ack_ts; + int ix; + u8 annotation, anno_type, retrans = 0, unacked = 0; - /* secured packets must be verified and possibly decrypted */ - if (call->conn->security->verify_packet(call, skb, - _abort_code) < 0) - goto protocol_error; - - rxrpc_insert_oos_packet(call, skb); - goto process_further; - - /* partial ACK to process */ - case RXRPC_PACKET_TYPE_ACK: - if (skb_copy_bits(skb, 0, &ack, sizeof(ack)) < 0) { - _debug("extraction failure"); - goto protocol_error; - } - if (!skb_pull(skb, sizeof(ack))) - BUG(); - - latest = sp->hdr.serial; - hard = ntohl(ack.firstPacket); - tx = atomic_read(&call->sequence); - - _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - latest, - ntohs(ack.maxSkew), - hard, - ntohl(ack.previousPacket), - ntohl(ack.serial), - rxrpc_acks(ack.reason), - ack.nAcks); - - rxrpc_extract_ackinfo(call, skb, latest, ack.nAcks); - - if (ack.reason == RXRPC_ACK_PING) { - _proto("Rx ACK %%%u PING Request", latest); - rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - sp->hdr.serial, true); - } - - /* discard any out-of-order or duplicate ACKs */ - if (latest - call->acks_latest <= 0) { - _debug("discard ACK %d <= %d", - latest, call->acks_latest); - goto discard; - } - call->acks_latest = latest; + _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); - if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && - call->state != RXRPC_CALL_CLIENT_AWAIT_REPLY && - call->state != RXRPC_CALL_SERVER_SEND_REPLY && - call->state != RXRPC_CALL_SERVER_AWAIT_ACK) - goto discard; + max_age = ktime_sub_ms(now, rxrpc_resend_timeout); - _debug("Tx=%d H=%u S=%d", tx, call->acks_hard, call->state); - - if (hard > 0) { - if (hard - 1 > tx) { - _debug("hard-ACK'd packet %d not transmitted" - " (%d top)", - hard - 1, tx); - goto protocol_error; - } + spin_lock_bh(&call->lock); - if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY || - call->state == RXRPC_CALL_SERVER_AWAIT_ACK) && - hard > tx) { - call->acks_hard = tx; - goto all_acked; - } + cursor = call->tx_hard_ack; + top = call->tx_top; + ASSERT(before_eq(cursor, top)); + if (cursor == top) + goto out_unlock; + + /* Scan the packet list without dropping the lock and decide which of + * the packets in the Tx buffer we're going to resend and what the new + * resend timeout will be. + */ + oldest = now; + for (seq = cursor + 1; before_eq(seq, top); seq++) { + ix = seq & RXRPC_RXTX_BUFF_MASK; + annotation = call->rxtx_annotations[ix]; + anno_type = annotation & RXRPC_TX_ANNO_MASK; + annotation &= ~RXRPC_TX_ANNO_MASK; + if (anno_type == RXRPC_TX_ANNO_ACK) + continue; - smp_rmb(); - rxrpc_rotate_tx_window(call, hard - 1); - } + skb = call->rxtx_buffer[ix]; + rxrpc_see_skb(skb, rxrpc_skb_tx_seen); + sp = rxrpc_skb(skb); - if (ack.nAcks > 0) { - if (hard - 1 + ack.nAcks > tx) { - _debug("soft-ACK'd packet %d+%d not" - " transmitted (%d top)", - hard - 1, ack.nAcks, tx); - goto protocol_error; + if (anno_type == RXRPC_TX_ANNO_UNACK) { + if (ktime_after(skb->tstamp, max_age)) { + if (ktime_before(skb->tstamp, oldest)) + oldest = skb->tstamp; + continue; } - - if (rxrpc_process_soft_ACKs(call, &ack, skb) < 0) - goto protocol_error; + if (!(annotation & RXRPC_TX_ANNO_RESENT)) + unacked++; } - goto discard; - - /* complete ACK to process */ - case RXRPC_PACKET_TYPE_ACKALL: - goto all_acked; - - /* abort and busy are handled elsewhere */ - case RXRPC_PACKET_TYPE_BUSY: - case RXRPC_PACKET_TYPE_ABORT: - BUG(); - /* connection level events - also handled elsewhere */ - case RXRPC_PACKET_TYPE_CHALLENGE: - case RXRPC_PACKET_TYPE_RESPONSE: - case RXRPC_PACKET_TYPE_DEBUG: - BUG(); + /* Okay, we need to retransmit a packet. */ + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; + retrans++; + trace_rxrpc_retransmit(call, seq, annotation | anno_type, + ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } - /* if we've had a hard ACK that covers all the packets we've sent, then - * that ends that phase of the operation */ -all_acked: - write_lock_bh(&call->state_lock); - _debug("ack all %d", call->state); + call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout); - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - call->state = RXRPC_CALL_CLIENT_RECV_REPLY; - break; - case RXRPC_CALL_SERVER_AWAIT_ACK: - _debug("srv complete"); - call->state = RXRPC_CALL_COMPLETE; - post_ACK = true; - break; - case RXRPC_CALL_CLIENT_SEND_REQUEST: - case RXRPC_CALL_SERVER_RECV_REQUEST: - goto protocol_error_unlock; /* can't occur yet */ - default: - write_unlock_bh(&call->state_lock); - goto discard; /* assume packet left over from earlier phase */ - } - - write_unlock_bh(&call->state_lock); + if (unacked) + rxrpc_congestion_timeout(call); - /* if all the packets we sent are hard-ACK'd, then we can discard - * whatever we've got left */ - _debug("clear Tx %d", - CIRC_CNT(call->acks_head, call->acks_tail, call->acks_winsz)); - - del_timer_sync(&call->resend_timer); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - - if (call->acks_window) - rxrpc_zap_tx_window(call); - - if (post_ACK) { - /* post the final ACK message for userspace to pick up */ - _debug("post ACK"); - skb->mark = RXRPC_SKB_MARK_FINAL_ACK; - sp->call = call; - rxrpc_get_call(call); - atomic_inc(&call->skb_count); - spin_lock_bh(&call->lock); - if (rxrpc_queue_rcv_skb(call, skb, true, true) < 0) - BUG(); + /* If there was nothing that needed retransmission then it's likely + * that an ACK got lost somewhere. Send a ping to find out instead of + * retransmitting data. + */ + if (!retrans) { + rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); spin_unlock_bh(&call->lock); - goto process_further; - } - -discard: - rxrpc_free_skb(skb); - goto process_further; - -protocol_error_unlock: - write_unlock_bh(&call->state_lock); -protocol_error: - rxrpc_free_skb(skb); - _leave(" = -EPROTO"); - return -EPROTO; -} - -/* - * post a message to the socket Rx queue for recvmsg() to pick up - */ -static int rxrpc_post_message(struct rxrpc_call *call, u32 mark, u32 error, - bool fatal) -{ - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; - int ret; - - _enter("{%d,%lx},%u,%u,%d", - call->debug_id, call->flags, mark, error, fatal); - - /* remove timers and things for fatal messages */ - if (fatal) { - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - } + ack_ts = ktime_sub(now, call->acks_latest_ts); + if (ktime_to_ns(ack_ts) < call->peer->rtt) + goto out; + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ack_ping_for_lost_ack); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + goto out; + } + + /* Now go through the Tx window and perform the retransmissions. We + * have to drop the lock for each send. If an ACK comes in whilst the + * lock is dropped, it may clear some of the retransmission markers for + * packets that it soft-ACKs. + */ + for (seq = cursor + 1; before_eq(seq, top); seq++) { + ix = seq & RXRPC_RXTX_BUFF_MASK; + annotation = call->rxtx_annotations[ix]; + anno_type = annotation & RXRPC_TX_ANNO_MASK; + if (anno_type != RXRPC_TX_ANNO_RETRANS) + continue; - if (mark != RXRPC_SKB_MARK_NEW_CALL && - !test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - _leave("[no userid]"); - return 0; - } + skb = call->rxtx_buffer[ix]; + rxrpc_get_skb(skb, rxrpc_skb_tx_got); + spin_unlock_bh(&call->lock); - if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { - skb = alloc_skb(0, GFP_NOFS); - if (!skb) - return -ENOMEM; + if (rxrpc_send_data_packet(call, skb, true) < 0) { + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + return; + } - rxrpc_new_skb(skb); + if (rxrpc_is_client_call(call)) + rxrpc_expose_client_call(call); - skb->mark = mark; + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + spin_lock_bh(&call->lock); - sp = rxrpc_skb(skb); - memset(sp, 0, sizeof(*sp)); - sp->error = error; - sp->call = call; - rxrpc_get_call(call); - atomic_inc(&call->skb_count); + /* We need to clear the retransmit state, but there are two + * things we need to be aware of: A new ACK/NAK might have been + * received and the packet might have been hard-ACK'd (in which + * case it will no longer be in the buffer). + */ + if (after(seq, call->tx_hard_ack)) { + annotation = call->rxtx_annotations[ix]; + anno_type = annotation & RXRPC_TX_ANNO_MASK; + if (anno_type == RXRPC_TX_ANNO_RETRANS || + anno_type == RXRPC_TX_ANNO_NAK) { + annotation &= ~RXRPC_TX_ANNO_MASK; + annotation |= RXRPC_TX_ANNO_UNACK; + } + annotation |= RXRPC_TX_ANNO_RESENT; + call->rxtx_annotations[ix] = annotation; + } - spin_lock_bh(&call->lock); - ret = rxrpc_queue_rcv_skb(call, skb, true, fatal); - spin_unlock_bh(&call->lock); - BUG_ON(ret < 0); + if (after(call->tx_hard_ack, seq)) + seq = call->tx_hard_ack; } - return 0; +out_unlock: + spin_unlock_bh(&call->lock); +out: + _leave(""); } /* - * handle background processing of incoming call packets and ACK / abort - * generation + * Handle retransmission and deferred ACK/abort generation. */ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); - struct rxrpc_wire_header whdr; - struct rxrpc_ackpacket ack; - struct rxrpc_ackinfo ackinfo; - struct msghdr msg; - struct kvec iov[5]; - enum rxrpc_call_event genbit; - unsigned long bits; - __be32 data, pad; - size_t len; - int loop, nbit, ioc, ret, mtu; - u32 serial, abort_code = RX_PROTOCOL_ERROR; - u8 *acks = NULL; - - //printk("\n--------------------\n"); - _enter("{%d,%s,%lx} [%lu]", - call->debug_id, rxrpc_call_states[call->state], call->events, - (jiffies - call->creation_jif) / (HZ / 10)); - - if (test_and_set_bit(RXRPC_CALL_PROC_BUSY, &call->flags)) { - _debug("XXXXXXXXXXXXX RUNNING ON MULTIPLE CPUS XXXXXXXXXXXXX"); - return; - } - - if (!call->conn) - goto skip_msg_init; - - /* there's a good chance we're going to have to send a message, so set - * one up in advance */ - msg.msg_name = &call->conn->params.peer->srx.transport; - msg.msg_namelen = call->conn->params.peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - whdr.epoch = htonl(call->conn->proto.epoch); - whdr.cid = htonl(call->cid); - whdr.callNumber = htonl(call->call_id); - whdr.seq = 0; - whdr.type = RXRPC_PACKET_TYPE_ACK; - whdr.flags = call->conn->out_clientflag; - whdr.userStatus = 0; - whdr.securityIndex = call->conn->security_ix; - whdr._rsvd = 0; - whdr.serviceId = htons(call->service_id); - - memset(iov, 0, sizeof(iov)); - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); -skip_msg_init: - - /* deal with events of a final nature */ - if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) { - enum rxrpc_skb_mark mark; - int error; - - clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_ABORT, &call->events); - - error = call->error_report; - if (error < RXRPC_LOCAL_ERROR_OFFSET) { - mark = RXRPC_SKB_MARK_NET_ERROR; - _debug("post net error %d", error); - } else { - mark = RXRPC_SKB_MARK_LOCAL_ERROR; - error -= RXRPC_LOCAL_ERROR_OFFSET; - _debug("post net local error %d", error); - } - - if (rxrpc_post_message(call, mark, error, true) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); - goto kill_ACKs; - } - - if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) { - ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); - - clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_ABORT, &call->events); + ktime_t now; - _debug("post conn abort"); + rxrpc_see_call(call); - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - call->conn->error, true) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - goto kill_ACKs; - } - - if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) { - whdr.type = RXRPC_PACKET_TYPE_BUSY; - genbit = RXRPC_CALL_EV_REJECT_BUSY; - goto send_message; - } - - if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) { - ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE); - - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - ECONNABORTED, true) < 0) - goto no_mem; - whdr.type = RXRPC_PACKET_TYPE_ABORT; - data = htonl(call->local_abort); - iov[1].iov_base = &data; - iov[1].iov_len = sizeof(data); - genbit = RXRPC_CALL_EV_ABORT; - goto send_message; - } - - if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) { - genbit = RXRPC_CALL_EV_ACK_FINAL; - - ack.bufferSpace = htons(8); - ack.maxSkew = 0; - ack.serial = 0; - ack.reason = RXRPC_ACK_IDLE; - ack.nAcks = 0; - call->ackr_reason = 0; - - spin_lock_bh(&call->lock); - ack.serial = htonl(call->ackr_serial); - ack.previousPacket = htonl(call->ackr_prev_seq); - ack.firstPacket = htonl(call->rx_data_eaten + 1); - spin_unlock_bh(&call->lock); - - pad = 0; + //printk("\n--------------------\n"); + _enter("{%d,%s,%lx}", + call->debug_id, rxrpc_call_states[call->state], call->events); - iov[1].iov_base = &ack; - iov[1].iov_len = sizeof(ack); - iov[2].iov_base = &pad; - iov[2].iov_len = 3; - iov[3].iov_base = &ackinfo; - iov[3].iov_len = sizeof(ackinfo); - goto send_ACK; +recheck_state: + if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + goto recheck_state; } - if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) | - (1 << RXRPC_CALL_EV_RCVD_ABORT)) - ) { - u32 mark; - - if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events)) - mark = RXRPC_SKB_MARK_REMOTE_ABORT; - else - mark = RXRPC_SKB_MARK_BUSY; - - _debug("post abort/busy"); - rxrpc_clear_tx_window(call); - if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0) - goto no_mem; - - clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); - clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - goto kill_ACKs; + if (call->state == RXRPC_CALL_COMPLETE) { + del_timer_sync(&call->timer); + goto out_put; } - if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) { - _debug("do implicit ackall"); - rxrpc_clear_tx_window(call); + now = ktime_get_real(); + if (ktime_before(call->expire_at, now)) { + rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME); + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + goto recheck_state; } - if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) { - write_lock_bh(&call->state_lock); - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_CALL_TIMEOUT; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); + if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) { + call->ack_at = call->expire_at; + if (call->ackr_reason) { + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + goto recheck_state; } - write_unlock_bh(&call->state_lock); - - _debug("post timeout"); - if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR, - ETIME, true) < 0) - goto no_mem; - - clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); - goto kill_ACKs; } - /* deal with assorted inbound messages */ - if (!skb_queue_empty(&call->rx_queue)) { - switch (rxrpc_process_rx_queue(call, &abort_code)) { - case 0: - case -EAGAIN: - break; - case -ENOMEM: - goto no_mem; - case -EKEYEXPIRED: - case -EKEYREJECTED: - case -EPROTO: - rxrpc_abort_call(call, abort_code); - goto kill_ACKs; - } + if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) { + rxrpc_resend(call, now); + goto recheck_state; } - /* handle resending */ - if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) - rxrpc_resend_timer(call); - if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_resend(call); - - /* consider sending an ordinary ACK */ - if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { - _debug("send ACK: window: %d - %d { %lx }", - call->rx_data_eaten, call->ackr_win_top, - call->ackr_window[0]); - - if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST && - call->ackr_reason != RXRPC_ACK_PING_RESPONSE) { - /* ACK by sending reply DATA packet in this state */ - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - goto maybe_reschedule; - } - - genbit = RXRPC_CALL_EV_ACK; - - acks = kzalloc(call->ackr_win_top - call->rx_data_eaten, - GFP_NOFS); - if (!acks) - goto no_mem; - - //hdr.flags = RXRPC_SLOW_START_OK; - ack.bufferSpace = htons(8); - ack.maxSkew = 0; - - spin_lock_bh(&call->lock); - ack.reason = call->ackr_reason; - ack.serial = htonl(call->ackr_serial); - ack.previousPacket = htonl(call->ackr_prev_seq); - ack.firstPacket = htonl(call->rx_data_eaten + 1); - - ack.nAcks = 0; - for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { - nbit = loop * BITS_PER_LONG; - for (bits = call->ackr_window[loop]; bits; bits >>= 1 - ) { - _debug("- l=%d n=%d b=%lx", loop, nbit, bits); - if (bits & 1) { - acks[nbit] = RXRPC_ACK_TYPE_ACK; - ack.nAcks = nbit + 1; - } - nbit++; - } - } - call->ackr_reason = 0; - spin_unlock_bh(&call->lock); - - pad = 0; - - iov[1].iov_base = &ack; - iov[1].iov_len = sizeof(ack); - iov[2].iov_base = acks; - iov[2].iov_len = ack.nAcks; - iov[3].iov_base = &pad; - iov[3].iov_len = 3; - iov[4].iov_base = &ackinfo; - iov[4].iov_len = sizeof(ackinfo); - - switch (ack.reason) { - case RXRPC_ACK_REQUESTED: - case RXRPC_ACK_DUPLICATE: - case RXRPC_ACK_OUT_OF_SEQUENCE: - case RXRPC_ACK_EXCEEDS_WINDOW: - case RXRPC_ACK_NOSPACE: - case RXRPC_ACK_PING: - case RXRPC_ACK_PING_RESPONSE: - goto send_ACK_with_skew; - case RXRPC_ACK_DELAY: - case RXRPC_ACK_IDLE: - goto send_ACK; - } - } - - /* handle completion of security negotiations on an incoming - * connection */ - if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) { - _debug("secured"); - spin_lock_bh(&call->lock); - - if (call->state == RXRPC_CALL_SERVER_SECURING) { - _debug("securing"); - write_lock(&call->socket->call_lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { - _debug("not released"); - call->state = RXRPC_CALL_SERVER_ACCEPTING; - list_move_tail(&call->accept_link, - &call->socket->acceptq); - } - write_unlock(&call->socket->call_lock); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); - read_unlock(&call->state_lock); - } - - spin_unlock_bh(&call->lock); - if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) - goto maybe_reschedule; - } - - /* post a notification of an acceptable connection to the app */ - if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) { - _debug("post accept"); - if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL, - 0, false) < 0) - goto no_mem; - clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events); - goto maybe_reschedule; - } - - /* handle incoming call acceptance */ - if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) { - _debug("accepted"); - ASSERTCMP(call->rx_data_post, ==, 0); - call->rx_data_post = 1; - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) - set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events); - read_unlock_bh(&call->state_lock); - } - - /* drain the out of sequence received packet queue into the packet Rx - * queue */ - if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) { - while (call->rx_data_post == call->rx_first_oos) - if (rxrpc_drain_rx_oos_queue(call) < 0) - break; - goto maybe_reschedule; - } - - if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) { - rxrpc_release_call(call); - clear_bit(RXRPC_CALL_EV_RELEASE, &call->events); - } + rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now); /* other events may have been raised since we started checking */ - goto maybe_reschedule; - -send_ACK_with_skew: - ack.maxSkew = htons(atomic_read(&call->conn->hi_serial) - - ntohl(ack.serial)); -send_ACK: - mtu = call->conn->params.peer->if_mtu; - mtu -= call->conn->params.peer->hdrsize; - ackinfo.maxMTU = htonl(mtu); - ackinfo.rwind = htonl(rxrpc_rx_window_size); - - /* permit the peer to send us jumbo packets if it wants to */ - ackinfo.rxMTU = htonl(rxrpc_rx_mtu); - ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max); - - serial = atomic_inc_return(&call->conn->serial); - whdr.serial = htonl(serial); - _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - serial, - ntohs(ack.maxSkew), - ntohl(ack.firstPacket), - ntohl(ack.previousPacket), - ntohl(ack.serial), - rxrpc_acks(ack.reason), - ack.nAcks); - - del_timer_sync(&call->ack_timer); - if (ack.nAcks > 0) - set_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags); - goto send_message_2; - -send_message: - _debug("send message"); - - serial = atomic_inc_return(&call->conn->serial); - whdr.serial = htonl(serial); - _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial); -send_message_2: - - len = iov[0].iov_len; - ioc = 1; - if (iov[4].iov_len) { - ioc = 5; - len += iov[4].iov_len; - len += iov[3].iov_len; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[3].iov_len) { - ioc = 4; - len += iov[3].iov_len; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[2].iov_len) { - ioc = 3; - len += iov[2].iov_len; - len += iov[1].iov_len; - } else if (iov[1].iov_len) { - ioc = 2; - len += iov[1].iov_len; - } - - ret = kernel_sendmsg(call->conn->params.local->socket, - &msg, iov, ioc, len); - if (ret < 0) { - _debug("sendmsg failed: %d", ret); - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); - goto error; - } - - switch (genbit) { - case RXRPC_CALL_EV_ABORT: - clear_bit(genbit, &call->events); - clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - goto kill_ACKs; - - case RXRPC_CALL_EV_ACK_FINAL: - write_lock_bh(&call->state_lock); - if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK) - call->state = RXRPC_CALL_COMPLETE; - write_unlock_bh(&call->state_lock); - goto kill_ACKs; - - default: - clear_bit(genbit, &call->events); - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - case RXRPC_CALL_CLIENT_RECV_REPLY: - case RXRPC_CALL_SERVER_RECV_REQUEST: - case RXRPC_CALL_SERVER_ACK_REQUEST: - _debug("start ACK timer"); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, - call->ackr_serial, false); - default: - break; - } - goto maybe_reschedule; - } - -kill_ACKs: - del_timer_sync(&call->ack_timer); - if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) - rxrpc_put_call(call); - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - -maybe_reschedule: - if (call->events || !skb_queue_empty(&call->rx_queue)) { - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); - } - - /* don't leave aborted connections on the accept queue */ - if (call->state >= RXRPC_CALL_COMPLETE && - !list_empty(&call->accept_link)) { - _debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }", - call, call->events, call->flags, call->conn->proto.cid); - - read_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); - } - -error: - clear_bit(RXRPC_CALL_PROC_BUSY, &call->flags); - kfree(acks); - - /* because we don't want two CPUs both processing the work item for one - * call at the same time, we use a flag to note when it's busy; however - * this means there's a race between clearing the flag and setting the - * work pending bit and the work item being processed again */ - if (call->events && !work_pending(&call->processor)) { - _debug("jumpstart %x", call->conn->proto.cid); - rxrpc_queue_call(call); + if (call->events && call->state < RXRPC_CALL_COMPLETE) { + __rxrpc_queue_call(call); + goto out; } +out_put: + rxrpc_put_call(call, rxrpc_call_put); +out: _leave(""); - return; - -no_mem: - _debug("out of memory"); - goto maybe_reschedule; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index ae057e0740f3..364b42dc3dce 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -19,23 +19,13 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" -/* - * Maximum lifetime of a call (in jiffies). - */ -unsigned int rxrpc_max_call_lifetime = 60 * HZ; - -/* - * Time till dead call expires after last use (in jiffies). - */ -unsigned int rxrpc_dead_call_expiry = 2 * HZ; - const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { - [RXRPC_CALL_UNINITIALISED] = "Uninit", + [RXRPC_CALL_UNINITIALISED] = "Uninit ", [RXRPC_CALL_CLIENT_AWAIT_CONN] = "ClWtConn", [RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq", [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", - [RXRPC_CALL_CLIENT_FINAL_ACK] = "ClFnlACK", + [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_ACCEPTING] = "SvAccept", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", @@ -43,22 +33,47 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", [RXRPC_CALL_SERVER_AWAIT_ACK] = "SvAwtACK", [RXRPC_CALL_COMPLETE] = "Complete", - [RXRPC_CALL_SERVER_BUSY] = "SvBusy ", +}; + +const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { + [RXRPC_CALL_SUCCEEDED] = "Complete", [RXRPC_CALL_REMOTELY_ABORTED] = "RmtAbort", [RXRPC_CALL_LOCALLY_ABORTED] = "LocAbort", + [RXRPC_CALL_LOCAL_ERROR] = "LocError", [RXRPC_CALL_NETWORK_ERROR] = "NetError", - [RXRPC_CALL_DEAD] = "Dead ", +}; + +const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = { + [rxrpc_call_new_client] = "NWc", + [rxrpc_call_new_service] = "NWs", + [rxrpc_call_queued] = "QUE", + [rxrpc_call_queued_ref] = "QUR", + [rxrpc_call_connected] = "CON", + [rxrpc_call_release] = "RLS", + [rxrpc_call_seen] = "SEE", + [rxrpc_call_got] = "GOT", + [rxrpc_call_got_userid] = "Gus", + [rxrpc_call_got_kernel] = "Gke", + [rxrpc_call_put] = "PUT", + [rxrpc_call_put_userid] = "Pus", + [rxrpc_call_put_kernel] = "Pke", + [rxrpc_call_put_noqueue] = "PNQ", + [rxrpc_call_error] = "*E*", }; struct kmem_cache *rxrpc_call_jar; LIST_HEAD(rxrpc_calls); DEFINE_RWLOCK(rxrpc_call_lock); -static void rxrpc_destroy_call(struct work_struct *work); -static void rxrpc_call_life_expired(unsigned long _call); -static void rxrpc_dead_call_expired(unsigned long _call); -static void rxrpc_ack_time_expired(unsigned long _call); -static void rxrpc_resend_time_expired(unsigned long _call); +static void rxrpc_call_timer_expired(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *)_call; + + _enter("%d", call->debug_id); + + if (call->state < RXRPC_CALL_COMPLETE) + rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real()); +} /* * find an extant server call @@ -91,7 +106,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, return NULL; found_extant_call: - rxrpc_get_call(call); + rxrpc_get_call(call, rxrpc_call_got); read_unlock(&rx->call_lock); _leave(" = %p [%d]", call, atomic_read(&call->usage)); return call; @@ -100,7 +115,7 @@ found_extant_call: /* * allocate a new call */ -static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) +struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) { struct rxrpc_call *call; @@ -108,29 +123,25 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) if (!call) return NULL; - call->acks_winsz = 16; - call->acks_window = kmalloc(call->acks_winsz * sizeof(unsigned long), + call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE, + sizeof(struct sk_buff *), gfp); - if (!call->acks_window) { - kmem_cache_free(rxrpc_call_jar, call); - return NULL; - } + if (!call->rxtx_buffer) + goto nomem; + + call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp); + if (!call->rxtx_annotations) + goto nomem_2; - setup_timer(&call->lifetimer, &rxrpc_call_life_expired, - (unsigned long) call); - setup_timer(&call->deadspan, &rxrpc_dead_call_expired, - (unsigned long) call); - setup_timer(&call->ack_timer, &rxrpc_ack_time_expired, - (unsigned long) call); - setup_timer(&call->resend_timer, &rxrpc_resend_time_expired, - (unsigned long) call); - INIT_WORK(&call->destroyer, &rxrpc_destroy_call); + setup_timer(&call->timer, rxrpc_call_timer_expired, + (unsigned long)call); INIT_WORK(&call->processor, &rxrpc_process_call); INIT_LIST_HEAD(&call->link); + INIT_LIST_HEAD(&call->chan_wait_link); INIT_LIST_HEAD(&call->accept_link); - skb_queue_head_init(&call->rx_queue); - skb_queue_head_init(&call->rx_oos_queue); - init_waitqueue_head(&call->tx_waitq); + INIT_LIST_HEAD(&call->recvmsg_link); + INIT_LIST_HEAD(&call->sock_link); + init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); rwlock_init(&call->state_lock); atomic_set(&call->usage, 1); @@ -138,70 +149,65 @@ static struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp) memset(&call->sock_node, 0xed, sizeof(call->sock_node)); - call->rx_data_expect = 1; - call->rx_data_eaten = 0; - call->rx_first_oos = 0; - call->ackr_win_top = call->rx_data_eaten + 1 + rxrpc_rx_window_size; - call->creation_jif = jiffies; + /* Leave space in the ring to handle a maxed-out jumbo packet */ + call->rx_winsize = rxrpc_rx_window_size; + call->tx_winsize = 16; + call->rx_expect_next = 1; + + if (RXRPC_TX_SMSS > 2190) + call->cong_cwnd = 2; + else if (RXRPC_TX_SMSS > 1095) + call->cong_cwnd = 3; + else + call->cong_cwnd = 4; + call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1; return call; + +nomem_2: + kfree(call->rxtx_buffer); +nomem: + kmem_cache_free(rxrpc_call_jar, call); + return NULL; } /* * Allocate a new client call. */ -static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, - struct sockaddr_rxrpc *srx, +static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_call *call; + ktime_t now; _enter(""); - ASSERT(rx->local != NULL); - call = rxrpc_alloc_call(gfp); if (!call) return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; - - sock_hold(&rx->sk); - call->socket = rx; - call->rx_data_post = 1; - - call->local = rx->local; call->service_id = srx->srx_service; - call->in_clientflag = 0; + call->tx_phase = true; + now = ktime_get_real(); + call->acks_latest_ts = now; + call->cong_tstamp = now; _leave(" = %p", call); return call; } /* - * Begin client call. + * Initiate the call ack/resend/expiry timer. */ -static int rxrpc_begin_client_call(struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +static void rxrpc_start_call_timer(struct rxrpc_call *call) { - int ret; - - /* Set up or get a connection record and set the protocol parameters, - * including channel number and call ID. - */ - ret = rxrpc_connect_call(call, cp, srx, gfp); - if (ret < 0) - return ret; - - call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; - - spin_lock(&call->conn->params.peer->lock); - hlist_add_head(&call->error_link, &call->conn->params.peer->error_targets); - spin_unlock(&call->conn->params.peer->lock); - - call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; - add_timer(&call->lifetimer); - return 0; + ktime_t now = ktime_get_real(), expire_at; + + expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime); + call->expire_at = expire_at; + call->ack_at = expire_at; + call->resend_at = expire_at; + call->timer.expires = jiffies + LONG_MAX / 2; + rxrpc_set_timer(call, rxrpc_timer_begin, now); } /* @@ -216,20 +222,21 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, { struct rxrpc_call *call, *xcall; struct rb_node *parent, **pp; + const void *here = __builtin_return_address(0); int ret; _enter("%p,%lx", rx, user_call_ID); - call = rxrpc_alloc_client_call(rx, srx, gfp); + call = rxrpc_alloc_client_call(srx, gfp); if (IS_ERR(call)) { _leave(" = %ld", PTR_ERR(call)); return call; } - /* Publish the call, even though it is incompletely set up as yet */ - call->user_call_ID = user_call_ID; - __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); + trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage), + here, (const void *)user_call_ID); + /* Publish the call, even though it is incompletely set up as yet */ write_lock(&rx->call_lock); pp = &rx->calls.rb_node; @@ -243,369 +250,285 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, else if (user_call_ID > xcall->user_call_ID) pp = &(*pp)->rb_right; else - goto found_user_ID_now_present; + goto error_dup_user_ID; } - rxrpc_get_call(call); - + rcu_assign_pointer(call->socket, rx); + call->user_call_ID = user_call_ID; + __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); + rxrpc_get_call(call, rxrpc_call_got_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); + list_add(&call->sock_link, &rx->sock_calls); + write_unlock(&rx->call_lock); - write_lock_bh(&rxrpc_call_lock); + write_lock(&rxrpc_call_lock); list_add_tail(&call->link, &rxrpc_calls); - write_unlock_bh(&rxrpc_call_lock); + write_unlock(&rxrpc_call_lock); - ret = rxrpc_begin_client_call(call, cp, srx, gfp); + /* Set up or get a connection record and set the protocol parameters, + * including channel number and call ID. + */ + ret = rxrpc_connect_call(call, cp, srx, gfp); if (ret < 0) goto error; - _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); + trace_rxrpc_call(call, rxrpc_call_connected, atomic_read(&call->usage), + here, ERR_PTR(ret)); - _leave(" = %p [new]", call); - return call; + spin_lock_bh(&call->conn->params.peer->lock); + hlist_add_head(&call->error_link, + &call->conn->params.peer->error_targets); + spin_unlock_bh(&call->conn->params.peer->lock); -error: - write_lock(&rx->call_lock); - rb_erase(&call->sock_node, &rx->calls); - write_unlock(&rx->call_lock); - rxrpc_put_call(call); + rxrpc_start_call_timer(call); - write_lock_bh(&rxrpc_call_lock); - list_del_init(&call->link); - write_unlock_bh(&rxrpc_call_lock); + _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); - set_bit(RXRPC_CALL_RELEASED, &call->flags); - call->state = RXRPC_CALL_DEAD; - rxrpc_put_call(call); - _leave(" = %d", ret); - return ERR_PTR(ret); + _leave(" = %p [new]", call); + return call; /* We unexpectedly found the user ID in the list after taking * the call_lock. This shouldn't happen unless the user races * with itself and tries to add the same user ID twice at the * same time in different threads. */ -found_user_ID_now_present: +error_dup_user_ID: write_unlock(&rx->call_lock); - set_bit(RXRPC_CALL_RELEASED, &call->flags); - call->state = RXRPC_CALL_DEAD; - rxrpc_put_call(call); - _leave(" = -EEXIST [%p]", call); - return ERR_PTR(-EEXIST); + ret = -EEXIST; + +error: + __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + RX_CALL_DEAD, ret); + trace_rxrpc_call(call, rxrpc_call_error, atomic_read(&call->usage), + here, ERR_PTR(ret)); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); + _leave(" = %d", ret); + return ERR_PTR(ret); } /* - * set up an incoming call - * - called in process context with IRQs enabled + * Set up an incoming call. call->conn points to the connection. + * This is called in BH context and isn't allowed to fail. */ -struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, - struct rxrpc_connection *conn, - struct sk_buff *skb) +void rxrpc_incoming_call(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct sk_buff *skb) { + struct rxrpc_connection *conn = call->conn; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_call *call, *candidate; - u32 call_id, chan; - - _enter(",%d", conn->debug_id); - - ASSERT(rx != NULL); - - candidate = rxrpc_alloc_call(GFP_NOIO); - if (!candidate) - return ERR_PTR(-EBUSY); - - chan = sp->hdr.cid & RXRPC_CHANNELMASK; - candidate->socket = rx; - candidate->conn = conn; - candidate->cid = sp->hdr.cid; - candidate->call_id = sp->hdr.callNumber; - candidate->channel = chan; - candidate->rx_data_post = 0; - candidate->state = RXRPC_CALL_SERVER_ACCEPTING; - if (conn->security_ix > 0) - candidate->state = RXRPC_CALL_SERVER_SECURING; - - spin_lock(&conn->channel_lock); - - /* set the channel for this call */ - call = rcu_dereference_protected(conn->channels[chan].call, - lockdep_is_held(&conn->channel_lock)); - - _debug("channel[%u] is %p", candidate->channel, call); - if (call && call->call_id == sp->hdr.callNumber) { - /* already set; must've been a duplicate packet */ - _debug("extant call [%d]", call->state); - ASSERTCMP(call->conn, ==, conn); - - read_lock(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) - rxrpc_queue_call(call); - case RXRPC_CALL_REMOTELY_ABORTED: - read_unlock(&call->state_lock); - goto aborted_call; - default: - rxrpc_get_call(call); - read_unlock(&call->state_lock); - goto extant_call; - } - } - - if (call) { - /* it seems the channel is still in use from the previous call - * - ditch the old binding if its call is now complete */ - _debug("CALL: %u { %s }", - call->debug_id, rxrpc_call_states[call->state]); - - if (call->state >= RXRPC_CALL_COMPLETE) { - __rxrpc_disconnect_call(call); - } else { - spin_unlock(&conn->channel_lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -EBUSY"); - return ERR_PTR(-EBUSY); - } - } - - /* check the call number isn't duplicate */ - _debug("check dup"); - call_id = sp->hdr.callNumber; - - /* We just ignore calls prior to the current call ID. Terminated calls - * are handled via the connection. + u32 chan; + + _enter(",%d", call->conn->debug_id); + + rcu_assign_pointer(call->socket, rx); + call->call_id = sp->hdr.callNumber; + call->service_id = sp->hdr.serviceId; + call->cid = sp->hdr.cid; + call->state = RXRPC_CALL_SERVER_ACCEPTING; + if (sp->hdr.securityIndex > 0) + call->state = RXRPC_CALL_SERVER_SECURING; + call->cong_tstamp = skb->tstamp; + + /* Set the channel for this call. We don't get channel_lock as we're + * only defending against the data_ready handler (which we're called + * from) and the RESPONSE packet parser (which is only really + * interested in call_counter and can cope with a disagreement with the + * call pointer). */ - if (call_id <= conn->channels[chan].call_counter) - goto old_call; /* TODO: Just drop packet */ - - /* make the call available */ - _debug("new call"); - call = candidate; - candidate = NULL; - conn->channels[chan].call_counter = call_id; + chan = sp->hdr.cid & RXRPC_CHANNELMASK; + conn->channels[chan].call_counter = call->call_id; + conn->channels[chan].call_id = call->call_id; rcu_assign_pointer(conn->channels[chan].call, call); - sock_hold(&rx->sk); - rxrpc_get_connection(conn); - spin_unlock(&conn->channel_lock); spin_lock(&conn->params.peer->lock); hlist_add_head(&call->error_link, &conn->params.peer->error_targets); spin_unlock(&conn->params.peer->lock); - write_lock_bh(&rxrpc_call_lock); - list_add_tail(&call->link, &rxrpc_calls); - write_unlock_bh(&rxrpc_call_lock); + _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); - call->local = conn->params.local; - call->epoch = conn->proto.epoch; - call->service_id = conn->params.service_id; - call->in_clientflag = RXRPC_CLIENT_INITIATED; + rxrpc_start_call_timer(call); + _leave(""); +} - _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); +/* + * Queue a call's work processor, getting a ref to pass to the work queue. + */ +bool rxrpc_queue_call(struct rxrpc_call *call) +{ + const void *here = __builtin_return_address(0); + int n = __atomic_add_unless(&call->usage, 1, 0); + if (n == 0) + return false; + if (rxrpc_queue_work(&call->processor)) + trace_rxrpc_call(call, rxrpc_call_queued, n + 1, here, NULL); + else + rxrpc_put_call(call, rxrpc_call_put_noqueue); + return true; +} - call->lifetimer.expires = jiffies + rxrpc_max_call_lifetime; - add_timer(&call->lifetimer); - _leave(" = %p {%d} [new]", call, call->debug_id); - return call; +/* + * Queue a call's work processor, passing the callers ref to the work queue. + */ +bool __rxrpc_queue_call(struct rxrpc_call *call) +{ + const void *here = __builtin_return_address(0); + int n = atomic_read(&call->usage); + ASSERTCMP(n, >=, 1); + if (rxrpc_queue_work(&call->processor)) + trace_rxrpc_call(call, rxrpc_call_queued_ref, n, here, NULL); + else + rxrpc_put_call(call, rxrpc_call_put_noqueue); + return true; +} -extant_call: - spin_unlock(&conn->channel_lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = %p {%d} [extant]", call, call ? call->debug_id : -1); - return call; +/* + * Note the re-emergence of a call. + */ +void rxrpc_see_call(struct rxrpc_call *call) +{ + const void *here = __builtin_return_address(0); + if (call) { + int n = atomic_read(&call->usage); -aborted_call: - spin_unlock(&conn->channel_lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -ECONNABORTED"); - return ERR_PTR(-ECONNABORTED); - -old_call: - spin_unlock(&conn->channel_lock); - kmem_cache_free(rxrpc_call_jar, candidate); - _leave(" = -ECONNRESET [old]"); - return ERR_PTR(-ECONNRESET); + trace_rxrpc_call(call, rxrpc_call_seen, n, here, NULL); + } } /* - * detach a call from a socket and set up for release + * Note the addition of a ref on a call. */ -void rxrpc_release_call(struct rxrpc_call *call) +void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&call->usage); + + trace_rxrpc_call(call, op, n, here, NULL); +} + +/* + * Detach a call from its owning socket. + */ +void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) +{ + const void *here = __builtin_return_address(0); struct rxrpc_connection *conn = call->conn; - struct rxrpc_sock *rx = call->socket; + bool put = false; + int i; - _enter("{%d,%d,%d,%d}", - call->debug_id, atomic_read(&call->usage), - atomic_read(&call->ackr_not_idle), - call->rx_first_oos); + _enter("{%d,%d}", call->debug_id, atomic_read(&call->usage)); + + trace_rxrpc_call(call, rxrpc_call_release, atomic_read(&call->usage), + here, (const void *)call->flags); + + ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); spin_lock_bh(&call->lock); if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); spin_unlock_bh(&call->lock); - /* dissociate from the socket - * - the socket's ref on the call is passed to the death timer - */ - _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); + del_timer_sync(&call->timer); - spin_lock(&conn->params.peer->lock); - hlist_del_init(&call->error_link); - spin_unlock(&conn->params.peer->lock); + /* Make sure we don't get any more notifications */ + write_lock_bh(&rx->recvmsg_lock); - write_lock_bh(&rx->call_lock); - if (!list_empty(&call->accept_link)) { + if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", call, call->events, call->flags); - ASSERT(!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); - list_del_init(&call->accept_link); - sk_acceptq_removed(&rx->sk); - } else if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - rb_erase(&call->sock_node, &rx->calls); - memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); - clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); + list_del(&call->recvmsg_link); + put = true; } - write_unlock_bh(&rx->call_lock); - /* free up the channel for reuse */ - write_lock_bh(&call->state_lock); + /* list_empty() must return false in rxrpc_notify_socket() */ + call->recvmsg_link.next = NULL; + call->recvmsg_link.prev = NULL; - if (call->state < RXRPC_CALL_COMPLETE && - call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { - _debug("+++ ABORTING STATE %d +++\n", call->state); - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_CALL_DEAD; - } - write_unlock_bh(&call->state_lock); + write_unlock_bh(&rx->recvmsg_lock); + if (put) + rxrpc_put_call(call, rxrpc_call_put); - rxrpc_disconnect_call(call); + write_lock(&rx->call_lock); - /* clean up the Rx queue */ - if (!skb_queue_empty(&call->rx_queue) || - !skb_queue_empty(&call->rx_oos_queue)) { - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; + if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + rb_erase(&call->sock_node, &rx->calls); + memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); + rxrpc_put_call(call, rxrpc_call_put_userid); + } - _debug("purge Rx queues"); + list_del(&call->sock_link); + write_unlock(&rx->call_lock); - spin_lock_bh(&call->lock); - while ((skb = skb_dequeue(&call->rx_queue)) || - (skb = skb_dequeue(&call->rx_oos_queue))) { - spin_unlock_bh(&call->lock); + _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn); - sp = rxrpc_skb(skb); - _debug("- zap %s %%%u #%u", - rxrpc_pkts[sp->hdr.type], - sp->hdr.serial, sp->hdr.seq); - rxrpc_free_skb(skb); - spin_lock_bh(&call->lock); - } - spin_unlock_bh(&call->lock); + if (conn) + rxrpc_disconnect_call(call); - ASSERTCMP(call->state, !=, RXRPC_CALL_COMPLETE); + for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { + rxrpc_free_skb(call->rxtx_buffer[i], + (call->tx_phase ? rxrpc_skb_tx_cleaned : + rxrpc_skb_rx_cleaned)); + call->rxtx_buffer[i] = NULL; } - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - del_timer_sync(&call->lifetimer); - call->deadspan.expires = jiffies + rxrpc_dead_call_expiry; - add_timer(&call->deadspan); - _leave(""); } /* - * handle a dead call being ready for reaping - */ -static void rxrpc_dead_call_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - write_lock_bh(&call->state_lock); - call->state = RXRPC_CALL_DEAD; - write_unlock_bh(&call->state_lock); - rxrpc_put_call(call); -} - -/* - * mark a call as to be released, aborting it if it's still in progress - * - called with softirqs disabled - */ -static void rxrpc_mark_call_released(struct rxrpc_call *call) -{ - bool sched; - - write_lock(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) { - sched = false; - if (call->state < RXRPC_CALL_COMPLETE) { - _debug("abort call %p", call); - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_CALL_DEAD; - if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) - sched = true; - } - if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - sched = true; - if (sched) - rxrpc_queue_call(call); - } - write_unlock(&call->state_lock); -} - -/* * release all the calls associated with a socket */ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) { struct rxrpc_call *call; - struct rb_node *p; _enter("%p", rx); - read_lock_bh(&rx->call_lock); - - /* mark all the calls as no longer wanting incoming packets */ - for (p = rb_first(&rx->calls); p; p = rb_next(p)) { - call = rb_entry(p, struct rxrpc_call, sock_node); - rxrpc_mark_call_released(call); - } - - /* kill the not-yet-accepted incoming calls */ - list_for_each_entry(call, &rx->secureq, accept_link) { - rxrpc_mark_call_released(call); + while (!list_empty(&rx->to_be_accepted)) { + call = list_entry(rx->to_be_accepted.next, + struct rxrpc_call, accept_link); + list_del(&call->accept_link); + rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET); + rxrpc_put_call(call, rxrpc_call_put); } - list_for_each_entry(call, &rx->acceptq, accept_link) { - rxrpc_mark_call_released(call); + while (!list_empty(&rx->sock_calls)) { + call = list_entry(rx->sock_calls.next, + struct rxrpc_call, sock_link); + rxrpc_get_call(call, rxrpc_call_got); + rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + rxrpc_release_call(rx, call); + rxrpc_put_call(call, rxrpc_call_put); } - read_unlock_bh(&rx->call_lock); _leave(""); } /* * release a call */ -void __rxrpc_put_call(struct rxrpc_call *call) +void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { + const void *here = __builtin_return_address(0); + int n; + ASSERT(call != NULL); - _enter("%p{u=%d}", call, atomic_read(&call->usage)); + n = atomic_dec_return(&call->usage); + trace_rxrpc_call(call, op, n, here, NULL); + ASSERTCMP(n, >=, 0); + if (n == 0) { + _debug("call %d dead", call->debug_id); + ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); - ASSERTCMP(atomic_read(&call->usage), >, 0); + write_lock(&rxrpc_call_lock); + list_del_init(&call->link); + write_unlock(&rxrpc_call_lock); - if (atomic_dec_and_test(&call->usage)) { - _debug("call %d dead", call->debug_id); - WARN_ON(atomic_read(&call->skb_count) != 0); - ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); - rxrpc_queue_work(&call->destroyer); + rxrpc_cleanup_call(call); } - _leave(""); } /* @@ -615,187 +538,70 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) { struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); - rxrpc_purge_queue(&call->rx_queue); + rxrpc_put_peer(call->peer); + kfree(call->rxtx_buffer); + kfree(call->rxtx_annotations); kmem_cache_free(rxrpc_call_jar, call); } /* * clean up a call */ -static void rxrpc_cleanup_call(struct rxrpc_call *call) +void rxrpc_cleanup_call(struct rxrpc_call *call) { - _net("DESTROY CALL %d", call->debug_id); + int i; - ASSERT(call->socket); + _net("DESTROY CALL %d", call->debug_id); memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); - del_timer_sync(&call->lifetimer); - del_timer_sync(&call->deadspan); - del_timer_sync(&call->ack_timer); - del_timer_sync(&call->resend_timer); + del_timer_sync(&call->timer); + ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); - ASSERTCMP(call->events, ==, 0); - if (work_pending(&call->processor)) { - _debug("defer destroy"); - rxrpc_queue_work(&call->destroyer); - return; - } - ASSERTCMP(call->conn, ==, NULL); - if (call->acks_window) { - _debug("kill Tx window %d", - CIRC_CNT(call->acks_head, call->acks_tail, - call->acks_winsz)); - smp_mb(); - while (CIRC_CNT(call->acks_head, call->acks_tail, - call->acks_winsz) > 0) { - struct rxrpc_skb_priv *sp; - unsigned long _skb; - - _skb = call->acks_window[call->acks_tail] & ~1; - sp = rxrpc_skb((struct sk_buff *)_skb); - _debug("+++ clear Tx %u", sp->hdr.seq); - rxrpc_free_skb((struct sk_buff *)_skb); - call->acks_tail = - (call->acks_tail + 1) & (call->acks_winsz - 1); - } - - kfree(call->acks_window); - } + /* Clean up the Rx/Tx buffer */ + for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) + rxrpc_free_skb(call->rxtx_buffer[i], + (call->tx_phase ? rxrpc_skb_tx_cleaned : + rxrpc_skb_rx_cleaned)); - rxrpc_free_skb(call->tx_pending); + rxrpc_free_skb(call->tx_pending, rxrpc_skb_tx_cleaned); - rxrpc_purge_queue(&call->rx_queue); - ASSERT(skb_queue_empty(&call->rx_oos_queue)); - sock_put(&call->socket->sk); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } /* - * destroy a call - */ -static void rxrpc_destroy_call(struct work_struct *work) -{ - struct rxrpc_call *call = - container_of(work, struct rxrpc_call, destroyer); - - _enter("%p{%d,%d,%p}", - call, atomic_read(&call->usage), call->channel, call->conn); - - ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); - - write_lock_bh(&rxrpc_call_lock); - list_del_init(&call->link); - write_unlock_bh(&rxrpc_call_lock); - - rxrpc_cleanup_call(call); - _leave(""); -} - -/* - * preemptively destroy all the call records from a transport endpoint rather - * than waiting for them to time out + * Make sure that all calls are gone. */ void __exit rxrpc_destroy_all_calls(void) { struct rxrpc_call *call; _enter(""); - write_lock_bh(&rxrpc_call_lock); + + if (list_empty(&rxrpc_calls)) + return; + + write_lock(&rxrpc_call_lock); while (!list_empty(&rxrpc_calls)) { call = list_entry(rxrpc_calls.next, struct rxrpc_call, link); _debug("Zapping call %p", call); + rxrpc_see_call(call); list_del_init(&call->link); - switch (atomic_read(&call->usage)) { - case 0: - ASSERTCMP(call->state, ==, RXRPC_CALL_DEAD); - break; - case 1: - if (del_timer_sync(&call->deadspan) != 0 && - call->state != RXRPC_CALL_DEAD) - rxrpc_dead_call_expired((unsigned long) call); - if (call->state != RXRPC_CALL_DEAD) - break; - default: - pr_err("Call %p still in use (%d,%d,%s,%lx,%lx)!\n", - call, atomic_read(&call->usage), - atomic_read(&call->ackr_not_idle), - rxrpc_call_states[call->state], - call->flags, call->events); - if (!skb_queue_empty(&call->rx_queue)) - pr_err("Rx queue occupied\n"); - if (!skb_queue_empty(&call->rx_oos_queue)) - pr_err("OOS queue occupied\n"); - break; - } - - write_unlock_bh(&rxrpc_call_lock); - cond_resched(); - write_lock_bh(&rxrpc_call_lock); - } - - write_unlock_bh(&rxrpc_call_lock); - _leave(""); -} - -/* - * handle call lifetime being exceeded - */ -static void rxrpc_call_life_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; + pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", + call, atomic_read(&call->usage), + rxrpc_call_states[call->state], + call->flags, call->events); - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - _enter("{%d}", call->debug_id); - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) { - set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events); - rxrpc_queue_call(call); + write_unlock(&rxrpc_call_lock); + cond_resched(); + write_lock(&rxrpc_call_lock); } - read_unlock_bh(&call->state_lock); -} - -/* - * handle resend timer expiry - * - may not take call->state_lock as this can deadlock against del_timer_sync() - */ -static void rxrpc_resend_time_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - if (call->state >= RXRPC_CALL_COMPLETE) - return; - - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) - rxrpc_queue_call(call); -} - -/* - * handle ACK timer expiry - */ -static void rxrpc_ack_time_expired(unsigned long _call) -{ - struct rxrpc_call *call = (struct rxrpc_call *) _call; - - _enter("{%d}", call->debug_id); - - if (call->state >= RXRPC_CALL_COMPLETE) - return; - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events)) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); + write_unlock(&rxrpc_call_lock); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 9e91f27b0d0f..60ef9605167e 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -7,6 +7,68 @@ * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. + * + * + * Client connections need to be cached for a little while after they've made a + * call so as to handle retransmitted DATA packets in case the server didn't + * receive the final ACK or terminating ABORT we sent it. + * + * Client connections can be in one of a number of cache states: + * + * (1) INACTIVE - The connection is not held in any list and may not have been + * exposed to the world. If it has been previously exposed, it was + * discarded from the idle list after expiring. + * + * (2) WAITING - The connection is waiting for the number of client conns to + * drop below the maximum capacity. Calls may be in progress upon it from + * when it was active and got culled. + * + * The connection is on the rxrpc_waiting_client_conns list which is kept + * in to-be-granted order. Culled conns with waiters go to the back of + * the queue just like new conns. + * + * (3) ACTIVE - The connection has at least one call in progress upon it, it + * may freely grant available channels to new calls and calls may be + * waiting on it for channels to become available. + * + * The connection is on the rxrpc_active_client_conns list which is kept + * in activation order for culling purposes. + * + * rxrpc_nr_active_client_conns is held incremented also. + * + * (4) CULLED - The connection got summarily culled to try and free up + * capacity. Calls currently in progress on the connection are allowed to + * continue, but new calls will have to wait. There can be no waiters in + * this state - the conn would have to go to the WAITING state instead. + * + * (5) IDLE - The connection has no calls in progress upon it and must have + * been exposed to the world (ie. the EXPOSED flag must be set). When it + * expires, the EXPOSED flag is cleared and the connection transitions to + * the INACTIVE state. + * + * The connection is on the rxrpc_idle_client_conns list which is kept in + * order of how soon they'll expire. + * + * There are flags of relevance to the cache: + * + * (1) EXPOSED - The connection ID got exposed to the world. If this flag is + * set, an extra ref is added to the connection preventing it from being + * reaped when it has no calls outstanding. This flag is cleared and the + * ref dropped when a conn is discarded from the idle list. + * + * This allows us to move terminal call state retransmission to the + * connection and to discard the call immediately we think it is done + * with. It also give us a chance to reuse the connection. + * + * (2) DONT_REUSE - The connection should be discarded as soon as possible and + * should not be reused. This is set when an exclusive connection is used + * or a call ID counter overflows. + * + * The caching state may only be changed if the cache lock is held. + * + * There are two idle client connection expiry durations. If the total number + * of connections is below the reap threshold, we use the normal duration; if + * it's above, we use the fast duration. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -16,27 +78,50 @@ #include <linux/timer.h> #include "ar-internal.h" +__read_mostly unsigned int rxrpc_max_client_connections = 1000; +__read_mostly unsigned int rxrpc_reap_client_connections = 900; +__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ; +__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ; + +static unsigned int rxrpc_nr_client_conns; +static unsigned int rxrpc_nr_active_client_conns; +static __read_mostly bool rxrpc_kill_all_client_conns; + +static DEFINE_SPINLOCK(rxrpc_client_conn_cache_lock); +static DEFINE_SPINLOCK(rxrpc_client_conn_discard_mutex); +static LIST_HEAD(rxrpc_waiting_client_conns); +static LIST_HEAD(rxrpc_active_client_conns); +static LIST_HEAD(rxrpc_idle_client_conns); + /* * We use machine-unique IDs for our client connections. */ DEFINE_IDR(rxrpc_client_conn_ids); static DEFINE_SPINLOCK(rxrpc_conn_id_lock); +static void rxrpc_cull_active_client_conns(void); +static void rxrpc_discard_expired_client_conns(struct work_struct *); + +static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, + rxrpc_discard_expired_client_conns); + +const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = { + [RXRPC_CONN_CLIENT_INACTIVE] = "Inac", + [RXRPC_CONN_CLIENT_WAITING] = "Wait", + [RXRPC_CONN_CLIENT_ACTIVE] = "Actv", + [RXRPC_CONN_CLIENT_CULLED] = "Cull", + [RXRPC_CONN_CLIENT_IDLE] = "Idle", +}; + /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The - * epoch is changed if this wraps. - * - * TODO: The IDR tree gets very expensive on memory if the connection IDs are - * widely scattered throughout the number space, so we shall need to retire - * connections that have, say, an ID more than four times the maximum number of - * client conns away from the current allocation point to try and keep the IDs - * concentrated. We will also need to retire connections from an old epoch. + * epoch doesn't change until the client is rebooted (or, at least, unless the + * module is unloaded). */ static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp) { - u32 epoch; int id; _enter(""); @@ -44,34 +129,18 @@ static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, idr_preload(gfp); spin_lock(&rxrpc_conn_id_lock); - epoch = rxrpc_epoch; - - /* We could use idr_alloc_cyclic() here, but we really need to know - * when the thing wraps so that we can advance the epoch. - */ - if (rxrpc_client_conn_ids.cur == 0) - rxrpc_client_conn_ids.cur = 1; - id = idr_alloc(&rxrpc_client_conn_ids, conn, - rxrpc_client_conn_ids.cur, 0x40000000, GFP_NOWAIT); - if (id < 0) { - if (id != -ENOSPC) - goto error; - id = idr_alloc(&rxrpc_client_conn_ids, conn, - 1, 0x40000000, GFP_NOWAIT); - if (id < 0) - goto error; - epoch++; - rxrpc_epoch = epoch; - } - rxrpc_client_conn_ids.cur = id + 1; + id = idr_alloc_cyclic(&rxrpc_client_conn_ids, conn, + 1, 0x40000000, GFP_NOWAIT); + if (id < 0) + goto error; spin_unlock(&rxrpc_conn_id_lock); idr_preload_end(); - conn->proto.epoch = epoch; + conn->proto.epoch = rxrpc_epoch; conn->proto.cid = id << RXRPC_CIDSHIFT; set_bit(RXRPC_CONN_HAS_IDR, &conn->flags); - _leave(" [CID %x:%x]", epoch, conn->proto.cid); + _leave(" [CID %x]", conn->proto.cid); return 0; error: @@ -114,8 +183,7 @@ void rxrpc_destroy_client_conn_ids(void) } /* - * Allocate a client connection. The caller must take care to clear any - * padding bytes in *cp. + * Allocate a client connection. */ static struct rxrpc_connection * rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) @@ -131,6 +199,10 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) return ERR_PTR(-ENOMEM); } + atomic_set(&conn->usage, 1); + if (cp->exclusive) + __set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + conn->params = *cp; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->state = RXRPC_CONN_CLIENT; @@ -148,7 +220,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) goto error_2; write_lock(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); + list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); write_unlock(&rxrpc_connection_lock); /* We steal the caller's peer ref. */ @@ -156,6 +228,9 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) rxrpc_get_local(conn->params.local); key_get(conn->params.key); + trace_rxrpc_conn(conn, rxrpc_conn_new_client, atomic_read(&conn->usage), + __builtin_return_address(0)); + trace_rxrpc_client(conn, -1, rxrpc_client_alloc); _leave(" = %p", conn); return conn; @@ -170,32 +245,68 @@ error_0: } /* - * find a connection for a call - * - called in process context with IRQs enabled + * Determine if a connection may be reused. */ -int rxrpc_connect_call(struct rxrpc_call *call, - struct rxrpc_conn_parameters *cp, - struct sockaddr_rxrpc *srx, - gfp_t gfp) +static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) +{ + int id_cursor, id, distance, limit; + + if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) + goto dont_reuse; + + if (conn->proto.epoch != rxrpc_epoch) + goto mark_dont_reuse; + + /* The IDR tree gets very expensive on memory if the connection IDs are + * widely scattered throughout the number space, so we shall want to + * kill off connections that, say, have an ID more than about four + * times the maximum number of client conns away from the current + * allocation point to try and keep the IDs concentrated. + */ + id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur); + id = conn->proto.cid >> RXRPC_CIDSHIFT; + distance = id - id_cursor; + if (distance < 0) + distance = -distance; + limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4; + if (distance > limit) + goto mark_dont_reuse; + + return true; + +mark_dont_reuse: + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); +dont_reuse: + return false; +} + +/* + * Create or find a client connection to use for a call. + * + * If we return with a connection, the call will be on its waiting list. It's + * left to the caller to assign a channel and wake up the call. + */ +static int rxrpc_get_client_conn(struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) { struct rxrpc_connection *conn, *candidate = NULL; struct rxrpc_local *local = cp->local; struct rb_node *p, **pp, *parent; long diff; - int chan; - - DECLARE_WAITQUEUE(myself, current); + int ret = -ENOMEM; _enter("{%d,%lx},", call->debug_id, call->user_call_ID); cp->peer = rxrpc_lookup_peer(cp->local, srx, gfp); if (!cp->peer) - return -ENOMEM; + goto error; + /* If the connection is not meant to be exclusive, search the available + * connections to see if the connection we want to use already exists. + */ if (!cp->exclusive) { - /* Search for a existing client connection unless this is going - * to be a connection that's used exclusively for a single call. - */ _debug("search 1"); spin_lock(&local->client_conns_lock); p = local->client_conns.rb_node; @@ -206,39 +317,56 @@ int rxrpc_connect_call(struct rxrpc_call *call, diff = (cmp(peer) ?: cmp(key) ?: cmp(security_level)); - if (diff < 0) +#undef cmp + if (diff < 0) { p = p->rb_left; - else if (diff > 0) + } else if (diff > 0) { p = p->rb_right; - else - goto found_extant_conn; + } else { + if (rxrpc_may_reuse_conn(conn) && + rxrpc_get_connection_maybe(conn)) + goto found_extant_conn; + /* The connection needs replacing. It's better + * to effect that when we have something to + * replace it with so that we don't have to + * rebalance the tree twice. + */ + break; + } } spin_unlock(&local->client_conns_lock); } - /* We didn't find a connection or we want an exclusive one. */ - _debug("get new conn"); + /* There wasn't a connection yet or we need an exclusive connection. + * We need to create a candidate and then potentially redo the search + * in case we're racing with another thread also trying to connect on a + * shareable connection. + */ + _debug("new conn"); candidate = rxrpc_alloc_client_connection(cp, gfp); - if (!candidate) { - _leave(" = -ENOMEM"); - return -ENOMEM; + if (IS_ERR(candidate)) { + ret = PTR_ERR(candidate); + goto error_peer; } + /* Add the call to the new connection's waiting list in case we're + * going to have to wait for the connection to come live. It's our + * connection, so we want first dibs on the channel slots. We would + * normally have to take channel_lock but we do this before anyone else + * can see the connection. + */ + list_add_tail(&call->chan_wait_link, &candidate->waiting_calls); + if (cp->exclusive) { - /* Assign the call on an exclusive connection to channel 0 and - * don't add the connection to the endpoint's shareable conn - * lookup tree. - */ - _debug("exclusive chan 0"); - conn = candidate; - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); - spin_lock(&conn->channel_lock); - chan = 0; - goto found_channel; + call->conn = candidate; + call->security_ix = candidate->security_ix; + _leave(" = 0 [exclusive %d]", candidate->debug_id); + return 0; } - /* We need to redo the search before attempting to add a new connection - * lest we race with someone else adding a conflicting instance. + /* Publish the new connection for userspace to find. We need to redo + * the search before doing this lest we race with someone else adding a + * conflicting instance. */ _debug("search 2"); spin_lock(&local->client_conns_lock); @@ -249,124 +377,711 @@ int rxrpc_connect_call(struct rxrpc_call *call, parent = *pp; conn = rb_entry(parent, struct rxrpc_connection, client_node); +#define cmp(X) ((long)conn->params.X - (long)candidate->params.X) diff = (cmp(peer) ?: cmp(key) ?: cmp(security_level)); - if (diff < 0) +#undef cmp + if (diff < 0) { pp = &(*pp)->rb_left; - else if (diff > 0) + } else if (diff > 0) { pp = &(*pp)->rb_right; - else - goto found_extant_conn; + } else { + if (rxrpc_may_reuse_conn(conn) && + rxrpc_get_connection_maybe(conn)) + goto found_extant_conn; + /* The old connection is from an outdated epoch. */ + _debug("replace conn"); + clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags); + rb_replace_node(&conn->client_node, + &candidate->client_node, + &local->client_conns); + trace_rxrpc_client(conn, -1, rxrpc_client_replace); + goto candidate_published; + } } - /* The second search also failed; simply add the new connection with - * the new call in channel 0. Note that we need to take the channel - * lock before dropping the client conn lock. - */ _debug("new conn"); - set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); rb_link_node(&candidate->client_node, parent, pp); rb_insert_color(&candidate->client_node, &local->client_conns); -attached: - conn = candidate; - candidate = NULL; - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS - 1); - spin_lock(&conn->channel_lock); +candidate_published: + set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); + call->conn = candidate; + call->security_ix = candidate->security_ix; spin_unlock(&local->client_conns_lock); - chan = 0; + _leave(" = 0 [new %d]", candidate->debug_id); + return 0; -found_channel: - _debug("found chan"); - call->conn = conn; - call->channel = chan; - call->epoch = conn->proto.epoch; - call->cid = conn->proto.cid | chan; - call->call_id = ++conn->channels[chan].call_counter; - conn->channels[chan].call_id = call->call_id; - rcu_assign_pointer(conn->channels[chan].call, call); + /* We come here if we found a suitable connection already in existence. + * Discard any candidate we may have allocated, and try to get a + * channel on this one. + */ +found_extant_conn: + _debug("found conn"); + spin_unlock(&local->client_conns_lock); - _net("CONNECT call %d on conn %d", call->debug_id, conn->debug_id); + if (candidate) { + trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate); + rxrpc_put_connection(candidate); + candidate = NULL; + } + spin_lock(&conn->channel_lock); + call->conn = conn; + call->security_ix = conn->security_ix; + list_add(&call->chan_wait_link, &conn->waiting_calls); spin_unlock(&conn->channel_lock); + _leave(" = 0 [extant %d]", conn->debug_id); + return 0; + +error_peer: rxrpc_put_peer(cp->peer); cp->peer = NULL; - _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); - return 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Activate a connection. + */ +static void rxrpc_activate_conn(struct rxrpc_connection *conn) +{ + trace_rxrpc_client(conn, -1, rxrpc_client_to_active); + conn->cache_state = RXRPC_CONN_CLIENT_ACTIVE; + rxrpc_nr_active_client_conns++; + list_move_tail(&conn->cache_link, &rxrpc_active_client_conns); +} + +/* + * Attempt to animate a connection for a new call. + * + * If it's not exclusive, the connection is in the endpoint tree, and we're in + * the conn's list of those waiting to grab a channel. There is, however, a + * limit on the number of live connections allowed at any one time, so we may + * have to wait for capacity to become available. + * + * Note that a connection on the waiting queue might *also* have active + * channels if it has been culled to make space and then re-requested by a new + * call. + */ +static void rxrpc_animate_client_conn(struct rxrpc_connection *conn) +{ + unsigned int nr_conns; + + _enter("%d,%d", conn->debug_id, conn->cache_state); + + if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE) + goto out; + + spin_lock(&rxrpc_client_conn_cache_lock); + + nr_conns = rxrpc_nr_client_conns; + if (!test_and_set_bit(RXRPC_CONN_COUNTED, &conn->flags)) { + trace_rxrpc_client(conn, -1, rxrpc_client_count); + rxrpc_nr_client_conns = nr_conns + 1; + } + + switch (conn->cache_state) { + case RXRPC_CONN_CLIENT_ACTIVE: + case RXRPC_CONN_CLIENT_WAITING: + break; + + case RXRPC_CONN_CLIENT_INACTIVE: + case RXRPC_CONN_CLIENT_CULLED: + case RXRPC_CONN_CLIENT_IDLE: + if (nr_conns >= rxrpc_max_client_connections) + goto wait_for_capacity; + goto activate_conn; + + default: + BUG(); + } + +out_unlock: + spin_unlock(&rxrpc_client_conn_cache_lock); +out: + _leave(" [%d]", conn->cache_state); + return; + +activate_conn: + _debug("activate"); + rxrpc_activate_conn(conn); + goto out_unlock; + +wait_for_capacity: + _debug("wait"); + trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); + conn->cache_state = RXRPC_CONN_CLIENT_WAITING; + list_move_tail(&conn->cache_link, &rxrpc_waiting_client_conns); + goto out_unlock; +} + +/* + * Deactivate a channel. + */ +static void rxrpc_deactivate_one_channel(struct rxrpc_connection *conn, + unsigned int channel) +{ + struct rxrpc_channel *chan = &conn->channels[channel]; + + rcu_assign_pointer(chan->call, NULL); + conn->active_chans &= ~(1 << channel); +} + +/* + * Assign a channel to the call at the front of the queue and wake the call up. + * We don't increment the callNumber counter until this number has been exposed + * to the world. + */ +static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, + unsigned int channel) +{ + struct rxrpc_channel *chan = &conn->channels[channel]; + struct rxrpc_call *call = list_entry(conn->waiting_calls.next, + struct rxrpc_call, chan_wait_link); + u32 call_id = chan->call_counter + 1; + + trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate); + + write_lock_bh(&call->state_lock); + call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; + write_unlock_bh(&call->state_lock); + + rxrpc_see_call(call); + list_del_init(&call->chan_wait_link); + conn->active_chans |= 1 << channel; + call->peer = rxrpc_get_peer(conn->params.peer); + call->cid = conn->proto.cid | channel; + call->call_id = call_id; + + _net("CONNECT call %08x:%08x as call %d on conn %d", + call->cid, call->call_id, call->debug_id, conn->debug_id); - /* We found a potentially suitable connection already in existence. If - * we can reuse it (ie. its usage count hasn't been reduced to 0 by the - * reaper), discard any candidate we may have allocated, and try to get - * a channel on this one, otherwise we have to replace it. + /* Paired with the read barrier in rxrpc_wait_for_channel(). This + * orders cid and epoch in the connection wrt to call_id without the + * need to take the channel_lock. + * + * We provisionally assign a callNumber at this point, but we don't + * confirm it until the call is about to be exposed. + * + * TODO: Pair with a barrier in the data_ready handler when that looks + * at the call ID through a connection channel. */ -found_extant_conn: - _debug("found conn"); - if (!rxrpc_get_connection_maybe(conn)) { - set_bit(RXRPC_CONN_IN_CLIENT_CONNS, &candidate->flags); - rb_replace_node(&conn->client_node, - &candidate->client_node, - &local->client_conns); - clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags); - goto attached; + smp_wmb(); + chan->call_id = call_id; + rcu_assign_pointer(chan->call, call); + wake_up(&call->waitq); +} + +/* + * Assign channels and callNumbers to waiting calls with channel_lock + * held by caller. + */ +static void rxrpc_activate_channels_locked(struct rxrpc_connection *conn) +{ + u8 avail, mask; + + switch (conn->cache_state) { + case RXRPC_CONN_CLIENT_ACTIVE: + mask = RXRPC_ACTIVE_CHANS_MASK; + break; + default: + return; } - spin_unlock(&local->client_conns_lock); + while (!list_empty(&conn->waiting_calls) && + (avail = ~conn->active_chans, + avail &= mask, + avail != 0)) + rxrpc_activate_one_channel(conn, __ffs(avail)); +} + +/* + * Assign channels and callNumbers to waiting calls. + */ +static void rxrpc_activate_channels(struct rxrpc_connection *conn) +{ + _enter("%d", conn->debug_id); - rxrpc_put_connection(candidate); + trace_rxrpc_client(conn, -1, rxrpc_client_activate_chans); + + if (conn->active_chans == RXRPC_ACTIVE_CHANS_MASK) + return; + + spin_lock(&conn->channel_lock); + rxrpc_activate_channels_locked(conn); + spin_unlock(&conn->channel_lock); + _leave(""); +} + +/* + * Wait for a callNumber and a channel to be granted to a call. + */ +static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp) +{ + int ret = 0; + + _enter("%d", call->debug_id); + + if (!call->call_id) { + DECLARE_WAITQUEUE(myself, current); - if (!atomic_add_unless(&conn->avail_chans, -1, 0)) { if (!gfpflags_allow_blocking(gfp)) { - rxrpc_put_connection(conn); - _leave(" = -EAGAIN"); - return -EAGAIN; + ret = -EAGAIN; + goto out; } - add_wait_queue(&conn->channel_wq, &myself); + add_wait_queue_exclusive(&call->waitq, &myself); for (;;) { set_current_state(TASK_INTERRUPTIBLE); - if (atomic_add_unless(&conn->avail_chans, -1, 0)) + if (call->call_id) + break; + if (signal_pending(current)) { + ret = -ERESTARTSYS; break; - if (signal_pending(current)) - goto interrupted; + } schedule(); } - remove_wait_queue(&conn->channel_wq, &myself); + remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); } - /* The connection allegedly now has a free channel and we can now - * attach the call to it. - */ + /* Paired with the write barrier in rxrpc_activate_one_channel(). */ + smp_rmb(); + +out: + _leave(" = %d", ret); + return ret; +} + +/* + * find a connection for a call + * - called in process context with IRQs enabled + */ +int rxrpc_connect_call(struct rxrpc_call *call, + struct rxrpc_conn_parameters *cp, + struct sockaddr_rxrpc *srx, + gfp_t gfp) +{ + int ret; + + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); + + rxrpc_discard_expired_client_conns(NULL); + rxrpc_cull_active_client_conns(); + + ret = rxrpc_get_client_conn(call, cp, srx, gfp); + if (ret < 0) + return ret; + + rxrpc_animate_client_conn(call->conn); + rxrpc_activate_channels(call->conn); + + ret = rxrpc_wait_for_channel(call, gfp); + if (ret < 0) + rxrpc_disconnect_client_call(call); + + _leave(" = %d", ret); + return ret; +} + +/* + * Note that a connection is about to be exposed to the world. Once it is + * exposed, we maintain an extra ref on it that stops it from being summarily + * discarded before it's (a) had a chance to deal with retransmission and (b) + * had a chance at re-use (the per-connection security negotiation is + * expensive). + */ +static void rxrpc_expose_client_conn(struct rxrpc_connection *conn, + unsigned int channel) +{ + if (!test_and_set_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { + trace_rxrpc_client(conn, channel, rxrpc_client_exposed); + rxrpc_get_connection(conn); + } +} + +/* + * Note that a call, and thus a connection, is about to be exposed to the + * world. + */ +void rxrpc_expose_client_call(struct rxrpc_call *call) +{ + unsigned int channel = call->cid & RXRPC_CHANNELMASK; + struct rxrpc_connection *conn = call->conn; + struct rxrpc_channel *chan = &conn->channels[channel]; + + if (!test_and_set_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + /* Mark the call ID as being used. If the callNumber counter + * exceeds ~2 billion, we kill the connection after its + * outstanding calls have finished so that the counter doesn't + * wrap. + */ + chan->call_counter++; + if (chan->call_counter >= INT_MAX) + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + rxrpc_expose_client_conn(conn, channel); + } +} + +/* + * Disconnect a client call. + */ +void rxrpc_disconnect_client_call(struct rxrpc_call *call) +{ + unsigned int channel = call->cid & RXRPC_CHANNELMASK; + struct rxrpc_connection *conn = call->conn; + struct rxrpc_channel *chan = &conn->channels[channel]; + + trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); + call->conn = NULL; + spin_lock(&conn->channel_lock); - for (chan = 0; chan < RXRPC_MAXCALLS; chan++) - if (!conn->channels[chan].call) - goto found_channel; - BUG(); + /* Calls that have never actually been assigned a channel can simply be + * discarded. If the conn didn't get used either, it will follow + * immediately unless someone else grabs it in the meantime. + */ + if (!list_empty(&call->chan_wait_link)) { + _debug("call is waiting"); + ASSERTCMP(call->call_id, ==, 0); + ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); + list_del_init(&call->chan_wait_link); + + trace_rxrpc_client(conn, channel, rxrpc_client_chan_unstarted); + + /* We must deactivate or idle the connection if it's now + * waiting for nothing. + */ + spin_lock(&rxrpc_client_conn_cache_lock); + if (conn->cache_state == RXRPC_CONN_CLIENT_WAITING && + list_empty(&conn->waiting_calls) && + !conn->active_chans) + goto idle_connection; + goto out; + } + + ASSERTCMP(rcu_access_pointer(chan->call), ==, call); + + /* If a client call was exposed to the world, we save the result for + * retransmission. + * + * We use a barrier here so that the call number and abort code can be + * read without needing to take a lock. + * + * TODO: Make the incoming packet handler check this and handle + * terminal retransmission without requiring access to the call. + */ + if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { + _debug("exposed %u,%u", call->call_id, call->abort_code); + __rxrpc_disconnect_call(conn, call); + } + + /* See if we can pass the channel directly to another call. */ + if (conn->cache_state == RXRPC_CONN_CLIENT_ACTIVE && + !list_empty(&conn->waiting_calls)) { + trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); + rxrpc_activate_one_channel(conn, channel); + goto out_2; + } + + /* Things are more complex and we need the cache lock. We might be + * able to simply idle the conn or it might now be lurking on the wait + * list. It might even get moved back to the active list whilst we're + * waiting for the lock. + */ + spin_lock(&rxrpc_client_conn_cache_lock); + + switch (conn->cache_state) { + case RXRPC_CONN_CLIENT_ACTIVE: + if (list_empty(&conn->waiting_calls)) { + rxrpc_deactivate_one_channel(conn, channel); + if (!conn->active_chans) { + rxrpc_nr_active_client_conns--; + goto idle_connection; + } + goto out; + } + + trace_rxrpc_client(conn, channel, rxrpc_client_chan_pass); + rxrpc_activate_one_channel(conn, channel); + goto out; -interrupted: - remove_wait_queue(&conn->channel_wq, &myself); - __set_current_state(TASK_RUNNING); + case RXRPC_CONN_CLIENT_CULLED: + rxrpc_deactivate_one_channel(conn, channel); + ASSERT(list_empty(&conn->waiting_calls)); + if (!conn->active_chans) + goto idle_connection; + goto out; + + case RXRPC_CONN_CLIENT_WAITING: + rxrpc_deactivate_one_channel(conn, channel); + goto out; + + default: + BUG(); + } + +out: + spin_unlock(&rxrpc_client_conn_cache_lock); +out_2: + spin_unlock(&conn->channel_lock); rxrpc_put_connection(conn); - rxrpc_put_peer(cp->peer); - cp->peer = NULL; - _leave(" = -ERESTARTSYS"); - return -ERESTARTSYS; + _leave(""); + return; + +idle_connection: + /* As no channels remain active, the connection gets deactivated + * immediately or moved to the idle list for a short while. + */ + if (test_bit(RXRPC_CONN_EXPOSED, &conn->flags)) { + trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); + conn->idle_timestamp = jiffies; + conn->cache_state = RXRPC_CONN_CLIENT_IDLE; + list_move_tail(&conn->cache_link, &rxrpc_idle_client_conns); + if (rxrpc_idle_client_conns.next == &conn->cache_link && + !rxrpc_kill_all_client_conns) + queue_delayed_work(rxrpc_workqueue, + &rxrpc_client_conn_reap, + rxrpc_conn_idle_client_expiry); + } else { + trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive); + conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; + list_del_init(&conn->cache_link); + } + goto out; } /* - * Remove a client connection from the local endpoint's tree, thereby removing - * it as a target for reuse for new client calls. + * Clean up a dead client connection. */ -void rxrpc_unpublish_client_conn(struct rxrpc_connection *conn) +static struct rxrpc_connection * +rxrpc_put_one_client_conn(struct rxrpc_connection *conn) { + struct rxrpc_connection *next = NULL; struct rxrpc_local *local = conn->params.local; + unsigned int nr_conns; - spin_lock(&local->client_conns_lock); - if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) - rb_erase(&conn->client_node, &local->client_conns); - spin_unlock(&local->client_conns_lock); + trace_rxrpc_client(conn, -1, rxrpc_client_cleanup); + + if (test_bit(RXRPC_CONN_IN_CLIENT_CONNS, &conn->flags)) { + spin_lock(&local->client_conns_lock); + if (test_and_clear_bit(RXRPC_CONN_IN_CLIENT_CONNS, + &conn->flags)) + rb_erase(&conn->client_node, &local->client_conns); + spin_unlock(&local->client_conns_lock); + } rxrpc_put_client_connection_id(conn); + + ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_INACTIVE); + + if (test_bit(RXRPC_CONN_COUNTED, &conn->flags)) { + trace_rxrpc_client(conn, -1, rxrpc_client_uncount); + spin_lock(&rxrpc_client_conn_cache_lock); + nr_conns = --rxrpc_nr_client_conns; + + if (nr_conns < rxrpc_max_client_connections && + !list_empty(&rxrpc_waiting_client_conns)) { + next = list_entry(rxrpc_waiting_client_conns.next, + struct rxrpc_connection, cache_link); + rxrpc_get_connection(next); + rxrpc_activate_conn(next); + } + + spin_unlock(&rxrpc_client_conn_cache_lock); + } + + rxrpc_kill_connection(conn); + if (next) + rxrpc_activate_channels(next); + + /* We need to get rid of the temporary ref we took upon next, but we + * can't call rxrpc_put_connection() recursively. + */ + return next; +} + +/* + * Clean up a dead client connections. + */ +void rxrpc_put_client_conn(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + int n; + + do { + n = atomic_dec_return(&conn->usage); + trace_rxrpc_conn(conn, rxrpc_conn_put_client, n, here); + if (n > 0) + return; + ASSERTCMP(n, >=, 0); + + conn = rxrpc_put_one_client_conn(conn); + } while (conn); +} + +/* + * Kill the longest-active client connections to make room for new ones. + */ +static void rxrpc_cull_active_client_conns(void) +{ + struct rxrpc_connection *conn; + unsigned int nr_conns = rxrpc_nr_client_conns; + unsigned int nr_active, limit; + + _enter(""); + + ASSERTCMP(nr_conns, >=, 0); + if (nr_conns < rxrpc_max_client_connections) { + _leave(" [ok]"); + return; + } + limit = rxrpc_reap_client_connections; + + spin_lock(&rxrpc_client_conn_cache_lock); + nr_active = rxrpc_nr_active_client_conns; + + while (nr_active > limit) { + ASSERT(!list_empty(&rxrpc_active_client_conns)); + conn = list_entry(rxrpc_active_client_conns.next, + struct rxrpc_connection, cache_link); + ASSERTCMP(conn->cache_state, ==, RXRPC_CONN_CLIENT_ACTIVE); + + if (list_empty(&conn->waiting_calls)) { + trace_rxrpc_client(conn, -1, rxrpc_client_to_culled); + conn->cache_state = RXRPC_CONN_CLIENT_CULLED; + list_del_init(&conn->cache_link); + } else { + trace_rxrpc_client(conn, -1, rxrpc_client_to_waiting); + conn->cache_state = RXRPC_CONN_CLIENT_WAITING; + list_move_tail(&conn->cache_link, + &rxrpc_waiting_client_conns); + } + + nr_active--; + } + + rxrpc_nr_active_client_conns = nr_active; + spin_unlock(&rxrpc_client_conn_cache_lock); + ASSERTCMP(nr_active, >=, 0); + _leave(" [culled]"); +} + +/* + * Discard expired client connections from the idle list. Each conn in the + * idle list has been exposed and holds an extra ref because of that. + * + * This may be called from conn setup or from a work item so cannot be + * considered non-reentrant. + */ +static void rxrpc_discard_expired_client_conns(struct work_struct *work) +{ + struct rxrpc_connection *conn; + unsigned long expiry, conn_expires_at, now; + unsigned int nr_conns; + bool did_discard = false; + + _enter("%c", work ? 'w' : 'n'); + + if (list_empty(&rxrpc_idle_client_conns)) { + _leave(" [empty]"); + return; + } + + /* Don't double up on the discarding */ + if (!spin_trylock(&rxrpc_client_conn_discard_mutex)) { + _leave(" [already]"); + return; + } + + /* We keep an estimate of what the number of conns ought to be after + * we've discarded some so that we don't overdo the discarding. + */ + nr_conns = rxrpc_nr_client_conns; + +next: + spin_lock(&rxrpc_client_conn_cache_lock); + + if (list_empty(&rxrpc_idle_client_conns)) + goto out; + + conn = list_entry(rxrpc_idle_client_conns.next, + struct rxrpc_connection, cache_link); + ASSERT(test_bit(RXRPC_CONN_EXPOSED, &conn->flags)); + + if (!rxrpc_kill_all_client_conns) { + /* If the number of connections is over the reap limit, we + * expedite discard by reducing the expiry timeout. We must, + * however, have at least a short grace period to be able to do + * final-ACK or ABORT retransmission. + */ + expiry = rxrpc_conn_idle_client_expiry; + if (nr_conns > rxrpc_reap_client_connections) + expiry = rxrpc_conn_idle_client_fast_expiry; + + conn_expires_at = conn->idle_timestamp + expiry; + + now = READ_ONCE(jiffies); + if (time_after(conn_expires_at, now)) + goto not_yet_expired; + } + + trace_rxrpc_client(conn, -1, rxrpc_client_discard); + if (!test_and_clear_bit(RXRPC_CONN_EXPOSED, &conn->flags)) + BUG(); + conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE; + list_del_init(&conn->cache_link); + + spin_unlock(&rxrpc_client_conn_cache_lock); + + /* When we cleared the EXPOSED flag, we took on responsibility for the + * reference that that had on the usage count. We deal with that here. + * If someone re-sets the flag and re-gets the ref, that's fine. + */ + rxrpc_put_connection(conn); + did_discard = true; + nr_conns--; + goto next; + +not_yet_expired: + /* The connection at the front of the queue hasn't yet expired, so + * schedule the work item for that point if we discarded something. + * + * We don't worry if the work item is already scheduled - it can look + * after rescheduling itself at a later time. We could cancel it, but + * then things get messier. + */ + _debug("not yet"); + if (!rxrpc_kill_all_client_conns) + queue_delayed_work(rxrpc_workqueue, + &rxrpc_client_conn_reap, + conn_expires_at - now); + +out: + spin_unlock(&rxrpc_client_conn_cache_lock); + spin_unlock(&rxrpc_client_conn_discard_mutex); + _leave(""); +} + +/* + * Preemptively destroy all the client connection records rather than waiting + * for them to time out + */ +void __exit rxrpc_destroy_all_client_connections(void) +{ + _enter(""); + + spin_lock(&rxrpc_client_conn_cache_lock); + rxrpc_kill_all_client_conns = true; + spin_unlock(&rxrpc_client_conn_cache_lock); + + cancel_delayed_work(&rxrpc_client_conn_reap); + + if (!queue_delayed_work(rxrpc_workqueue, &rxrpc_client_conn_reap, 0)) + _debug("destroy: queue failed"); + + _leave(""); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index cee0f35bc1cf..3f9d8d7ec632 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -15,20 +15,128 @@ #include <linux/net.h> #include <linux/skbuff.h> #include <linux/errqueue.h> -#include <linux/udp.h> -#include <linux/in.h> -#include <linux/in6.h> -#include <linux/icmp.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include "ar-internal.h" /* + * Retransmit terminal ACK or ABORT of the previous call. + */ +static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_channel *chan; + struct msghdr msg; + struct kvec iov; + struct { + struct rxrpc_wire_header whdr; + union { + struct { + __be32 code; + } abort; + struct { + struct rxrpc_ackpacket ack; + u8 padding[3]; + struct rxrpc_ackinfo info; + }; + }; + } __attribute__((packed)) pkt; + size_t len; + u32 serial, mtu, call_id; + + _enter("%d", conn->debug_id); + + chan = &conn->channels[sp->hdr.cid & RXRPC_CHANNELMASK]; + + /* If the last call got moved on whilst we were waiting to run, just + * ignore this packet. + */ + call_id = READ_ONCE(chan->last_call); + /* Sync with __rxrpc_disconnect_call() */ + smp_rmb(); + if (call_id != sp->hdr.callNumber) + return; + + msg.msg_name = &conn->params.peer->srx.transport; + msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + pkt.whdr.epoch = htonl(sp->hdr.epoch); + pkt.whdr.cid = htonl(sp->hdr.cid); + pkt.whdr.callNumber = htonl(sp->hdr.callNumber); + pkt.whdr.seq = 0; + pkt.whdr.type = chan->last_type; + pkt.whdr.flags = conn->out_clientflag; + pkt.whdr.userStatus = 0; + pkt.whdr.securityIndex = conn->security_ix; + pkt.whdr._rsvd = 0; + pkt.whdr.serviceId = htons(chan->last_service_id); + + len = sizeof(pkt.whdr); + switch (chan->last_type) { + case RXRPC_PACKET_TYPE_ABORT: + pkt.abort.code = htonl(chan->last_abort); + len += sizeof(pkt.abort); + break; + + case RXRPC_PACKET_TYPE_ACK: + mtu = conn->params.peer->if_mtu; + mtu -= conn->params.peer->hdrsize; + pkt.ack.bufferSpace = 0; + pkt.ack.maxSkew = htons(skb->priority); + pkt.ack.firstPacket = htonl(chan->last_seq); + pkt.ack.previousPacket = htonl(chan->last_seq - 1); + pkt.ack.serial = htonl(sp->hdr.serial); + pkt.ack.reason = RXRPC_ACK_DUPLICATE; + pkt.ack.nAcks = 0; + pkt.info.rxMTU = htonl(rxrpc_rx_mtu); + pkt.info.maxMTU = htonl(mtu); + pkt.info.rwind = htonl(rxrpc_rx_window_size); + pkt.info.jumbo_max = htonl(rxrpc_rx_jumbo_max); + pkt.whdr.flags |= RXRPC_SLOW_START_OK; + len += sizeof(pkt.ack) + sizeof(pkt.info); + break; + } + + /* Resync with __rxrpc_disconnect_call() and check that the last call + * didn't get advanced whilst we were filling out the packets. + */ + smp_rmb(); + if (READ_ONCE(chan->last_call) != call_id) + return; + + iov.iov_base = &pkt; + iov.iov_len = len; + + serial = atomic_inc_return(&conn->serial); + pkt.whdr.serial = htonl(serial); + + switch (chan->last_type) { + case RXRPC_PACKET_TYPE_ABORT: + _proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort); + break; + case RXRPC_PACKET_TYPE_ACK: + trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0, + RXRPC_ACK_DUPLICATE, 0); + _proto("Tx ACK %%%u [re]", serial); + break; + } + + kernel_sendmsg(conn->params.local->socket, &msg, &iov, 1, len); + _leave(""); + return; +} + +/* * pass a connection-level abort onto all calls on that connection */ -static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, - u32 abort_code) +static void rxrpc_abort_calls(struct rxrpc_connection *conn, + enum rxrpc_call_completion compl, + u32 abort_code, int error) { struct rxrpc_call *call; int i; @@ -41,19 +149,15 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, call = rcu_dereference_protected( conn->channels[i].call, lockdep_is_held(&conn->channel_lock)); - write_lock_bh(&call->state_lock); - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = state; - if (state == RXRPC_CALL_LOCALLY_ABORTED) { - call->local_abort = conn->local_abort; - set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - } else { - call->remote_abort = conn->remote_abort; - set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - } - rxrpc_queue_call(call); + if (call) { + if (compl == RXRPC_CALL_LOCALLY_ABORTED) + trace_rxrpc_abort("CON", call->cid, + call->call_id, 0, + abort_code, error); + if (rxrpc_set_call_completion(call, compl, + abort_code, error)) + rxrpc_notify_socket(call); } - write_unlock_bh(&call->state_lock); } spin_unlock(&conn->channel_lock); @@ -78,17 +182,16 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, /* generate a connection-level abort */ spin_lock_bh(&conn->state_lock); - if (conn->state < RXRPC_CONN_REMOTELY_ABORTED) { - conn->state = RXRPC_CONN_LOCALLY_ABORTED; - conn->error = error; - spin_unlock_bh(&conn->state_lock); - } else { + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { spin_unlock_bh(&conn->state_lock); _leave(" = 0 [already dead]"); return 0; } - rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); + conn->state = RXRPC_CONN_LOCALLY_ABORTED; + spin_unlock_bh(&conn->state_lock); + + rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error); msg.msg_name = &conn->params.peer->srx.transport; msg.msg_namelen = conn->params.peer->srx.transport_len; @@ -132,17 +235,18 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, /* * mark a call as being on a now-secured channel - * - must be called with softirqs disabled + * - must be called with BH's disabled. */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { _enter("%p", call); if (call) { - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events)) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); + write_lock_bh(&call->state_lock); + if (call->state == RXRPC_CALL_SERVER_SECURING) { + call->state = RXRPC_CALL_SERVER_ACCEPTING; + rxrpc_notify_socket(call); + } + write_unlock_bh(&call->state_lock); } } @@ -159,22 +263,28 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, int loop, ret; if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - kleave(" = -ECONNABORTED [%u]", conn->state); + _leave(" = -ECONNABORTED [%u]", conn->state); return -ECONNABORTED; } _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_DATA: + case RXRPC_PACKET_TYPE_ACK: + rxrpc_conn_retransmit_call(conn, skb); + return 0; + case RXRPC_PACKET_TYPE_ABORT: - if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &wtmp, sizeof(wtmp)) < 0) return -EPROTO; abort_code = ntohl(wtmp); _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); conn->state = RXRPC_CONN_REMOTELY_ABORTED; rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, - abort_code); + abort_code, ECONNABORTED); return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: @@ -199,14 +309,16 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { conn->state = RXRPC_CONN_SERVICE; + spin_unlock(&conn->state_lock); for (loop = 0; loop < RXRPC_MAXCALLS; loop++) rxrpc_call_is_secure( rcu_dereference_protected( conn->channels[loop].call, lockdep_is_held(&conn->channel_lock))); + } else { + spin_unlock(&conn->state_lock); } - spin_unlock(&conn->state_lock); spin_unlock(&conn->channel_lock); return 0; @@ -269,7 +381,7 @@ void rxrpc_process_connection(struct work_struct *work) u32 abort_code = RX_PROTOCOL_ERROR; int ret; - _enter("{%d}", conn->debug_id); + rxrpc_see_connection(conn); if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events)) rxrpc_secure_connection(conn); @@ -277,6 +389,7 @@ void rxrpc_process_connection(struct work_struct *work) /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); ret = rxrpc_process_event(conn, skb, &abort_code); switch (ret) { case -EPROTO: @@ -287,7 +400,7 @@ void rxrpc_process_connection(struct work_struct *work) goto requeue_and_leave; case -ECONNABORTED: default: - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); break; } } @@ -304,91 +417,7 @@ requeue_and_leave: protocol_error: if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) goto requeue_and_leave; - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _leave(" [EPROTO]"); goto out; } - -/* - * put a packet up for transport-level abort - */ -void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) -{ - CHECK_SLAB_OKAY(&local->usage); - - skb_queue_tail(&local->reject_queue, skb); - rxrpc_queue_local(local); -} - -/* - * reject packets through the local endpoint - */ -void rxrpc_reject_packets(struct rxrpc_local *local) -{ - union { - struct sockaddr sa; - struct sockaddr_in sin; - } sa; - struct rxrpc_skb_priv *sp; - struct rxrpc_wire_header whdr; - struct sk_buff *skb; - struct msghdr msg; - struct kvec iov[2]; - size_t size; - __be32 code; - - _enter("%d", local->debug_id); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = &code; - iov[1].iov_len = sizeof(code); - size = sizeof(whdr) + sizeof(code); - - msg.msg_name = &sa; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&sa, 0, sizeof(sa)); - sa.sa.sa_family = local->srx.transport.family; - switch (sa.sa.sa_family) { - case AF_INET: - msg.msg_namelen = sizeof(sa.sin); - break; - default: - msg.msg_namelen = 0; - break; - } - - memset(&whdr, 0, sizeof(whdr)); - whdr.type = RXRPC_PACKET_TYPE_ABORT; - - while ((skb = skb_dequeue(&local->reject_queue))) { - sp = rxrpc_skb(skb); - switch (sa.sa.sa_family) { - case AF_INET: - sa.sin.sin_port = udp_hdr(skb)->source; - sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; - code = htonl(skb->priority); - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.serviceId = htons(sp->hdr.serviceId); - whdr.flags = sp->hdr.flags; - whdr.flags ^= RXRPC_CLIENT_INITIATED; - whdr.flags &= RXRPC_CLIENT_INITIATED; - - kernel_sendmsg(local->socket, &msg, iov, 2, size); - break; - - default: - break; - } - - rxrpc_free_skb(skb); - } - - _leave(""); -} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 896d84493a05..e1e83af47866 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -1,6 +1,6 @@ -/* RxRPC virtual connection handler +/* RxRPC virtual connection handler, common bits. * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -15,8 +15,6 @@ #include <linux/slab.h> #include <linux/net.h> #include <linux/skbuff.h> -#include <net/sock.h> -#include <net/af_rxrpc.h> #include "ar-internal.h" /* @@ -27,9 +25,12 @@ unsigned int rxrpc_connection_expiry = 10 * 60; static void rxrpc_connection_reaper(struct work_struct *work); LIST_HEAD(rxrpc_connections); +LIST_HEAD(rxrpc_connection_proc_list); DEFINE_RWLOCK(rxrpc_connection_lock); static DECLARE_DELAYED_WORK(rxrpc_connection_reap, rxrpc_connection_reaper); +static void rxrpc_destroy_connection(struct rcu_head *); + /* * allocate a new connection */ @@ -41,21 +42,18 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) conn = kzalloc(sizeof(struct rxrpc_connection), gfp); if (conn) { + INIT_LIST_HEAD(&conn->cache_link); spin_lock_init(&conn->channel_lock); - init_waitqueue_head(&conn->channel_wq); + INIT_LIST_HEAD(&conn->waiting_calls); INIT_WORK(&conn->processor, &rxrpc_process_connection); + INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); skb_queue_head_init(&conn->rx_queue); conn->security = &rxrpc_no_security; spin_lock_init(&conn->state_lock); - /* We maintain an extra ref on the connection whilst it is - * on the rxrpc_connections list. - */ - atomic_set(&conn->usage, 2); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); - atomic_set(&conn->avail_chans, RXRPC_MAXCALLS); conn->size_align = 4; - conn->header_size = sizeof(struct rxrpc_wire_header); + conn->idle_timestamp = jiffies; } _leave(" = %p{%d}", conn, conn ? conn->debug_id : 0); @@ -135,6 +133,16 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, srx.transport.sin.sin_addr.s_addr) goto not_found; break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + if (peer->srx.transport.sin6.sin6_port != + srx.transport.sin6.sin6_port || + memcmp(&peer->srx.transport.sin6.sin6_addr, + &srx.transport.sin6.sin6_addr, + sizeof(struct in6_addr)) != 0) + goto not_found; + break; +#endif default: BUG(); } @@ -153,25 +161,32 @@ not_found: * terminates. The caller must hold the channel_lock and must release the * call's ref on the connection. */ -void __rxrpc_disconnect_call(struct rxrpc_call *call) +void __rxrpc_disconnect_call(struct rxrpc_connection *conn, + struct rxrpc_call *call) { - struct rxrpc_connection *conn = call->conn; - struct rxrpc_channel *chan = &conn->channels[call->channel]; + struct rxrpc_channel *chan = + &conn->channels[call->cid & RXRPC_CHANNELMASK]; - _enter("%d,%d", conn->debug_id, call->channel); + _enter("%d,%x", conn->debug_id, call->cid); if (rcu_access_pointer(chan->call) == call) { /* Save the result of the call so that we can repeat it if necessary * through the channel, whilst disposing of the actual call record. */ - chan->last_result = call->local_abort; + chan->last_service_id = call->service_id; + if (call->abort_code) { + chan->last_abort = call->abort_code; + chan->last_type = RXRPC_PACKET_TYPE_ABORT; + } else { + chan->last_seq = call->rx_hard_ack; + chan->last_type = RXRPC_PACKET_TYPE_ACK; + } + /* Sync with rxrpc_conn_retransmit(). */ smp_wmb(); chan->last_call = chan->call_id; chan->call_id = chan->call_counter; rcu_assign_pointer(chan->call, NULL); - atomic_inc(&conn->avail_chans); - wake_up(&conn->channel_wq); } _leave(""); @@ -185,34 +200,122 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; + spin_lock_bh(&conn->params.peer->lock); + hlist_del_init(&call->error_link); + spin_unlock_bh(&conn->params.peer->lock); + + if (rxrpc_is_client_call(call)) + return rxrpc_disconnect_client_call(call); + spin_lock(&conn->channel_lock); - __rxrpc_disconnect_call(call); + __rxrpc_disconnect_call(conn, call); spin_unlock(&conn->channel_lock); call->conn = NULL; + conn->idle_timestamp = jiffies; rxrpc_put_connection(conn); } /* - * release a virtual connection + * Kill off a connection. */ -void rxrpc_put_connection(struct rxrpc_connection *conn) +void rxrpc_kill_connection(struct rxrpc_connection *conn) { - if (!conn) - return; + ASSERT(!rcu_access_pointer(conn->channels[0].call) && + !rcu_access_pointer(conn->channels[1].call) && + !rcu_access_pointer(conn->channels[2].call) && + !rcu_access_pointer(conn->channels[3].call)); + ASSERT(list_empty(&conn->cache_link)); - _enter("%p{u=%d,d=%d}", - conn, atomic_read(&conn->usage), conn->debug_id); + write_lock(&rxrpc_connection_lock); + list_del_init(&conn->proc_link); + write_unlock(&rxrpc_connection_lock); - ASSERTCMP(atomic_read(&conn->usage), >, 1); + /* Drain the Rx queue. Note that even though we've unpublished, an + * incoming packet could still be being added to our Rx queue, so we + * will need to drain it again in the RCU cleanup handler. + */ + rxrpc_purge_queue(&conn->rx_queue); - conn->put_time = ktime_get_seconds(); - if (atomic_dec_return(&conn->usage) == 1) { - _debug("zombie"); - rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); + /* Leave final destruction to RCU. The connection processor work item + * must carry a ref on the connection to prevent us getting here whilst + * it is queued or running. + */ + call_rcu(&conn->rcu, rxrpc_destroy_connection); +} + +/* + * Queue a connection's work processor, getting a ref to pass to the work + * queue. + */ +bool rxrpc_queue_conn(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + int n = __atomic_add_unless(&conn->usage, 1, 0); + if (n == 0) + return false; + if (rxrpc_queue_work(&conn->processor)) + trace_rxrpc_conn(conn, rxrpc_conn_queued, n + 1, here); + else + rxrpc_put_connection(conn); + return true; +} + +/* + * Note the re-emergence of a connection. + */ +void rxrpc_see_connection(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + if (conn) { + int n = atomic_read(&conn->usage); + + trace_rxrpc_conn(conn, rxrpc_conn_seen, n, here); + } +} + +/* + * Get a ref on a connection. + */ +void rxrpc_get_connection(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(&conn->usage); + + trace_rxrpc_conn(conn, rxrpc_conn_got, n, here); +} + +/* + * Try to get a ref on a connection. + */ +struct rxrpc_connection * +rxrpc_get_connection_maybe(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + + if (conn) { + int n = __atomic_add_unless(&conn->usage, 1, 0); + if (n > 0) + trace_rxrpc_conn(conn, rxrpc_conn_got, n + 1, here); + else + conn = NULL; } + return conn; +} - _leave(""); +/* + * Release a service connection + */ +void rxrpc_put_service_conn(struct rxrpc_connection *conn) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_dec_return(&conn->usage); + trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here); + ASSERTCMP(n, >=, 0); + if (n == 0) + rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); } /* @@ -242,19 +345,19 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) } /* - * reap dead connections + * reap dead service connections */ static void rxrpc_connection_reaper(struct work_struct *work) { struct rxrpc_connection *conn, *_p; - unsigned long reap_older_than, earliest, put_time, now; + unsigned long reap_older_than, earliest, idle_timestamp, now; LIST_HEAD(graveyard); _enter(""); - now = ktime_get_seconds(); - reap_older_than = now - rxrpc_connection_expiry; + now = jiffies; + reap_older_than = now - rxrpc_connection_expiry * HZ; earliest = ULONG_MAX; write_lock(&rxrpc_connection_lock); @@ -262,11 +365,17 @@ static void rxrpc_connection_reaper(struct work_struct *work) ASSERTCMP(atomic_read(&conn->usage), >, 0); if (likely(atomic_read(&conn->usage) > 1)) continue; + if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) + continue; + + idle_timestamp = READ_ONCE(conn->idle_timestamp); + _debug("reap CONN %d { u=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->usage), + (long)reap_older_than - (long)idle_timestamp); - put_time = READ_ONCE(conn->put_time); - if (time_after(put_time, reap_older_than)) { - if (time_before(put_time, earliest)) - earliest = put_time; + if (time_after(idle_timestamp, reap_older_than)) { + if (time_before(idle_timestamp, earliest)) + earliest = idle_timestamp; continue; } @@ -277,7 +386,7 @@ static void rxrpc_connection_reaper(struct work_struct *work) continue; if (rxrpc_conn_is_client(conn)) - rxrpc_unpublish_client_conn(conn); + BUG(); else rxrpc_unpublish_service_conn(conn); @@ -287,9 +396,9 @@ static void rxrpc_connection_reaper(struct work_struct *work) if (earliest != ULONG_MAX) { _debug("reschedule reaper %ld", (long) earliest - now); - ASSERTCMP(earliest, >, now); + ASSERT(time_after(earliest, now)); rxrpc_queue_delayed_work(&rxrpc_connection_reap, - (earliest - now) * HZ); + earliest - now); } while (!list_empty(&graveyard)) { @@ -298,16 +407,15 @@ static void rxrpc_connection_reaper(struct work_struct *work) list_del_init(&conn->link); ASSERTCMP(atomic_read(&conn->usage), ==, 0); - skb_queue_purge(&conn->rx_queue); - call_rcu(&conn->rcu, rxrpc_destroy_connection); + rxrpc_kill_connection(conn); } _leave(""); } /* - * preemptively destroy all the connection records rather than waiting for them - * to time out + * preemptively destroy all the service connection records rather than + * waiting for them to time out */ void __exit rxrpc_destroy_all_connections(void) { @@ -316,6 +424,8 @@ void __exit rxrpc_destroy_all_connections(void) _enter(""); + rxrpc_destroy_all_client_connections(); + rxrpc_connection_expiry = 0; cancel_delayed_work(&rxrpc_connection_reap); rxrpc_queue_delayed_work(&rxrpc_connection_reap, 0); @@ -330,6 +440,8 @@ void __exit rxrpc_destroy_all_connections(void) write_unlock(&rxrpc_connection_lock); BUG_ON(leak); + ASSERT(list_empty(&rxrpc_connection_proc_list)); + /* Make sure the local and peer records pinned by any dying connections * are released. */ diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index fd9027ccba8f..eef551f40dc2 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -65,9 +65,8 @@ done: * Insert a service connection into a peer's tree, thereby making it a target * for incoming packets. */ -static struct rxrpc_connection * -rxrpc_publish_service_conn(struct rxrpc_peer *peer, - struct rxrpc_connection *conn) +static void rxrpc_publish_service_conn(struct rxrpc_peer *peer, + struct rxrpc_connection *conn) { struct rxrpc_connection *cursor = NULL; struct rxrpc_conn_proto k = conn->proto; @@ -96,7 +95,7 @@ conn_published: set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); write_sequnlock_bh(&peer->service_conn_lock); _leave(" = %d [new]", conn->debug_id); - return conn; + return; found_extant_conn: if (atomic_read(&cursor->usage) == 0) @@ -119,100 +118,58 @@ replace_old_connection: } /* - * get a record of an incoming connection + * Preallocate a service connection. The connection is placed on the proc and + * reap lists so that we don't have to get the lock from BH context. */ -struct rxrpc_connection *rxrpc_incoming_connection(struct rxrpc_local *local, - struct sockaddr_rxrpc *srx, - struct sk_buff *skb) +struct rxrpc_connection *rxrpc_prealloc_service_connection(gfp_t gfp) { - struct rxrpc_connection *conn; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_peer *peer; - const char *new = "old"; - - _enter(""); + struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp); - peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); - if (!peer) { - _debug("no peer"); - return ERR_PTR(-EBUSY); - } + if (conn) { + /* We maintain an extra ref on the connection whilst it is on + * the rxrpc_connections list. + */ + conn->state = RXRPC_CONN_SERVICE_PREALLOC; + atomic_set(&conn->usage, 2); - ASSERT(sp->hdr.flags & RXRPC_CLIENT_INITIATED); - - rcu_read_lock(); - peer = rxrpc_lookup_peer_rcu(local, srx); - if (peer) { - conn = rxrpc_find_service_conn_rcu(peer, skb); - if (conn) { - if (sp->hdr.securityIndex != conn->security_ix) - goto security_mismatch_rcu; - if (rxrpc_get_connection_maybe(conn)) - goto found_extant_connection_rcu; - - /* The conn has expired but we can't remove it without - * the appropriate lock, so we attempt to replace it - * when we have a new candidate. - */ - } + write_lock(&rxrpc_connection_lock); + list_add_tail(&conn->link, &rxrpc_connections); + list_add_tail(&conn->proc_link, &rxrpc_connection_proc_list); + write_unlock(&rxrpc_connection_lock); - if (!rxrpc_get_peer_maybe(peer)) - peer = NULL; + trace_rxrpc_conn(conn, rxrpc_conn_new_service, + atomic_read(&conn->usage), + __builtin_return_address(0)); } - rcu_read_unlock(); - if (!peer) { - peer = rxrpc_lookup_peer(local, srx, GFP_NOIO); - if (!peer) - goto enomem; - } + return conn; +} - /* We don't have a matching record yet. */ - conn = rxrpc_alloc_connection(GFP_NOIO); - if (!conn) - goto enomem_peer; +/* + * Set up an incoming connection. This is called in BH context with the RCU + * read lock held. + */ +void rxrpc_new_incoming_connection(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + _enter(""); conn->proto.epoch = sp->hdr.epoch; conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; - conn->params.local = local; - conn->params.peer = peer; conn->params.service_id = sp->hdr.serviceId; conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; - conn->state = RXRPC_CONN_SERVICE; - if (conn->params.service_id) + if (conn->security_ix) conn->state = RXRPC_CONN_SERVICE_UNSECURED; - - rxrpc_get_local(local); - - write_lock(&rxrpc_connection_lock); - list_add_tail(&conn->link, &rxrpc_connections); - write_unlock(&rxrpc_connection_lock); + else + conn->state = RXRPC_CONN_SERVICE; /* Make the connection a target for incoming packets. */ - rxrpc_publish_service_conn(peer, conn); - - new = "new"; - -success: - _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->proto.cid); - _leave(" = %p {u=%d}", conn, atomic_read(&conn->usage)); - return conn; - -found_extant_connection_rcu: - rcu_read_unlock(); - goto success; - -security_mismatch_rcu: - rcu_read_unlock(); - _leave(" = -EKEYREJECTED"); - return ERR_PTR(-EKEYREJECTED); + rxrpc_publish_service_conn(conn->params.peer, conn); -enomem_peer: - rxrpc_put_peer(peer); -enomem: - _leave(" = -ENOMEM"); - return ERR_PTR(-ENOMEM); + _net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid); } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 70bb77818dea..3ad9f75031e3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1,6 +1,6 @@ /* RxRPC packet reception * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or @@ -27,550 +27,920 @@ #include <net/net_namespace.h> #include "ar-internal.h" +static void rxrpc_proto_abort(const char *why, + struct rxrpc_call *call, rxrpc_seq_t seq) +{ + if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, EBADMSG)) { + set_bit(RXRPC_CALL_EV_ABORT, &call->events); + rxrpc_queue_call(call); + } +} + /* - * queue a packet for recvmsg to pass to userspace - * - the caller must hold a lock on call->lock - * - must not be called with interrupts disabled (sk_filter() disables BH's) - * - eats the packet whether successful or not - * - there must be just one reference to the packet, which the caller passes to - * this function + * Do TCP-style congestion management [RFC 5681]. */ -int rxrpc_queue_rcv_skb(struct rxrpc_call *call, struct sk_buff *skb, - bool force, bool terminal) +static void rxrpc_congestion_management(struct rxrpc_call *call, + struct sk_buff *skb, + struct rxrpc_ack_summary *summary, + rxrpc_serial_t acked_serial) { - struct rxrpc_skb_priv *sp; - struct rxrpc_sock *rx = call->socket; - struct sock *sk; - int ret; + enum rxrpc_congest_change change = rxrpc_cong_no_change; + unsigned int cumulative_acks = call->cong_cumul_acks; + unsigned int cwnd = call->cong_cwnd; + bool resend = false; + + summary->flight_size = + (call->tx_top - call->tx_hard_ack) - summary->nr_acks; + + if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { + summary->retrans_timeo = true; + call->cong_ssthresh = max_t(unsigned int, + summary->flight_size / 2, 2); + cwnd = 1; + if (cwnd >= call->cong_ssthresh && + call->cong_mode == RXRPC_CALL_SLOW_START) { + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_tstamp = skb->tstamp; + cumulative_acks = 0; + } + } - _enter(",,%d,%d", force, terminal); + cumulative_acks += summary->nr_new_acks; + cumulative_acks += summary->nr_rot_new_acks; + if (cumulative_acks > 255) + cumulative_acks = 255; + + summary->mode = call->cong_mode; + summary->cwnd = call->cong_cwnd; + summary->ssthresh = call->cong_ssthresh; + summary->cumulative_acks = cumulative_acks; + summary->dup_acks = call->cong_dup_acks; + + switch (call->cong_mode) { + case RXRPC_CALL_SLOW_START: + if (summary->nr_nacks > 0) + goto packet_loss_detected; + if (summary->cumulative_acks > 0) + cwnd += 1; + if (cwnd >= call->cong_ssthresh) { + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_tstamp = skb->tstamp; + } + goto out; - ASSERT(!irqs_disabled()); + case RXRPC_CALL_CONGEST_AVOIDANCE: + if (summary->nr_nacks > 0) + goto packet_loss_detected; - sp = rxrpc_skb(skb); - ASSERTCMP(sp->call, ==, call); - - /* if we've already posted the terminal message for a call, then we - * don't post any more */ - if (test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags)) { - _debug("already terminated"); - ASSERTCMP(call->state, >=, RXRPC_CALL_COMPLETE); - rxrpc_free_skb(skb); - return 0; - } - - sk = &rx->sk; - - if (!force) { - /* cast skb->rcvbuf to unsigned... It's pointless, but - * reduces number of warnings when compiling with -W - * --ANK */ -// ret = -ENOBUFS; -// if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= -// (unsigned int) sk->sk_rcvbuf) -// goto out; - - ret = sk_filter(sk, skb); - if (ret < 0) + /* We analyse the number of packets that get ACK'd per RTT + * period and increase the window if we managed to fill it. + */ + if (call->peer->rtt_usage == 0) goto out; - } + if (ktime_before(skb->tstamp, + ktime_add_ns(call->cong_tstamp, + call->peer->rtt))) + goto out_no_clear_ca; + change = rxrpc_cong_rtt_window_end; + call->cong_tstamp = skb->tstamp; + if (cumulative_acks >= cwnd) + cwnd++; + goto out; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (!test_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags) && - !test_bit(RXRPC_CALL_RELEASED, &call->flags) && - call->socket->sk.sk_state != RXRPC_CLOSE) { - skb->destructor = rxrpc_packet_destructor; - skb->dev = NULL; - skb->sk = sk; - atomic_add(skb->truesize, &sk->sk_rmem_alloc); + case RXRPC_CALL_PACKET_LOSS: + if (summary->nr_nacks == 0) + goto resume_normality; - if (terminal) { - _debug("<<<< TERMINAL MESSAGE >>>>"); - set_bit(RXRPC_CALL_TERMINAL_MSG, &call->flags); + if (summary->new_low_nack) { + change = rxrpc_cong_new_low_nack; + call->cong_dup_acks = 1; + if (call->cong_extra > 1) + call->cong_extra = 1; + goto send_extra_data; } - /* allow interception by a kernel service */ - if (rx->interceptor) { - rx->interceptor(sk, call->user_call_ID, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); - } else { - _net("post skb %p", skb); - __skb_queue_tail(&sk->sk_receive_queue, skb); - spin_unlock_bh(&sk->sk_receive_queue.lock); + call->cong_dup_acks++; + if (call->cong_dup_acks < 3) + goto send_extra_data; + + change = rxrpc_cong_begin_retransmission; + call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; + call->cong_ssthresh = max_t(unsigned int, + summary->flight_size / 2, 2); + cwnd = call->cong_ssthresh + 3; + call->cong_extra = 0; + call->cong_dup_acks = 0; + resend = true; + goto out; - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + case RXRPC_CALL_FAST_RETRANSMIT: + if (!summary->new_low_nack) { + if (summary->nr_new_acks == 0) + cwnd += 1; + call->cong_dup_acks++; + if (call->cong_dup_acks == 2) { + change = rxrpc_cong_retransmit_again; + call->cong_dup_acks = 0; + resend = true; + } + } else { + change = rxrpc_cong_progress; + cwnd = call->cong_ssthresh; + if (summary->nr_nacks == 0) + goto resume_normality; } - skb = NULL; - } else { - spin_unlock_bh(&sk->sk_receive_queue.lock); + goto out; + + default: + BUG(); + goto out; } - ret = 0; +resume_normality: + change = rxrpc_cong_cleared_nacks; + call->cong_dup_acks = 0; + call->cong_extra = 0; + call->cong_tstamp = skb->tstamp; + if (cwnd < call->cong_ssthresh) + call->cong_mode = RXRPC_CALL_SLOW_START; + else + call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; out: - rxrpc_free_skb(skb); + cumulative_acks = 0; +out_no_clear_ca: + if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1) + cwnd = RXRPC_RXTX_BUFF_SIZE - 1; + call->cong_cwnd = cwnd; + call->cong_cumul_acks = cumulative_acks; + trace_rxrpc_congest(call, summary, acked_serial, change); + if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_queue_call(call); + return; - _leave(" = %d", ret); - return ret; +packet_loss_detected: + change = rxrpc_cong_saw_nack; + call->cong_mode = RXRPC_CALL_PACKET_LOSS; + call->cong_dup_acks = 0; + goto send_extra_data; + +send_extra_data: + /* Send some previously unsent DATA if we have some to advance the ACK + * state. + */ + if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & + RXRPC_TX_ANNO_LAST || + summary->nr_acks != call->tx_top - call->tx_hard_ack) { + call->cong_extra++; + wake_up(&call->waitq); + } + goto out_no_clear_ca; } /* - * process a DATA packet, posting the packet to the appropriate queue - * - eats the packet if successful + * Ping the other end to fill our RTT cache and to retrieve the rwind + * and MTU parameters. */ -static int rxrpc_fast_process_data(struct rxrpc_call *call, - struct sk_buff *skb, u32 seq) +static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb, + int skew) { - struct rxrpc_skb_priv *sp; - bool terminal; - int ret, ackbit, ack; - u32 serial; - u8 flags; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + ktime_t now = skb->tstamp; - _enter("{%u,%u},,{%u}", call->rx_data_post, call->rx_first_oos, seq); + if (call->peer->rtt_usage < 3 || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) + rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, + true, true, + rxrpc_propose_ack_ping_for_params); +} - sp = rxrpc_skb(skb); - ASSERTCMP(sp->call, ==, NULL); - flags = sp->hdr.flags; - serial = sp->hdr.serial; +/* + * Apply a hard ACK by advancing the Tx window. + */ +static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, + struct rxrpc_ack_summary *summary) +{ + struct sk_buff *skb, *list = NULL; + int ix; + u8 annotation; + + if (call->acks_lowest_nak == call->tx_hard_ack) { + call->acks_lowest_nak = to; + } else if (before_eq(call->acks_lowest_nak, to)) { + summary->new_low_nack = true; + call->acks_lowest_nak = to; + } spin_lock(&call->lock); - if (call->state > RXRPC_CALL_COMPLETE) - goto discard; + while (before(call->tx_hard_ack, to)) { + call->tx_hard_ack++; + ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK; + skb = call->rxtx_buffer[ix]; + annotation = call->rxtx_annotations[ix]; + rxrpc_see_skb(skb, rxrpc_skb_tx_rotated); + call->rxtx_buffer[ix] = NULL; + call->rxtx_annotations[ix] = 0; + skb->next = list; + list = skb; + + if (annotation & RXRPC_TX_ANNO_LAST) + set_bit(RXRPC_CALL_TX_LAST, &call->flags); + if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK) + summary->nr_rot_new_acks++; + } - ASSERTCMP(call->rx_data_expect, >=, call->rx_data_post); - ASSERTCMP(call->rx_data_post, >=, call->rx_data_recv); - ASSERTCMP(call->rx_data_recv, >=, call->rx_data_eaten); + spin_unlock(&call->lock); - if (seq < call->rx_data_post) { - _debug("dup #%u [-%u]", seq, call->rx_data_post); - ack = RXRPC_ACK_DUPLICATE; - ret = -ENOBUFS; - goto discard_and_ack; - } + trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ? + rxrpc_transmit_rotate_last : + rxrpc_transmit_rotate)); + wake_up(&call->waitq); - /* we may already have the packet in the out of sequence queue */ - ackbit = seq - (call->rx_data_eaten + 1); - ASSERTCMP(ackbit, >=, 0); - if (__test_and_set_bit(ackbit, call->ackr_window)) { - _debug("dup oos #%u [%u,%u]", - seq, call->rx_data_eaten, call->rx_data_post); - ack = RXRPC_ACK_DUPLICATE; - goto discard_and_ack; + while (list) { + skb = list; + list = skb->next; + skb->next = NULL; + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); } +} - if (seq >= call->ackr_win_top) { - _debug("exceed #%u [%u]", seq, call->ackr_win_top); - __clear_bit(ackbit, call->ackr_window); - ack = RXRPC_ACK_EXCEEDS_WINDOW; - goto discard_and_ack; - } +/* + * End the transmission phase of a call. + * + * This occurs when we get an ACKALL packet, the first DATA packet of a reply, + * or a final ACK packet. + */ +static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, + const char *abort_why) +{ - if (seq == call->rx_data_expect) { - clear_bit(RXRPC_CALL_EXPECT_OOS, &call->flags); - call->rx_data_expect++; - } else if (seq > call->rx_data_expect) { - _debug("oos #%u [%u]", seq, call->rx_data_expect); - call->rx_data_expect = seq + 1; - if (test_and_set_bit(RXRPC_CALL_EXPECT_OOS, &call->flags)) { - ack = RXRPC_ACK_OUT_OF_SEQUENCE; - goto enqueue_and_ack; - } - goto enqueue_packet; - } + ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); - if (seq != call->rx_data_post) { - _debug("ahead #%u [%u]", seq, call->rx_data_post); - goto enqueue_packet; - } + write_lock(&call->state_lock); - if (test_bit(RXRPC_CALL_RCVD_LAST, &call->flags)) - goto protocol_error; + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + if (reply_begun) + call->state = RXRPC_CALL_CLIENT_RECV_REPLY; + else + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + break; - /* if the packet need security things doing to it, then it goes down - * the slow path */ - if (call->conn->security_ix) - goto enqueue_packet; + case RXRPC_CALL_SERVER_AWAIT_ACK: + __rxrpc_call_completed(call); + rxrpc_notify_socket(call); + break; - sp->call = call; - rxrpc_get_call(call); - atomic_inc(&call->skb_count); - terminal = ((flags & RXRPC_LAST_PACKET) && - !(flags & RXRPC_CLIENT_INITIATED)); - ret = rxrpc_queue_rcv_skb(call, skb, false, terminal); - if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOBUFS) { - __clear_bit(ackbit, call->ackr_window); - ack = RXRPC_ACK_NOSPACE; - goto discard_and_ack; - } - goto out; + default: + goto bad_state; } - skb = NULL; - sp = NULL; - - _debug("post #%u", seq); - ASSERTCMP(call->rx_data_post, ==, seq); - call->rx_data_post++; + write_unlock(&call->state_lock); + if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) { + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, 0, false, true, + rxrpc_propose_ack_client_tx_end); + trace_rxrpc_transmit(call, rxrpc_transmit_await_reply); + } else { + trace_rxrpc_transmit(call, rxrpc_transmit_end); + } + _leave(" = ok"); + return true; + +bad_state: + write_unlock(&call->state_lock); + kdebug("end_tx %s", rxrpc_call_states[call->state]); + rxrpc_proto_abort(abort_why, call, call->tx_top); + return false; +} - if (flags & RXRPC_LAST_PACKET) - set_bit(RXRPC_CALL_RCVD_LAST, &call->flags); +/* + * Begin the reply reception phase of a call. + */ +static bool rxrpc_receiving_reply(struct rxrpc_call *call) +{ + struct rxrpc_ack_summary summary = { 0 }; + rxrpc_seq_t top = READ_ONCE(call->tx_top); + + if (call->ackr_reason) { + spin_lock_bh(&call->lock); + call->ackr_reason = 0; + call->resend_at = call->expire_at; + call->ack_at = call->expire_at; + spin_unlock_bh(&call->lock); + rxrpc_set_timer(call, rxrpc_timer_init_for_reply, + ktime_get_real()); + } - /* if we've reached an out of sequence packet then we need to drain - * that queue into the socket Rx queue now */ - if (call->rx_data_post == call->rx_first_oos) { - _debug("drain rx oos now"); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); + if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) + rxrpc_rotate_tx_window(call, top, &summary); + if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { + rxrpc_proto_abort("TXL", call, top); + return false; } + if (!rxrpc_end_tx_phase(call, true, "ETD")) + return false; + call->tx_phase = false; + return true; +} - spin_unlock(&call->lock); - atomic_inc(&call->ackr_not_idle); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false); - _leave(" = 0 [posted]"); - return 0; +/* + * Scan a jumbo packet to validate its structure and to work out how many + * subpackets it contains. + * + * A jumbo packet is a collection of consecutive packets glued together with + * little headers between that indicate how to change the initial header for + * each subpacket. + * + * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but + * the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any + * size. + */ +static bool rxrpc_validate_jumbo(struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = skb->len; + int nr_jumbo = 1; + u8 flags = sp->hdr.flags; -protocol_error: - ret = -EBADMSG; -out: - spin_unlock(&call->lock); - _leave(" = %d", ret); - return ret; + do { + nr_jumbo++; + if (len - offset < RXRPC_JUMBO_SUBPKTLEN) + goto protocol_error; + if (flags & RXRPC_LAST_PACKET) + goto protocol_error; + offset += RXRPC_JUMBO_DATALEN; + if (skb_copy_bits(skb, offset, &flags, 1) < 0) + goto protocol_error; + offset += sizeof(struct rxrpc_jumbo_header); + } while (flags & RXRPC_JUMBO_PACKET); -discard_and_ack: - _debug("discard and ACK packet %p", skb); - __rxrpc_propose_ACK(call, ack, serial, true); -discard: - spin_unlock(&call->lock); - rxrpc_free_skb(skb); - _leave(" = 0 [discarded]"); - return 0; + sp->nr_jumbo = nr_jumbo; + return true; -enqueue_and_ack: - __rxrpc_propose_ACK(call, ack, serial, true); -enqueue_packet: - _net("defer skb %p", skb); - spin_unlock(&call->lock); - skb_queue_tail(&call->rx_queue, skb); - atomic_inc(&call->ackr_not_idle); - read_lock(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) - rxrpc_queue_call(call); - read_unlock(&call->state_lock); - _leave(" = 0 [queued]"); - return 0; +protocol_error: + return false; } /* - * assume an implicit ACKALL of the transmission phase of a client socket upon - * reception of the first reply packet + * Handle reception of a duplicate packet. + * + * We have to take care to avoid an attack here whereby we're given a series of + * jumbograms, each with a sequence number one before the preceding one and + * filled up to maximum UDP size. If they never send us the first packet in + * the sequence, they can cause us to have to hold on to around 2MiB of kernel + * space until the call times out. + * + * We limit the space usage by only accepting three duplicate jumbo packets per + * call. After that, we tell the other side we're no longer accepting jumbos + * (that information is encoded in the ACK packet). */ -static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial) +static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq, + u8 annotation, bool *_jumbo_bad) { - write_lock_bh(&call->state_lock); - - switch (call->state) { - case RXRPC_CALL_CLIENT_AWAIT_REPLY: - call->state = RXRPC_CALL_CLIENT_RECV_REPLY; - call->acks_latest = serial; - - _debug("implicit ACKALL %%%u", call->acks_latest); - set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events); - write_unlock_bh(&call->state_lock); - - if (try_to_del_timer_sync(&call->resend_timer) >= 0) { - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_EV_RESEND, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - } - break; + /* Discard normal packets that are duplicates. */ + if (annotation == 0) + return; - default: - write_unlock_bh(&call->state_lock); - break; + /* Skip jumbo subpackets that are duplicates. When we've had three or + * more partially duplicate jumbo packets, we refuse to take any more + * jumbos for this call. + */ + if (!*_jumbo_bad) { + call->nr_jumbo_bad++; + *_jumbo_bad = true; } } /* - * post an incoming packet to the nominated call to deal with - * - must get rid of the sk_buff, either by freeing it or by queuing it + * Process a DATA packet, adding the packet to the Rx ring. */ -void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) +static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, + u16 skew) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 wtmp; - u32 hi_serial, abort_code; + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int ix; + rxrpc_serial_t serial = sp->hdr.serial, ack_serial = 0; + rxrpc_seq_t seq = sp->hdr.seq, hard_ack; + bool immediate_ack = false, jumbo_bad = false, queued; + u16 len; + u8 ack = 0, flags, annotation = 0; - _enter("%p,%p", call, skb); + _enter("{%u,%u},{%u,%u}", + call->rx_hard_ack, call->rx_top, skb->len, seq); - ASSERT(!irqs_disabled()); + _proto("Rx DATA %%%u { #%u f=%02x }", + sp->hdr.serial, seq, sp->hdr.flags); -#if 0 // INJECT RX ERROR - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - static int skip = 0; - if (++skip == 3) { - printk("DROPPED 3RD PACKET!!!!!!!!!!!!!\n"); - skip = 0; - goto free_packet; - } + if (call->state >= RXRPC_CALL_COMPLETE) + return; + + /* Received data implicitly ACKs all of the request packets we sent + * when we're acting as a client. + */ + if ((call->state == RXRPC_CALL_CLIENT_SEND_REQUEST || + call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && + !rxrpc_receiving_reply(call)) + return; + + call->ackr_prev_seq = seq; + + hard_ack = READ_ONCE(call->rx_hard_ack); + if (after(seq, hard_ack + call->rx_winsize)) { + ack = RXRPC_ACK_EXCEEDS_WINDOW; + ack_serial = serial; + goto ack; } -#endif - /* track the latest serial number on this connection for ACK packet - * information */ - hi_serial = atomic_read(&call->conn->hi_serial); - while (sp->hdr.serial > hi_serial) - hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial, - sp->hdr.serial); + flags = sp->hdr.flags; + if (flags & RXRPC_JUMBO_PACKET) { + if (call->nr_jumbo_bad > 3) { + ack = RXRPC_ACK_NOSPACE; + ack_serial = serial; + goto ack; + } + annotation = 1; + } - /* request ACK generation for any ACK or DATA packet that requests - * it */ - if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - _proto("ACK Requested on %%%u", sp->hdr.serial); - rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false); +next_subpacket: + queued = false; + ix = seq & RXRPC_RXTX_BUFF_MASK; + len = skb->len; + if (flags & RXRPC_JUMBO_PACKET) + len = RXRPC_JUMBO_DATALEN; + + if (flags & RXRPC_LAST_PACKET) { + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && + seq != call->rx_top) + return rxrpc_proto_abort("LSN", call, seq); + } else { + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && + after_eq(seq, call->rx_top)) + return rxrpc_proto_abort("LSA", call, seq); } - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_ABORT: - _debug("abort"); + if (before_eq(seq, hard_ack)) { + ack = RXRPC_ACK_DUPLICATE; + ack_serial = serial; + goto skip; + } - if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0) - goto protocol_error; + if (flags & RXRPC_REQUEST_ACK && !ack) { + ack = RXRPC_ACK_REQUESTED; + ack_serial = serial; + } - abort_code = ntohl(wtmp); - _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); - - write_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_REMOTELY_ABORTED; - call->remote_abort = abort_code; - set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); - rxrpc_queue_call(call); + if (call->rxtx_buffer[ix]) { + rxrpc_input_dup_data(call, seq, annotation, &jumbo_bad); + if (ack != RXRPC_ACK_DUPLICATE) { + ack = RXRPC_ACK_DUPLICATE; + ack_serial = serial; } - goto free_packet_unlock; + immediate_ack = true; + goto skip; + } - case RXRPC_PACKET_TYPE_BUSY: - _proto("Rx BUSY %%%u", sp->hdr.serial); + /* Queue the packet. We use a couple of memory barriers here as need + * to make sure that rx_top is perceived to be set after the buffer + * pointer and that the buffer pointer is set after the annotation and + * the skb data. + * + * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() + * and also rxrpc_fill_out_ack(). + */ + rxrpc_get_skb(skb, rxrpc_skb_rx_got); + call->rxtx_annotations[ix] = annotation; + smp_wmb(); + call->rxtx_buffer[ix] = skb; + if (after(seq, call->rx_top)) { + smp_store_release(&call->rx_top, seq); + } else if (before(seq, call->rx_top)) { + /* Send an immediate ACK if we fill in a hole */ + if (!ack) { + ack = RXRPC_ACK_DELAY; + ack_serial = serial; + } + immediate_ack = true; + } + if (flags & RXRPC_LAST_PACKET) { + set_bit(RXRPC_CALL_RX_LAST, &call->flags); + trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); + } else { + trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); + } + queued = true; - if (rxrpc_conn_is_service(call->conn)) - goto protocol_error; + if (after_eq(seq, call->rx_expect_next)) { + if (after(seq, call->rx_expect_next)) { + _net("OOS %u > %u", seq, call->rx_expect_next); + ack = RXRPC_ACK_OUT_OF_SEQUENCE; + ack_serial = serial; + } + call->rx_expect_next = seq + 1; + } - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - call->state = RXRPC_CALL_SERVER_BUSY; - set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events); - rxrpc_queue_call(call); - case RXRPC_CALL_SERVER_BUSY: - goto free_packet_unlock; - default: - goto protocol_error_locked; +skip: + offset += len; + if (flags & RXRPC_JUMBO_PACKET) { + if (skb_copy_bits(skb, offset, &flags, 1) < 0) + return rxrpc_proto_abort("XJF", call, seq); + offset += sizeof(struct rxrpc_jumbo_header); + seq++; + serial++; + annotation++; + if (flags & RXRPC_JUMBO_PACKET) + annotation |= RXRPC_RX_ANNO_JLAST; + if (after(seq, hard_ack + call->rx_winsize)) { + ack = RXRPC_ACK_EXCEEDS_WINDOW; + ack_serial = serial; + if (!jumbo_bad) { + call->nr_jumbo_bad++; + jumbo_bad = true; + } + goto ack; } - default: - _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); - goto protocol_error; + _proto("Rx DATA Jumbo %%%u", serial); + goto next_subpacket; + } - case RXRPC_PACKET_TYPE_DATA: - _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); + if (queued && flags & RXRPC_LAST_PACKET && !ack) { + ack = RXRPC_ACK_DELAY; + ack_serial = serial; + } - if (sp->hdr.seq == 0) - goto protocol_error; +ack: + if (ack) + rxrpc_propose_ACK(call, ack, skew, ack_serial, + immediate_ack, true, + rxrpc_propose_ack_input_data); - call->ackr_prev_seq = sp->hdr.seq; + if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) + rxrpc_notify_socket(call); + _leave(" [queued]"); +} - /* received data implicitly ACKs all of the request packets we - * sent when we're acting as a client */ - if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - rxrpc_assume_implicit_ackall(call, sp->hdr.serial); +/* + * Process a requested ACK. + */ +static void rxrpc_input_requested_ack(struct rxrpc_call *call, + ktime_t resp_time, + rxrpc_serial_t orig_serial, + rxrpc_serial_t ack_serial) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + ktime_t sent_at; + int ix; + + for (ix = 0; ix < RXRPC_RXTX_BUFF_SIZE; ix++) { + skb = call->rxtx_buffer[ix]; + if (!skb) + continue; + + sp = rxrpc_skb(skb); + if (sp->hdr.serial != orig_serial) + continue; + smp_rmb(); + sent_at = skb->tstamp; + goto found; + } + return; - switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) { - case 0: - skb = NULL; - goto done; +found: + rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_requested_ack, + orig_serial, ack_serial, sent_at, resp_time); +} - default: - BUG(); +/* + * Process a ping response. + */ +static void rxrpc_input_ping_response(struct rxrpc_call *call, + ktime_t resp_time, + rxrpc_serial_t orig_serial, + rxrpc_serial_t ack_serial) +{ + rxrpc_serial_t ping_serial; + ktime_t ping_time; - /* data packet received beyond the last packet */ - case -EBADMSG: - goto protocol_error; - } + ping_time = call->ackr_ping_time; + smp_rmb(); + ping_serial = call->ackr_ping; - case RXRPC_PACKET_TYPE_ACKALL: - case RXRPC_PACKET_TYPE_ACK: - /* ACK processing is done in process context */ - read_lock_bh(&call->state_lock); - if (call->state < RXRPC_CALL_DEAD) { - skb_queue_tail(&call->rx_queue, skb); - rxrpc_queue_call(call); - skb = NULL; - } - read_unlock_bh(&call->state_lock); - goto free_packet; + if (!test_bit(RXRPC_CALL_PINGING, &call->flags) || + before(orig_serial, ping_serial)) + return; + clear_bit(RXRPC_CALL_PINGING, &call->flags); + if (after(orig_serial, ping_serial)) + return; + + rxrpc_peer_add_rtt(call, rxrpc_rtt_rx_ping_response, + orig_serial, ack_serial, ping_time, resp_time); +} + +/* + * Process the extra information that may be appended to an ACK packet + */ +static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, + struct rxrpc_ackinfo *ackinfo) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_peer *peer; + unsigned int mtu; + u32 rwind = ntohl(ackinfo->rwind); + + _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", + sp->hdr.serial, + ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), + rwind, ntohl(ackinfo->jumbo_max)); + + if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) + rwind = RXRPC_RXTX_BUFF_SIZE - 1; + call->tx_winsize = rwind; + if (call->cong_ssthresh > rwind) + call->cong_ssthresh = rwind; + + mtu = min(ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU)); + + peer = call->peer; + if (mtu < peer->maxdata) { + spin_lock_bh(&peer->lock); + peer->maxdata = mtu; + peer->mtu = mtu + peer->hdrsize; + spin_unlock_bh(&peer->lock); + _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); } +} -protocol_error: - _debug("protocol error"); - write_lock_bh(&call->state_lock); -protocol_error_locked: - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_PROTOCOL_ERROR; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call); +/* + * Process individual soft ACKs. + * + * Each ACK in the array corresponds to one packet and can be either an ACK or + * a NAK. If we get find an explicitly NAK'd packet we resend immediately; + * packets that lie beyond the end of the ACK list are scheduled for resend by + * the timer on the basis that the peer might just not have processed them at + * the time the ACK was sent. + */ +static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, + rxrpc_seq_t seq, int nr_acks, + struct rxrpc_ack_summary *summary) +{ + int ix; + u8 annotation, anno_type; + + for (; nr_acks > 0; nr_acks--, seq++) { + ix = seq & RXRPC_RXTX_BUFF_MASK; + annotation = call->rxtx_annotations[ix]; + anno_type = annotation & RXRPC_TX_ANNO_MASK; + annotation &= ~RXRPC_TX_ANNO_MASK; + switch (*acks++) { + case RXRPC_ACK_TYPE_ACK: + summary->nr_acks++; + if (anno_type == RXRPC_TX_ANNO_ACK) + continue; + summary->nr_new_acks++; + call->rxtx_annotations[ix] = + RXRPC_TX_ANNO_ACK | annotation; + break; + case RXRPC_ACK_TYPE_NACK: + if (!summary->nr_nacks && + call->acks_lowest_nak != seq) { + call->acks_lowest_nak = seq; + summary->new_low_nack = true; + } + summary->nr_nacks++; + if (anno_type == RXRPC_TX_ANNO_NAK) + continue; + summary->nr_new_nacks++; + if (anno_type == RXRPC_TX_ANNO_RETRANS) + continue; + call->rxtx_annotations[ix] = + RXRPC_TX_ANNO_NAK | annotation; + break; + default: + return rxrpc_proto_abort("SFT", call, 0); + } } -free_packet_unlock: - write_unlock_bh(&call->state_lock); -free_packet: - rxrpc_free_skb(skb); -done: - _leave(""); } /* - * split up a jumbo data packet + * Process an ACK packet. + * + * ack.firstPacket is the sequence number of the first soft-ACK'd/NAK'd packet + * in the ACK array. Anything before that is hard-ACK'd and may be discarded. + * + * A hard-ACK means that a packet has been processed and may be discarded; a + * soft-ACK means that the packet may be discarded and retransmission + * requested. A phase is complete when all packets are hard-ACK'd. */ -static void rxrpc_process_jumbo_packet(struct rxrpc_call *call, - struct sk_buff *jumbo) +static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, + u16 skew) { - struct rxrpc_jumbo_header jhdr; - struct rxrpc_skb_priv *sp; - struct sk_buff *part; + struct rxrpc_ack_summary summary = { 0 }; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + union { + struct rxrpc_ackpacket ack; + struct rxrpc_ackinfo info; + u8 acks[RXRPC_MAXACKS]; + } buf; + rxrpc_serial_t acked_serial; + rxrpc_seq_t first_soft_ack, hard_ack; + int nr_acks, offset, ioffset; + + _enter(""); + + offset = sizeof(struct rxrpc_wire_header); + if (skb_copy_bits(skb, offset, &buf.ack, sizeof(buf.ack)) < 0) { + _debug("extraction failure"); + return rxrpc_proto_abort("XAK", call, 0); + } + offset += sizeof(buf.ack); + + acked_serial = ntohl(buf.ack.serial); + first_soft_ack = ntohl(buf.ack.firstPacket); + hard_ack = first_soft_ack - 1; + nr_acks = buf.ack.nAcks; + summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? + buf.ack.reason : RXRPC_ACK__INVALID); + + trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks); + + _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + sp->hdr.serial, + ntohs(buf.ack.maxSkew), + first_soft_ack, + ntohl(buf.ack.previousPacket), + acked_serial, + rxrpc_ack_names[summary.ack_reason], + buf.ack.nAcks); + + if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) + rxrpc_input_ping_response(call, skb->tstamp, acked_serial, + sp->hdr.serial); + if (buf.ack.reason == RXRPC_ACK_REQUESTED) + rxrpc_input_requested_ack(call, skb->tstamp, acked_serial, + sp->hdr.serial); + + if (buf.ack.reason == RXRPC_ACK_PING) { + _proto("Rx ACK %%%u PING Request", sp->hdr.serial); + rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, + skew, sp->hdr.serial, true, true, + rxrpc_propose_ack_respond_to_ping); + } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { + rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, + skew, sp->hdr.serial, true, true, + rxrpc_propose_ack_respond_to_ack); + } - _enter(",{%u,%u}", jumbo->data_len, jumbo->len); + ioffset = offset + nr_acks + 3; + if (skb->len >= ioffset + sizeof(buf.info)) { + if (skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0) + return rxrpc_proto_abort("XAI", call, 0); + rxrpc_input_ackinfo(call, skb, &buf.info); + } - sp = rxrpc_skb(jumbo); + if (first_soft_ack == 0) + return rxrpc_proto_abort("AK0", call, 0); - do { - sp->hdr.flags &= ~RXRPC_JUMBO_PACKET; - - /* make a clone to represent the first subpacket in what's left - * of the jumbo packet */ - part = skb_clone(jumbo, GFP_ATOMIC); - if (!part) { - /* simply ditch the tail in the event of ENOMEM */ - pskb_trim(jumbo, RXRPC_JUMBO_DATALEN); - break; - } - rxrpc_new_skb(part); + /* Ignore ACKs unless we are or have just been transmitting. */ + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + case RXRPC_CALL_SERVER_SEND_REPLY: + case RXRPC_CALL_SERVER_AWAIT_ACK: + break; + default: + return; + } - pskb_trim(part, RXRPC_JUMBO_DATALEN); + /* Discard any out-of-order or duplicate ACKs. */ + if (before_eq(sp->hdr.serial, call->acks_latest)) { + _debug("discard ACK %d <= %d", + sp->hdr.serial, call->acks_latest); + return; + } + call->acks_latest_ts = skb->tstamp; + call->acks_latest = sp->hdr.serial; + + if (before(hard_ack, call->tx_hard_ack) || + after(hard_ack, call->tx_top)) + return rxrpc_proto_abort("AKW", call, 0); + if (nr_acks > call->tx_top - hard_ack) + return rxrpc_proto_abort("AKN", call, 0); + + if (after(hard_ack, call->tx_hard_ack)) + rxrpc_rotate_tx_window(call, hard_ack, &summary); + + if (nr_acks > 0) { + if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0) + return rxrpc_proto_abort("XSA", call, 0); + rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks, + &summary); + } - if (!pskb_pull(jumbo, RXRPC_JUMBO_DATALEN)) - goto protocol_error; + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) { + rxrpc_end_tx_phase(call, false, "ETA"); + return; + } - if (skb_copy_bits(jumbo, 0, &jhdr, sizeof(jhdr)) < 0) - goto protocol_error; - if (!pskb_pull(jumbo, sizeof(jhdr))) - BUG(); + if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & + RXRPC_TX_ANNO_LAST && + summary.nr_acks == call->tx_top - hard_ack) + rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, + false, true, + rxrpc_propose_ack_ping_for_lost_reply); - sp->hdr.seq += 1; - sp->hdr.serial += 1; - sp->hdr.flags = jhdr.flags; - sp->hdr._rsvd = ntohs(jhdr._rsvd); + return rxrpc_congestion_management(call, skb, &summary, acked_serial); +} - _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1); +/* + * Process an ACKALL packet. + */ +static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_ack_summary summary = { 0 }; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - rxrpc_fast_process_packet(call, part); - part = NULL; + _proto("Rx ACKALL %%%u", sp->hdr.serial); - } while (sp->hdr.flags & RXRPC_JUMBO_PACKET); + rxrpc_rotate_tx_window(call, call->tx_top, &summary); + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags)) + rxrpc_end_tx_phase(call, false, "ETL"); +} - rxrpc_fast_process_packet(call, jumbo); - _leave(""); - return; +/* + * Process an ABORT packet. + */ +static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + __be32 wtmp; + u32 abort_code = RX_CALL_DEAD; -protocol_error: - _debug("protocol error"); - rxrpc_free_skb(part); - rxrpc_free_skb(jumbo); - write_lock_bh(&call->state_lock); - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = RX_PROTOCOL_ERROR; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call); - } - write_unlock_bh(&call->state_lock); - _leave(""); + _enter(""); + + if (skb->len >= 4 && + skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &wtmp, sizeof(wtmp)) >= 0) + abort_code = ntohl(wtmp); + + _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); + + if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, + abort_code, ECONNABORTED)) + rxrpc_notify_socket(call); } /* - * post an incoming packet to the appropriate call/socket to deal with - * - must get rid of the sk_buff, either by freeing it or by queuing it + * Process an incoming call packet. */ -static void rxrpc_post_packet_to_call(struct rxrpc_call *call, - struct sk_buff *skb) +static void rxrpc_input_call_packet(struct rxrpc_call *call, + struct sk_buff *skb, u16 skew) { - struct rxrpc_skb_priv *sp; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); _enter("%p,%p", call, skb); - sp = rxrpc_skb(skb); + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_DATA: + rxrpc_input_data(call, skb, skew); + break; - _debug("extant call [%d]", call->state); + case RXRPC_PACKET_TYPE_ACK: + rxrpc_input_ack(call, skb, skew); + break; + + case RXRPC_PACKET_TYPE_BUSY: + _proto("Rx BUSY %%%u", sp->hdr.serial); + + /* Just ignore BUSY packets from the server; the retry and + * lifespan timers will take care of business. BUSY packets + * from the client don't make sense. + */ + break; + + case RXRPC_PACKET_TYPE_ABORT: + rxrpc_input_abort(call, skb); + break; + + case RXRPC_PACKET_TYPE_ACKALL: + rxrpc_input_ackall(call, skb); + break; - read_lock(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_LOCALLY_ABORTED: - if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) { - rxrpc_queue_call(call); - goto free_unlock; - } - case RXRPC_CALL_REMOTELY_ABORTED: - case RXRPC_CALL_NETWORK_ERROR: - case RXRPC_CALL_DEAD: - goto dead_call; - case RXRPC_CALL_COMPLETE: - case RXRPC_CALL_CLIENT_FINAL_ACK: - /* complete server call */ - if (rxrpc_conn_is_service(call->conn)) - goto dead_call; - /* resend last packet of a completed call */ - _debug("final ack again"); - rxrpc_get_call(call); - set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); - rxrpc_queue_call(call); - goto free_unlock; default: + _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); break; } - read_unlock(&call->state_lock); - rxrpc_get_call(call); - - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - sp->hdr.flags & RXRPC_JUMBO_PACKET) - rxrpc_process_jumbo_packet(call, skb); - else - rxrpc_fast_process_packet(call, skb); - - rxrpc_put_call(call); - goto done; - -dead_call: - if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { - skb->priority = RX_CALL_DEAD; - rxrpc_reject_packet(call->conn->params.local, skb); - goto unlock; - } -free_unlock: - rxrpc_free_skb(skb); -unlock: - read_unlock(&call->state_lock); -done: _leave(""); } /* * post connection-level events to the connection - * - this includes challenges, responses and some aborts + * - this includes challenges, responses, some aborts and call terminal packet + * retransmission. */ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, struct sk_buff *skb) @@ -595,6 +965,17 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, } /* + * put a packet up for transport-level abort + */ +static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) +{ + CHECK_SLAB_OKAY(&local->usage); + + skb_queue_tail(&local->reject_queue, skb); + rxrpc_queue_local(local); +} + +/* * Extract the wire header from a packet and translate the byte order. */ static noinline @@ -605,8 +986,6 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) /* dig out the RxRPC connection details */ if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) return -EBADMSG; - if (!pskb_pull(skb, sizeof(whdr))) - BUG(); memset(sp, 0, sizeof(*sp)); sp->hdr.epoch = ntohl(whdr.epoch); @@ -631,19 +1010,22 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) * shut down and the local endpoint from going away, thus sk_user_data will not * be cleared until this function returns. */ -void rxrpc_data_ready(struct sock *sk) +void rxrpc_data_ready(struct sock *udp_sk) { struct rxrpc_connection *conn; + struct rxrpc_channel *chan; + struct rxrpc_call *call; struct rxrpc_skb_priv *sp; - struct rxrpc_local *local = sk->sk_user_data; + struct rxrpc_local *local = udp_sk->sk_user_data; struct sk_buff *skb; - int ret; + unsigned int channel; + int ret, skew; - _enter("%p", sk); + _enter("%p", udp_sk); ASSERT(!irqs_disabled()); - skb = skb_recv_datagram(sk, 0, 1, &ret); + skb = skb_recv_datagram(udp_sk, 0, 1, &ret); if (!skb) { if (ret == -EAGAIN) return; @@ -651,13 +1033,13 @@ void rxrpc_data_ready(struct sock *sk) return; } - rxrpc_new_skb(skb); + rxrpc_new_skb(skb, rxrpc_skb_rx_received); _net("recv skb %p", skb); /* we'll probably need to checksum it (didn't call sock_recvmsg) */ if (skb_checksum_complete(skb)) { - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); _leave(" [CSUM failed]"); return; @@ -671,13 +1053,21 @@ void rxrpc_data_ready(struct sock *sk) skb_orphan(skb); sp = rxrpc_skb(skb); - _net("Rx UDP packet from %08x:%04hu", - ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source)); - /* dig out the RxRPC connection details */ if (rxrpc_extract_header(sp, skb) < 0) goto bad_message; + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + if ((lose++ & 7) == 7) { + trace_rxrpc_rx_lose(sp); + rxrpc_lose_skb(skb, rxrpc_skb_rx_lost); + return; + } + } + + trace_rxrpc_rx_packet(sp); + _net("Rx RxRPC %s ep=%x call=%x:%x", sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient", sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber); @@ -688,70 +1078,125 @@ void rxrpc_data_ready(struct sock *sk) goto bad_message; } - if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: rxrpc_post_packet_to_local(local, skb); goto out; - } - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) - goto bad_message; + case RXRPC_PACKET_TYPE_BUSY: + if (sp->hdr.flags & RXRPC_CLIENT_INITIATED) + goto discard; + + case RXRPC_PACKET_TYPE_DATA: + if (sp->hdr.callNumber == 0) + goto bad_message; + if (sp->hdr.flags & RXRPC_JUMBO_PACKET && + !rxrpc_validate_jumbo(skb)) + goto bad_message; + break; + } rcu_read_lock(); conn = rxrpc_find_connection_rcu(local, skb); - if (!conn) - goto cant_route_call; + if (conn) { + if (sp->hdr.securityIndex != conn->security_ix) + goto wrong_security; + + if (sp->hdr.callNumber == 0) { + /* Connection-level packet */ + _debug("CONN %p {%d}", conn, conn->debug_id); + rxrpc_post_packet_to_conn(conn, skb); + goto out_unlock; + } + + /* Note the serial number skew here */ + skew = (int)sp->hdr.serial - (int)conn->hi_serial; + if (skew >= 0) { + if (skew > 0) + conn->hi_serial = sp->hdr.serial; + } else { + skew = -skew; + skew = min(skew, 65535); + } - if (sp->hdr.callNumber == 0) { - /* Connection-level packet */ - _debug("CONN %p {%d}", conn, conn->debug_id); - rxrpc_post_packet_to_conn(conn, skb); - } else { /* Call-bound packets are routed by connection channel. */ - unsigned int channel = sp->hdr.cid & RXRPC_CHANNELMASK; - struct rxrpc_channel *chan = &conn->channels[channel]; - struct rxrpc_call *call = rcu_dereference(chan->call); + channel = sp->hdr.cid & RXRPC_CHANNELMASK; + chan = &conn->channels[channel]; + + /* Ignore really old calls */ + if (sp->hdr.callNumber < chan->last_call) + goto discard_unlock; + + if (sp->hdr.callNumber == chan->last_call) { + /* For the previous service call, if completed successfully, we + * discard all further packets. + */ + if (rxrpc_conn_is_service(conn) && + (chan->last_type == RXRPC_PACKET_TYPE_ACK || + sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)) + goto discard_unlock; + + /* But otherwise we need to retransmit the final packet from + * data cached in the connection record. + */ + rxrpc_post_packet_to_conn(conn, skb); + goto out_unlock; + } - if (!call || atomic_read(&call->usage) == 0) - goto cant_route_call; + call = rcu_dereference(chan->call); + } else { + skew = 0; + call = NULL; + } - rxrpc_post_packet_to_call(call, skb); + if (!call || atomic_read(&call->usage) == 0) { + if (!(sp->hdr.type & RXRPC_CLIENT_INITIATED) || + sp->hdr.callNumber == 0 || + sp->hdr.type != RXRPC_PACKET_TYPE_DATA) + goto bad_message_unlock; + if (sp->hdr.seq != 1) + goto discard_unlock; + call = rxrpc_new_incoming_call(local, conn, skb); + if (!call) { + rcu_read_unlock(); + goto reject_packet; + } + rxrpc_send_ping(call, skb, skew); } + rxrpc_input_call_packet(call, skb, skew); + goto discard_unlock; + +discard_unlock: rcu_read_unlock(); +discard: + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); out: + trace_rxrpc_rx_done(0, 0); return; -cant_route_call: +out_unlock: rcu_read_unlock(); + goto out; - _debug("can't route call"); - if (sp->hdr.flags & RXRPC_CLIENT_INITIATED && - sp->hdr.type == RXRPC_PACKET_TYPE_DATA) { - if (sp->hdr.seq == 1) { - _debug("first packet"); - skb_queue_tail(&local->accept_queue, skb); - rxrpc_queue_work(&local->processor); - _leave(" [incoming]"); - return; - } - skb->priority = RX_INVALID_OPERATION; - } else { - skb->priority = RX_CALL_DEAD; - } - - if (sp->hdr.type != RXRPC_PACKET_TYPE_ABORT) { - _debug("reject type %d",sp->hdr.type); - rxrpc_reject_packet(local, skb); - } else { - rxrpc_free_skb(skb); - } - _leave(" [no call]"); - return; +wrong_security: + rcu_read_unlock(); + trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RXKADINCONSISTENCY, EBADMSG); + skb->priority = RXKADINCONSISTENCY; + goto post_abort; +bad_message_unlock: + rcu_read_unlock(); bad_message: + trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_PROTOCOL_ERROR, EBADMSG); skb->priority = RX_PROTOCOL_ERROR; +post_abort: + skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; +reject_packet: + trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_reject_packet(local, skb); _leave(" [badmsg]"); } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index c21ad213b337..7d4375e557e6 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -23,31 +23,36 @@ static int none_prime_packet_security(struct rxrpc_connection *conn) } static int none_secure_packet(struct rxrpc_call *call, - struct sk_buff *skb, - size_t data_size, - void *sechdr) + struct sk_buff *skb, + size_t data_size, + void *sechdr) { return 0; } -static int none_verify_packet(struct rxrpc_call *call, - struct sk_buff *skb, - u32 *_abort_code) +static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int offset, unsigned int len, + rxrpc_seq_t seq, u16 expected_cksum) { return 0; } +static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len) +{ +} + static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb, + u32 *_abort_code) { *_abort_code = RX_PROTOCOL_ERROR; return -EPROTO; } static int none_verify_response(struct rxrpc_connection *conn, - struct sk_buff *skb, - u32 *_abort_code) + struct sk_buff *skb, + u32 *_abort_code) { *_abort_code = RX_PROTOCOL_ERROR; return -EPROTO; @@ -78,6 +83,7 @@ const struct rxrpc_security rxrpc_no_security = { .prime_packet_security = none_prime_packet_security, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, + .locate_data = none_locate_data, .respond_to_challenge = none_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 31a3f86ef2f6..540d3955c1bc 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -15,8 +15,6 @@ #include <linux/net.h> #include <linux/skbuff.h> #include <linux/slab.h> -#include <linux/udp.h> -#include <linux/ip.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <generated/utsrelease.h> @@ -33,7 +31,7 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, { struct rxrpc_wire_header whdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct sockaddr_in sin; + struct sockaddr_rxrpc srx; struct msghdr msg; struct kvec iov[2]; size_t len; @@ -41,12 +39,11 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, _enter(""); - sin.sin_family = AF_INET; - sin.sin_port = udp_hdr(skb)->source; - sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) + return; - msg.msg_name = &sin; - msg.msg_namelen = sizeof(sin); + msg.msg_name = &srx.transport; + msg.msg_namelen = srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -93,11 +90,13 @@ void rxrpc_process_local_events(struct rxrpc_local *local) if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: - if (skb_copy_bits(skb, 0, &v, 1) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &v, 1) < 0) return; _proto("Rx VERSION { %02x }", v); if (v == 0) @@ -109,7 +108,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) break; } - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); } _leave(""); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index a753796fbe8f..ff4864d550b8 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -58,6 +58,17 @@ static long rxrpc_local_cmp_key(const struct rxrpc_local *local, memcmp(&local->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + /* If the choice of UDP6 port is left up to the transport, then + * the endpoint record doesn't match. + */ + return ((u16 __force)local->srx.transport.sin6.sin6_port - + (u16 __force)srx->transport.sin6.sin6_port) ?: + memcmp(&local->srx.transport.sin6.sin6_addr, + &srx->transport.sin6.sin6_addr, + sizeof(struct in6_addr)); +#endif default: BUG(); } @@ -75,9 +86,7 @@ static struct rxrpc_local *rxrpc_alloc_local(const struct sockaddr_rxrpc *srx) atomic_set(&local->usage, 1); INIT_LIST_HEAD(&local->link); INIT_WORK(&local->processor, rxrpc_local_processor); - INIT_LIST_HEAD(&local->services); init_rwsem(&local->defrag_sem); - skb_queue_head_init(&local->accept_queue); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); local->client_conns = RB_ROOT; @@ -101,11 +110,12 @@ static int rxrpc_open_socket(struct rxrpc_local *local) struct sock *sock; int ret, opt; - _enter("%p{%d}", local, local->srx.transport_type); + _enter("%p{%d,%d}", + local, local->srx.transport_type, local->srx.transport.family); /* create a socket to represent the local endpoint */ - ret = sock_create_kern(&init_net, PF_INET, local->srx.transport_type, - IPPROTO_UDP, &local->socket); + ret = sock_create_kern(&init_net, local->srx.transport.family, + local->srx.transport_type, 0, &local->socket); if (ret < 0) { _leave(" = %d [socket]", ret); return ret; @@ -170,18 +180,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) long diff; int ret; - if (srx->transport.family == AF_INET) { - _enter("{%d,%u,%pI4+%hu}", - srx->transport_type, - srx->transport.family, - &srx->transport.sin.sin_addr, - ntohs(srx->transport.sin.sin_port)); - } else { - _enter("{%d,%u}", - srx->transport_type, - srx->transport.family); - return ERR_PTR(-EAFNOSUPPORT); - } + _enter("{%d,%d,%pISp}", + srx->transport_type, srx->transport.family, &srx->transport); mutex_lock(&rxrpc_local_mutex); @@ -234,13 +234,8 @@ struct rxrpc_local *rxrpc_lookup_local(const struct sockaddr_rxrpc *srx) found: mutex_unlock(&rxrpc_local_mutex); - _net("LOCAL %s %d {%d,%u,%pI4+%hu}", - age, - local->debug_id, - local->srx.transport_type, - local->srx.transport.family, - &local->srx.transport.sin.sin_addr, - ntohs(local->srx.transport.sin.sin_port)); + _net("LOCAL %s %d {%pISp}", + age, local->debug_id, &local->srx.transport); _leave(" = %p", local); return local; @@ -296,7 +291,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) mutex_unlock(&rxrpc_local_mutex); ASSERT(RB_EMPTY_ROOT(&local->client_conns)); - ASSERT(list_empty(&local->services)); + ASSERT(!local->service); if (socket) { local->socket = NULL; @@ -308,7 +303,6 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) /* At this point, there should be no more packets coming in to the * local endpoint. */ - rxrpc_purge_queue(&local->accept_queue); rxrpc_purge_queue(&local->reject_queue); rxrpc_purge_queue(&local->event_queue); @@ -332,11 +326,6 @@ static void rxrpc_local_processor(struct work_struct *work) if (atomic_read(&local->usage) == 0) return rxrpc_local_destroyer(local); - if (!skb_queue_empty(&local->accept_queue)) { - rxrpc_accept_incoming_calls(local); - again = true; - } - if (!skb_queue_empty(&local->reject_queue)) { rxrpc_reject_packets(local); again = true; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index bdc5e42fe600..9d1c721bc4e8 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -21,28 +21,33 @@ unsigned int rxrpc_max_backlog __read_mostly = 10; /* + * Maximum lifetime of a call (in mx). + */ +unsigned int rxrpc_max_call_lifetime = 60 * 1000; + +/* * How long to wait before scheduling ACK generation after seeing a - * packet with RXRPC_REQUEST_ACK set (in jiffies). + * packet with RXRPC_REQUEST_ACK set (in ms). */ unsigned int rxrpc_requested_ack_delay = 1; /* - * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). + * How long to wait before scheduling an ACK with subtype DELAY (in ms). * * We use this when we've received new data packets. If those packets aren't * all consumed within this time we will send a DELAY ACK if an ACK was not * requested to let the sender know it doesn't need to resend. */ -unsigned int rxrpc_soft_ack_delay = 1 * HZ; +unsigned int rxrpc_soft_ack_delay = 1 * 1000; /* - * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). + * How long to wait before scheduling an ACK with subtype IDLE (in ms). * * We use this when we've consumed some previously soft-ACK'd packets when * further packets aren't immediately received to decide when to send an IDLE * ACK let the other end know that it can free up its Tx buffer space. */ -unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; +unsigned int rxrpc_idle_ack_delay = 0.5 * 1000; /* * Receive window size in packets. This indicates the maximum number of @@ -50,7 +55,10 @@ unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further * packets. */ -unsigned int rxrpc_rx_window_size = 32; +unsigned int rxrpc_rx_window_size = RXRPC_INIT_RX_WINDOW_SIZE; +#if (RXRPC_RXTX_BUFF_SIZE - 1) < RXRPC_INIT_RX_WINDOW_SIZE +#error Need to reduce RXRPC_INIT_RX_WINDOW_SIZE +#endif /* * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet @@ -64,6 +72,11 @@ unsigned int rxrpc_rx_mtu = 5692; */ unsigned int rxrpc_rx_jumbo_max = 4; +/* + * Time till packet resend (in milliseconds). + */ +unsigned int rxrpc_resend_timeout = 4 * 1000; + const char *const rxrpc_pkts[] = { "?00", "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", @@ -75,21 +88,152 @@ const s8 rxrpc_ack_priority[] = { [RXRPC_ACK_DELAY] = 1, [RXRPC_ACK_REQUESTED] = 2, [RXRPC_ACK_IDLE] = 3, - [RXRPC_ACK_PING_RESPONSE] = 4, - [RXRPC_ACK_DUPLICATE] = 5, - [RXRPC_ACK_OUT_OF_SEQUENCE] = 6, - [RXRPC_ACK_EXCEEDS_WINDOW] = 7, - [RXRPC_ACK_NOSPACE] = 8, -}; - -const char *rxrpc_acks(u8 reason) -{ - static const char *const str[] = { - "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", - "IDL", "-?-" - }; - - if (reason >= ARRAY_SIZE(str)) - reason = ARRAY_SIZE(str) - 1; - return str[reason]; -} + [RXRPC_ACK_DUPLICATE] = 4, + [RXRPC_ACK_OUT_OF_SEQUENCE] = 5, + [RXRPC_ACK_EXCEEDS_WINDOW] = 6, + [RXRPC_ACK_NOSPACE] = 7, + [RXRPC_ACK_PING_RESPONSE] = 8, + [RXRPC_ACK_PING] = 9, +}; + +const char const rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = { + "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", + "IDL", "-?-" +}; + +const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { + [rxrpc_skb_rx_cleaned] = "Rx CLN", + [rxrpc_skb_rx_freed] = "Rx FRE", + [rxrpc_skb_rx_got] = "Rx GOT", + [rxrpc_skb_rx_lost] = "Rx *L*", + [rxrpc_skb_rx_received] = "Rx RCV", + [rxrpc_skb_rx_purged] = "Rx PUR", + [rxrpc_skb_rx_rotated] = "Rx ROT", + [rxrpc_skb_rx_seen] = "Rx SEE", + [rxrpc_skb_tx_cleaned] = "Tx CLN", + [rxrpc_skb_tx_freed] = "Tx FRE", + [rxrpc_skb_tx_got] = "Tx GOT", + [rxrpc_skb_tx_new] = "Tx NEW", + [rxrpc_skb_tx_rotated] = "Tx ROT", + [rxrpc_skb_tx_seen] = "Tx SEE", +}; + +const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = { + [rxrpc_conn_new_client] = "NWc", + [rxrpc_conn_new_service] = "NWs", + [rxrpc_conn_queued] = "QUE", + [rxrpc_conn_seen] = "SEE", + [rxrpc_conn_got] = "GOT", + [rxrpc_conn_put_client] = "PTc", + [rxrpc_conn_put_service] = "PTs", +}; + +const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = { + [rxrpc_client_activate_chans] = "Activa", + [rxrpc_client_alloc] = "Alloc ", + [rxrpc_client_chan_activate] = "ChActv", + [rxrpc_client_chan_disconnect] = "ChDisc", + [rxrpc_client_chan_pass] = "ChPass", + [rxrpc_client_chan_unstarted] = "ChUnst", + [rxrpc_client_cleanup] = "Clean ", + [rxrpc_client_count] = "Count ", + [rxrpc_client_discard] = "Discar", + [rxrpc_client_duplicate] = "Duplic", + [rxrpc_client_exposed] = "Expose", + [rxrpc_client_replace] = "Replac", + [rxrpc_client_to_active] = "->Actv", + [rxrpc_client_to_culled] = "->Cull", + [rxrpc_client_to_idle] = "->Idle", + [rxrpc_client_to_inactive] = "->Inac", + [rxrpc_client_to_waiting] = "->Wait", + [rxrpc_client_uncount] = "Uncoun", +}; + +const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = { + [rxrpc_transmit_wait] = "WAI", + [rxrpc_transmit_queue] = "QUE", + [rxrpc_transmit_queue_last] = "QLS", + [rxrpc_transmit_rotate] = "ROT", + [rxrpc_transmit_rotate_last] = "RLS", + [rxrpc_transmit_await_reply] = "AWR", + [rxrpc_transmit_end] = "END", +}; + +const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = { + [rxrpc_receive_incoming] = "INC", + [rxrpc_receive_queue] = "QUE", + [rxrpc_receive_queue_last] = "QLS", + [rxrpc_receive_front] = "FRN", + [rxrpc_receive_rotate] = "ROT", + [rxrpc_receive_end] = "END", +}; + +const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = { + [rxrpc_recvmsg_enter] = "ENTR", + [rxrpc_recvmsg_wait] = "WAIT", + [rxrpc_recvmsg_dequeue] = "DEQU", + [rxrpc_recvmsg_hole] = "HOLE", + [rxrpc_recvmsg_next] = "NEXT", + [rxrpc_recvmsg_cont] = "CONT", + [rxrpc_recvmsg_full] = "FULL", + [rxrpc_recvmsg_data_return] = "DATA", + [rxrpc_recvmsg_terminal] = "TERM", + [rxrpc_recvmsg_to_be_accepted] = "TBAC", + [rxrpc_recvmsg_return] = "RETN", +}; + +const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = { + [rxrpc_rtt_tx_ping] = "PING", + [rxrpc_rtt_tx_data] = "DATA", +}; + +const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { + [rxrpc_rtt_rx_ping_response] = "PONG", + [rxrpc_rtt_rx_requested_ack] = "RACK", +}; + +const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { + [rxrpc_timer_begin] = "Begin ", + [rxrpc_timer_expired] = "*EXPR*", + [rxrpc_timer_init_for_reply] = "IniRpl", + [rxrpc_timer_set_for_ack] = "SetAck", + [rxrpc_timer_set_for_send] = "SetTx ", + [rxrpc_timer_set_for_resend] = "SetRTx", +}; + +const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { + [rxrpc_propose_ack_client_tx_end] = "ClTxEnd", + [rxrpc_propose_ack_input_data] = "DataIn ", + [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck", + [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl", + [rxrpc_propose_ack_ping_for_params] = "Params ", + [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", + [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png", + [rxrpc_propose_ack_retry_tx] = "RetryTx", + [rxrpc_propose_ack_rotate_rx] = "RxAck ", + [rxrpc_propose_ack_terminal_ack] = "ClTerm ", +}; + +const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = { + [rxrpc_propose_ack_use] = "", + [rxrpc_propose_ack_update] = " Update", + [rxrpc_propose_ack_subsume] = " Subsume", +}; + +const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = { + [RXRPC_CALL_SLOW_START] = "SlowStart", + [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid", + [RXRPC_CALL_PACKET_LOSS] = "PktLoss ", + [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ", +}; + +const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = { + [rxrpc_cong_begin_retransmission] = " Retrans", + [rxrpc_cong_cleared_nacks] = " Cleared", + [rxrpc_cong_new_low_nack] = " NewLowN", + [rxrpc_cong_no_change] = "", + [rxrpc_cong_progress] = " Progres", + [rxrpc_cong_retransmit_again] = " ReTxAgn", + [rxrpc_cong_rtt_window_end] = " RttWinE", + [rxrpc_cong_saw_nack] = " SawNack", +}; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f4bda06b7d2d..0d47db886f6e 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -14,336 +14,326 @@ #include <linux/net.h> #include <linux/gfp.h> #include <linux/skbuff.h> -#include <linux/circ_buf.h> #include <linux/export.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" -/* - * Time till packet resend (in jiffies). - */ -unsigned int rxrpc_resend_timeout = 4 * HZ; - -static int rxrpc_send_data(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct msghdr *msg, size_t len); +struct rxrpc_pkt_buffer { + struct rxrpc_wire_header whdr; + union { + struct { + struct rxrpc_ackpacket ack; + u8 acks[255]; + u8 pad[3]; + }; + __be32 abort_code; + }; + struct rxrpc_ackinfo ackinfo; +}; /* - * extract control messages from the sendmsg() control buffer + * Fill out an ACK packet. */ -static int rxrpc_sendmsg_cmsg(struct msghdr *msg, - unsigned long *user_call_ID, - enum rxrpc_command *command, - u32 *abort_code, - bool *_exclusive) +static size_t rxrpc_fill_out_ack(struct rxrpc_call *call, + struct rxrpc_pkt_buffer *pkt, + rxrpc_seq_t *_hard_ack, + rxrpc_seq_t *_top) { - struct cmsghdr *cmsg; - bool got_user_ID = false; - int len; - - *command = RXRPC_CMD_SEND_DATA; - - if (msg->msg_controllen == 0) - return -EINVAL; - - for_each_cmsghdr(cmsg, msg) { - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); - _debug("CMSG %d, %d, %d", - cmsg->cmsg_level, cmsg->cmsg_type, len); - - if (cmsg->cmsg_level != SOL_RXRPC) - continue; - - switch (cmsg->cmsg_type) { - case RXRPC_USER_CALL_ID: - if (msg->msg_flags & MSG_CMSG_COMPAT) { - if (len != sizeof(u32)) - return -EINVAL; - *user_call_ID = *(u32 *) CMSG_DATA(cmsg); - } else { - if (len != sizeof(unsigned long)) - return -EINVAL; - *user_call_ID = *(unsigned long *) - CMSG_DATA(cmsg); - } - _debug("User Call ID %lx", *user_call_ID); - got_user_ID = true; - break; - - case RXRPC_ABORT: - if (*command != RXRPC_CMD_SEND_DATA) - return -EINVAL; - *command = RXRPC_CMD_SEND_ABORT; - if (len != sizeof(*abort_code)) - return -EINVAL; - *abort_code = *(unsigned int *) CMSG_DATA(cmsg); - _debug("Abort %x", *abort_code); - if (*abort_code == 0) - return -EINVAL; - break; - - case RXRPC_ACCEPT: - if (*command != RXRPC_CMD_SEND_DATA) - return -EINVAL; - *command = RXRPC_CMD_ACCEPT; - if (len != 0) - return -EINVAL; - break; - - case RXRPC_EXCLUSIVE_CALL: - *_exclusive = true; - if (len != 0) - return -EINVAL; - break; - default: - return -EINVAL; - } + rxrpc_serial_t serial; + rxrpc_seq_t hard_ack, top, seq; + int ix; + u32 mtu, jmax; + u8 *ackp = pkt->acks; + + /* Barrier against rxrpc_input_data(). */ + serial = call->ackr_serial; + hard_ack = READ_ONCE(call->rx_hard_ack); + top = smp_load_acquire(&call->rx_top); + *_hard_ack = hard_ack; + *_top = top; + + pkt->ack.bufferSpace = htons(8); + pkt->ack.maxSkew = htons(call->ackr_skew); + pkt->ack.firstPacket = htonl(hard_ack + 1); + pkt->ack.previousPacket = htonl(call->ackr_prev_seq); + pkt->ack.serial = htonl(serial); + pkt->ack.reason = call->ackr_reason; + pkt->ack.nAcks = top - hard_ack; + + if (pkt->ack.reason == RXRPC_ACK_PING) + pkt->whdr.flags |= RXRPC_REQUEST_ACK; + + if (after(top, hard_ack)) { + seq = hard_ack + 1; + do { + ix = seq & RXRPC_RXTX_BUFF_MASK; + if (call->rxtx_buffer[ix]) + *ackp++ = RXRPC_ACK_TYPE_ACK; + else + *ackp++ = RXRPC_ACK_TYPE_NACK; + seq++; + } while (before_eq(seq, top)); } - if (!got_user_ID) - return -EINVAL; - _leave(" = 0"); - return 0; + mtu = call->conn->params.peer->if_mtu; + mtu -= call->conn->params.peer->hdrsize; + jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max; + pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu); + pkt->ackinfo.maxMTU = htonl(mtu); + pkt->ackinfo.rwind = htonl(call->rx_winsize); + pkt->ackinfo.jumbo_max = htonl(jmax); + + *ackp++ = 0; + *ackp++ = 0; + *ackp++ = 0; + return top - hard_ack + 3; } /* - * abort a call, sending an ABORT packet to the peer + * Send an ACK or ABORT call packet. */ -static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) +int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type) { - write_lock_bh(&call->state_lock); - - if (call->state <= RXRPC_CALL_COMPLETE) { - call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->local_abort = abort_code; - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - del_timer_sync(&call->resend_timer); - del_timer_sync(&call->ack_timer); - clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events); - clear_bit(RXRPC_CALL_EV_ACK, &call->events); - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - rxrpc_queue_call(call); + struct rxrpc_connection *conn = NULL; + struct rxrpc_pkt_buffer *pkt; + struct msghdr msg; + struct kvec iov[2]; + rxrpc_serial_t serial; + rxrpc_seq_t hard_ack, top; + size_t len, n; + bool ping = false; + int ioc, ret; + u32 abort_code; + + _enter("%u,%s", call->debug_id, rxrpc_pkts[type]); + + spin_lock_bh(&call->lock); + if (call->conn) + conn = rxrpc_get_connection_maybe(call->conn); + spin_unlock_bh(&call->lock); + if (!conn) + return -ECONNRESET; + + pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); + if (!pkt) { + rxrpc_put_connection(conn); + return -ENOMEM; } - write_unlock_bh(&call->state_lock); -} - -/* - * Create a new client call for sendmsg(). - */ -static struct rxrpc_call * -rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, - unsigned long user_call_ID, bool exclusive) -{ - struct rxrpc_conn_parameters cp; - struct rxrpc_call *call; - struct key *key; - - DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); + msg.msg_name = &call->peer->srx.transport; + msg.msg_namelen = call->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + pkt->whdr.epoch = htonl(conn->proto.epoch); + pkt->whdr.cid = htonl(call->cid); + pkt->whdr.callNumber = htonl(call->call_id); + pkt->whdr.seq = 0; + pkt->whdr.type = type; + pkt->whdr.flags = conn->out_clientflag; + pkt->whdr.userStatus = 0; + pkt->whdr.securityIndex = call->security_ix; + pkt->whdr._rsvd = 0; + pkt->whdr.serviceId = htons(call->service_id); + + iov[0].iov_base = pkt; + iov[0].iov_len = sizeof(pkt->whdr); + len = sizeof(pkt->whdr); + + switch (type) { + case RXRPC_PACKET_TYPE_ACK: + spin_lock_bh(&call->lock); + if (!call->ackr_reason) { + spin_unlock_bh(&call->lock); + ret = 0; + goto out; + } + ping = (call->ackr_reason == RXRPC_ACK_PING); + n = rxrpc_fill_out_ack(call, pkt, &hard_ack, &top); + call->ackr_reason = 0; - _enter(""); + spin_unlock_bh(&call->lock); - if (!msg->msg_name) - return ERR_PTR(-EDESTADDRREQ); - key = rx->key; - if (key && !rx->key->payload.data[0]) - key = NULL; + pkt->whdr.flags |= RXRPC_SLOW_START_OK; - memset(&cp, 0, sizeof(cp)); - cp.local = rx->local; - cp.key = rx->key; - cp.security_level = rx->min_sec_level; - cp.exclusive = rx->exclusive | exclusive; - cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); + iov[0].iov_len += sizeof(pkt->ack) + n; + iov[1].iov_base = &pkt->ackinfo; + iov[1].iov_len = sizeof(pkt->ackinfo); + len += sizeof(pkt->ack) + n + sizeof(pkt->ackinfo); + ioc = 2; + break; - _leave(" = %p\n", call); - return call; -} + case RXRPC_PACKET_TYPE_ABORT: + abort_code = call->abort_code; + pkt->abort_code = htonl(abort_code); + iov[0].iov_len += sizeof(pkt->abort_code); + len += sizeof(pkt->abort_code); + ioc = 1; + break; -/* - * send a message forming part of a client call through an RxRPC socket - * - caller holds the socket locked - * - the socket may be either a client socket or a server socket - */ -int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) -{ - enum rxrpc_command cmd; - struct rxrpc_call *call; - unsigned long user_call_ID = 0; - bool exclusive = false; - u32 abort_code = 0; - int ret; - - _enter(""); - - ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, - &exclusive); - if (ret < 0) - return ret; - - if (cmd == RXRPC_CMD_ACCEPT) { - if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) - return -EINVAL; - call = rxrpc_accept_call(rx, user_call_ID); - if (IS_ERR(call)) - return PTR_ERR(call); - rxrpc_put_call(call); - return 0; + default: + BUG(); + ret = -ENOANO; + goto out; } - call = rxrpc_find_call_by_user_ID(rx, user_call_ID); - if (!call) { - if (cmd != RXRPC_CMD_SEND_DATA) - return -EBADSLT; - call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, - exclusive); - if (IS_ERR(call)) - return PTR_ERR(call); + serial = atomic_inc_return(&conn->serial); + pkt->whdr.serial = htonl(serial); + switch (type) { + case RXRPC_PACKET_TYPE_ACK: + trace_rxrpc_tx_ack(call, serial, + ntohl(pkt->ack.firstPacket), + ntohl(pkt->ack.serial), + pkt->ack.reason, pkt->ack.nAcks); + break; } - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - - if (call->state >= RXRPC_CALL_COMPLETE) { - /* it's too late for this call */ - ret = -ECONNRESET; - } else if (cmd == RXRPC_CMD_SEND_ABORT) { - rxrpc_send_abort(call, abort_code); - ret = 0; - } else if (cmd != RXRPC_CMD_SEND_DATA) { - ret = -EINVAL; - } else if (!call->in_clientflag && - call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { - /* request phase complete for this client call */ - ret = -EPROTO; - } else if (call->in_clientflag && - call->state != RXRPC_CALL_SERVER_ACK_REQUEST && - call->state != RXRPC_CALL_SERVER_SEND_REPLY) { - /* Reply phase not begun or not complete for service call. */ - ret = -EPROTO; - } else { - ret = rxrpc_send_data(rx, call, msg, len); + if (ping) { + call->ackr_ping = serial; + smp_wmb(); + /* We need to stick a time in before we send the packet in case + * the reply gets back before kernel_sendmsg() completes - but + * asking UDP to send the packet can take a relatively long + * time, so we update the time after, on the assumption that + * the packet transmission is more likely to happen towards the + * end of the kernel_sendmsg() call. + */ + call->ackr_ping_time = ktime_get_real(); + set_bit(RXRPC_CALL_PINGING, &call->flags); + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_ping, serial); } - - rxrpc_put_call(call); - _leave(" = %d", ret); - return ret; -} - -/** - * rxrpc_kernel_send_data - Allow a kernel service to send data on a call - * @call: The call to send data through - * @msg: The data to send - * @len: The amount of data to send - * - * Allow a kernel service to send data on a call. The call must be in an state - * appropriate to sending data. No control data should be supplied in @msg, - * nor should an address be supplied. MSG_MORE should be flagged if there's - * more data to come, otherwise this data will end the transmission phase. - */ -int rxrpc_kernel_send_data(struct rxrpc_call *call, struct msghdr *msg, - size_t len) -{ - int ret; - - _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); - - ASSERTCMP(msg->msg_name, ==, NULL); - ASSERTCMP(msg->msg_control, ==, NULL); - - lock_sock(&call->socket->sk); - - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - - if (call->state >= RXRPC_CALL_COMPLETE) { - ret = -ESHUTDOWN; /* it's too late for this call */ - } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && - call->state != RXRPC_CALL_SERVER_ACK_REQUEST && - call->state != RXRPC_CALL_SERVER_SEND_REPLY) { - ret = -EPROTO; /* request phase complete for this client call */ - } else { - ret = rxrpc_send_data(call->socket, call, msg, len); + ret = kernel_sendmsg(conn->params.local->socket, + &msg, iov, ioc, len); + if (ping) + call->ackr_ping_time = ktime_get_real(); + + if (type == RXRPC_PACKET_TYPE_ACK && + call->state < RXRPC_CALL_COMPLETE) { + if (ret < 0) { + clear_bit(RXRPC_CALL_PINGING, &call->flags); + rxrpc_propose_ACK(call, pkt->ack.reason, + ntohs(pkt->ack.maxSkew), + ntohl(pkt->ack.serial), + true, true, + rxrpc_propose_ack_retry_tx); + } else { + spin_lock_bh(&call->lock); + if (after(hard_ack, call->ackr_consumed)) + call->ackr_consumed = hard_ack; + if (after(top, call->ackr_seen)) + call->ackr_seen = top; + spin_unlock_bh(&call->lock); + } } - release_sock(&call->socket->sk); - _leave(" = %d", ret); +out: + rxrpc_put_connection(conn); + kfree(pkt); return ret; } -EXPORT_SYMBOL(rxrpc_kernel_send_data); - -/** - * rxrpc_kernel_abort_call - Allow a kernel service to abort a call - * @call: The call to be aborted - * @abort_code: The abort code to stick into the ABORT packet - * - * Allow a kernel service to abort a call, if it's still in an abortable state. - */ -void rxrpc_kernel_abort_call(struct rxrpc_call *call, u32 abort_code) -{ - _enter("{%d},%d", call->debug_id, abort_code); - - lock_sock(&call->socket->sk); - - _debug("CALL %d USR %lx ST %d on CONN %p", - call->debug_id, call->user_call_ID, call->state, call->conn); - - if (call->state < RXRPC_CALL_COMPLETE) - rxrpc_send_abort(call, abort_code); - - release_sock(&call->socket->sk); - _leave(""); -} - -EXPORT_SYMBOL(rxrpc_kernel_abort_call); - /* * send a packet through the transport endpoint */ -int rxrpc_send_data_packet(struct rxrpc_connection *conn, struct sk_buff *skb) +int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, + bool retrans) { - struct kvec iov[1]; + struct rxrpc_connection *conn = call->conn; + struct rxrpc_wire_header whdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct msghdr msg; + struct kvec iov[2]; + rxrpc_serial_t serial; + size_t len; + bool lost = false; int ret, opt; _enter(",{%d}", skb->len); - iov[0].iov_base = skb->head; - iov[0].iov_len = skb->len; + /* Each transmission of a Tx packet needs a new serial number */ + serial = atomic_inc_return(&conn->serial); + + whdr.epoch = htonl(conn->proto.epoch); + whdr.cid = htonl(call->cid); + whdr.callNumber = htonl(call->call_id); + whdr.seq = htonl(sp->hdr.seq); + whdr.serial = htonl(serial); + whdr.type = RXRPC_PACKET_TYPE_DATA; + whdr.flags = sp->hdr.flags; + whdr.userStatus = 0; + whdr.securityIndex = call->security_ix; + whdr._rsvd = htons(sp->hdr._rsvd); + whdr.serviceId = htons(call->service_id); + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = skb->head; + iov[1].iov_len = skb->len; + len = iov[0].iov_len + iov[1].iov_len; - msg.msg_name = &conn->params.peer->srx.transport; - msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_name = &call->peer->srx.transport; + msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; - /* send the packet with the don't fragment bit set if we currently - * think it's small enough */ - if (skb->len - sizeof(struct rxrpc_wire_header) < conn->params.peer->maxdata) { - down_read(&conn->params.local->defrag_sem); - /* send the packet by UDP - * - returns -EMSGSIZE if UDP would have to fragment the packet - * to go out of the interface - * - in which case, we'll have processed the ICMP error - * message and update the peer record - */ - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, - iov[0].iov_len); + /* If our RTT cache needs working on, request an ACK. Also request + * ACKs if a DATA packet appears to have been lost. + */ + if (retrans || + call->cong_mode == RXRPC_CALL_SLOW_START || + (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + ktime_get_real())) + whdr.flags |= RXRPC_REQUEST_ACK; + + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + if ((lose++ & 7) == 7) { + ret = 0; + lost = true; + goto done; + } + } - up_read(&conn->params.local->defrag_sem); - if (ret == -EMSGSIZE) - goto send_fragmentable; + _proto("Tx DATA %%%u { #%u }", serial, sp->hdr.seq); - _leave(" = %d [%u]", ret, conn->params.peer->maxdata); - return ret; + /* send the packet with the don't fragment bit set if we currently + * think it's small enough */ + if (iov[1].iov_len >= call->peer->maxdata) + goto send_fragmentable; + + down_read(&conn->params.local->defrag_sem); + /* send the packet by UDP + * - returns -EMSGSIZE if UDP would have to fragment the packet + * to go out of the interface + * - in which case, we'll have processed the ICMP error + * message and update the peer record + */ + ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + + up_read(&conn->params.local->defrag_sem); + if (ret == -EMSGSIZE) + goto send_fragmentable; + +done: + trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, + retrans, lost); + if (ret >= 0) { + ktime_t now = ktime_get_real(); + skb->tstamp = now; + smp_wmb(); + sp->hdr.serial = serial; + if (whdr.flags & RXRPC_REQUEST_ACK) { + call->peer->rtt_last_req = now; + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial); + } } + _leave(" = %d [%u]", ret, call->peer->maxdata); + return ret; send_fragmentable: /* attempt to send this message with fragmentation enabled */ @@ -358,8 +348,8 @@ send_fragmentable: SOL_IP, IP_MTU_DISCOVER, (char *)&opt, sizeof(opt)); if (ret == 0) { - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, - iov[0].iov_len); + ret = kernel_sendmsg(conn->params.local->socket, &msg, + iov, 2, len); opt = IP_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, SOL_IP, @@ -367,355 +357,82 @@ send_fragmentable: (char *)&opt, sizeof(opt)); } break; - } - - up_write(&conn->params.local->defrag_sem); - _leave(" = %d [frag %u]", ret, conn->params.peer->maxdata); - return ret; -} -/* - * wait for space to appear in the transmit/ACK window - * - caller holds the socket locked - */ -static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, - struct rxrpc_call *call, - long *timeo) -{ - DECLARE_WAITQUEUE(myself, current); - int ret; - - _enter(",{%d},%ld", - CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), - call->acks_winsz), - *timeo); - - add_wait_queue(&call->tx_waitq, &myself); - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - ret = 0; - if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail), - call->acks_winsz) > 0) - break; - if (signal_pending(current)) { - ret = sock_intr_errno(*timeo); - break; - } - - release_sock(&rx->sk); - *timeo = schedule_timeout(*timeo); - lock_sock(&rx->sk); - } - - remove_wait_queue(&call->tx_waitq, &myself); - set_current_state(TASK_RUNNING); - _leave(" = %d", ret); - return ret; -} - -/* - * attempt to schedule an instant Tx resend - */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call) -{ - read_lock_bh(&call->state_lock); - if (try_to_del_timer_sync(&call->resend_timer) >= 0) { - clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags); - if (call->state < RXRPC_CALL_COMPLETE && - !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events)) - rxrpc_queue_call(call); - } - read_unlock_bh(&call->state_lock); -} +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + opt = IPV6_PMTUDISC_DONT; + ret = kernel_setsockopt(conn->params.local->socket, + SOL_IPV6, IPV6_MTU_DISCOVER, + (char *)&opt, sizeof(opt)); + if (ret == 0) { + ret = kernel_sendmsg(conn->params.local->socket, &msg, + iov, 1, iov[0].iov_len); -/* - * queue a packet for transmission, set the resend timer and attempt - * to send the packet immediately - */ -static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, - bool last) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - int ret; - - _net("queue skb %p [%d]", skb, call->acks_head); - - ASSERT(call->acks_window != NULL); - call->acks_window[call->acks_head] = (unsigned long) skb; - smp_wmb(); - call->acks_head = (call->acks_head + 1) & (call->acks_winsz - 1); - - if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { - _debug("________awaiting reply/ACK__________"); - write_lock_bh(&call->state_lock); - switch (call->state) { - case RXRPC_CALL_CLIENT_SEND_REQUEST: - call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; - break; - case RXRPC_CALL_SERVER_ACK_REQUEST: - call->state = RXRPC_CALL_SERVER_SEND_REPLY; - if (!last) - break; - case RXRPC_CALL_SERVER_SEND_REPLY: - call->state = RXRPC_CALL_SERVER_AWAIT_ACK; - break; - default: - break; + opt = IPV6_PMTUDISC_DO; + kernel_setsockopt(conn->params.local->socket, + SOL_IPV6, IPV6_MTU_DISCOVER, + (char *)&opt, sizeof(opt)); } - write_unlock_bh(&call->state_lock); - } - - _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); - - sp->need_resend = false; - sp->resend_at = jiffies + rxrpc_resend_timeout; - if (!test_and_set_bit(RXRPC_CALL_RUN_RTIMER, &call->flags)) { - _debug("run timer"); - call->resend_timer.expires = sp->resend_at; - add_timer(&call->resend_timer); - } - - /* attempt to cancel the rx-ACK timer, deferring reply transmission if - * we're ACK'ing the request phase of an incoming call */ - ret = -EAGAIN; - if (try_to_del_timer_sync(&call->ack_timer) >= 0) { - /* the packet may be freed by rxrpc_process_call() before this - * returns */ - ret = rxrpc_send_data_packet(call->conn, skb); - _net("sent skb %p", skb); - } else { - _debug("failed to delete ACK timer"); - } - - if (ret < 0) { - _debug("need instant resend %d", ret); - sp->need_resend = true; - rxrpc_instant_resend(call); + break; +#endif } - _leave(""); -} - -/* - * Convert a host-endian header into a network-endian header. - */ -static void rxrpc_insert_header(struct sk_buff *skb) -{ - struct rxrpc_wire_header whdr; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.seq = htonl(sp->hdr.seq); - whdr.serial = htonl(sp->hdr.serial); - whdr.type = sp->hdr.type; - whdr.flags = sp->hdr.flags; - whdr.userStatus = sp->hdr.userStatus; - whdr.securityIndex = sp->hdr.securityIndex; - whdr._rsvd = htons(sp->hdr._rsvd); - whdr.serviceId = htons(sp->hdr.serviceId); - - memcpy(skb->head, &whdr, sizeof(whdr)); + up_write(&conn->params.local->defrag_sem); + goto done; } /* - * send data through a socket - * - must be called in process context - * - caller holds the socket locked + * reject packets through the local endpoint */ -static int rxrpc_send_data(struct rxrpc_sock *rx, - struct rxrpc_call *call, - struct msghdr *msg, size_t len) +void rxrpc_reject_packets(struct rxrpc_local *local) { + struct sockaddr_rxrpc srx; struct rxrpc_skb_priv *sp; + struct rxrpc_wire_header whdr; struct sk_buff *skb; - struct sock *sk = &rx->sk; - long timeo; - bool more; - int ret, copied; - - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - - /* this should be in poll */ - sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - return -EPIPE; - - more = msg->msg_flags & MSG_MORE; - - skb = call->tx_pending; - call->tx_pending = NULL; - - copied = 0; - do { - if (!skb) { - size_t size, chunk, max, space; - - _debug("alloc"); - - if (CIRC_SPACE(call->acks_head, - ACCESS_ONCE(call->acks_tail), - call->acks_winsz) <= 0) { - ret = -EAGAIN; - if (msg->msg_flags & MSG_DONTWAIT) - goto maybe_error; - ret = rxrpc_wait_for_tx_window(rx, call, - &timeo); - if (ret < 0) - goto maybe_error; - } - - max = call->conn->params.peer->maxdata; - max -= call->conn->security_size; - max &= ~(call->conn->size_align - 1UL); - - chunk = max; - if (chunk > msg_data_left(msg) && !more) - chunk = msg_data_left(msg); - - space = chunk + call->conn->size_align; - space &= ~(call->conn->size_align - 1UL); - - size = space + call->conn->header_size; - - _debug("SIZE: %zu/%zu/%zu", chunk, space, size); - - /* create a buffer that we can retain until it's ACK'd */ - skb = sock_alloc_send_skb( - sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); - if (!skb) - goto maybe_error; - - rxrpc_new_skb(skb); - - _debug("ALLOC SEND %p", skb); - - ASSERTCMP(skb->mark, ==, 0); + struct msghdr msg; + struct kvec iov[2]; + size_t size; + __be32 code; - _debug("HS: %u", call->conn->header_size); - skb_reserve(skb, call->conn->header_size); - skb->len += call->conn->header_size; + _enter("%d", local->debug_id); - sp = rxrpc_skb(skb); - sp->remain = chunk; - if (sp->remain > skb_tailroom(skb)) - sp->remain = skb_tailroom(skb); + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = &code; + iov[1].iov_len = sizeof(code); + size = sizeof(whdr) + sizeof(code); - _net("skb: hr %d, tr %d, hl %d, rm %d", - skb_headroom(skb), - skb_tailroom(skb), - skb_headlen(skb), - sp->remain); + msg.msg_name = &srx.transport; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } + memset(&whdr, 0, sizeof(whdr)); + whdr.type = RXRPC_PACKET_TYPE_ABORT; - _debug("append"); + while ((skb = skb_dequeue(&local->reject_queue))) { + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); - /* append next segment of data to the current buffer */ - if (msg_data_left(msg) > 0) { - int copy = skb_tailroom(skb); - ASSERTCMP(copy, >, 0); - if (copy > msg_data_left(msg)) - copy = msg_data_left(msg); - if (copy > sp->remain) - copy = sp->remain; - - _debug("add"); - ret = skb_add_data(skb, &msg->msg_iter, copy); - _debug("added"); - if (ret < 0) - goto efault; - sp->remain -= copy; - skb->mark += copy; - copied += copy; - } + if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { + msg.msg_namelen = srx.transport_len; - /* check for the far side aborting the call or a network error - * occurring */ - if (call->state > RXRPC_CALL_COMPLETE) - goto call_aborted; - - /* add the packet to the send queue if it's now full */ - if (sp->remain <= 0 || - (msg_data_left(msg) == 0 && !more)) { - struct rxrpc_connection *conn = call->conn; - uint32_t seq; - size_t pad; - - /* pad out if we're using security */ - if (conn->security_ix) { - pad = conn->security_size + skb->mark; - pad = conn->size_align - pad; - pad &= conn->size_align - 1; - _debug("pad %zu", pad); - if (pad) - memset(skb_put(skb, pad), 0, pad); - } - - seq = atomic_inc_return(&call->sequence); - - sp->hdr.epoch = conn->proto.epoch; - sp->hdr.cid = call->cid; - sp->hdr.callNumber = call->call_id; - sp->hdr.seq = seq; - sp->hdr.serial = atomic_inc_return(&conn->serial); - sp->hdr.type = RXRPC_PACKET_TYPE_DATA; - sp->hdr.userStatus = 0; - sp->hdr.securityIndex = conn->security_ix; - sp->hdr._rsvd = 0; - sp->hdr.serviceId = call->service_id; - - sp->hdr.flags = conn->out_clientflag; - if (msg_data_left(msg) == 0 && !more) - sp->hdr.flags |= RXRPC_LAST_PACKET; - else if (CIRC_SPACE(call->acks_head, - ACCESS_ONCE(call->acks_tail), - call->acks_winsz) > 1) - sp->hdr.flags |= RXRPC_MORE_PACKETS; - if (more && seq & 1) - sp->hdr.flags |= RXRPC_REQUEST_ACK; - - ret = conn->security->secure_packet( - call, skb, skb->mark, - skb->head + sizeof(struct rxrpc_wire_header)); - if (ret < 0) - goto out; - - rxrpc_insert_header(skb); - rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); - skb = NULL; - } - } while (msg_data_left(msg) > 0); + code = htonl(skb->priority); -success: - ret = copied; -out: - call->tx_pending = skb; - _leave(" = %d", ret); - return ret; + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.serviceId = htons(sp->hdr.serviceId); + whdr.flags = sp->hdr.flags; + whdr.flags ^= RXRPC_CLIENT_INITIATED; + whdr.flags &= RXRPC_CLIENT_INITIATED; -call_aborted: - rxrpc_free_skb(skb); - if (call->state == RXRPC_CALL_NETWORK_ERROR) - ret = call->error_report < RXRPC_LOCAL_ERROR_OFFSET ? - call->error_report : - call->error_report - RXRPC_LOCAL_ERROR_OFFSET; - else - ret = -ECONNABORTED; - _leave(" = %d", ret); - return ret; + kernel_sendmsg(local->socket, &msg, iov, 2, size); + } -maybe_error: - if (copied) - goto success; - goto out; + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + } -efault: - ret = -EFAULT; - goto out; + _leave(""); } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 8940674b5e08..bf13b8470c9a 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -66,6 +66,32 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, } break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + srx.transport.sin6.sin6_port = serr->port; + srx.transport_len = sizeof(struct sockaddr_in6); + switch (serr->ee.ee_origin) { + case SO_EE_ORIGIN_ICMP6: + _net("Rx ICMP6"); + memcpy(&srx.transport.sin6.sin6_addr, + skb_network_header(skb) + serr->addr_offset, + sizeof(struct in6_addr)); + break; + case SO_EE_ORIGIN_ICMP: + _net("Rx ICMP on v6 sock"); + memcpy(srx.transport.sin6.sin6_addr.s6_addr + 12, + skb_network_header(skb) + serr->addr_offset, + sizeof(struct in_addr)); + break; + default: + memcpy(&srx.transport.sin6.sin6_addr, + &ipv6_hdr(skb)->saddr, + sizeof(struct in6_addr)); + break; + } + break; +#endif + default: BUG(); } @@ -129,22 +155,21 @@ void rxrpc_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } + rxrpc_new_skb(skb, rxrpc_skb_rx_received); serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); - kfree_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); return; } - rxrpc_new_skb(skb); - rcu_read_lock(); peer = rxrpc_lookup_peer_icmp_rcu(local, skb); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; if (!peer) { rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _leave(" [no peer]"); return; } @@ -154,7 +179,7 @@ void rxrpc_error_report(struct sock *sk) serr->ee.ee_code == ICMP_FRAG_NEEDED)) { rxrpc_adjust_mtu(peer, serr); rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); rxrpc_put_peer(peer); _leave(" [MTU update]"); return; @@ -162,7 +187,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); rcu_read_unlock(); - rxrpc_free_skb(skb); + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); /* The ref we obtained is passed off to the work item */ rxrpc_queue_work(&peer->error_distributor); @@ -248,13 +273,20 @@ void rxrpc_peer_error_distributor(struct work_struct *work) struct rxrpc_peer *peer = container_of(work, struct rxrpc_peer, error_distributor); struct rxrpc_call *call; - int error_report; + enum rxrpc_call_completion compl; + int error; _enter(""); - error_report = READ_ONCE(peer->error_report); + error = READ_ONCE(peer->error_report); + if (error < RXRPC_LOCAL_ERROR_OFFSET) { + compl = RXRPC_CALL_NETWORK_ERROR; + } else { + compl = RXRPC_CALL_LOCAL_ERROR; + error -= RXRPC_LOCAL_ERROR_OFFSET; + } - _debug("ISSUE ERROR %d", error_report); + _debug("ISSUE ERROR %s %d", rxrpc_call_completions[compl], error); spin_lock_bh(&peer->lock); @@ -262,16 +294,10 @@ void rxrpc_peer_error_distributor(struct work_struct *work) call = hlist_entry(peer->error_targets.first, struct rxrpc_call, error_link); hlist_del_init(&call->error_link); + rxrpc_see_call(call); - write_lock(&call->state_lock); - if (call->state != RXRPC_CALL_COMPLETE && - call->state < RXRPC_CALL_NETWORK_ERROR) { - call->error_report = error_report; - call->state = RXRPC_CALL_NETWORK_ERROR; - set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events); - rxrpc_queue_call(call); - } - write_unlock(&call->state_lock); + if (rxrpc_set_call_completion(call, compl, 0, error)) + rxrpc_notify_socket(call); } spin_unlock_bh(&peer->lock); @@ -279,3 +305,44 @@ void rxrpc_peer_error_distributor(struct work_struct *work) rxrpc_put_peer(peer); _leave(""); } + +/* + * Add RTT information to cache. This is called in softirq mode and has + * exclusive access to the peer RTT data. + */ +void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time) +{ + struct rxrpc_peer *peer = call->peer; + s64 rtt; + u64 sum = peer->rtt_sum, avg; + u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage; + + rtt = ktime_to_ns(ktime_sub(resp_time, send_time)); + if (rtt < 0) + return; + + /* Replace the oldest datum in the RTT buffer */ + sum -= peer->rtt_cache[cursor]; + sum += rtt; + peer->rtt_cache[cursor] = rtt; + peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1); + peer->rtt_sum = sum; + if (usage < RXRPC_RTT_CACHE_SIZE) { + usage++; + peer->rtt_usage = usage; + } + + /* Now recalculate the average */ + if (usage == RXRPC_RTT_CACHE_SIZE) { + avg = sum / RXRPC_RTT_CACHE_SIZE; + } else { + avg = sum; + do_div(avg, usage); + } + + peer->rtt = avg; + trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, + usage, avg); +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 538e9831c699..941b724d523b 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -16,12 +16,14 @@ #include <linux/skbuff.h> #include <linux/udp.h> #include <linux/in.h> +#include <linux/in6.h> #include <linux/slab.h> #include <linux/hashtable.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include <net/route.h> +#include <net/ip6_route.h> #include "ar-internal.h" static DEFINE_HASHTABLE(rxrpc_peer_hash, 10); @@ -50,6 +52,13 @@ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local, size = sizeof(srx->transport.sin.sin_addr); p = (u16 *)&srx->transport.sin.sin_addr; break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + hash_key += (u16 __force)srx->transport.sin.sin_port; + size = sizeof(srx->transport.sin6.sin6_addr); + p = (u16 *)&srx->transport.sin6.sin6_addr; + break; +#endif default: WARN(1, "AF_RXRPC: Unsupported transport address family\n"); return 0; @@ -93,6 +102,14 @@ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer, memcmp(&peer->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + return ((u16 __force)peer->srx.transport.sin6.sin6_port - + (u16 __force)srx->transport.sin6.sin6_port) ?: + memcmp(&peer->srx.transport.sin6.sin6_addr, + &srx->transport.sin6.sin6_addr, + sizeof(struct in6_addr)); +#endif default: BUG(); } @@ -130,17 +147,7 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer) { - switch (srx->transport.family) { - case AF_INET: - _net("PEER %d {%d,%u,%pI4+%hu}", - peer->debug_id, - peer->srx.transport_type, - peer->srx.transport.family, - &peer->srx.transport.sin.sin_addr, - ntohs(peer->srx.transport.sin.sin_port)); - break; - } - + _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport); _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); } return peer; @@ -152,22 +159,53 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, */ static void rxrpc_assess_MTU_size(struct rxrpc_peer *peer) { + struct dst_entry *dst; struct rtable *rt; - struct flowi4 fl4; + struct flowi fl; + struct flowi4 *fl4 = &fl.u.ip4; +#ifdef CONFIG_AF_RXRPC_IPV6 + struct flowi6 *fl6 = &fl.u.ip6; +#endif peer->if_mtu = 1500; - rt = ip_route_output_ports(&init_net, &fl4, NULL, - peer->srx.transport.sin.sin_addr.s_addr, 0, - htons(7000), htons(7001), - IPPROTO_UDP, 0, 0); - if (IS_ERR(rt)) { - _leave(" [route err %ld]", PTR_ERR(rt)); - return; + memset(&fl, 0, sizeof(fl)); + switch (peer->srx.transport.family) { + case AF_INET: + rt = ip_route_output_ports( + &init_net, fl4, NULL, + peer->srx.transport.sin.sin_addr.s_addr, 0, + htons(7000), htons(7001), IPPROTO_UDP, 0, 0); + if (IS_ERR(rt)) { + _leave(" [route err %ld]", PTR_ERR(rt)); + return; + } + dst = &rt->dst; + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + fl6->flowi6_iif = LOOPBACK_IFINDEX; + fl6->flowi6_scope = RT_SCOPE_UNIVERSE; + fl6->flowi6_proto = IPPROTO_UDP; + memcpy(&fl6->daddr, &peer->srx.transport.sin6.sin6_addr, + sizeof(struct in6_addr)); + fl6->fl6_dport = htons(7001); + fl6->fl6_sport = htons(7000); + dst = ip6_route_output(&init_net, NULL, fl6); + if (IS_ERR(dst)) { + _leave(" [route err %ld]", PTR_ERR(dst)); + return; + } + break; +#endif + + default: + BUG(); } - peer->if_mtu = dst_mtu(&rt->dst); - dst_release(&rt->dst); + peer->if_mtu = dst_mtu(dst); + dst_release(dst); _leave(" [if_mtu %u]", peer->if_mtu); } @@ -199,6 +237,41 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) } /* + * Initialise peer record. + */ +static void rxrpc_init_peer(struct rxrpc_peer *peer, unsigned long hash_key) +{ + peer->hash_key = hash_key; + rxrpc_assess_MTU_size(peer); + peer->mtu = peer->if_mtu; + peer->rtt_last_req = ktime_get_real(); + + switch (peer->srx.transport.family) { + case AF_INET: + peer->hdrsize = sizeof(struct iphdr); + break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + peer->hdrsize = sizeof(struct ipv6hdr); + break; +#endif + default: + BUG(); + } + + switch (peer->srx.transport_type) { + case SOCK_DGRAM: + peer->hdrsize += sizeof(struct udphdr); + break; + default: + BUG(); + } + + peer->hdrsize += sizeof(struct rxrpc_wire_header); + peer->maxdata = peer->mtu - peer->hdrsize; +} + +/* * Set up a new peer. */ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, @@ -212,31 +285,40 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_local *local, peer = rxrpc_alloc_peer(local, gfp); if (peer) { - peer->hash_key = hash_key; memcpy(&peer->srx, srx, sizeof(*srx)); + rxrpc_init_peer(peer, hash_key); + } - rxrpc_assess_MTU_size(peer); - peer->mtu = peer->if_mtu; - - if (srx->transport.family == AF_INET) { - peer->hdrsize = sizeof(struct iphdr); - switch (srx->transport_type) { - case SOCK_DGRAM: - peer->hdrsize += sizeof(struct udphdr); - break; - default: - BUG(); - break; - } - } else { - BUG(); - } + _leave(" = %p", peer); + return peer; +} + +/* + * Set up a new incoming peer. The address is prestored in the preallocated + * peer. + */ +struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, + struct rxrpc_peer *prealloc) +{ + struct rxrpc_peer *peer; + unsigned long hash_key; + + hash_key = rxrpc_peer_hash_key(local, &prealloc->srx); + prealloc->local = local; + rxrpc_init_peer(prealloc, hash_key); - peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; + spin_lock(&rxrpc_peer_hash_lock); + + /* Need to check that we aren't racing with someone else */ + peer = __rxrpc_lookup_peer_rcu(local, &prealloc->srx, hash_key); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + peer = prealloc; + hash_add_rcu(rxrpc_peer_hash, &peer->hash_link, hash_key); } - _leave(" = %p", peer); + spin_unlock(&rxrpc_peer_hash_lock); return peer; } @@ -249,11 +331,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, *candidate; unsigned long hash_key = rxrpc_peer_hash_key(local, srx); - _enter("{%d,%d,%pI4+%hu}", - srx->transport_type, - srx->transport_len, - &srx->transport.sin.sin_addr, - ntohs(srx->transport.sin.sin_port)); + _enter("{%pISp}", &srx->transport); /* search the peer list first */ rcu_read_lock(); @@ -272,7 +350,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock(&rxrpc_peer_hash_lock); + spin_lock_bh(&rxrpc_peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -282,7 +360,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, hash_add_rcu(rxrpc_peer_hash, &candidate->hash_link, hash_key); - spin_unlock(&rxrpc_peer_hash_lock); + spin_unlock_bh(&rxrpc_peer_hash_lock); if (peer) kfree(candidate); @@ -290,11 +368,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, peer = candidate; } - _net("PEER %d {%d,%pI4+%hu}", - peer->debug_id, - peer->srx.transport_type, - &peer->srx.transport.sin.sin_addr, - ntohs(peer->srx.transport.sin.sin_port)); + _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport); _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); return peer; @@ -307,9 +381,24 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer) { ASSERT(hlist_empty(&peer->error_targets)); - spin_lock(&rxrpc_peer_hash_lock); + spin_lock_bh(&rxrpc_peer_hash_lock); hash_del_rcu(&peer->hash_link); - spin_unlock(&rxrpc_peer_hash_lock); + spin_unlock_bh(&rxrpc_peer_hash_lock); kfree_rcu(peer, rcu); } + +/** + * rxrpc_kernel_get_peer - Get the peer address of a call + * @sock: The socket on which the call is in progress. + * @call: The call to query + * @_srx: Where to place the result + * + * Get the address of the remote peer in a call. + */ +void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, + struct sockaddr_rxrpc *_srx) +{ + *_srx = call->peer->srx; +} +EXPORT_SYMBOL(rxrpc_kernel_get_peer); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index ced5f07444e5..65cd980767fa 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -17,12 +17,12 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { [RXRPC_CONN_UNUSED] = "Unused ", [RXRPC_CONN_CLIENT] = "Client ", + [RXRPC_CONN_SERVICE_PREALLOC] = "SvPrealc", [RXRPC_CONN_SERVICE_UNSECURED] = "SvUnsec ", [RXRPC_CONN_SERVICE_CHALLENGING] = "SvChall ", [RXRPC_CONN_SERVICE] = "SvSecure", [RXRPC_CONN_REMOTELY_ABORTED] = "RmtAbort", [RXRPC_CONN_LOCALLY_ABORTED] = "LocAbort", - [RXRPC_CONN_NETWORK_ERROR] = "NetError", }; /* @@ -30,6 +30,7 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { */ static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) { + rcu_read_lock(); read_lock(&rxrpc_call_lock); return seq_list_start_head(&rxrpc_calls, *_pos); } @@ -42,17 +43,21 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) { read_unlock(&rxrpc_call_lock); + rcu_read_unlock(); } static int rxrpc_call_seq_show(struct seq_file *seq, void *v) { - struct rxrpc_connection *conn; + struct rxrpc_local *local; + struct rxrpc_sock *rx; + struct rxrpc_peer *peer; struct rxrpc_call *call; - char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + char lbuff[50], rbuff[50]; if (v == &rxrpc_calls) { seq_puts(seq, - "Proto Local Remote " + "Proto Local " + " Remote " " SvID ConnID CallID End Use State Abort " " UserID\n"); return 0; @@ -60,30 +65,35 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call = list_entry(v, struct rxrpc_call, link); - sprintf(lbuff, "%pI4:%u", - &call->local->srx.transport.sin.sin_addr, - ntohs(call->local->srx.transport.sin.sin_port)); + rx = rcu_dereference(call->socket); + if (rx) { + local = READ_ONCE(rx->local); + if (local) + sprintf(lbuff, "%pISpc", &local->srx.transport); + else + strcpy(lbuff, "no_local"); + } else { + strcpy(lbuff, "no_socket"); + } - conn = call->conn; - if (conn) - sprintf(rbuff, "%pI4:%u", - &conn->params.peer->srx.transport.sin.sin_addr, - ntohs(conn->params.peer->srx.transport.sin.sin_port)); + peer = call->peer; + if (peer) + sprintf(rbuff, "%pISpc", &peer->srx.transport); else strcpy(rbuff, "no_connection"); seq_printf(seq, - "UDP %-22.22s %-22.22s %4x %08x %08x %s %3u" + "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %lx\n", lbuff, rbuff, call->service_id, call->cid, call->call_id, - call->in_clientflag ? "Svc" : "Clt", + rxrpc_is_service_call(call) ? "Svc" : "Clt", atomic_read(&call->usage), rxrpc_call_states[call->state], - call->remote_abort ?: call->local_abort, + call->abort_code, call->user_call_ID); return 0; @@ -115,13 +125,13 @@ const struct file_operations rxrpc_call_seq_fops = { static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) { read_lock(&rxrpc_connection_lock); - return seq_list_start_head(&rxrpc_connections, *_pos); + return seq_list_start_head(&rxrpc_connection_proc_list, *_pos); } static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - return seq_list_next(v, &rxrpc_connections, pos); + return seq_list_next(v, &rxrpc_connection_proc_list, pos); } static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) @@ -132,29 +142,31 @@ static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) { struct rxrpc_connection *conn; - char lbuff[4 + 4 + 4 + 4 + 5 + 1], rbuff[4 + 4 + 4 + 4 + 5 + 1]; + char lbuff[50], rbuff[50]; - if (v == &rxrpc_connections) { + if (v == &rxrpc_connection_proc_list) { seq_puts(seq, - "Proto Local Remote " + "Proto Local " + " Remote " " SvID ConnID End Use State Key " " Serial ISerial\n" ); return 0; } - conn = list_entry(v, struct rxrpc_connection, link); - - sprintf(lbuff, "%pI4:%u", - &conn->params.local->srx.transport.sin.sin_addr, - ntohs(conn->params.local->srx.transport.sin.sin_port)); + conn = list_entry(v, struct rxrpc_connection, proc_link); + if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) { + strcpy(lbuff, "no_local"); + strcpy(rbuff, "no_connection"); + goto print; + } - sprintf(rbuff, "%pI4:%u", - &conn->params.peer->srx.transport.sin.sin_addr, - ntohs(conn->params.peer->srx.transport.sin.sin_port)); + sprintf(lbuff, "%pISpc", &conn->params.local->srx.transport); + sprintf(rbuff, "%pISpc", &conn->params.peer->srx.transport); +print: seq_printf(seq, - "UDP %-22.22s %-22.22s %4x %08x %s %3u" + "UDP %-47.47s %-47.47s %4x %08x %s %3u" " %s %08x %08x %08x\n", lbuff, rbuff, @@ -165,7 +177,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) rxrpc_conn_states[conn->state], key_serial(conn->params.key), atomic_read(&conn->serial), - atomic_read(&conn->hi_serial)); + conn->hi_serial); return 0; } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 9ed66d533002..f05ea0a88076 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -19,399 +19,645 @@ #include "ar-internal.h" /* - * removal a call's user ID from the socket tree to make the user ID available - * again and so that it won't be seen again in association with that call + * Post a call for attention by the socket or kernel service. Further + * notifications are suppressed by putting recvmsg_link on a dummy queue. */ -void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call) +void rxrpc_notify_socket(struct rxrpc_call *call) { - _debug("RELEASE CALL %d", call->debug_id); + struct rxrpc_sock *rx; + struct sock *sk; + + _enter("%d", call->debug_id); + + if (!list_empty(&call->recvmsg_link)) + return; + + rcu_read_lock(); + + rx = rcu_dereference(call->socket); + sk = &rx->sk; + if (rx && sk->sk_state < RXRPC_CLOSE) { + if (call->notify_rx) { + call->notify_rx(sk, call, call->user_call_ID); + } else { + write_lock_bh(&rx->recvmsg_lock); + if (list_empty(&call->recvmsg_link)) { + rxrpc_get_call(call, rxrpc_call_got); + list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); + } + write_unlock_bh(&rx->recvmsg_lock); - if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - write_lock_bh(&rx->call_lock); - rb_erase(&call->sock_node, &call->socket->calls); - clear_bit(RXRPC_CALL_HAS_USERID, &call->flags); - write_unlock_bh(&rx->call_lock); + if (!sock_flag(sk, SOCK_DEAD)) { + _debug("call %ps", sk->sk_data_ready); + sk->sk_data_ready(sk); + } + } } - read_lock_bh(&call->state_lock); - if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) && - !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) - rxrpc_queue_call(call); - read_unlock_bh(&call->state_lock); + rcu_read_unlock(); + _leave(""); } /* - * receive a message from an RxRPC socket - * - we need to be careful about two or more threads calling recvmsg - * simultaneously + * Pass a call terminating message to userspace. */ -int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) { - struct rxrpc_skb_priv *sp; - struct rxrpc_call *call = NULL, *continue_call = NULL; - struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - struct sk_buff *skb; - long timeo; - int copy, ret, ullen, offset, copied = 0; - u32 abort_code; - - DEFINE_WAIT(wait); - - _enter(",,,%zu,%d", len, flags); - - if (flags & (MSG_OOB | MSG_TRUNC)) - return -EOPNOTSUPP; - - ullen = msg->msg_flags & MSG_CMSG_COMPAT ? 4 : sizeof(unsigned long); + u32 tmp = 0; + int ret; + + switch (call->completion) { + case RXRPC_CALL_SUCCEEDED: + ret = 0; + if (rxrpc_is_service_call(call)) + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp); + break; + case RXRPC_CALL_REMOTELY_ABORTED: + tmp = call->abort_code; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); + break; + case RXRPC_CALL_LOCALLY_ABORTED: + tmp = call->abort_code; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); + break; + case RXRPC_CALL_NETWORK_ERROR: + tmp = call->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp); + break; + case RXRPC_CALL_LOCAL_ERROR: + tmp = call->error; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp); + break; + default: + pr_err("Invalid terminal call state %u\n", call->state); + BUG(); + break; + } - timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); - msg->msg_flags |= MSG_MORE; + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack, + call->rx_pkt_offset, call->rx_pkt_len, ret); + return ret; +} - lock_sock(&rx->sk); +/* + * Pass back notification of a new call. The call is added to the + * to-be-accepted list. This means that the next call to be accepted might not + * be the last call seen awaiting acceptance, but unless we leave this on the + * front of the queue and block all other messages until someone gives us a + * user_ID for it, there's not a lot we can do. + */ +static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, int flags) +{ + int tmp = 0, ret; - for (;;) { - /* return immediately if a client socket has no outstanding - * calls */ - if (RB_EMPTY_ROOT(&rx->calls)) { - if (copied) - goto out; - if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) { - release_sock(&rx->sk); - if (continue_call) - rxrpc_put_call(continue_call); - return -ENODATA; - } - } + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp); - /* get the next message on the Rx queue */ - skb = skb_peek(&rx->sk.sk_receive_queue); - if (!skb) { - /* nothing remains on the queue */ - if (copied && - (flags & MSG_PEEK || timeo == 0)) - goto out; + if (ret == 0 && !(flags & MSG_PEEK)) { + _debug("to be accepted"); + write_lock_bh(&rx->recvmsg_lock); + list_del_init(&call->recvmsg_link); + write_unlock_bh(&rx->recvmsg_lock); - /* wait for a message to turn up */ - release_sock(&rx->sk); - prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, - TASK_INTERRUPTIBLE); - ret = sock_error(&rx->sk); - if (ret) - goto wait_error; - - if (skb_queue_empty(&rx->sk.sk_receive_queue)) { - if (signal_pending(current)) - goto wait_interrupted; - timeo = schedule_timeout(timeo); - } - finish_wait(sk_sleep(&rx->sk), &wait); - lock_sock(&rx->sk); - continue; - } + rxrpc_get_call(call, rxrpc_call_got); + write_lock(&rx->call_lock); + list_add_tail(&call->accept_link, &rx->to_be_accepted); + write_unlock(&rx->call_lock); + } - peek_next_packet: - sp = rxrpc_skb(skb); - call = sp->call; - ASSERT(call != NULL); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret); + return ret; +} - _debug("next pkt %s", rxrpc_pkts[sp->hdr.type]); +/* + * End the packet reception phase. + */ +static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) +{ + _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); - /* make sure we wait for the state to be updated in this call */ - spin_lock_bh(&call->lock); - spin_unlock_bh(&call->lock); + trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); + ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); - if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) { - _debug("packet from released call"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - continue; - } + if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, + rxrpc_propose_ack_terminal_ack); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + } - /* determine whether to continue last data receive */ - if (continue_call) { - _debug("maybe cont"); - if (call != continue_call || - skb->mark != RXRPC_SKB_MARK_DATA) { - release_sock(&rx->sk); - rxrpc_put_call(continue_call); - _leave(" = %d [noncont]", copied); - return copied; - } - } + write_lock_bh(&call->state_lock); - rxrpc_get_call(call); + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + __rxrpc_call_completed(call); + break; - /* copy the peer address and timestamp */ - if (!continue_call) { - if (msg->msg_name) { - size_t len = - sizeof(call->conn->params.peer->srx); - memcpy(msg->msg_name, - &call->conn->params.peer->srx, len); - msg->msg_namelen = len; - } - sock_recv_timestamp(msg, &rx->sk, skb); - } + case RXRPC_CALL_SERVER_RECV_REQUEST: + call->tx_phase = true; + call->state = RXRPC_CALL_SERVER_ACK_REQUEST; + break; + default: + break; + } - /* receive the message */ - if (skb->mark != RXRPC_SKB_MARK_DATA) - goto receive_non_data_message; + write_unlock_bh(&call->state_lock); +} - _debug("recvmsg DATA #%u { %d, %d }", - sp->hdr.seq, skb->len, sp->offset); +/* + * Discard a packet we've used up and advance the Rx window by one. + */ +static void rxrpc_rotate_rx_window(struct rxrpc_call *call) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + rxrpc_serial_t serial; + rxrpc_seq_t hard_ack, top; + u8 flags; + int ix; + + _enter("%d", call->debug_id); + + hard_ack = call->rx_hard_ack; + top = smp_load_acquire(&call->rx_top); + ASSERT(before(hard_ack, top)); + + hard_ack++; + ix = hard_ack & RXRPC_RXTX_BUFF_MASK; + skb = call->rxtx_buffer[ix]; + rxrpc_see_skb(skb, rxrpc_skb_rx_rotated); + sp = rxrpc_skb(skb); + flags = sp->hdr.flags; + serial = sp->hdr.serial; + if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) + serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1; + + call->rxtx_buffer[ix] = NULL; + call->rxtx_annotations[ix] = 0; + /* Barrier against rxrpc_input_data(). */ + smp_store_release(&call->rx_hard_ack, hard_ack); + + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + + _debug("%u,%u,%02x", hard_ack, top, flags); + trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); + if (flags & RXRPC_LAST_PACKET) { + rxrpc_end_rx_phase(call, serial); + } else { + /* Check to see if there's an ACK that needs sending. */ + if (after_eq(hard_ack, call->ackr_consumed + 2) || + after_eq(top, call->ackr_seen + 2) || + (hard_ack == top && after(hard_ack, call->ackr_consumed))) + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, + true, false, + rxrpc_propose_ack_rotate_rx); + if (call->ackr_reason) + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + } +} - if (!continue_call) { - /* only set the control data once per recvmsg() */ - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - ullen, &call->user_call_ID); - if (ret < 0) - goto copy_error; - ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); - } +/* + * Decrypt and verify a (sub)packet. The packet's length may be changed due to + * padding, but if this is the case, the packet length will be resident in the + * socket buffer. Note that we can't modify the master skb info as the skb may + * be the home to multiple subpackets. + */ +static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, + u8 annotation, + unsigned int offset, unsigned int len) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_seq_t seq = sp->hdr.seq; + u16 cksum = sp->hdr.cksum; - ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); - ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); - call->rx_data_recv = sp->hdr.seq; + _enter(""); - ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); + /* For all but the head jumbo subpacket, the security checksum is in a + * jumbo header immediately prior to the data. + */ + if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) { + __be16 tmp; + if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0) + BUG(); + cksum = ntohs(tmp); + seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1; + } - offset = sp->offset; - copy = skb->len - offset; - if (copy > len - copied) - copy = len - copied; + return call->conn->security->verify_packet(call, skb, offset, len, + seq, cksum); +} - ret = skb_copy_datagram_msg(skb, offset, msg, copy); +/* + * Locate the data within a packet. This is complicated by: + * + * (1) An skb may contain a jumbo packet - so we have to find the appropriate + * subpacket. + * + * (2) The (sub)packets may be encrypted and, if so, the encrypted portion + * contains an extra header which includes the true length of the data, + * excluding any encrypted padding. + */ +static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, + u8 *_annotation, + unsigned int *_offset, unsigned int *_len) +{ + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = *_len; + int ret; + u8 annotation = *_annotation; + + /* Locate the subpacket */ + len = skb->len - offset; + if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) { + offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) * + RXRPC_JUMBO_SUBPKTLEN); + len = (annotation & RXRPC_RX_ANNO_JLAST) ? + skb->len - offset : RXRPC_JUMBO_SUBPKTLEN; + } + if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) { + ret = rxrpc_verify_packet(call, skb, annotation, offset, len); if (ret < 0) - goto copy_error; - - /* handle piecemeal consumption of data packets */ - _debug("copied %d+%d", copy, copied); + return ret; + *_annotation |= RXRPC_RX_ANNO_VERIFIED; + } - offset += copy; - copied += copy; + *_offset = offset; + *_len = len; + call->conn->security->locate_data(call, skb, _offset, _len); + return 0; +} - if (!(flags & MSG_PEEK)) - sp->offset = offset; +/* + * Deliver messages to a call. This keeps processing packets until the buffer + * is filled and we find either more DATA (returns 0) or the end of the DATA + * (returns 1). If more packets are required, it returns -EAGAIN. + */ +static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, + struct msghdr *msg, struct iov_iter *iter, + size_t len, int flags, size_t *_offset) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + rxrpc_seq_t hard_ack, top, seq; + size_t remain; + bool last; + unsigned int rx_pkt_offset, rx_pkt_len; + int ix, copy, ret = -EAGAIN, ret2; + + rx_pkt_offset = call->rx_pkt_offset; + rx_pkt_len = call->rx_pkt_len; + + if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { + seq = call->rx_hard_ack; + ret = 1; + goto done; + } - if (sp->offset < skb->len) { - _debug("buffer full"); - ASSERTCMP(copied, ==, len); + /* Barriers against rxrpc_input_data(). */ + hard_ack = call->rx_hard_ack; + top = smp_load_acquire(&call->rx_top); + for (seq = hard_ack + 1; before_eq(seq, top); seq++) { + ix = seq & RXRPC_RXTX_BUFF_MASK; + skb = call->rxtx_buffer[ix]; + if (!skb) { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq, + rx_pkt_offset, rx_pkt_len, 0); break; } + smp_rmb(); + rxrpc_see_skb(skb, rxrpc_skb_rx_seen); + sp = rxrpc_skb(skb); - /* we transferred the whole data packet */ if (!(flags & MSG_PEEK)) - rxrpc_kernel_data_consumed(call, skb); - - if (sp->hdr.flags & RXRPC_LAST_PACKET) { - _debug("last"); - if (rxrpc_conn_is_client(call->conn)) { - /* last byte of reply received */ - ret = copied; - goto terminal_message; + trace_rxrpc_receive(call, rxrpc_receive_front, + sp->hdr.serial, seq); + + if (msg) + sock_recv_timestamp(msg, sock->sk, skb); + + if (rx_pkt_offset == 0) { + ret2 = rxrpc_locate_data(call, skb, + &call->rxtx_annotations[ix], + &rx_pkt_offset, &rx_pkt_len); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq, + rx_pkt_offset, rx_pkt_len, ret2); + if (ret2 < 0) { + ret = ret2; + goto out; } + } else { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq, + rx_pkt_offset, rx_pkt_len, 0); + } - /* last bit of request received */ - if (!(flags & MSG_PEEK)) { - _debug("eat packet"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != - skb) - BUG(); - rxrpc_free_skb(skb); + /* We have to handle short, empty and used-up DATA packets. */ + remain = len - *_offset; + copy = rx_pkt_len; + if (copy > remain) + copy = remain; + if (copy > 0) { + ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, + copy); + if (ret2 < 0) { + ret = ret2; + goto out; } - msg->msg_flags &= ~MSG_MORE; - break; + + /* handle piecemeal consumption of data packets */ + rx_pkt_offset += copy; + rx_pkt_len -= copy; + *_offset += copy; } - /* move on to the next data message */ - _debug("next"); - if (!continue_call) - continue_call = sp->call; - else - rxrpc_put_call(call); - call = NULL; - - if (flags & MSG_PEEK) { - _debug("peek next"); - skb = skb->next; - if (skb == (struct sk_buff *) &rx->sk.sk_receive_queue) - break; - goto peek_next_packet; + if (rx_pkt_len > 0) { + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq, + rx_pkt_offset, rx_pkt_len, 0); + ASSERTCMP(*_offset, ==, len); + ret = 0; + break; } - _debug("eat packet"); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); + /* The whole packet has been transferred. */ + last = sp->hdr.flags & RXRPC_LAST_PACKET; + if (!(flags & MSG_PEEK)) + rxrpc_rotate_rx_window(call); + rx_pkt_offset = 0; + rx_pkt_len = 0; + + if (last) { + ASSERTCMP(seq, ==, READ_ONCE(call->rx_top)); + ret = 1; + goto out; + } } - /* end of non-terminal data packet reception for the moment */ - _debug("end rcv data"); out: - release_sock(&rx->sk); - if (call) - rxrpc_put_call(call); - if (continue_call) - rxrpc_put_call(continue_call); - _leave(" = %d [data]", copied); - return copied; - - /* handle non-DATA messages such as aborts, incoming connections and - * final ACKs */ -receive_non_data_message: - _debug("non-data"); - - if (skb->mark == RXRPC_SKB_MARK_NEW_CALL) { - _debug("RECV NEW CALL"); - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &abort_code); - if (ret < 0) - goto copy_error; - if (!(flags & MSG_PEEK)) { - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - } - goto out; + if (!(flags & MSG_PEEK)) { + call->rx_pkt_offset = rx_pkt_offset; + call->rx_pkt_len = rx_pkt_len; } +done: + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, + rx_pkt_offset, rx_pkt_len, ret); + return ret; +} - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - ullen, &call->user_call_ID); - if (ret < 0) - goto copy_error; - ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags)); +/* + * Receive a message from an RxRPC socket + * - we need to be careful about two or more threads calling recvmsg + * simultaneously + */ +int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct rxrpc_call *call; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct list_head *l; + size_t copied = 0; + long timeo; + int ret; + + DEFINE_WAIT(wait); + + trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0); + + if (flags & (MSG_OOB | MSG_TRUNC)) + return -EOPNOTSUPP; + + timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); + +try_again: + lock_sock(&rx->sk); + + /* Return immediately if a client socket has no outstanding calls */ + if (RB_EMPTY_ROOT(&rx->calls) && + list_empty(&rx->recvmsg_q) && + rx->sk.sk_state != RXRPC_SERVER_LISTENING) { + release_sock(&rx->sk); + return -ENODATA; + } + + if (list_empty(&rx->recvmsg_q)) { + ret = -EWOULDBLOCK; + if (timeo == 0) { + call = NULL; + goto error_no_call; + } + + release_sock(&rx->sk); + + /* Wait for something to happen */ + prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, + TASK_INTERRUPTIBLE); + ret = sock_error(&rx->sk); + if (ret) + goto wait_error; + + if (list_empty(&rx->recvmsg_q)) { + if (signal_pending(current)) + goto wait_interrupted; + trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, + 0, 0, 0, 0); + timeo = schedule_timeout(timeo); + } + finish_wait(sk_sleep(&rx->sk), &wait); + goto try_again; + } - switch (skb->mark) { - case RXRPC_SKB_MARK_DATA: + /* Find the next call and dequeue it if we're not just peeking. If we + * do dequeue it, that comes with a ref that we will need to release. + */ + write_lock_bh(&rx->recvmsg_lock); + l = rx->recvmsg_q.next; + call = list_entry(l, struct rxrpc_call, recvmsg_link); + if (!(flags & MSG_PEEK)) + list_del_init(&call->recvmsg_link); + else + rxrpc_get_call(call, rxrpc_call_got); + write_unlock_bh(&rx->recvmsg_lock); + + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); + + if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); - case RXRPC_SKB_MARK_FINAL_ACK: - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &abort_code); - break; - case RXRPC_SKB_MARK_BUSY: - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code); - break; - case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = call->remote_abort; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); - break; - case RXRPC_SKB_MARK_LOCAL_ABORT: - abort_code = call->local_abort; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); - break; - case RXRPC_SKB_MARK_NET_ERROR: - _debug("RECV NET ERROR %d", sp->error); - abort_code = sp->error; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &abort_code); + + if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { + if (flags & MSG_CMSG_COMPAT) { + unsigned int id32 = call->user_call_ID; + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned int), &id32); + } else { + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned long), + &call->user_call_ID); + } + if (ret < 0) + goto error; + } + + if (msg->msg_name) { + size_t len = sizeof(call->conn->params.peer->srx); + memcpy(msg->msg_name, &call->conn->params.peer->srx, len); + msg->msg_namelen = len; + } + + switch (call->state) { + case RXRPC_CALL_SERVER_ACCEPTING: + ret = rxrpc_recvmsg_new_call(rx, call, msg, flags); break; - case RXRPC_SKB_MARK_LOCAL_ERROR: - _debug("RECV LOCAL ERROR %d", sp->error); - abort_code = sp->error; - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, - &abort_code); + case RXRPC_CALL_CLIENT_RECV_REPLY: + case RXRPC_CALL_SERVER_RECV_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, + flags, &copied); + if (ret == -EAGAIN) + ret = 0; + + if (after(call->rx_top, call->rx_hard_ack) && + call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK]) + rxrpc_notify_socket(call); break; default: - pr_err("Unknown packet mark %u\n", skb->mark); - BUG(); + ret = 0; break; } if (ret < 0) - goto copy_error; - -terminal_message: - _debug("terminal"); - msg->msg_flags &= ~MSG_MORE; - msg->msg_flags |= MSG_EOR; + goto error; - if (!(flags & MSG_PEEK)) { - _net("free terminal skb %p", skb); - if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) - BUG(); - rxrpc_free_skb(skb); - rxrpc_remove_user_ID(rx, call); + if (call->state == RXRPC_CALL_COMPLETE) { + ret = rxrpc_recvmsg_term(call, msg); + if (ret < 0) + goto error; + if (!(flags & MSG_PEEK)) + rxrpc_release_call(rx, call); + msg->msg_flags |= MSG_EOR; + ret = 1; } - release_sock(&rx->sk); - rxrpc_put_call(call); - if (continue_call) - rxrpc_put_call(continue_call); - _leave(" = %d", ret); - return ret; + if (ret == 0) + msg->msg_flags |= MSG_MORE; + else + msg->msg_flags &= ~MSG_MORE; + ret = copied; -copy_error: - _debug("copy error"); +error: + rxrpc_put_call(call, rxrpc_call_put); +error_no_call: release_sock(&rx->sk); - rxrpc_put_call(call); - if (continue_call) - rxrpc_put_call(continue_call); - _leave(" = %d", ret); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); return ret; wait_interrupted: ret = sock_intr_errno(timeo); wait_error: finish_wait(sk_sleep(&rx->sk), &wait); - if (continue_call) - rxrpc_put_call(continue_call); - if (copied) - copied = ret; - _leave(" = %d [waitfail %d]", copied, ret); - return copied; - + call = NULL; + goto error_no_call; } /** - * rxrpc_kernel_is_data_last - Determine if data message is last one - * @skb: Message holding data + * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info + * @sock: The socket that the call exists on + * @call: The call to send data through + * @buf: The buffer to receive into + * @size: The size of the buffer, including data already read + * @_offset: The running offset into the buffer. + * @want_more: True if more data is expected to be read + * @_abort: Where the abort code is stored if -ECONNABORTED is returned * - * Determine if data message is last one for the parent call. + * Allow a kernel service to receive data and pick up information about the + * state of a call. Returns 0 if got what was asked for and there's more + * available, 1 if we got what was asked for and we're at the end of the data + * and -EAGAIN if we need more data. + * + * Note that we may return -EAGAIN to drain empty packets at the end of the + * data, even if we've already copied over the requested data. + * + * This function adds the amount it transfers to *_offset, so this should be + * precleared as appropriate. Note that the amount remaining in the buffer is + * taken to be size - *_offset. + * + * *_abort should also be initialised to 0. */ -bool rxrpc_kernel_is_data_last(struct sk_buff *skb) +int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, + void *buf, size_t size, size_t *_offset, + bool want_more, u32 *_abort) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct iov_iter iter; + struct kvec iov; + int ret; - ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_DATA); + _enter("{%d,%s},%zu/%zu,%d", + call->debug_id, rxrpc_call_states[call->state], + *_offset, size, want_more); - return sp->hdr.flags & RXRPC_LAST_PACKET; -} + ASSERTCMP(*_offset, <=, size); + ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING); -EXPORT_SYMBOL(rxrpc_kernel_is_data_last); + iov.iov_base = buf + *_offset; + iov.iov_len = size - *_offset; + iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); -/** - * rxrpc_kernel_get_abort_code - Get the abort code from an RxRPC abort message - * @skb: Message indicating an abort - * - * Get the abort code from an RxRPC abort message. - */ -u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + lock_sock(sock->sk); + + switch (call->state) { + case RXRPC_CALL_CLIENT_RECV_REPLY: + case RXRPC_CALL_SERVER_RECV_REQUEST: + case RXRPC_CALL_SERVER_ACK_REQUEST: + ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0, + _offset); + if (ret < 0) + goto out; + + /* We can only reach here with a partially full buffer if we + * have reached the end of the data. We must otherwise have a + * full buffer or have been given -EAGAIN. + */ + if (ret == 1) { + if (*_offset < size) + goto short_data; + if (!want_more) + goto read_phase_complete; + ret = 0; + goto out; + } + + if (!want_more) + goto excess_data; + goto out; + + case RXRPC_CALL_COMPLETE: + goto call_complete; - switch (skb->mark) { - case RXRPC_SKB_MARK_REMOTE_ABORT: - return sp->call->remote_abort; - case RXRPC_SKB_MARK_LOCAL_ABORT: - return sp->call->local_abort; default: - BUG(); + ret = -EINPROGRESS; + goto out; } -} - -EXPORT_SYMBOL(rxrpc_kernel_get_abort_code); -/** - * rxrpc_kernel_get_error - Get the error number from an RxRPC error message - * @skb: Message indicating an error - * - * Get the error number from an RxRPC error message. - */ -int rxrpc_kernel_get_error_number(struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); +read_phase_complete: + ret = 1; +out: + release_sock(sock->sk); + _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); + return ret; - return sp->error; +short_data: + ret = -EBADMSG; + goto out; +excess_data: + ret = -EMSGSIZE; + goto out; +call_complete: + *_abort = call->abort_code; + ret = call->error; + if (call->completion == RXRPC_CALL_SUCCEEDED) { + ret = 1; + if (size > 0) + ret = -ECONNRESET; + } + goto out; } - -EXPORT_SYMBOL(rxrpc_kernel_get_error_number); +EXPORT_SYMBOL(rxrpc_kernel_recv_data); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 63afa9e9cc08..627abed5f999 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -80,12 +80,10 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn) case RXRPC_SECURITY_AUTH: conn->size_align = 8; conn->security_size = sizeof(struct rxkad_level1_hdr); - conn->header_size += sizeof(struct rxkad_level1_hdr); break; case RXRPC_SECURITY_ENCRYPT: conn->size_align = 8; conn->security_size = sizeof(struct rxkad_level2_hdr); - conn->header_size += sizeof(struct rxkad_level2_hdr); break; default: ret = -EKEYREJECTED; @@ -161,7 +159,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, _enter(""); - check = sp->hdr.seq ^ sp->hdr.callNumber; + check = sp->hdr.seq ^ call->call_id; data_size |= (u32)check << 16; hdr.data_size = htonl(data_size); @@ -205,7 +203,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, _enter(""); - check = sp->hdr.seq ^ sp->hdr.callNumber; + check = sp->hdr.seq ^ call->call_id; rxkhdr.data_size = htonl(data_size | (u32)check << 16); rxkhdr.checksum = 0; @@ -275,9 +273,9 @@ static int rxkad_secure_packet(struct rxrpc_call *call, memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); /* calculate the security checksum */ - x = call->channel << (32 - RXRPC_CIDSHIFT); + x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); x |= sp->hdr.seq & 0x3fffffff; - call->crypto_buf[0] = htonl(sp->hdr.callNumber); + call->crypto_buf[0] = htonl(call->call_id); call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); @@ -316,12 +314,11 @@ static int rxkad_secure_packet(struct rxrpc_call *call, /* * decrypt partial encryption on a packet (level 1 security) */ -static int rxkad_verify_packet_auth(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 *_abort_code) +static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int offset, unsigned int len, + rxrpc_seq_t seq) { struct rxkad_level1_hdr sechdr; - struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg[16]; @@ -332,15 +329,20 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, _enter(""); - sp = rxrpc_skb(skb); + if (len < 8) { + rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO); + goto protocol_error; + } - /* we want to decrypt the skbuff in-place */ + /* Decrypt the skbuff in-place. TODO: We really want to decrypt + * directly into the target buffer. + */ nsg = skb_cow_data(skb, 0, &trailer); if (nsg < 0 || nsg > 16) goto nomem; sg_init_table(sg, nsg); - skb_to_sgvec(skb, sg, 0, 8); + skb_to_sgvec(skb, sg, offset, 8); /* start the decryption afresh */ memset(&iv, 0, sizeof(iv)); @@ -351,35 +353,35 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call, crypto_skcipher_decrypt(req); skcipher_request_zero(req); - /* remove the decrypted packet length */ - if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) - goto datalen_error; - if (!skb_pull(skb, sizeof(sechdr))) - BUG(); + /* Extract the decrypted packet length */ + if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) { + rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO); + goto protocol_error; + } + offset += sizeof(sechdr); + len -= sizeof(sechdr); buf = ntohl(sechdr.data_size); data_size = buf & 0xffff; check = buf >> 16; - check ^= sp->hdr.seq ^ sp->hdr.callNumber; + check ^= seq ^ call->call_id; check &= 0xffff; if (check != 0) { - *_abort_code = RXKADSEALEDINCON; + rxrpc_abort_call("V1C", call, seq, RXKADSEALEDINCON, EPROTO); goto protocol_error; } - /* shorten the packet to remove the padding */ - if (data_size > skb->len) - goto datalen_error; - else if (data_size < skb->len) - skb->len = data_size; + if (data_size > len) { + rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO); + goto protocol_error; + } _leave(" = 0 [dlen=%x]", data_size); return 0; -datalen_error: - *_abort_code = RXKADDATALEN; protocol_error: + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); _leave(" = -EPROTO"); return -EPROTO; @@ -391,13 +393,12 @@ nomem: /* * wholly decrypt a packet (level 2 security) */ -static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 *_abort_code) +static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int offset, unsigned int len, + rxrpc_seq_t seq) { const struct rxrpc_key_token *token; struct rxkad_level2_hdr sechdr; - struct rxrpc_skb_priv *sp; SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; @@ -408,9 +409,14 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, _enter(",{%d}", skb->len); - sp = rxrpc_skb(skb); + if (len < 8) { + rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO); + goto protocol_error; + } - /* we want to decrypt the skbuff in-place */ + /* Decrypt the skbuff in-place. TODO: We really want to decrypt + * directly into the target buffer. + */ nsg = skb_cow_data(skb, 0, &trailer); if (nsg < 0) goto nomem; @@ -423,7 +429,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, } sg_init_table(sg, nsg); - skb_to_sgvec(skb, sg, 0, skb->len); + skb_to_sgvec(skb, sg, offset, len); /* decrypt from the session key */ token = call->conn->params.key->payload.data[0]; @@ -431,41 +437,41 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call, skcipher_request_set_tfm(req, call->conn->cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x); + skcipher_request_set_crypt(req, sg, sg, len, iv.x); crypto_skcipher_decrypt(req); skcipher_request_zero(req); if (sg != _sg) kfree(sg); - /* remove the decrypted packet length */ - if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0) - goto datalen_error; - if (!skb_pull(skb, sizeof(sechdr))) - BUG(); + /* Extract the decrypted packet length */ + if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) { + rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO); + goto protocol_error; + } + offset += sizeof(sechdr); + len -= sizeof(sechdr); buf = ntohl(sechdr.data_size); data_size = buf & 0xffff; check = buf >> 16; - check ^= sp->hdr.seq ^ sp->hdr.callNumber; + check ^= seq ^ call->call_id; check &= 0xffff; if (check != 0) { - *_abort_code = RXKADSEALEDINCON; + rxrpc_abort_call("V2C", call, seq, RXKADSEALEDINCON, EPROTO); goto protocol_error; } - /* shorten the packet to remove the padding */ - if (data_size > skb->len) - goto datalen_error; - else if (data_size < skb->len) - skb->len = data_size; + if (data_size > len) { + rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO); + goto protocol_error; + } _leave(" = 0 [dlen=%x]", data_size); return 0; -datalen_error: - *_abort_code = RXKADDATALEN; protocol_error: + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); _leave(" = -EPROTO"); return -EPROTO; @@ -475,40 +481,31 @@ nomem: } /* - * verify the security on a received packet + * Verify the security on a received packet or subpacket (if part of a + * jumbo packet). */ -static int rxkad_verify_packet(struct rxrpc_call *call, - struct sk_buff *skb, - u32 *_abort_code) +static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int offset, unsigned int len, + rxrpc_seq_t seq, u16 expected_cksum) { SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); - struct rxrpc_skb_priv *sp; struct rxrpc_crypt iv; struct scatterlist sg; u16 cksum; u32 x, y; - int ret; - - sp = rxrpc_skb(skb); _enter("{%d{%x}},{#%u}", - call->debug_id, key_serial(call->conn->params.key), sp->hdr.seq); + call->debug_id, key_serial(call->conn->params.key), seq); if (!call->conn->cipher) return 0; - if (sp->hdr.securityIndex != RXRPC_SECURITY_RXKAD) { - *_abort_code = RXKADINCONSISTENCY; - _leave(" = -EPROTO [not rxkad]"); - return -EPROTO; - } - /* continue encrypting from where we left off */ memcpy(&iv, call->conn->csum_iv.x, sizeof(iv)); /* validate the security checksum */ - x = call->channel << (32 - RXRPC_CIDSHIFT); - x |= sp->hdr.seq & 0x3fffffff; + x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); + x |= seq & 0x3fffffff; call->crypto_buf[0] = htonl(call->call_id); call->crypto_buf[1] = htonl(x); @@ -524,29 +521,69 @@ static int rxkad_verify_packet(struct rxrpc_call *call, if (cksum == 0) cksum = 1; /* zero checksums are not permitted */ - if (sp->hdr.cksum != cksum) { - *_abort_code = RXKADSEALEDINCON; + if (cksum != expected_cksum) { + rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO); + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); _leave(" = -EPROTO [csum failed]"); return -EPROTO; } switch (call->conn->params.security_level) { case RXRPC_SECURITY_PLAIN: - ret = 0; - break; + return 0; case RXRPC_SECURITY_AUTH: - ret = rxkad_verify_packet_auth(call, skb, _abort_code); - break; + return rxkad_verify_packet_1(call, skb, offset, len, seq); case RXRPC_SECURITY_ENCRYPT: - ret = rxkad_verify_packet_encrypt(call, skb, _abort_code); - break; + return rxkad_verify_packet_2(call, skb, offset, len, seq); default: - ret = -ENOANO; - break; + return -ENOANO; } +} - _leave(" = %d", ret); - return ret; +/* + * Locate the data contained in a packet that was partially encrypted. + */ +static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len) +{ + struct rxkad_level1_hdr sechdr; + + if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0) + BUG(); + *_offset += sizeof(sechdr); + *_len = ntohl(sechdr.data_size) & 0xffff; +} + +/* + * Locate the data contained in a packet that was completely encrypted. + */ +static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len) +{ + struct rxkad_level2_hdr sechdr; + + if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0) + BUG(); + *_offset += sizeof(sechdr); + *_len = ntohl(sechdr.data_size) & 0xffff; +} + +/* + * Locate the data contained in an already decrypted packet. + */ +static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len) +{ + switch (call->conn->params.security_level) { + case RXRPC_SECURITY_AUTH: + rxkad_locate_data_1(call, skb, _offset, _len); + return; + case RXRPC_SECURITY_ENCRYPT: + rxkad_locate_data_2(call, skb, _offset, _len); + return; + default: + return; + } } /* @@ -716,7 +753,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, struct rxkad_challenge challenge; struct rxkad_response resp __attribute__((aligned(8))); /* must be aligned for crypto */ - struct rxrpc_skb_priv *sp; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); u32 version, nonce, min_level, abort_code; int ret; @@ -734,8 +771,8 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, } abort_code = RXKADPACKETSHORT; - sp = rxrpc_skb(skb); - if (skb_copy_bits(skb, 0, &challenge, sizeof(challenge)) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &challenge, sizeof(challenge)) < 0) goto protocol_error; version = ntohl(challenge.version); @@ -981,7 +1018,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, { struct rxkad_response response __attribute__((aligned(8))); /* must be aligned for crypto */ - struct rxrpc_skb_priv *sp; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; time_t expiry; void *ticket; @@ -992,7 +1029,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, 0, &response, sizeof(response)) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &response, sizeof(response)) < 0) goto protocol_error; if (!pskb_pull(skb, sizeof(response))) BUG(); @@ -1000,7 +1038,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, version = ntohl(response.version); ticket_len = ntohl(response.ticket_len); kvno = ntohl(response.kvno); - sp = rxrpc_skb(skb); _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", sp->hdr.serial, version, kvno, ticket_len); @@ -1022,7 +1059,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, return -ENOMEM; abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, 0, ticket, ticket_len) < 0) + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + ticket, ticket_len) < 0) goto protocol_error_free; ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key, @@ -1147,6 +1185,7 @@ const struct rxrpc_security rxkad = { .prime_packet_security = rxkad_prime_packet_security, .secure_packet = rxkad_secure_packet, .verify_packet = rxkad_verify_packet, + .locate_data = rxkad_locate_data, .issue_challenge = rxkad_issue_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 814d285ff802..7d921e56e715 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -130,20 +130,20 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) } /* find the service */ - read_lock_bh(&local->services_lock); - list_for_each_entry(rx, &local->services, listen_link) { - if (rx->srx.srx_service == conn->params.service_id) - goto found_service; - } + read_lock(&local->services_lock); + rx = rcu_dereference_protected(local->service, + lockdep_is_held(&local->services_lock)); + if (rx && rx->srx.srx_service == conn->params.service_id) + goto found_service; /* the service appears to have died */ - read_unlock_bh(&local->services_lock); + read_unlock(&local->services_lock); _leave(" = -ENOENT"); return -ENOENT; found_service: if (!rx->securities) { - read_unlock_bh(&local->services_lock); + read_unlock(&local->services_lock); _leave(" = -ENOKEY"); return -ENOKEY; } @@ -152,13 +152,13 @@ found_service: kref = keyring_search(make_key_ref(rx->securities, 1UL), &key_type_rxrpc_s, kdesc); if (IS_ERR(kref)) { - read_unlock_bh(&local->services_lock); + read_unlock(&local->services_lock); _leave(" = %ld [search]", PTR_ERR(kref)); return PTR_ERR(kref); } key = key_ref_to_ptr(kref); - read_unlock_bh(&local->services_lock); + read_unlock(&local->services_lock); conn->server_key = key; conn->security = sec; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c new file mode 100644 index 000000000000..3322543d460a --- /dev/null +++ b/net/rxrpc/sendmsg.c @@ -0,0 +1,606 @@ +/* AF_RXRPC sendmsg() implementation. + * + * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/gfp.h> +#include <linux/skbuff.h> +#include <linux/export.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +enum rxrpc_command { + RXRPC_CMD_SEND_DATA, /* send data message */ + RXRPC_CMD_SEND_ABORT, /* request abort generation */ + RXRPC_CMD_ACCEPT, /* [server] accept incoming call */ + RXRPC_CMD_REJECT_BUSY, /* [server] reject a call as busy */ +}; + +/* + * wait for space to appear in the transmit/ACK window + * - caller holds the socket locked + */ +static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, + struct rxrpc_call *call, + long *timeo) +{ + DECLARE_WAITQUEUE(myself, current); + int ret; + + _enter(",{%u,%u,%u}", + call->tx_hard_ack, call->tx_top, call->tx_winsize); + + add_wait_queue(&call->waitq, &myself); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + ret = 0; + if (call->tx_top - call->tx_hard_ack < + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra)) + break; + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -call->error; + break; + } + if (signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + + trace_rxrpc_transmit(call, rxrpc_transmit_wait); + release_sock(&rx->sk); + *timeo = schedule_timeout(*timeo); + lock_sock(&rx->sk); + } + + remove_wait_queue(&call->waitq, &myself); + set_current_state(TASK_RUNNING); + _leave(" = %d", ret); + return ret; +} + +/* + * Schedule an instant Tx resend. + */ +static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix) +{ + spin_lock_bh(&call->lock); + + if (call->state < RXRPC_CALL_COMPLETE) { + call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS; + if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + rxrpc_queue_call(call); + } + + spin_unlock_bh(&call->lock); +} + +/* + * Queue a DATA packet for transmission, set the resend timeout and send the + * packet immediately + */ +static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb, + bool last) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_seq_t seq = sp->hdr.seq; + int ret, ix; + u8 annotation = RXRPC_TX_ANNO_UNACK; + + _net("queue skb %p [%d]", skb, seq); + + ASSERTCMP(seq, ==, call->tx_top + 1); + + if (last) + annotation |= RXRPC_TX_ANNO_LAST; + + /* We have to set the timestamp before queueing as the retransmit + * algorithm can see the packet as soon as we queue it. + */ + skb->tstamp = ktime_get_real(); + + ix = seq & RXRPC_RXTX_BUFF_MASK; + rxrpc_get_skb(skb, rxrpc_skb_tx_got); + call->rxtx_annotations[ix] = annotation; + smp_wmb(); + call->rxtx_buffer[ix] = skb; + call->tx_top = seq; + if (last) + trace_rxrpc_transmit(call, rxrpc_transmit_queue_last); + else + trace_rxrpc_transmit(call, rxrpc_transmit_queue); + + if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { + _debug("________awaiting reply/ACK__________"); + write_lock_bh(&call->state_lock); + switch (call->state) { + case RXRPC_CALL_CLIENT_SEND_REQUEST: + call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; + break; + case RXRPC_CALL_SERVER_ACK_REQUEST: + call->state = RXRPC_CALL_SERVER_SEND_REPLY; + if (!last) + break; + case RXRPC_CALL_SERVER_SEND_REPLY: + call->state = RXRPC_CALL_SERVER_AWAIT_ACK; + break; + default: + break; + } + write_unlock_bh(&call->state_lock); + } + + if (seq == 1 && rxrpc_is_client_call(call)) + rxrpc_expose_client_call(call); + + ret = rxrpc_send_data_packet(call, skb, false); + if (ret < 0) { + _debug("need instant resend %d", ret); + rxrpc_instant_resend(call, ix); + } else { + ktime_t now = ktime_get_real(), resend_at; + + resend_at = ktime_add_ms(now, rxrpc_resend_timeout); + + if (ktime_before(resend_at, call->resend_at)) { + call->resend_at = resend_at; + rxrpc_set_timer(call, rxrpc_timer_set_for_send, now); + } + } + + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + _leave(""); +} + +/* + * send data through a socket + * - must be called in process context + * - caller holds the socket locked + */ +static int rxrpc_send_data(struct rxrpc_sock *rx, + struct rxrpc_call *call, + struct msghdr *msg, size_t len) +{ + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + struct sock *sk = &rx->sk; + long timeo; + bool more; + int ret, copied; + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + /* this should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + return -EPIPE; + + more = msg->msg_flags & MSG_MORE; + + skb = call->tx_pending; + call->tx_pending = NULL; + rxrpc_see_skb(skb, rxrpc_skb_tx_seen); + + copied = 0; + do { + /* Check to see if there's a ping ACK to reply to. */ + if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE) + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ACK); + + if (!skb) { + size_t size, chunk, max, space; + + _debug("alloc"); + + if (call->tx_top - call->tx_hard_ack >= + min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra)) { + ret = -EAGAIN; + if (msg->msg_flags & MSG_DONTWAIT) + goto maybe_error; + ret = rxrpc_wait_for_tx_window(rx, call, + &timeo); + if (ret < 0) + goto maybe_error; + } + + max = RXRPC_JUMBO_DATALEN; + max -= call->conn->security_size; + max &= ~(call->conn->size_align - 1UL); + + chunk = max; + if (chunk > msg_data_left(msg) && !more) + chunk = msg_data_left(msg); + + space = chunk + call->conn->size_align; + space &= ~(call->conn->size_align - 1UL); + + size = space + call->conn->security_size; + + _debug("SIZE: %zu/%zu/%zu", chunk, space, size); + + /* create a buffer that we can retain until it's ACK'd */ + skb = sock_alloc_send_skb( + sk, size, msg->msg_flags & MSG_DONTWAIT, &ret); + if (!skb) + goto maybe_error; + + rxrpc_new_skb(skb, rxrpc_skb_tx_new); + + _debug("ALLOC SEND %p", skb); + + ASSERTCMP(skb->mark, ==, 0); + + _debug("HS: %u", call->conn->security_size); + skb_reserve(skb, call->conn->security_size); + skb->len += call->conn->security_size; + + sp = rxrpc_skb(skb); + sp->remain = chunk; + if (sp->remain > skb_tailroom(skb)) + sp->remain = skb_tailroom(skb); + + _net("skb: hr %d, tr %d, hl %d, rm %d", + skb_headroom(skb), + skb_tailroom(skb), + skb_headlen(skb), + sp->remain); + + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + _debug("append"); + sp = rxrpc_skb(skb); + + /* append next segment of data to the current buffer */ + if (msg_data_left(msg) > 0) { + int copy = skb_tailroom(skb); + ASSERTCMP(copy, >, 0); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); + if (copy > sp->remain) + copy = sp->remain; + + _debug("add"); + ret = skb_add_data(skb, &msg->msg_iter, copy); + _debug("added"); + if (ret < 0) + goto efault; + sp->remain -= copy; + skb->mark += copy; + copied += copy; + } + + /* check for the far side aborting the call or a network error + * occurring */ + if (call->state == RXRPC_CALL_COMPLETE) + goto call_terminated; + + /* add the packet to the send queue if it's now full */ + if (sp->remain <= 0 || + (msg_data_left(msg) == 0 && !more)) { + struct rxrpc_connection *conn = call->conn; + uint32_t seq; + size_t pad; + + /* pad out if we're using security */ + if (conn->security_ix) { + pad = conn->security_size + skb->mark; + pad = conn->size_align - pad; + pad &= conn->size_align - 1; + _debug("pad %zu", pad); + if (pad) + memset(skb_put(skb, pad), 0, pad); + } + + seq = call->tx_top + 1; + + sp->hdr.seq = seq; + sp->hdr._rsvd = 0; + sp->hdr.flags = conn->out_clientflag; + + if (msg_data_left(msg) == 0 && !more) + sp->hdr.flags |= RXRPC_LAST_PACKET; + else if (call->tx_top - call->tx_hard_ack < + call->tx_winsize) + sp->hdr.flags |= RXRPC_MORE_PACKETS; + + ret = conn->security->secure_packet( + call, skb, skb->mark, skb->head); + if (ret < 0) + goto out; + + rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); + skb = NULL; + } + } while (msg_data_left(msg) > 0); + +success: + ret = copied; +out: + call->tx_pending = skb; + _leave(" = %d", ret); + return ret; + +call_terminated: + rxrpc_free_skb(skb, rxrpc_skb_tx_freed); + _leave(" = %d", -call->error); + return -call->error; + +maybe_error: + if (copied) + goto success; + goto out; + +efault: + ret = -EFAULT; + goto out; +} + +/* + * extract control messages from the sendmsg() control buffer + */ +static int rxrpc_sendmsg_cmsg(struct msghdr *msg, + unsigned long *user_call_ID, + enum rxrpc_command *command, + u32 *abort_code, + bool *_exclusive) +{ + struct cmsghdr *cmsg; + bool got_user_ID = false; + int len; + + *command = RXRPC_CMD_SEND_DATA; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_USER_CALL_ID: + if (msg->msg_flags & MSG_CMSG_COMPAT) { + if (len != sizeof(u32)) + return -EINVAL; + *user_call_ID = *(u32 *) CMSG_DATA(cmsg); + } else { + if (len != sizeof(unsigned long)) + return -EINVAL; + *user_call_ID = *(unsigned long *) + CMSG_DATA(cmsg); + } + _debug("User Call ID %lx", *user_call_ID); + got_user_ID = true; + break; + + case RXRPC_ABORT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_SEND_ABORT; + if (len != sizeof(*abort_code)) + return -EINVAL; + *abort_code = *(unsigned int *) CMSG_DATA(cmsg); + _debug("Abort %x", *abort_code); + if (*abort_code == 0) + return -EINVAL; + break; + + case RXRPC_ACCEPT: + if (*command != RXRPC_CMD_SEND_DATA) + return -EINVAL; + *command = RXRPC_CMD_ACCEPT; + if (len != 0) + return -EINVAL; + break; + + case RXRPC_EXCLUSIVE_CALL: + *_exclusive = true; + if (len != 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + if (!got_user_ID) + return -EINVAL; + _leave(" = 0"); + return 0; +} + +/* + * Create a new client call for sendmsg(). + */ +static struct rxrpc_call * +rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, + unsigned long user_call_ID, bool exclusive) +{ + struct rxrpc_conn_parameters cp; + struct rxrpc_call *call; + struct key *key; + + DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx, msg->msg_name); + + _enter(""); + + if (!msg->msg_name) + return ERR_PTR(-EDESTADDRREQ); + + key = rx->key; + if (key && !rx->key->payload.data[0]) + key = NULL; + + memset(&cp, 0, sizeof(cp)); + cp.local = rx->local; + cp.key = rx->key; + cp.security_level = rx->min_sec_level; + cp.exclusive = rx->exclusive | exclusive; + cp.service_id = srx->srx_service; + call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, GFP_KERNEL); + + _leave(" = %p\n", call); + return call; +} + +/* + * send a message forming part of a client call through an RxRPC socket + * - caller holds the socket locked + * - the socket may be either a client socket or a server socket + */ +int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + enum rxrpc_command cmd; + struct rxrpc_call *call; + unsigned long user_call_ID = 0; + bool exclusive = false; + u32 abort_code = 0; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_cmsg(msg, &user_call_ID, &cmd, &abort_code, + &exclusive); + if (ret < 0) + return ret; + + if (cmd == RXRPC_CMD_ACCEPT) { + if (rx->sk.sk_state != RXRPC_SERVER_LISTENING) + return -EINVAL; + call = rxrpc_accept_call(rx, user_call_ID, NULL); + if (IS_ERR(call)) + return PTR_ERR(call); + rxrpc_put_call(call, rxrpc_call_put); + return 0; + } + + call = rxrpc_find_call_by_user_ID(rx, user_call_ID); + if (!call) { + if (cmd != RXRPC_CMD_SEND_DATA) + return -EBADSLT; + call = rxrpc_new_client_call_for_sendmsg(rx, msg, user_call_ID, + exclusive); + if (IS_ERR(call)) + return PTR_ERR(call); + } + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + /* it's too late for this call */ + ret = -ESHUTDOWN; + } else if (cmd == RXRPC_CMD_SEND_ABORT) { + ret = 0; + if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED)) + ret = rxrpc_send_call_packet(call, + RXRPC_PACKET_TYPE_ABORT); + } else if (cmd != RXRPC_CMD_SEND_DATA) { + ret = -EINVAL; + } else if (rxrpc_is_client_call(call) && + call->state != RXRPC_CALL_CLIENT_SEND_REQUEST) { + /* request phase complete for this client call */ + ret = -EPROTO; + } else if (rxrpc_is_service_call(call) && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + /* Reply phase not begun or not complete for service call. */ + ret = -EPROTO; + } else { + ret = rxrpc_send_data(rx, call, msg, len); + } + + rxrpc_put_call(call, rxrpc_call_put); + _leave(" = %d", ret); + return ret; +} + +/** + * rxrpc_kernel_send_data - Allow a kernel service to send data on a call + * @sock: The socket the call is on + * @call: The call to send data through + * @msg: The data to send + * @len: The amount of data to send + * + * Allow a kernel service to send data on a call. The call must be in an state + * appropriate to sending data. No control data should be supplied in @msg, + * nor should an address be supplied. MSG_MORE should be flagged if there's + * more data to come, otherwise this data will end the transmission phase. + */ +int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, + struct msghdr *msg, size_t len) +{ + int ret; + + _enter("{%d,%s},", call->debug_id, rxrpc_call_states[call->state]); + + ASSERTCMP(msg->msg_name, ==, NULL); + ASSERTCMP(msg->msg_control, ==, NULL); + + lock_sock(sock->sk); + + _debug("CALL %d USR %lx ST %d on CONN %p", + call->debug_id, call->user_call_ID, call->state, call->conn); + + if (call->state >= RXRPC_CALL_COMPLETE) { + ret = -ESHUTDOWN; /* it's too late for this call */ + } else if (call->state != RXRPC_CALL_CLIENT_SEND_REQUEST && + call->state != RXRPC_CALL_SERVER_ACK_REQUEST && + call->state != RXRPC_CALL_SERVER_SEND_REPLY) { + ret = -EPROTO; /* request phase complete for this client call */ + } else { + ret = rxrpc_send_data(rxrpc_sk(sock->sk), call, msg, len); + } + + release_sock(sock->sk); + _leave(" = %d", ret); + return ret; +} +EXPORT_SYMBOL(rxrpc_kernel_send_data); + +/** + * rxrpc_kernel_abort_call - Allow a kernel service to abort a call + * @sock: The socket the call is on + * @call: The call to be aborted + * @abort_code: The abort code to stick into the ABORT packet + * @error: Local error value + * @why: 3-char string indicating why. + * + * Allow a kernel service to abort a call, if it's still in an abortable state. + */ +void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, + u32 abort_code, int error, const char *why) +{ + _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); + + lock_sock(sock->sk); + + if (rxrpc_abort_call(why, call, 0, abort_code, error)) + rxrpc_send_call_packet(call, RXRPC_PACKET_TYPE_ABORT); + + release_sock(sock->sk); + _leave(""); +} + +EXPORT_SYMBOL(rxrpc_kernel_abort_call); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 06c51d4b622d..67b02c45271b 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -18,148 +18,82 @@ #include <net/af_rxrpc.h> #include "ar-internal.h" +#define select_skb_count(op) (op >= rxrpc_skb_tx_cleaned ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs) + /* - * set up for the ACK at the end of the receive phase when we discard the final - * receive phase data packet - * - called with softirqs disabled + * Note the allocation or reception of a socket buffer. */ -static void rxrpc_request_final_ACK(struct rxrpc_call *call) +void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { - /* the call may be aborted before we have a chance to ACK it */ - write_lock(&call->state_lock); - - switch (call->state) { - case RXRPC_CALL_CLIENT_RECV_REPLY: - call->state = RXRPC_CALL_CLIENT_FINAL_ACK; - _debug("request final ACK"); - - /* get an extra ref on the call for the final-ACK generator to - * release */ - rxrpc_get_call(call); - set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events); - if (try_to_del_timer_sync(&call->ack_timer) >= 0) - rxrpc_queue_call(call); - break; - - case RXRPC_CALL_SERVER_RECV_REQUEST: - call->state = RXRPC_CALL_SERVER_ACK_REQUEST; - default: - break; - } - - write_unlock(&call->state_lock); + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); } /* - * drop the bottom ACK off of the call ACK window and advance the window + * Note the re-emergence of a socket buffer from a queue or buffer. */ -static void rxrpc_hard_ACK_data(struct rxrpc_call *call, - struct rxrpc_skb_priv *sp) +void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { - int loop; - u32 seq; - - spin_lock_bh(&call->lock); - - _debug("hard ACK #%u", sp->hdr.seq); - - for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) { - call->ackr_window[loop] >>= 1; - call->ackr_window[loop] |= - call->ackr_window[loop + 1] << (BITS_PER_LONG - 1); - } - - seq = sp->hdr.seq; - ASSERTCMP(seq, ==, call->rx_data_eaten + 1); - call->rx_data_eaten = seq; - - if (call->ackr_win_top < UINT_MAX) - call->ackr_win_top++; - - ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, - call->rx_data_post, >=, call->rx_data_recv); - ASSERTIFCMP(call->state <= RXRPC_CALL_COMPLETE, - call->rx_data_recv, >=, call->rx_data_eaten); - - if (sp->hdr.flags & RXRPC_LAST_PACKET) { - rxrpc_request_final_ACK(call); - } else if (atomic_dec_and_test(&call->ackr_not_idle) && - test_and_clear_bit(RXRPC_CALL_TX_SOFT_ACK, &call->flags)) { - /* We previously soft-ACK'd some received packets that have now - * been consumed, so send a hard-ACK if no more packets are - * immediately forthcoming to allow the transmitter to free up - * its Tx bufferage. - */ - _debug("send Rx idle ACK"); - __rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, sp->hdr.serial, - false); + const void *here = __builtin_return_address(0); + if (skb) { + int n = atomic_read(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); } - - spin_unlock_bh(&call->lock); } -/** - * rxrpc_kernel_data_consumed - Record consumption of data message - * @call: The call to which the message pertains. - * @skb: Message holding data - * - * Record the consumption of a data message and generate an ACK if appropriate. - * The call state is shifted if this was the final packet. The caller must be - * in process context with no spinlocks held. - * - * TODO: Actually generate the ACK here rather than punting this to the - * workqueue. +/* + * Note the addition of a ref on a socket buffer. */ -void rxrpc_kernel_data_consumed(struct rxrpc_call *call, struct sk_buff *skb) +void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - _enter("%d,%p{%u}", call->debug_id, skb, sp->hdr.seq); - - ASSERTCMP(sp->call, ==, call); - ASSERTCMP(sp->hdr.type, ==, RXRPC_PACKET_TYPE_DATA); - - /* TODO: Fix the sequence number tracking */ - ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv); - ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1); - ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten); - - call->rx_data_recv = sp->hdr.seq; - rxrpc_hard_ACK_data(call, sp); + const void *here = __builtin_return_address(0); + int n = atomic_inc_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + skb_get(skb); } -EXPORT_SYMBOL(rxrpc_kernel_data_consumed); /* - * Destroy a packet that has an RxRPC control buffer + * Note the destruction of a socket buffer. */ -void rxrpc_packet_destructor(struct sk_buff *skb) +void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_call *call = sp->call; - - _enter("%p{%p}", skb, call); - - if (call) { - if (atomic_dec_return(&call->skb_count) < 0) - BUG(); - rxrpc_put_call(call); - sp->call = NULL; + const void *here = __builtin_return_address(0); + if (skb) { + int n; + CHECK_SLAB_OKAY(&skb->users); + n = atomic_dec_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + kfree_skb(skb); } +} - if (skb->sk) - sock_rfree(skb); - _leave(""); +/* + * Note the injected loss of a socket buffer. + */ +void rxrpc_lose_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +{ + const void *here = __builtin_return_address(0); + if (skb) { + int n; + CHECK_SLAB_OKAY(&skb->users); + n = atomic_dec_return(select_skb_count(op)); + trace_rxrpc_skb(skb, op, atomic_read(&skb->users), n, here); + kfree_skb(skb); + } } -/** - * rxrpc_kernel_free_skb - Free an RxRPC socket buffer - * @skb: The socket buffer to be freed - * - * Let RxRPC free its own socket buffer, permitting it to maintain debug - * accounting. +/* + * Clear a queue of socket buffers. */ -void rxrpc_kernel_free_skb(struct sk_buff *skb) +void rxrpc_purge_queue(struct sk_buff_head *list) { - rxrpc_free_skb(skb); + const void *here = __builtin_return_address(0); + struct sk_buff *skb; + while ((skb = skb_dequeue((list))) != NULL) { + int n = atomic_dec_return(select_skb_count(rxrpc_skb_rx_purged)); + trace_rxrpc_skb(skb, rxrpc_skb_rx_purged, + atomic_read(&skb->users), n, here); + kfree_skb(skb); + } } -EXPORT_SYMBOL(rxrpc_kernel_free_skb); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 03ad08774d4e..34c706d2f79c 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -20,7 +20,7 @@ static const unsigned int one = 1; static const unsigned int four = 4; static const unsigned int thirtytwo = 32; static const unsigned int n_65535 = 65535; -static const unsigned int n_max_acks = RXRPC_MAXACKS; +static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1; /* * RxRPC operating parameters. @@ -35,7 +35,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_requested_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&zero, }, { @@ -43,7 +43,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_soft_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, { @@ -51,7 +51,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_idle_ack_delay, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, { @@ -59,6 +59,22 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_resend_timeout, .maxlen = sizeof(unsigned int), .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = (void *)&one, + }, + { + .procname = "idle_conn_expiry", + .data = &rxrpc_conn_idle_client_expiry, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + .extra1 = (void *)&one, + }, + { + .procname = "idle_conn_fast_expiry", + .data = &rxrpc_conn_idle_client_fast_expiry, + .maxlen = sizeof(unsigned int), + .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, .extra1 = (void *)&one, }, @@ -69,29 +85,28 @@ static struct ctl_table rxrpc_sysctl_table[] = { .data = &rxrpc_max_call_lifetime, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, + .proc_handler = proc_dointvec, .extra1 = (void *)&one, }, + + /* Non-time values */ { - .procname = "dead_call_expiry", - .data = &rxrpc_dead_call_expiry, + .procname = "max_client_conns", + .data = &rxrpc_max_client_connections, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - .extra1 = (void *)&one, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&rxrpc_reap_client_connections, }, - - /* Values measured in seconds */ { - .procname = "connection_expiry", - .data = &rxrpc_connection_expiry, + .procname = "reap_client_conns", + .data = &rxrpc_reap_client_connections, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)&one, + .extra2 = (void *)&rxrpc_max_client_connections, }, - - /* Non-time values */ { .procname = "max_backlog", .data = &rxrpc_max_backlog, diff --git a/net/rxrpc/utils.c b/net/rxrpc/utils.c index b88914d53ca5..ff7af71c4b49 100644 --- a/net/rxrpc/utils.c +++ b/net/rxrpc/utils.c @@ -30,6 +30,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb) srx->transport.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; return 0; +#ifdef CONFIG_AF_RXRPC_IPV6 case ETH_P_IPV6: srx->transport_type = SOCK_DGRAM; srx->transport_len = sizeof(srx->transport.sin6); @@ -37,6 +38,7 @@ int rxrpc_extract_addr_from_skb(struct sockaddr_rxrpc *srx, struct sk_buff *skb) srx->transport.sin6.sin6_port = udp_hdr(skb)->source; srx->transport.sin6.sin6_addr = ipv6_hdr(skb)->saddr; return 0; +#endif default: pr_warn_ratelimited("AF_RXRPC: Unknown eth protocol %u\n", diff --git a/net/sched/Kconfig b/net/sched/Kconfig index ccf931b3b94c..87956a768d1b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -749,6 +749,17 @@ config NET_ACT_CONNMARK To compile this code as a module, choose M here: the module will be called act_connmark. +config NET_ACT_SKBMOD + tristate "skb data modification action" + depends on NET_CLS_ACT + ---help--- + Say Y here to allow modification of skb data + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_skbmod. + config NET_ACT_IFE tristate "Inter-FE action based on IETF ForCES InterFE LFB" depends on NET_CLS_ACT @@ -761,6 +772,17 @@ config NET_ACT_IFE To compile this code as a module, choose M here: the module will be called act_ife. +config NET_ACT_TUNNEL_KEY + tristate "IP tunnel metadata manipulation" + depends on NET_CLS_ACT + ---help--- + Say Y here to set/release ip tunnel metadata. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_tunnel_key. + config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE @@ -771,6 +793,11 @@ config NET_IFE_SKBPRIO depends on NET_ACT_IFE ---help--- +config NET_IFE_SKBTCINDEX + tristate "Support to encoding decoding skb tcindex on IFE action" + depends on NET_ACT_IFE + ---help--- + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index ae088a5a9d95..4bdda3634e0b 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -19,9 +19,12 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o +obj-$(CONFIG_NET_ACT_SKBMOD) += act_skbmod.o obj-$(CONFIG_NET_ACT_IFE) += act_ife.o obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o +obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o +obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d09d0687594b..c9102172ce3b 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -592,9 +592,19 @@ err_out: return ERR_PTR(err); } -int tcf_action_init(struct net *net, struct nlattr *nla, - struct nlattr *est, char *name, int ovr, - int bind, struct list_head *actions) +static void cleanup_a(struct list_head *actions, int ovr) +{ + struct tc_action *a; + + if (!ovr) + return; + + list_for_each_entry(a, actions, list) + a->tcfa_refcnt--; +} + +int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, + char *name, int ovr, int bind, struct list_head *actions) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -612,8 +622,15 @@ int tcf_action_init(struct net *net, struct nlattr *nla, goto err; } act->order = i; + if (ovr) + act->tcfa_refcnt++; list_add_tail(&act->list, actions); } + + /* Remove the temp refcnt which was necessary to protect against + * destroying an existing action which was being replaced + */ + cleanup_a(actions, ovr); return 0; err: @@ -883,6 +900,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } act->order = i; + if (event == RTM_GETACTION) + act->tcfa_refcnt++; list_add_tail(&act->list, &actions); } @@ -923,9 +942,8 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, return err; } -static int -tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, - u32 portid, int ovr) +static int tcf_action_add(struct net *net, struct nlattr *nla, + struct nlmsghdr *n, u32 portid, int ovr) { int ret = 0; LIST_HEAD(actions); @@ -988,8 +1006,7 @@ replay: return ret; } -static struct nlattr * -find_dump_kind(const struct nlmsghdr *n) +static struct nlattr *find_dump_kind(const struct nlmsghdr *n) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; @@ -1016,8 +1033,7 @@ find_dump_kind(const struct nlmsghdr *n) return kind; } -static int -tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) +static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index bfa870731e74..1d3960033f61 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -39,13 +39,10 @@ static struct tc_action_ops act_bpf_ops; static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { + bool at_ingress = skb_at_tc_ingress(skb); struct tcf_bpf *prog = to_bpf(act); struct bpf_prog *filter; int action, filter_res; - bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; - - if (unlikely(!skb_mac_header_was_set(skb))) - return TC_ACT_UNSPEC; tcf_lastuse_update(&prog->tcf_tm); bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b5dbf633a863..e0defcef376d 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -116,8 +116,8 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, return (void *)(skb_network_header(skb) + ihl); } -static int tcf_csum_ipv4_icmp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl) +static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) { struct icmphdr *icmph; @@ -152,8 +152,8 @@ static int tcf_csum_ipv4_igmp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv6_icmp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl) +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) { struct icmp6hdr *icmp6h; const struct ipv6hdr *ip6h; @@ -174,8 +174,8 @@ static int tcf_csum_ipv6_icmp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv4_tcp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl) +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) { struct tcphdr *tcph; const struct iphdr *iph; @@ -195,8 +195,8 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv6_tcp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl) +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) { struct tcphdr *tcph; const struct ipv6hdr *ip6h; @@ -217,8 +217,8 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb, return 1; } -static int tcf_csum_ipv4_udp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl, int udplite) +static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl, int udplite) { struct udphdr *udph; const struct iphdr *iph; @@ -270,8 +270,8 @@ ignore_obscure_skb: return 1; } -static int tcf_csum_ipv6_udp(struct sk_buff *skb, - unsigned int ihl, unsigned int ipl, int udplite) +static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl, int udplite) { struct udphdr *udph; const struct ipv6hdr *ip6h; @@ -380,8 +380,8 @@ fail: return 0; } -static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, - unsigned int ixhl, unsigned int *pl) +static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl, + unsigned int *pl) { int off, len, optlen; unsigned char *xh = (void *)ip6xh; @@ -494,8 +494,8 @@ fail: return 0; } -static int tcf_csum(struct sk_buff *skb, - const struct tc_action *a, struct tcf_result *res) +static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); int action; @@ -531,8 +531,8 @@ drop: return TC_ACT_SHOT; } -static int tcf_csum_dump(struct sk_buff *skb, - struct tc_action *a, int bind, int ref) +static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_csum *p = to_tcf_csum(a); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e24a4093d6f6..e0aa30f83c6c 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -156,7 +156,8 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, int action = READ_ONCE(gact->tcf_action); struct tcf_t *tm = &gact->tcf_tm; - _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets); + _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, + packets); if (action == TC_ACT_SHOT) this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 4a60cd5e1875..95c463cbb9a6 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -63,6 +63,23 @@ int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval) } EXPORT_SYMBOL_GPL(ife_tlv_meta_encode); +int ife_encode_meta_u16(u16 metaval, void *skbdata, struct tcf_meta_info *mi) +{ + u16 edata = 0; + + if (mi->metaval) + edata = *(u16 *)mi->metaval; + else if (metaval) + edata = metaval; + + if (!edata) /* will not encode */ + return 0; + + edata = htons(edata); + return ife_tlv_meta_encode(skbdata, mi->metaid, 2, &edata); +} +EXPORT_SYMBOL_GPL(ife_encode_meta_u16); + int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi) { if (mi->metaval) @@ -81,6 +98,15 @@ int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi) } EXPORT_SYMBOL_GPL(ife_check_meta_u32); +int ife_check_meta_u16(u16 metaval, struct tcf_meta_info *mi) +{ + if (metaval || mi->metaval) + return 8; /* T+L+(V) == 2+2+(2+2bytepad) */ + + return 0; +} +EXPORT_SYMBOL_GPL(ife_check_meta_u16); + int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi) { u32 edata = metaval; diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c new file mode 100644 index 000000000000..3b35774ce890 --- /dev/null +++ b/net/sched/act_meta_skbtcindex.c @@ -0,0 +1,79 @@ +/* + * net/sched/act_meta_tc_index.c IFE skb->tc_index metadata module + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2016) + * +*/ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/module.h> +#include <linux/init.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <uapi/linux/tc_act/tc_ife.h> +#include <net/tc_act/tc_ife.h> +#include <linux/rtnetlink.h> + +static int skbtcindex_encode(struct sk_buff *skb, void *skbdata, + struct tcf_meta_info *e) +{ + u32 ifetc_index = skb->tc_index; + + return ife_encode_meta_u16(ifetc_index, skbdata, e); +} + +static int skbtcindex_decode(struct sk_buff *skb, void *data, u16 len) +{ + u16 ifetc_index = *(u16 *)data; + + skb->tc_index = ntohs(ifetc_index); + return 0; +} + +static int skbtcindex_check(struct sk_buff *skb, struct tcf_meta_info *e) +{ + return ife_check_meta_u16(skb->tc_index, e); +} + +static struct tcf_meta_ops ife_skbtcindex_ops = { + .metaid = IFE_META_TCINDEX, + .metatype = NLA_U16, + .name = "tc_index", + .synopsis = "skb tc_index 16 bit metadata", + .check_presence = skbtcindex_check, + .encode = skbtcindex_encode, + .decode = skbtcindex_decode, + .get = ife_get_meta_u16, + .alloc = ife_alloc_meta_u16, + .release = ife_release_meta_gen, + .validate = ife_validate_meta_u16, + .owner = THIS_MODULE, +}; + +static int __init ifetc_index_init_module(void) +{ + return register_ife_op(&ife_skbtcindex_ops); +} + +static void __exit ifetc_index_cleanup_module(void) +{ + unregister_ife_op(&ife_skbtcindex_ops); +} + +module_init(ifetc_index_init_module); +module_exit(ifetc_index_cleanup_module); + +MODULE_AUTHOR("Jamal Hadi Salim(2016)"); +MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 6038c85d92f5..667dc382df82 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -204,7 +204,15 @@ out: return retval; } -static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse) +{ + tcf_lastuse_update(&a->tcfa_tm); + _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); +} + +static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, + int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); @@ -280,6 +288,7 @@ static struct tc_action_ops act_mirred_ops = { .type = TCA_ACT_MIRRED, .owner = THIS_MODULE, .act = tcf_mirred, + .stats_update = tcf_stats_update, .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8a3be1d99775..d1bd248fe146 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -249,6 +249,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, police->tcfp_t_c = now; police->tcfp_toks = toks; police->tcfp_ptoks = ptoks; + if (police->tcfp_result == TC_ACT_SHOT) + police->tcf_qstats.drops++; spin_unlock(&police->tcf_lock); return police->tcfp_result; } @@ -261,8 +263,8 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, return police->tcf_action; } -static int -tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_police *police = to_police(a); @@ -347,14 +349,12 @@ static struct pernet_operations police_net_ops = { .size = sizeof(struct tc_action_net), }; -static int __init -police_init_module(void) +static int __init police_init_module(void) { return tcf_register_action(&act_police_ops, &police_net_ops); } -static void __exit -police_cleanup_module(void) +static void __exit police_cleanup_module(void) { tcf_unregister_action(&act_police_ops, &police_net_ops); } diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c new file mode 100644 index 000000000000..e7d96381c908 --- /dev/null +++ b/net/sched/act_skbmod.c @@ -0,0 +1,301 @@ +/* + * net/sched/act_skbmod.c skb data modifier + * + * Copyright (c) 2016 Jamal Hadi Salim <jhs@mojatatu.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. +*/ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> + +#include <linux/tc_act/tc_skbmod.h> +#include <net/tc_act/tc_skbmod.h> + +#define SKBMOD_TAB_MASK 15 + +static int skbmod_net_id; +static struct tc_action_ops act_skbmod_ops; + +#define MAX_EDIT_LEN ETH_HLEN +static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_skbmod *d = to_skbmod(a); + int action; + struct tcf_skbmod_params *p; + u64 flags; + int err; + + tcf_lastuse_update(&d->tcf_tm); + bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + + /* XXX: if you are going to edit more fields beyond ethernet header + * (example when you add IP header replacement or vlan swap) + * then MAX_EDIT_LEN needs to change appropriately + */ + err = skb_ensure_writable(skb, MAX_EDIT_LEN); + if (unlikely(err)) { /* best policy is to drop on the floor */ + qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); + return TC_ACT_SHOT; + } + + rcu_read_lock(); + action = READ_ONCE(d->tcf_action); + if (unlikely(action == TC_ACT_SHOT)) { + qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); + rcu_read_unlock(); + return action; + } + + p = rcu_dereference(d->skbmod_p); + flags = p->flags; + if (flags & SKBMOD_F_DMAC) + ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); + if (flags & SKBMOD_F_SMAC) + ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src); + if (flags & SKBMOD_F_ETYPE) + eth_hdr(skb)->h_proto = p->eth_type; + rcu_read_unlock(); + + if (flags & SKBMOD_F_SWAPMAC) { + u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */ + /*XXX: I am sure we can come up with more efficient swapping*/ + ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest); + ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source); + ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr); + } + + return action; +} + +static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { + [TCA_SKBMOD_PARMS] = { .len = sizeof(struct tc_skbmod) }, + [TCA_SKBMOD_DMAC] = { .len = ETH_ALEN }, + [TCA_SKBMOD_SMAC] = { .len = ETH_ALEN }, + [TCA_SKBMOD_ETYPE] = { .type = NLA_U16 }, +}; + +static int tcf_skbmod_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + struct nlattr *tb[TCA_SKBMOD_MAX + 1]; + struct tcf_skbmod_params *p, *p_old; + struct tc_skbmod *parm; + struct tcf_skbmod *d; + bool exists = false; + u8 *daddr = NULL; + u8 *saddr = NULL; + u16 eth_type = 0; + u32 lflags = 0; + int ret = 0, err; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy); + if (err < 0) + return err; + + if (!tb[TCA_SKBMOD_PARMS]) + return -EINVAL; + + if (tb[TCA_SKBMOD_DMAC]) { + daddr = nla_data(tb[TCA_SKBMOD_DMAC]); + lflags |= SKBMOD_F_DMAC; + } + + if (tb[TCA_SKBMOD_SMAC]) { + saddr = nla_data(tb[TCA_SKBMOD_SMAC]); + lflags |= SKBMOD_F_SMAC; + } + + if (tb[TCA_SKBMOD_ETYPE]) { + eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]); + lflags |= SKBMOD_F_ETYPE; + } + + parm = nla_data(tb[TCA_SKBMOD_PARMS]); + if (parm->flags & SKBMOD_F_SWAPMAC) + lflags = SKBMOD_F_SWAPMAC; + + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + if (!lflags) + return -EINVAL; + + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + &act_skbmod_ops, bind, true); + if (ret) + return ret; + + ret = ACT_P_CREATED; + } else { + tcf_hash_release(*a, bind); + if (!ovr) + return -EEXIST; + } + + d = to_skbmod(*a); + + ASSERT_RTNL(); + p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); + if (unlikely(!p)) { + if (ovr) + tcf_hash_release(*a, bind); + return -ENOMEM; + } + + p->flags = lflags; + d->tcf_action = parm->action; + + p_old = rtnl_dereference(d->skbmod_p); + + if (ovr) + spin_lock_bh(&d->tcf_lock); + + if (lflags & SKBMOD_F_DMAC) + ether_addr_copy(p->eth_dst, daddr); + if (lflags & SKBMOD_F_SMAC) + ether_addr_copy(p->eth_src, saddr); + if (lflags & SKBMOD_F_ETYPE) + p->eth_type = htons(eth_type); + + rcu_assign_pointer(d->skbmod_p, p); + if (ovr) + spin_unlock_bh(&d->tcf_lock); + + if (p_old) + kfree_rcu(p_old, rcu); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(tn, *a); + return ret; +} + +static void tcf_skbmod_cleanup(struct tc_action *a, int bind) +{ + struct tcf_skbmod *d = to_skbmod(a); + struct tcf_skbmod_params *p; + + p = rcu_dereference_protected(d->skbmod_p, 1); + kfree_rcu(p, rcu); +} + +static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + struct tcf_skbmod *d = to_skbmod(a); + unsigned char *b = skb_tail_pointer(skb); + struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); + struct tc_skbmod opt = { + .index = d->tcf_index, + .refcnt = d->tcf_refcnt - ref, + .bindcnt = d->tcf_bindcnt - bind, + .action = d->tcf_action, + }; + struct tcf_t t; + + opt.flags = p->flags; + if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + if ((p->flags & SKBMOD_F_DMAC) && + nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, p->eth_dst)) + goto nla_put_failure; + if ((p->flags & SKBMOD_F_SMAC) && + nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, p->eth_src)) + goto nla_put_failure; + if ((p->flags & SKBMOD_F_ETYPE) && + nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(p->eth_type))) + goto nla_put_failure; + + tcf_tm_dump(&t, &d->tcf_tm); + if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) + goto nla_put_failure; + + return skb->len; +nla_put_failure: + rcu_read_unlock(); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops); +} + +static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_hash_search(tn, a, index); +} + +static struct tc_action_ops act_skbmod_ops = { + .kind = "skbmod", + .type = TCA_ACT_SKBMOD, + .owner = THIS_MODULE, + .act = tcf_skbmod_run, + .dump = tcf_skbmod_dump, + .init = tcf_skbmod_init, + .cleanup = tcf_skbmod_cleanup, + .walk = tcf_skbmod_walker, + .lookup = tcf_skbmod_search, + .size = sizeof(struct tcf_skbmod), +}; + +static __net_init int skbmod_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK); +} + +static void __net_exit skbmod_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations skbmod_net_ops = { + .init = skbmod_init_net, + .exit = skbmod_exit_net, + .id = &skbmod_net_id, + .size = sizeof(struct tc_action_net), +}; + +MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>"); +MODULE_DESCRIPTION("SKB data mod-ing"); +MODULE_LICENSE("GPL"); + +static int __init skbmod_init_module(void) +{ + return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops); +} + +static void __exit skbmod_cleanup_module(void) +{ + tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops); +} + +module_init(skbmod_init_module); +module_exit(skbmod_cleanup_module); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c new file mode 100644 index 000000000000..af47bdf2f483 --- /dev/null +++ b/net/sched/act_tunnel_key.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2016, Amir Vadai <amir@vadai.me> + * Copyright (c) 2016, Mellanox Technologies. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/dst.h> +#include <net/dst_metadata.h> + +#include <linux/tc_act/tc_tunnel_key.h> +#include <net/tc_act/tc_tunnel_key.h> + +#define TUNNEL_KEY_TAB_MASK 15 + +static int tunnel_key_net_id; +static struct tc_action_ops act_tunnel_key_ops; + +static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params; + int action; + + rcu_read_lock(); + + params = rcu_dereference(t->params); + + tcf_lastuse_update(&t->tcf_tm); + bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); + action = params->action; + + switch (params->tcft_action) { + case TCA_TUNNEL_KEY_ACT_RELEASE: + skb_dst_drop(skb); + break; + case TCA_TUNNEL_KEY_ACT_SET: + skb_dst_drop(skb); + skb_dst_set(skb, dst_clone(¶ms->tcft_enc_metadata->dst)); + break; + default: + WARN_ONCE(1, "Bad tunnel_key action %d.\n", + params->tcft_action); + break; + } + + rcu_read_unlock(); + + return action; +} + +static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { + [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, + [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 }, + [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, +}; + +static int tunnel_key_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; + struct tcf_tunnel_key_params *params_old; + struct tcf_tunnel_key_params *params_new; + struct metadata_dst *metadata = NULL; + struct tc_tunnel_key *parm; + struct tcf_tunnel_key *t; + bool exists = false; + __be64 key_id; + int ret = 0; + int err; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); + exists = tcf_hash_check(tn, parm->index, a, bind); + if (exists && bind) + return 0; + + switch (parm->t_action) { + case TCA_TUNNEL_KEY_ACT_RELEASE: + break; + case TCA_TUNNEL_KEY_ACT_SET: + if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + ret = -EINVAL; + goto err_out; + } + + key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID])); + + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && + tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { + __be32 saddr; + __be32 daddr; + + saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); + daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); + + metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, + TUNNEL_KEY, key_id, 0); + } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && + tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { + struct in6_addr saddr; + struct in6_addr daddr; + + saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); + daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); + + metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0, + TUNNEL_KEY, key_id, 0); + } + + if (!metadata) { + ret = -EINVAL; + goto err_out; + } + + metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; + break; + default: + goto err_out; + } + + if (!exists) { + ret = tcf_hash_create(tn, parm->index, est, a, + &act_tunnel_key_ops, bind, true); + if (ret) + return ret; + + ret = ACT_P_CREATED; + } else { + tcf_hash_release(*a, bind); + if (!ovr) + return -EEXIST; + } + + t = to_tunnel_key(*a); + + ASSERT_RTNL(); + params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + if (unlikely(!params_new)) { + if (ret == ACT_P_CREATED) + tcf_hash_release(*a, bind); + return -ENOMEM; + } + + params_old = rtnl_dereference(t->params); + + params_new->action = parm->action; + params_new->tcft_action = parm->t_action; + params_new->tcft_enc_metadata = metadata; + + rcu_assign_pointer(t->params, params_new); + + if (params_old) + kfree_rcu(params_old, rcu); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(tn, *a); + + return ret; + +err_out: + if (exists) + tcf_hash_release(*a, bind); + return ret; +} + +static void tunnel_key_release(struct tc_action *a, int bind) +{ + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params; + + params = rcu_dereference_protected(t->params, 1); + + if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) + dst_release(¶ms->tcft_enc_metadata->dst); + + kfree_rcu(params, rcu); +} + +static int tunnel_key_dump_addresses(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + unsigned short family = ip_tunnel_info_af(info); + + if (family == AF_INET) { + __be32 saddr = info->key.u.ipv4.src; + __be32 daddr = info->key.u.ipv4.dst; + + if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) && + !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr)) + return 0; + } + + if (family == AF_INET6) { + const struct in6_addr *saddr6 = &info->key.u.ipv6.src; + const struct in6_addr *daddr6 = &info->key.u.ipv6.dst; + + if (!nla_put_in6_addr(skb, + TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) && + !nla_put_in6_addr(skb, + TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6)) + return 0; + } + + return -EINVAL; +} + +static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_tunnel_key *t = to_tunnel_key(a); + struct tcf_tunnel_key_params *params; + struct tc_tunnel_key opt = { + .index = t->tcf_index, + .refcnt = t->tcf_refcnt - ref, + .bindcnt = t->tcf_bindcnt - bind, + }; + struct tcf_t tm; + + params = rtnl_dereference(t->params); + + opt.t_action = params->tcft_action; + opt.action = params->action; + + if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { + struct ip_tunnel_key *key = + ¶ms->tcft_enc_metadata->u.tun_info.key; + __be32 key_id = tunnel_id_to_key32(key->tun_id); + + if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || + tunnel_key_dump_addresses(skb, + ¶ms->tcft_enc_metadata->u.tun_info)) + goto nla_put_failure; + } + + tcf_tm_dump(&tm, &t->tcf_tm); + if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), + &tm, TCA_TUNNEL_KEY_PAD)) + goto nla_put_failure; + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int tunnel_key_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops); +} + +static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_hash_search(tn, a, index); +} + +static struct tc_action_ops act_tunnel_key_ops = { + .kind = "tunnel_key", + .type = TCA_ACT_TUNNEL_KEY, + .owner = THIS_MODULE, + .act = tunnel_key_act, + .dump = tunnel_key_dump, + .init = tunnel_key_init, + .cleanup = tunnel_key_release, + .walk = tunnel_key_walker, + .lookup = tunnel_key_search, + .size = sizeof(struct tcf_tunnel_key), +}; + +static __net_init int tunnel_key_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK); +} + +static void __net_exit tunnel_key_exit_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + tc_action_net_exit(tn); +} + +static struct pernet_operations tunnel_key_net_ops = { + .init = tunnel_key_init_net, + .exit = tunnel_key_exit_net, + .id = &tunnel_key_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init tunnel_key_init_module(void) +{ + return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops); +} + +static void __exit tunnel_key_cleanup_module(void) +{ + tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops); +} + +module_init(tunnel_key_init_module); +module_exit(tunnel_key_cleanup_module); + +MODULE_AUTHOR("Amir Vadai <amir@vadai.me>"); +MODULE_DESCRIPTION("ip tunnel manipulation actions"); +MODULE_LICENSE("GPL v2"); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 691409de3e1a..b57fcbcefea1 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -30,12 +30,19 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, struct tcf_vlan *v = to_vlan(a); int action; int err; + u16 tci; spin_lock(&v->tcf_lock); tcf_lastuse_update(&v->tcf_tm); bstats_update(&v->tcf_bstats, skb); action = v->tcf_action; + /* Ensure 'data' points at mac_header prior calling vlan manipulating + * functions. + */ + if (skb_at_tc_ingress(skb)) + skb_push_rcsum(skb, skb->mac_len); + switch (v->tcfv_action) { case TCA_VLAN_ACT_POP: err = skb_vlan_pop(skb); @@ -43,10 +50,35 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, goto drop; break; case TCA_VLAN_ACT_PUSH: - err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid); + err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid | + (v->tcfv_push_prio << VLAN_PRIO_SHIFT)); if (err) goto drop; break; + case TCA_VLAN_ACT_MODIFY: + /* No-op if no vlan tag (either hw-accel or in-payload) */ + if (!skb_vlan_tagged(skb)) + goto unlock; + /* extract existing tag (and guarantee no hw-accel tag) */ + if (skb_vlan_tag_present(skb)) { + tci = skb_vlan_tag_get(skb); + skb->vlan_tci = 0; + } else { + /* in-payload vlan tag, pop it */ + err = __skb_vlan_pop(skb, &tci); + if (err) + goto drop; + } + /* replace the vid */ + tci = (tci & ~VLAN_VID_MASK) | v->tcfv_push_vid; + /* replace prio bits, if tcfv_push_prio specified */ + if (v->tcfv_push_prio) { + tci &= ~VLAN_PRIO_MASK; + tci |= v->tcfv_push_prio << VLAN_PRIO_SHIFT; + } + /* put updated tci as hwaccel tag */ + __vlan_hwaccel_put_tag(skb, v->tcfv_push_proto, tci); + break; default: BUG(); } @@ -57,6 +89,9 @@ drop: action = TC_ACT_SHOT; v->tcf_qstats.drops++; unlock: + if (skb_at_tc_ingress(skb)) + skb_pull_rcsum(skb, skb->mac_len); + spin_unlock(&v->tcf_lock); return action; } @@ -65,6 +100,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, + [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 }, }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, @@ -78,6 +114,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, int action; __be16 push_vid = 0; __be16 push_proto = 0; + u8 push_prio = 0; bool exists = false; int ret = 0, err; @@ -99,6 +136,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, case TCA_VLAN_ACT_POP: break; case TCA_VLAN_ACT_PUSH: + case TCA_VLAN_ACT_MODIFY: if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_hash_release(*a, bind); @@ -123,6 +161,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } else { push_proto = htons(ETH_P_8021Q); } + + if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY]) + push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; default: if (exists) @@ -150,6 +191,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v->tcfv_action = action; v->tcfv_push_vid = push_vid; + v->tcfv_push_prio = push_prio; v->tcfv_push_proto = push_proto; v->tcf_action = parm->action; @@ -178,10 +220,13 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if (v->tcfv_action == TCA_VLAN_ACT_PUSH && + if ((v->tcfv_action == TCA_VLAN_ACT_PUSH || + v->tcfv_action == TCA_VLAN_ACT_MODIFY) && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, - v->tcfv_push_proto))) + v->tcfv_push_proto) || + (nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, + v->tcfv_push_prio)))) goto nla_put_failure; tcf_tm_dump(&t, &v->tcf_tm); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index a7c5645373af..11da7da0b7c4 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -344,13 +344,15 @@ replay: if (err == 0) { struct tcf_proto *next = rtnl_dereference(tp->next); - tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER); + tfilter_notify(net, skb, n, tp, fh, + RTM_DELTFILTER); if (tcf_destroy(tp, false)) RCU_INIT_POINTER(*back, next); } goto errout; case RTM_GETTFILTER: - err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER); + err = tfilter_notify(net, skb, n, tp, fh, + RTM_NEWTFILTER); goto errout; default: err = -EINVAL; @@ -448,7 +450,8 @@ static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct net *net = sock_net(a->skb->sk); return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, - a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTFILTER); } /* called with RTNL */ @@ -552,7 +555,7 @@ void tcf_exts_destroy(struct tcf_exts *exts) EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) + struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr) { #ifdef CONFIG_NET_CLS_ACT { @@ -560,8 +563,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tb[exts->police], rate_tlv, - "police", ovr, - TCA_ACT_BIND); + "police", ovr, TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); @@ -573,8 +575,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, int err, i = 0; err = tcf_action_init(net, tb[exts->action], rate_tlv, - NULL, ovr, - TCA_ACT_BIND, &actions); + NULL, ovr, TCA_ACT_BIND, + &actions); if (err) return err; list_for_each_entry(act, &actions, list) diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 0b8c3ace671f..eb219b78cd49 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -138,10 +138,12 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct tcf_exts e; struct tcf_ematch_tree t; - tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t); if (err < 0) @@ -189,7 +191,10 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (!fnew) return -ENOBUFS; - tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE); + if (err < 0) + goto errout; + err = -EINVAL; if (handle) { fnew->handle = handle; @@ -226,6 +231,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index c3002c2c68bb..bb1d5a487081 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -27,6 +27,8 @@ MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_DESCRIPTION("TC BPF based classifier"); #define CLS_BPF_NAME_LEN 256 +#define CLS_BPF_SUPPORTED_GEN_FLAGS \ + (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW) struct cls_bpf_head { struct list_head plist; @@ -39,6 +41,8 @@ struct cls_bpf_prog { struct list_head link; struct tcf_result res; bool exts_integrated; + bool offloaded; + u32 gen_flags; struct tcf_exts exts; u32 handle; union { @@ -54,8 +58,10 @@ struct cls_bpf_prog { static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, [TCA_BPF_FLAGS] = { .type = NLA_U32 }, + [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, - [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, + [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, + .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, @@ -83,9 +89,6 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_bpf_prog *prog; int ret = -1; - if (unlikely(!skb_mac_header_was_set(skb))) - return -1; - /* Needed here for accessing maps. */ rcu_read_lock(); list_for_each_entry_rcu(prog, &head->plist, link) { @@ -93,7 +96,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, qdisc_skb_cb(skb)->tc_classid = prog->res.classid; - if (at_ingress) { + if (tc_skip_sw(prog->gen_flags)) { + filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0; + } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_end(skb); @@ -140,6 +145,91 @@ static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) return !prog->bpf_ops; } +static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, + enum tc_clsbpf_command cmd) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_bpf_offload bpf_offload = {}; + struct tc_to_netdev offload; + + offload.type = TC_SETUP_CLSBPF; + offload.cls_bpf = &bpf_offload; + + bpf_offload.command = cmd; + bpf_offload.exts = &prog->exts; + bpf_offload.prog = prog->filter; + bpf_offload.name = prog->bpf_name; + bpf_offload.exts_integrated = prog->exts_integrated; + bpf_offload.gen_flags = prog->gen_flags; + + return dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); +} + +static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, + struct cls_bpf_prog *oldprog) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct cls_bpf_prog *obj = prog; + enum tc_clsbpf_command cmd; + bool skip_sw; + int ret; + + skip_sw = tc_skip_sw(prog->gen_flags) || + (oldprog && tc_skip_sw(oldprog->gen_flags)); + + if (oldprog && oldprog->offloaded) { + if (tc_should_offload(dev, tp, prog->gen_flags)) { + cmd = TC_CLSBPF_REPLACE; + } else if (!tc_skip_sw(prog->gen_flags)) { + obj = oldprog; + cmd = TC_CLSBPF_DESTROY; + } else { + return -EINVAL; + } + } else { + if (!tc_should_offload(dev, tp, prog->gen_flags)) + return skip_sw ? -EINVAL : 0; + cmd = TC_CLSBPF_ADD; + } + + ret = cls_bpf_offload_cmd(tp, obj, cmd); + if (ret) + return skip_sw ? ret : 0; + + obj->offloaded = true; + if (oldprog) + oldprog->offloaded = false; + + return 0; +} + +static void cls_bpf_stop_offload(struct tcf_proto *tp, + struct cls_bpf_prog *prog) +{ + int err; + + if (!prog->offloaded) + return; + + err = cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_DESTROY); + if (err) { + pr_err("Stopping hardware offload failed: %d\n", err); + return; + } + + prog->offloaded = false; +} + +static void cls_bpf_offload_update_stats(struct tcf_proto *tp, + struct cls_bpf_prog *prog) +{ + if (!prog->offloaded) + return; + + cls_bpf_offload_cmd(tp, prog, TC_CLSBPF_STATS); +} + static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; @@ -179,6 +269,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) arg; + cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); @@ -195,6 +286,7 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force) return false; list_for_each_entry_safe(prog, tmp, &head->plist, link) { + cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); @@ -304,6 +396,7 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, { bool is_bpf, is_ebpf, have_exts = false; struct tcf_exts exts; + u32 gen_flags = 0; int ret; is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; @@ -311,30 +404,39 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; - tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); - ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + ret = tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); if (ret < 0) return ret; + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + goto errout; if (tb[TCA_BPF_FLAGS]) { u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { - tcf_exts_destroy(&exts); - return -EINVAL; + ret = -EINVAL; + goto errout; } have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; } + if (tb[TCA_BPF_FLAGS_GEN]) { + gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]); + if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS || + !tc_flags_valid(gen_flags)) { + ret = -EINVAL; + goto errout; + } + } prog->exts_integrated = have_exts; + prog->gen_flags = gen_flags; ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : cls_bpf_prog_from_efd(tb, prog, tp); - if (ret < 0) { - tcf_exts_destroy(&exts); - return ret; - } + if (ret < 0) + goto errout; if (tb[TCA_BPF_CLASSID]) { prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); @@ -343,6 +445,10 @@ static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, tcf_exts_change(tp, &prog->exts, &exts); return 0; + +errout: + tcf_exts_destroy(&exts); + return ret; } static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, @@ -388,7 +494,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (!prog) return -ENOBUFS; - tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE); + if (ret < 0) + goto errout; if (oldprog) { if (handle && oldprog->handle != handle) { @@ -406,10 +514,17 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, goto errout; } - ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr); + ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], + ovr); if (ret < 0) goto errout; + ret = cls_bpf_offload(tp, prog, oldprog); + if (ret) { + cls_bpf_delete_prog(tp, prog); + return ret; + } + if (oldprog) { list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); @@ -420,9 +535,10 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) prog; return 0; + errout: + tcf_exts_destroy(&prog->exts); kfree(prog); - return ret; } @@ -470,6 +586,8 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, tm->tcm_handle = prog->handle; + cls_bpf_offload_update_stats(tp, prog); + nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; @@ -492,6 +610,9 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) goto nla_put_failure; + if (prog->gen_flags && + nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags)) + goto nla_put_failure; nla_nest_end(skb, nest); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 4c85bd3a750c..85233c470035 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -93,7 +93,9 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOBUFS; - tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + err = tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); + if (err < 0) + goto errout; new->handle = handle; new->tp = tp; err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], @@ -101,10 +103,14 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE); if (err < 0) goto errout; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) { + tcf_exts_destroy(&e); + goto errout; + } err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t); if (err < 0) { @@ -120,6 +126,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, call_rcu(&head->rcu, cls_cgroup_destroy_rcu); return 0; errout: + tcf_exts_destroy(&new->exts); kfree(new); return err; } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index fbfec6a18839..e39672394c7b 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -29,7 +29,7 @@ #include <net/route.h> #include <net/flow_dissector.h> -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif @@ -87,12 +87,14 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); } -static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_proto(const struct sk_buff *skb, + const struct flow_keys *flow) { return flow->basic.ip_proto; } -static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_proto_src(const struct sk_buff *skb, + const struct flow_keys *flow) { if (flow->ports.ports) return ntohs(flow->ports.src); @@ -100,7 +102,8 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys return addr_fold(skb->sk); } -static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_proto_dst(const struct sk_buff *skb, + const struct flow_keys *flow) { if (flow->ports.ports) return ntohs(flow->ports.dst); @@ -125,14 +128,14 @@ static u32 flow_get_mark(const struct sk_buff *skb) static u32 flow_get_nfct(const struct sk_buff *skb) { -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) return addr_fold(skb->nfct); #else return 0; #endif } -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#if IS_ENABLED(CONFIG_NF_CONNTRACK) #define CTTUPLE(skb, member) \ ({ \ enum ip_conntrack_info ctinfo; \ @@ -149,7 +152,8 @@ static u32 flow_get_nfct(const struct sk_buff *skb) }) #endif -static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_nfct_src(const struct sk_buff *skb, + const struct flow_keys *flow) { switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): @@ -161,7 +165,8 @@ fallback: return flow_get_src(skb, flow); } -static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_nfct_dst(const struct sk_buff *skb, + const struct flow_keys *flow) { switch (tc_skb_protocol(skb)) { case htons(ETH_P_IP): @@ -173,14 +178,16 @@ fallback: return flow_get_dst(skb, flow); } -static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, + const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, src.u.all)); fallback: return flow_get_proto_src(skb, flow); } -static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow) +static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, + const struct flow_keys *flow) { return ntohs(CTTUPLE(skb, dst.u.all)); fallback: @@ -418,10 +425,12 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return -EOPNOTSUPP; } - tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE); + if (err < 0) + goto err1; err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); if (err < 0) - return err; + goto err1; err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t); if (err < 0) @@ -432,13 +441,15 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (!fnew) goto err2; - tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE); + if (err < 0) + goto err3; fold = (struct flow_filter *)*arg; if (fold) { err = -EINVAL; if (fold->handle != handle && handle) - goto err2; + goto err3; /* Copy fold into fnew */ fnew->tp = fold->tp; @@ -458,31 +469,31 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) - goto err2; + goto err3; if (mode == FLOW_MODE_HASH) perturb_period = fold->perturb_period; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) - goto err2; + goto err3; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } } else { err = -EINVAL; if (!handle) - goto err2; + goto err3; if (!tb[TCA_FLOW_KEYS]) - goto err2; + goto err3; mode = FLOW_MODE_MAP; if (tb[TCA_FLOW_MODE]) mode = nla_get_u32(tb[TCA_FLOW_MODE]); if (mode != FLOW_MODE_HASH && nkeys > 1) - goto err2; + goto err3; if (tb[TCA_FLOW_PERTURB]) { if (mode != FLOW_MODE_HASH) - goto err2; + goto err3; perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } @@ -542,6 +553,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, call_rcu(&fold->rcu, flow_destroy_filter); return 0; +err3: + tcf_exts_destroy(&fnew->exts); err2: tcf_em_tree_destroy(&t); kfree(fnew); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 5060801a2f6d..f6f40fba599b 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -23,17 +23,26 @@ #include <net/ip.h> #include <net/flow_dissector.h> +#include <net/dst.h> +#include <net/dst_metadata.h> + struct fl_flow_key { int indev_ifindex; struct flow_dissector_key_control control; + struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; - struct flow_dissector_key_addrs ipaddrs; + struct flow_dissector_key_vlan vlan; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; + struct flow_dissector_key_keyid enc_key_id; + union { + struct flow_dissector_key_ipv4_addrs enc_ipv4; + struct flow_dissector_key_ipv6_addrs enc_ipv6; + }; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -123,11 +132,31 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; + struct ip_tunnel_info *info; if (!atomic_read(&head->ht.nelems)) return -1; fl_clear_masked_range(&skb_key, &head->mask); + + info = skb_tunnel_info(skb); + if (info) { + struct ip_tunnel_key *key = &info->key; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_key.enc_ipv4.src = key->u.ipv4.src; + skb_key.enc_ipv4.dst = key->u.ipv4.dst; + break; + case AF_INET6: + skb_key.enc_ipv6.src = key->u.ipv6.src; + skb_key.enc_ipv6.dst = key->u.ipv6.dst; + break; + } + + skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); + } + skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. @@ -212,7 +241,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, tc.type = TC_SETUP_CLSFLOWER; tc.cls_flower = &offload; - err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); + err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, + &tc); if (tc_skip_sw(flags)) return err; @@ -293,6 +323,22 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV4_DST] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, + [TCA_FLOWER_KEY_TCP_SRC_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -308,9 +354,29 @@ static void fl_set_key_val(struct nlattr **tb, memcpy(mask, nla_data(tb[mask_type]), len); } +static void fl_set_key_vlan(struct nlattr **tb, + struct flow_dissector_key_vlan *key_val, + struct flow_dissector_key_vlan *key_mask) +{ +#define VLAN_PRIORITY_MASK 0x7 + + if (tb[TCA_FLOWER_KEY_VLAN_ID]) { + key_val->vlan_id = + nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; + key_mask->vlan_id = VLAN_VID_MASK; + } + if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { + key_val->vlan_priority = + nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & + VLAN_PRIORITY_MASK; + key_mask->vlan_priority = VLAN_PRIORITY_MASK; + } +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { + __be16 ethertype; #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); @@ -328,9 +394,20 @@ static int fl_set_key(struct net *net, struct nlattr **tb, mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, sizeof(key->eth.src)); - fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, - &mask->basic.n_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); + if (tb[TCA_FLOWER_KEY_ETH_TYPE]) { + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); + + if (ethertype == htons(ETH_P_8021Q)) { + fl_set_key_vlan(tb, &key->vlan, &mask->vlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, + &mask->basic.n_proto, TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } + } if (key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) { @@ -359,20 +436,54 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (key->basic.ip_proto == IPPROTO_TCP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, - &mask->tp.src, TCA_FLOWER_UNSPEC, + &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK, sizeof(key->tp.src)); fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, - &mask->tp.dst, TCA_FLOWER_UNSPEC, + &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, sizeof(key->tp.dst)); } else if (key->basic.ip_proto == IPPROTO_UDP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, - &mask->tp.src, TCA_FLOWER_UNSPEC, + &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK, sizeof(key->tp.src)); fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, - &mask->tp.dst, TCA_FLOWER_UNSPEC, + &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, sizeof(key->tp.dst)); } + if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || + tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + fl_set_key_val(tb, &key->enc_ipv4.src, + TCA_FLOWER_KEY_ENC_IPV4_SRC, + &mask->enc_ipv4.src, + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK, + sizeof(key->enc_ipv4.src)); + fl_set_key_val(tb, &key->enc_ipv4.dst, + TCA_FLOWER_KEY_ENC_IPV4_DST, + &mask->enc_ipv4.dst, + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK, + sizeof(key->enc_ipv4.dst)); + } + + if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] || + tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) { + key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + fl_set_key_val(tb, &key->enc_ipv6.src, + TCA_FLOWER_KEY_ENC_IPV6_SRC, + &mask->enc_ipv6.src, + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK, + sizeof(key->enc_ipv6.src)); + fl_set_key_val(tb, &key->enc_ipv6.dst, + TCA_FLOWER_KEY_ENC_IPV6_DST, + &mask->enc_ipv6.dst, + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK, + sizeof(key->enc_ipv6.dst)); + } + + fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID, + &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC, + sizeof(key->enc_key_id.keyid)); + return 0; } @@ -404,12 +515,10 @@ static int fl_init_hashtable(struct cls_fl_head *head, #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) #define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) -#define FL_KEY_MEMBER_END_OFFSET(member) \ - (FL_KEY_MEMBER_OFFSET(member) + FL_KEY_MEMBER_SIZE(member)) -#define FL_KEY_IN_RANGE(mask, member) \ - (FL_KEY_MEMBER_OFFSET(member) <= (mask)->range.end && \ - FL_KEY_MEMBER_END_OFFSET(member) >= (mask)->range.start) +#define FL_KEY_IS_MASKED(mask, member) \ + memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \ + 0, FL_KEY_MEMBER_SIZE(member)) \ #define FL_KEY_SET(keys, cnt, id, member) \ do { \ @@ -418,9 +527,9 @@ static int fl_init_hashtable(struct cls_fl_head *head, cnt++; \ } while(0); -#define FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, id, member) \ +#define FL_KEY_SET_IF_MASKED(mask, keys, cnt, id, member) \ do { \ - if (FL_KEY_IN_RANGE(mask, member)) \ + if (FL_KEY_IS_MASKED(mask, member)) \ FL_KEY_SET(keys, cnt, id, member); \ } while(0); @@ -432,14 +541,16 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt, - FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_PORTS, tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_VLAN, vlan); skb_flow_dissector_init(&head->dissector, keys, cnt); } @@ -478,10 +589,12 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct tcf_exts e; int err; - tcf_exts_init(&e, TCA_FLOWER_ACT, 0); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_FLOWER_ACT, 0); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; if (tb[TCA_FLOWER_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); @@ -550,7 +663,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!fnew) return -ENOBUFS; - tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); + if (err < 0) + goto errout; if (!handle) { handle = fl_grab_new_handle(tp, head); @@ -614,6 +729,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } @@ -668,6 +784,29 @@ static int fl_dump_key_val(struct sk_buff *skb, return 0; } +static int fl_dump_key_vlan(struct sk_buff *skb, + struct flow_dissector_key_vlan *vlan_key, + struct flow_dissector_key_vlan *vlan_mask) +{ + int err; + + if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) + return 0; + if (vlan_mask->vlan_id) { + err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, + vlan_key->vlan_id); + if (err) + return err; + } + if (vlan_mask->vlan_priority) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, + vlan_key->vlan_priority); + if (err) + return err; + } + return 0; +} + static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { @@ -712,6 +851,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto))) goto nla_put_failure; + + if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, @@ -738,21 +881,48 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (key->basic.ip_proto == IPPROTO_TCP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, - &mask->tp.src, TCA_FLOWER_UNSPEC, + &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, - &mask->tp.dst, TCA_FLOWER_UNSPEC, + &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; else if (key->basic.ip_proto == IPPROTO_UDP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, - &mask->tp.src, TCA_FLOWER_UNSPEC, + &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, - &mask->tp.dst, TCA_FLOWER_UNSPEC, + &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; + if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && + (fl_dump_key_val(skb, &key->enc_ipv4.src, + TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src, + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK, + sizeof(key->enc_ipv4.src)) || + fl_dump_key_val(skb, &key->enc_ipv4.dst, + TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst, + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK, + sizeof(key->enc_ipv4.dst)))) + goto nla_put_failure; + else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && + (fl_dump_key_val(skb, &key->enc_ipv6.src, + TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src, + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK, + sizeof(key->enc_ipv6.src)) || + fl_dump_key_val(skb, &key->enc_ipv6.dst, + TCA_FLOWER_KEY_ENC_IPV6_DST, + &mask->enc_ipv6.dst, + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK, + sizeof(key->enc_ipv6.dst)))) + goto nla_put_failure; + + if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID, + &mask->enc_key_id, TCA_FLOWER_UNSPEC, + sizeof(key->enc_key_id))) + goto nla_put_failure; + nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags); if (tcf_exts_dump(skb, &f->exts)) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index f23a3b68bba6..9dc63d54e167 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -57,7 +57,7 @@ static u32 fw_hash(u32 handle) } static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) + struct tcf_result *res) { struct fw_head *head = rcu_dereference_bh(tp->root); struct fw_filter *f; @@ -188,17 +188,20 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { static int fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f, - struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr) + struct nlattr **tb, struct nlattr **tca, unsigned long base, + bool ovr) { struct fw_head *head = rtnl_dereference(tp->root); struct tcf_exts e; u32 mask; int err; - tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) + goto errout; if (tb[TCA_FW_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]); @@ -235,9 +238,8 @@ errout: static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, - u32 handle, - struct nlattr **tca, - unsigned long *arg, bool ovr) + u32 handle, struct nlattr **tca, unsigned long *arg, + bool ovr) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *) *arg; @@ -270,10 +272,15 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, #endif /* CONFIG_NET_CLS_IND */ fnew->tp = f->tp; - tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE); + if (err < 0) { + kfree(fnew); + return err; + } err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr); if (err < 0) { + tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } @@ -313,7 +320,9 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) return -ENOBUFS; - tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); + err = tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE); + if (err < 0) + goto errout; f->id = handle; f->tp = tp; @@ -328,6 +337,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + tcf_exts_destroy(&f->exts); kfree(f); return err; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 08a3b0a6f5ab..455fc8f83d0a 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -268,8 +268,7 @@ static int route4_init(struct tcf_proto *tp) return 0; } -static void -route4_delete_filter(struct rcu_head *head) +static void route4_delete_filter(struct rcu_head *head) { struct route4_filter *f = container_of(head, struct route4_filter, rcu); @@ -383,17 +382,19 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *est, int new, bool ovr) { - int err; u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; unsigned int h1; struct route4_bucket *b; struct tcf_exts e; + int err; - tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -EINVAL; if (tb[TCA_ROUTE4_TO]) { @@ -472,10 +473,8 @@ errout: } static int route4_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, - u32 handle, - struct nlattr **tca, - unsigned long *arg, bool ovr) + struct tcf_proto *tp, unsigned long base, u32 handle, + struct nlattr **tca, unsigned long *arg, bool ovr) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; @@ -503,7 +502,10 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (!f) goto errout; - tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + err = tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); + if (err < 0) + goto errout; + if (fold) { f->id = fold->id; f->iif = fold->iif; @@ -557,6 +559,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + if (f) + tcf_exts_destroy(&f->exts); kfree(f); return err; } diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index f9c9fc075fe6..4f05a19fb073 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -487,10 +487,12 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + err = tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr); + if (err < 0) + goto errout2; f = (struct rsvp_filter *)*arg; if (f) { @@ -506,7 +508,11 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, goto errout2; } - tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + if (err < 0) { + kfree(n); + goto errout2; + } if (tb[TCA_RSVP_CLASSID]) { n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); @@ -530,7 +536,9 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (f == NULL) goto errout2; - tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + err = tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE); + if (err < 0) + goto errout; h2 = 16; if (tb[TCA_RSVP_SRC]) { memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); @@ -627,6 +635,7 @@ insert: goto insert; errout: + tcf_exts_destroy(&f->exts); kfree(f); errout2: tcf_exts_destroy(&e); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 944c8ff45055..96144bdf30db 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -50,14 +50,13 @@ struct tcindex_data { struct rcu_head rcu; }; -static inline int -tcindex_filter_is_set(struct tcindex_filter_result *r) +static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) { return tcf_exts_is_predicative(&r->exts) || r->res.classid; } -static struct tcindex_filter_result * -tcindex_lookup(struct tcindex_data *p, u16 key) +static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, + u16 key) { if (p->perfect) { struct tcindex_filter_result *f = p->perfect + key; @@ -144,7 +143,8 @@ static void tcindex_destroy_rexts(struct rcu_head *head) static void tcindex_destroy_fexts(struct rcu_head *head) { - struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu); + struct tcindex_filter *f = container_of(head, struct tcindex_filter, + rcu); tcf_exts_destroy(&f->result.exts); kfree(f); @@ -219,10 +219,10 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, }; -static void tcindex_filter_result_init(struct tcindex_filter_result *r) +static int tcindex_filter_result_init(struct tcindex_filter_result *r) { memset(r, 0, sizeof(*r)); - tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + return tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } static void __tcindex_partial_destroy(struct rcu_head *head) @@ -233,23 +233,57 @@ static void __tcindex_partial_destroy(struct rcu_head *head) kfree(p); } +static void tcindex_free_perfect_hash(struct tcindex_data *cp) +{ + int i; + + for (i = 0; i < cp->hash; i++) + tcf_exts_destroy(&cp->perfect[i].exts); + kfree(cp->perfect); +} + +static int tcindex_alloc_perfect_hash(struct tcindex_data *cp) +{ + int i, err = 0; + + cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), + GFP_KERNEL); + if (!cp->perfect) + return -ENOMEM; + + for (i = 0; i < cp->hash; i++) { + err = tcf_exts_init(&cp->perfect[i].exts, + TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + if (err < 0) + goto errout; + } + + return 0; + +errout: + tcindex_free_perfect_hash(cp); + return err; +} + static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, struct nlattr **tb, struct nlattr *est, bool ovr) { - int err, balloc = 0; struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_filter_result cr; - struct tcindex_data *cp, *oldp; + struct tcindex_data *cp = NULL, *oldp; struct tcindex_filter *f = NULL; /* make gcc behave */ + int err, balloc = 0; struct tcf_exts e; - tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -ENOMEM; /* tcindex_data attributes must look atomic to classifier/lookup so @@ -270,19 +304,20 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (p->perfect) { int i; - cp->perfect = kmemdup(p->perfect, - sizeof(*r) * cp->hash, GFP_KERNEL); - if (!cp->perfect) + if (tcindex_alloc_perfect_hash(cp) < 0) goto errout; for (i = 0; i < cp->hash; i++) - tcf_exts_init(&cp->perfect[i].exts, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); + cp->perfect[i].res = p->perfect[i].res; balloc = 1; } cp->h = p->h; - tcindex_filter_result_init(&new_filter_result); - tcindex_filter_result_init(&cr); + err = tcindex_filter_result_init(&new_filter_result); + if (err < 0) + goto errout1; + err = tcindex_filter_result_init(&cr); + if (err < 0) + goto errout1; if (old_r) cr.res = r->res; @@ -338,15 +373,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = -ENOMEM; if (!cp->perfect && !cp->h) { if (valid_perfect_hash(cp)) { - int i; - - cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL); - if (!cp->perfect) + if (tcindex_alloc_perfect_hash(cp) < 0) goto errout_alloc; - for (i = 0; i < cp->hash; i++) - tcf_exts_init(&cp->perfect[i].exts, - TCA_TCINDEX_ACT, - TCA_TCINDEX_POLICE); balloc = 1; } else { struct tcindex_filter __rcu **hash; @@ -373,8 +401,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (!f) goto errout_alloc; f->key = handle; - tcindex_filter_result_init(&f->result); f->next = NULL; + err = tcindex_filter_result_init(&f->result); + if (err < 0) { + kfree(f); + goto errout_alloc; + } } if (tb[TCA_TCINDEX_CLASSID]) { @@ -387,8 +419,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, else tcf_exts_change(tp, &cr.exts, &e); - if (old_r && old_r != r) - tcindex_filter_result_init(old_r); + if (old_r && old_r != r) { + err = tcindex_filter_result_init(old_r); + if (err < 0) { + kfree(f); + goto errout_alloc; + } + } oldp = p; r->res = cr.res; @@ -415,9 +452,12 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, errout_alloc: if (balloc == 1) - kfree(cp->perfect); + tcindex_free_perfect_hash(cp); else if (balloc == 2) kfree(cp->h); +errout1: + tcf_exts_destroy(&cr.exts); + tcf_exts_destroy(&new_filter_result.exts); errout: kfree(cp); tcf_exts_destroy(&e); @@ -510,7 +550,7 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force) static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ffe593efe930..ae83c3aec308 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -104,7 +104,8 @@ static inline unsigned int u32_hash_fold(__be32 key, return h; } -static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) +static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) { struct { struct tc_u_knode *knode; @@ -256,8 +257,7 @@ deadloop: return -1; } -static struct tc_u_hnode * -u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) +static struct tc_u_hnode *u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) { struct tc_u_hnode *ht; @@ -270,8 +270,7 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) return ht; } -static struct tc_u_knode * -u32_lookup_key(struct tc_u_hnode *ht, u32 handle) +static struct tc_u_knode *u32_lookup_key(struct tc_u_hnode *ht, u32 handle) { unsigned int sel; struct tc_u_knode *n = NULL; @@ -360,8 +359,7 @@ static int u32_init(struct tcf_proto *tp) return 0; } -static int u32_destroy_key(struct tcf_proto *tp, - struct tc_u_knode *n, +static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, bool free_pf) { tcf_exts_destroy(&n->exts); @@ -448,9 +446,8 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) } } -static int u32_replace_hw_hnode(struct tcf_proto *tp, - struct tc_u_hnode *h, - u32 flags) +static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, + u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload u32_offload = {0}; @@ -496,9 +493,8 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) } } -static int u32_replace_hw_knode(struct tcf_proto *tp, - struct tc_u_knode *n, - u32 flags) +static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, + u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload u32_offload = {0}; @@ -709,13 +705,15 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est, bool ovr) { - int err; struct tcf_exts e; + int err; - tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); - err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + err = tcf_exts_init(&e, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) return err; + err = tcf_exts_validate(net, tp, tb, est, &e, ovr); + if (err < 0) + goto errout; err = -EINVAL; if (tb[TCA_U32_LINK]) { @@ -761,8 +759,7 @@ errout: return err; } -static void u32_replace_knode(struct tcf_proto *tp, - struct tc_u_common *tp_c, +static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, struct tc_u_knode *n) { struct tc_u_knode __rcu **ins; @@ -833,15 +830,17 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp, new->tp = tp; memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); - tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE); + if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) { + kfree(new); + return NULL; + } return new; } static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, - unsigned long *arg, bool ovr) + struct nlattr **tca, unsigned long *arg, bool ovr) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; @@ -985,9 +984,12 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; n->flags = flags; - tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); n->tp = tp; + err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE); + if (err < 0) + goto errout; + #ifdef CONFIG_CLS_U32_MARK n->pcpu_success = alloc_percpu(u32); if (!n->pcpu_success) { @@ -1028,9 +1030,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, errhw: #ifdef CONFIG_CLS_U32_MARK free_percpu(n->pcpu_success); -errout: #endif +errout: + tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF free_percpu(n->pf); #endif @@ -1079,7 +1082,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, - struct sk_buff *skb, struct tcmsg *t) + struct sk_buff *skb, struct tcmsg *t) { struct tc_u_knode *n = (struct tc_u_knode *)fh; struct tc_u_hnode *ht_up, *ht_down; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 12ebde845523..206dc24add3a 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -29,6 +29,7 @@ #include <linux/hrtimer.h> #include <linux/lockdep.h> #include <linux/slab.h> +#include <linux/hashtable.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -259,37 +260,40 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) { struct Qdisc *q; + if (!qdisc_dev(root)) + return (root->handle == handle ? root : NULL); + if (!(root->flags & TCQ_F_BUILTIN) && root->handle == handle) return root; - list_for_each_entry_rcu(q, &root->list, list) { + hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) { if (q->handle == handle) return q; } return NULL; } -void qdisc_list_add(struct Qdisc *q) +void qdisc_hash_add(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { struct Qdisc *root = qdisc_dev(q)->qdisc; WARN_ON_ONCE(root == &noop_qdisc); ASSERT_RTNL(); - list_add_tail_rcu(&q->list, &root->list); + hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); } } -EXPORT_SYMBOL(qdisc_list_add); +EXPORT_SYMBOL(qdisc_hash_add); -void qdisc_list_del(struct Qdisc *q) +void qdisc_hash_del(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); - list_del_rcu(&q->list); + hash_del_rcu(&q->hash); } } -EXPORT_SYMBOL(qdisc_list_del); +EXPORT_SYMBOL(qdisc_hash_del); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { @@ -385,7 +389,8 @@ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) static struct qdisc_rate_table *qdisc_rtab_list; -struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab) +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, + struct nlattr *tab) { struct qdisc_rate_table *rtab; @@ -537,7 +542,8 @@ nla_put_failure: return -1; } -void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) +void __qdisc_calculate_pkt_len(struct sk_buff *skb, + const struct qdisc_size_table *stab) { int pkt_len, slot; @@ -884,10 +890,10 @@ static struct lock_class_key qdisc_rx_lock; Parameters are passed via opt. */ -static struct Qdisc * -qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, - struct Qdisc *p, u32 parent, u32 handle, - struct nlattr **tca, int *errp) +static struct Qdisc *qdisc_create(struct net_device *dev, + struct netdev_queue *dev_queue, + struct Qdisc *p, u32 parent, u32 handle, + struct nlattr **tca, int *errp) { int err; struct nlattr *kind = tca[TCA_KIND]; @@ -998,7 +1004,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out4; } - qdisc_list_add(sch); + qdisc_hash_add(sch); return sch; } @@ -1069,7 +1075,8 @@ struct check_loop_arg { int depth; }; -static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); +static int check_loop_fn(struct Qdisc *q, unsigned long cl, + struct qdisc_walker *w); static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) { @@ -1431,10 +1438,11 @@ err_out: static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, - int *q_idx_p, int s_q_idx) + int *q_idx_p, int s_q_idx, bool recur) { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; + int b; if (!root) return 0; @@ -1445,18 +1453,30 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, } else { if (!tc_qdisc_dump_ignore(q) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWQDISC) <= 0) goto done; q_idx++; } - list_for_each_entry(q, &root->list, list) { + + /* If dumping singletons, there is no qdisc_dev(root) and the singleton + * itself has already been dumped. + * + * If we've already dumped the top-level (ingress) qdisc above and the global + * qdisc hashtable, we don't want to hit it again + */ + if (!qdisc_dev(root) || !recur) + goto out; + + hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (q_idx < s_q_idx) { q_idx++; continue; } if (!tc_qdisc_dump_ignore(q) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWQDISC) <= 0) goto done; q_idx++; } @@ -1490,13 +1510,14 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; - if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) + if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, + true) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, - &q_idx, s_q_idx) < 0) + &q_idx, s_q_idx, false) < 0) goto done; cont: @@ -1625,7 +1646,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) if (cops->delete) err = cops->delete(q, cl); if (err == 0) - tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS); + tclass_notify(net, skb, n, q, cl, + RTM_DELTCLASS); goto out; case RTM_GETTCLASS: err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); @@ -1723,12 +1745,14 @@ struct qdisc_dump_args { struct netlink_callback *cb; }; -static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, + struct qdisc_walker *arg) { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, - a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWTCLASS); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, @@ -1765,6 +1789,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, int *t_p, int s_t) { struct Qdisc *q; + int b; if (!root) return 0; @@ -1772,7 +1797,10 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) return -1; - list_for_each_entry(q, &root->list, list) { + if (!qdisc_dev(root)) + return 0; + + hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; } @@ -1957,10 +1985,12 @@ static int __init pktsched_init(void) rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL); + rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, + NULL); rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL); - rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL); + rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, + NULL); return 0; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 4002df3c7d9f..5bfa79ee657c 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -69,7 +69,7 @@ struct codel_sched_data { static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) { struct Qdisc *sch = ctx; - struct sk_buff *skb = __skb_dequeue(&sch->q); + struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); if (skb) sch->qstats.backlog -= qdisc_pkt_len(skb); @@ -172,7 +172,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __skb_dequeue(&sch->q); + struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index baeed6a78d28..1e37247656f8 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -31,7 +31,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - if (likely(skb_queue_len(&sch->q) < sch->limit)) + if (likely(sch->q.qlen < sch->limit)) return qdisc_enqueue_tail(skb, sch); return qdisc_drop(skb, sch, to_free); @@ -42,7 +42,7 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; - if (likely(skb_queue_len(&sch->q) < sch->limit)) + if (likely(sch->q.qlen < sch->limit)) return qdisc_enqueue_tail(skb, sch); prev_backlog = sch->qstats.backlog; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index e5458b99e09c..18e752439f6f 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -86,6 +86,7 @@ struct fq_sched_data { struct rb_root delayed; /* for rate limited flows */ u64 time_next_delayed_flow; + unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ u32 quantum; @@ -94,6 +95,7 @@ struct fq_sched_data { u32 flow_max_rate; /* optional max rate per flow */ u32 flow_plimit; /* max packets per flow */ u32 orphan_mask; /* mask for orphaned skb */ + u32 low_rate_threshold; struct rb_root *fq_root; u8 rate_enable; u8 fq_trees_log; @@ -407,11 +409,19 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, static void fq_check_throttled(struct fq_sched_data *q, u64 now) { + unsigned long sample; struct rb_node *p; if (q->time_next_delayed_flow > now) return; + /* Update unthrottle latency EWMA. + * This is cheap and can help diagnosing timer/latency problems. + */ + sample = (unsigned long)(now - q->time_next_delayed_flow); + q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3; + q->unthrottle_latency_ns += sample >> 3; + q->time_next_delayed_flow = ~0ULL; while ((p = rb_first(&q->delayed)) != NULL) { struct fq_flow *f = container_of(p, struct fq_flow, rate_node); @@ -433,7 +443,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) struct fq_flow_head *head; struct sk_buff *skb; struct fq_flow *f; - u32 rate; + u32 rate, plen; skb = fq_dequeue_head(sch, &q->internal); if (skb) @@ -482,7 +492,7 @@ begin: prefetch(&skb->end); f->credit -= qdisc_pkt_len(skb); - if (f->credit > 0 || !q->rate_enable) + if (!q->rate_enable) goto out; /* Do not pace locally generated ack packets */ @@ -493,8 +503,15 @@ begin: if (skb->sk) rate = min(skb->sk->sk_pacing_rate, rate); + if (rate <= q->low_rate_threshold) { + f->credit = 0; + plen = qdisc_pkt_len(skb); + } else { + plen = max(qdisc_pkt_len(skb), q->quantum); + if (f->credit > 0) + goto out; + } if (rate != ~0U) { - u32 plen = max(qdisc_pkt_len(skb), q->quantum); u64 len = (u64)plen * NSEC_PER_SEC; if (likely(rate)) @@ -507,7 +524,12 @@ begin: len = NSEC_PER_SEC; q->stat_pkts_too_long++; } - + /* Account for schedule/timers drifts. + * f->time_next_packet was set when prior packet was sent, + * and current time (@now) can be too late by tens of us. + */ + if (f->time_next_packet) + len -= min(len/2, now - f->time_next_packet); f->time_next_packet = now + len; } out: @@ -662,6 +684,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 }, [TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 }, [TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 }, + [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt) @@ -716,6 +739,10 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_FQ_FLOW_MAX_RATE]) q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]); + if (tb[TCA_FQ_LOW_RATE_THRESHOLD]) + q->low_rate_threshold = + nla_get_u32(tb[TCA_FQ_LOW_RATE_THRESHOLD]); + if (tb[TCA_FQ_RATE_ENABLE]) { u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]); @@ -774,6 +801,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch)); q->flow_refill_delay = msecs_to_jiffies(40); q->flow_max_rate = ~0U; + q->time_next_delayed_flow = ~0ULL; q->rate_enable = 1; q->new_flows.first = NULL; q->old_flows.first = NULL; @@ -781,6 +809,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) q->fq_root = NULL; q->fq_trees_log = ilog2(1024); q->orphan_mask = 1024 - 1; + q->low_rate_threshold = 550000 / 8; qdisc_watchdog_init(&q->watchdog, sch); if (opt) @@ -811,6 +840,8 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, jiffies_to_usecs(q->flow_refill_delay)) || nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) || + nla_put_u32(skb, TCA_FQ_LOW_RATE_THRESHOLD, + q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) goto nla_put_failure; @@ -823,20 +854,24 @@ nla_put_failure: static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct fq_sched_data *q = qdisc_priv(sch); - u64 now = ktime_get_ns(); - struct tc_fq_qd_stats st = { - .gc_flows = q->stat_gc_flows, - .highprio_packets = q->stat_internal_packets, - .tcp_retrans = q->stat_tcp_retrans, - .throttled = q->stat_throttled, - .flows_plimit = q->stat_flows_plimit, - .pkts_too_long = q->stat_pkts_too_long, - .allocation_errors = q->stat_allocation_errors, - .flows = q->flows, - .inactive_flows = q->inactive_flows, - .throttled_flows = q->throttled_flows, - .time_next_delayed_flow = q->time_next_delayed_flow - now, - }; + struct tc_fq_qd_stats st; + + sch_tree_lock(sch); + + st.gc_flows = q->stat_gc_flows; + st.highprio_packets = q->stat_internal_packets; + st.tcp_retrans = q->stat_tcp_retrans; + st.throttled = q->stat_throttled; + st.flows_plimit = q->stat_flows_plimit; + st.pkts_too_long = q->stat_pkts_too_long; + st.allocation_errors = q->stat_allocation_errors; + st.time_next_delayed_flow = q->time_next_delayed_flow - ktime_get_ns(); + st.flows = q->flows; + st.inactive_flows = q->inactive_flows; + st.throttled_flows = q->throttled_flows; + st.unthrottle_latency_ns = min_t(unsigned long, + q->unthrottle_latency_ns, ~0U); + sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 657c13362b19..6cfb6e9038c2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -423,7 +423,6 @@ struct Qdisc noop_qdisc = { .dequeue = noop_dequeue, .flags = TCQ_F_BUILTIN, .ops = &noop_qdisc_ops, - .list = LIST_HEAD_INIT(noop_qdisc.list), .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, .running = SEQCNT_ZERO(noop_qdisc.running), @@ -467,7 +466,7 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { */ struct pfifo_fast_priv { u32 bitmap; - struct sk_buff_head q[PFIFO_FAST_BANDS]; + struct qdisc_skb_head q[PFIFO_FAST_BANDS]; }; /* @@ -478,7 +477,7 @@ struct pfifo_fast_priv { */ static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, +static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv, int band) { return priv->q + band; @@ -487,10 +486,10 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { + if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { int band = prio2band[skb->priority & TC_PRIO_MAX]; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct sk_buff_head *list = band2list(priv, band); + struct qdisc_skb_head *list = band2list(priv, band); priv->bitmap |= (1 << band); qdisc->q.qlen++; @@ -506,11 +505,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) int band = bitmap2band[priv->bitmap]; if (likely(band >= 0)) { - struct sk_buff_head *list = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); + struct qdisc_skb_head *qh = band2list(priv, band); + struct sk_buff *skb = __qdisc_dequeue_head(qh); + + if (likely(skb != NULL)) { + qdisc_qstats_backlog_dec(qdisc, skb); + qdisc_bstats_update(qdisc, skb); + } qdisc->q.qlen--; - if (skb_queue_empty(list)) + if (qh->qlen == 0) priv->bitmap &= ~(1 << band); return skb; @@ -525,9 +529,9 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) int band = bitmap2band[priv->bitmap]; if (band >= 0) { - struct sk_buff_head *list = band2list(priv, band); + struct qdisc_skb_head *qh = band2list(priv, band); - return skb_peek(list); + return qh->head; } return NULL; @@ -565,7 +569,7 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) struct pfifo_fast_priv *priv = qdisc_priv(qdisc); for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __skb_queue_head_init(band2list(priv, prio)); + qdisc_skb_head_init(band2list(priv, prio)); /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; @@ -613,8 +617,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } - INIT_LIST_HEAD(&sch->list); - skb_queue_head_init(&sch->q); + qdisc_skb_head_init(&sch->q); + spin_lock_init(&sch->q.lock); spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, @@ -701,7 +705,7 @@ void qdisc_destroy(struct Qdisc *qdisc) return; #ifdef CONFIG_NET_SCHED - qdisc_list_del(qdisc); + qdisc_hash_del(qdisc); qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif @@ -789,6 +793,10 @@ static void attach_default_qdiscs(struct net_device *dev) qdisc->ops->attach(qdisc); } } +#ifdef CONFIG_NET_SCHED + if (dev->qdisc) + qdisc_hash_add(dev->qdisc); +#endif } static void transition_one_qdisc(struct net_device *dev, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3ddc7bd74ecb..000f1d36128e 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -142,8 +142,6 @@ struct hfsc_class { link-sharing, max(myf, cfmin) */ u64 cl_myf; /* my fit-time (calculated from this class's own upperlimit curve) */ - u64 cl_myfadj; /* my fit-time adjustment (to cancel - history dependence) */ u64 cl_cfmin; /* earliest children's fit-time (used with cl_myf to obtain cl_f) */ u64 cl_cvtmin; /* minimal virtual time among the @@ -151,11 +149,8 @@ struct hfsc_class { (monotonic within a period) */ u64 cl_vtadj; /* intra-period cumulative vt adjustment */ - u64 cl_vtoff; /* inter-period cumulative vt offset */ - u64 cl_cvtmax; /* max child's vt in the last period */ - u64 cl_cvtoff; /* cumulative cvtmax of all periods */ - u64 cl_pcvtoff; /* parent's cvtoff at initialization - time */ + u64 cl_cvtoff; /* largest virtual time seen among + the children */ struct internal_sc cl_rsc; /* internal real-time service curve */ struct internal_sc cl_fsc; /* internal fair service curve */ @@ -701,28 +696,16 @@ init_vf(struct hfsc_class *cl, unsigned int len) } else { /* * first child for a new parent backlog period. - * add parent's cvtmax to cvtoff to make a new - * vt (vtoff + vt) larger than the vt in the - * last period for all children. + * initialize cl_vt to the highest value seen + * among the siblings. this is analogous to + * what cur_time would provide in realtime case. */ - vt = cl->cl_parent->cl_cvtmax; - cl->cl_parent->cl_cvtoff += vt; - cl->cl_parent->cl_cvtmax = 0; + cl->cl_vt = cl->cl_parent->cl_cvtoff; cl->cl_parent->cl_cvtmin = 0; - cl->cl_vt = 0; } - cl->cl_vtoff = cl->cl_parent->cl_cvtoff - - cl->cl_pcvtoff; - /* update the virtual curve */ - vt = cl->cl_vt + cl->cl_vtoff; - rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, - cl->cl_total); - if (cl->cl_virtual.x == vt) { - cl->cl_virtual.x -= cl->cl_vtoff; - cl->cl_vtoff = 0; - } + rtsc_min(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); cl->cl_vtadj = 0; cl->cl_vtperiod++; /* increment vt period */ @@ -745,7 +728,6 @@ init_vf(struct hfsc_class *cl, unsigned int len) /* compute myf */ cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total); - cl->cl_myfadj = 0; } } @@ -779,8 +761,7 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) go_passive = 0; /* update vt */ - cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) - - cl->cl_vtoff + cl->cl_vtadj; + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + cl->cl_vtadj; /* * if vt of the class is smaller than cvtmin, @@ -795,9 +776,9 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) if (go_passive) { /* no more active child, going passive */ - /* update cvtmax of the parent class */ - if (cl->cl_vt > cl->cl_parent->cl_cvtmax) - cl->cl_parent->cl_cvtmax = cl->cl_vt; + /* update cvtoff of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtoff) + cl->cl_parent->cl_cvtoff = cl->cl_vt; /* remove this class from the vt tree */ vttree_remove(cl); @@ -813,9 +794,10 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) /* update f */ if (cl->cl_flags & HFSC_USC) { + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, cl->cl_total); +#if 0 cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, cl->cl_total); -#if 0 /* * This code causes classes to stay way under their * limit when multiple classes are used at gigabit @@ -940,7 +922,7 @@ static void hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) { sc2isc(fsc, &cl->cl_fsc); - rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vtoff + cl->cl_vt, cl->cl_total); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); cl->cl_flags |= HFSC_FSC; } @@ -1094,7 +1076,6 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (parent->level == 0) hfsc_purge_queue(sch, parent); hfsc_adjust_levels(parent); - cl->cl_pcvtoff = parent->cl_cvtoff; sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); @@ -1482,16 +1463,12 @@ hfsc_reset_class(struct hfsc_class *cl) cl->cl_e = 0; cl->cl_vt = 0; cl->cl_vtadj = 0; - cl->cl_vtoff = 0; cl->cl_cvtmin = 0; - cl->cl_cvtmax = 0; cl->cl_cvtoff = 0; - cl->cl_pcvtoff = 0; cl->cl_vtperiod = 0; cl->cl_parentperiod = 0; cl->cl_f = 0; cl->cl_myf = 0; - cl->cl_myfadj = 0; cl->cl_cfmin = 0; cl->cl_nactive = 0; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 53dbfa187870..c798d0de8a9d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -162,7 +162,7 @@ struct htb_sched { struct work_struct work; /* non shaped skbs; let them go directly thru */ - struct sk_buff_head direct_queue; + struct qdisc_skb_head direct_queue; long direct_pkts; struct qdisc_watchdog watchdog; @@ -570,6 +570,22 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) list_del_init(&cl->un.leaf.drop_list); } +static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, + struct qdisc_skb_head *qh) +{ + struct sk_buff *last = qh->tail; + + if (last) { + skb->next = NULL; + last->next = skb; + qh->tail = skb; + } else { + qh->tail = skb; + qh->head = skb; + } + qh->qlen++; +} + static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -580,7 +596,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (cl == HTB_DIRECT) { /* enqueue to helper queue */ if (q->direct_queue.qlen < q->direct_qlen) { - __skb_queue_tail(&q->direct_queue, skb); + htb_enqueue_tail(skb, sch, &q->direct_queue); q->direct_pkts++; } else { return qdisc_drop(skb, sch, to_free); @@ -888,7 +904,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) unsigned long start_at; /* try to dequeue direct packets as high prio (!) to minimize cpu work */ - skb = __skb_dequeue(&q->direct_queue); + skb = __qdisc_dequeue_head(&q->direct_queue); if (skb != NULL) { ok: qdisc_bstats_update(sch, skb); @@ -1019,7 +1035,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); - __skb_queue_head_init(&q->direct_queue); + qdisc_skb_head_init(&q->direct_queue); if (tb[TCA_HTB_DIRECT_QLEN]) q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index b9439827c172..2bc8d7f8df16 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -88,7 +88,7 @@ static void mq_attach(struct Qdisc *sch) qdisc_destroy(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) - qdisc_list_add(qdisc); + qdisc_hash_add(qdisc); #endif } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 549c66359924..b5c502c78143 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -182,7 +182,7 @@ static void mqprio_attach(struct Qdisc *sch) if (old) qdisc_destroy(old); if (ntx < dev->real_num_tx_queues) - qdisc_list_add(qdisc); + qdisc_hash_add(qdisc); } kfree(priv->qdiscs); priv->qdiscs = NULL; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index aaaf02175338..9f7b380cf0a3 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -413,6 +413,16 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, return segs; } +static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb) +{ + skb->next = qh->head; + + if (!qh->head) + qh->tail = skb; + qh->head = skb; + qh->qlen++; +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -502,7 +512,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1<<(prandom_u32() % 8); } - if (unlikely(skb_queue_len(&sch->q) >= sch->limit)) + if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); qdisc_qstats_backlog_inc(sch, skb); @@ -522,8 +532,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (q->rate) { struct sk_buff *last; - if (!skb_queue_empty(&sch->q)) - last = skb_peek_tail(&sch->q); + if (sch->q.qlen) + last = sch->q.tail; else last = netem_rb_to_skb(rb_last(&q->t_root)); if (last) { @@ -552,7 +562,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, cb->time_to_send = psched_get_time(); q->counter = 0; - __skb_queue_head(&sch->q, skb); + netem_enqueue_skb_head(&sch->q, skb); sch->qstats.requeues++; } @@ -587,7 +597,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) struct rb_node *p; tfifo_dequeue: - skb = __skb_dequeue(&sch->q); + skb = __qdisc_dequeue_head(&sch->q); if (skb) { qdisc_qstats_backlog_dec(sch, skb); deliver: diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index a570b0bb254c..5c3a99d6aa82 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -231,7 +231,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __skb_dequeue(&sch->q); + struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); @@ -511,7 +511,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { struct sk_buff *skb; - skb = __qdisc_dequeue_head(sch, &sch->q); + skb = qdisc_dequeue_head(sch); if (!skb) return NULL; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 1c23060c41a6..f10d3397f917 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1408,7 +1408,7 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc) transports) { if (t->pmtu_pending && t->dst) { sctp_transport_update_pmtu(sk, t, - WORD_TRUNC(dst_mtu(t->dst))); + SCTP_TRUNC4(dst_mtu(t->dst))); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 912eb1685a5d..f99d4855d3de 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -48,7 +48,7 @@ static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, -#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) +#if IS_ENABLED(CONFIG_CRYPTO_SHA256) { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, .hmac_name = "hmac(sha256)", diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 0a3dbec0a8fb..7a1cdf43e49d 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -70,6 +70,19 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) return msg; } +void sctp_datamsg_free(struct sctp_datamsg *msg) +{ + struct sctp_chunk *chunk; + + /* This doesn't have to be a _safe vairant because + * sctp_chunk_free() only drops the refs. + */ + list_for_each_entry(chunk, &msg->chunks, frag_list) + sctp_chunk_free(chunk); + + sctp_datamsg_put(msg); +} + /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { @@ -187,9 +200,10 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, /* This is the biggest possible DATA chunk that can fit into * the packet */ - max_data = (asoc->pathmtu - - sctp_sk(asoc->base.sk)->pf->af->net_header_len - - sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk)) & ~3; + max_data = asoc->pathmtu - + sctp_sk(asoc->base.sk)->pf->af->net_header_len - + sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); + max_data = SCTP_TRUNC4(max_data); max = asoc->frag_point; /* If the the peer requested that we authenticate DATA chunks @@ -200,8 +214,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) - max_data -= WORD_ROUND(sizeof(sctp_auth_chunk_t) + - hmac_desc->hmac_len); + max_data -= SCTP_PAD4(sizeof(sctp_auth_chunk_t) + + hmac_desc->hmac_len); } /* Now, check if we need to reduce our max */ @@ -221,7 +235,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max) - max_data -= WORD_ROUND(sizeof(sctp_sack_chunk_t)); + max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) diff --git a/net/sctp/input.c b/net/sctp/input.c index 1555fb8c68e0..a2ea1d1cc06a 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -605,7 +605,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) /* PMTU discovery (RFC1191) */ if (ICMP_FRAG_NEEDED == code) { sctp_icmp_frag_needed(sk, asoc, transport, - WORD_TRUNC(info)); + SCTP_TRUNC4(info)); goto out_unlock; } else { if (ICMP_PROT_UNREACH == code) { @@ -673,7 +673,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb) if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = offset + WORD_ROUND(ntohs(ch->length)); + ch_end = offset + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb->len) break; @@ -1128,7 +1128,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) break; @@ -1197,7 +1197,7 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, * that the chunk length doesn't cause overflow. Otherwise, we'll * walk off the end. */ - if (WORD_ROUND(ntohs(ch->length)) > skb->len) + if (SCTP_PAD4(ntohs(ch->length)) > skb->len) return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 6437aa97cfd7..f731de3e8428 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -213,7 +213,7 @@ new_skb: } chunk->chunk_hdr = ch; - chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ diff --git a/net/sctp/output.c b/net/sctp/output.c index 31b7bc35895d..2a5c1896d18f 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -180,7 +180,6 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, int one_packet, gfp_t gfp) { sctp_xmit_t retval; - int error = 0; pr_debug("%s: packet:%p size:%Zu chunk:%p size:%d\n", __func__, packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); @@ -188,6 +187,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: if (!packet->has_cookie_echo) { + int error = 0; + error = sctp_packet_transmit(packet, gfp); if (error < 0) chunk->skb->sk->sk_err = -error; @@ -296,7 +297,7 @@ static sctp_xmit_t __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { sctp_xmit_t retval = SCTP_XMIT_OK; - __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length)); + __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length)); /* Check to see if this chunk will fit into the packet */ retval = sctp_packet_will_fit(packet, chunk, chunk_len); @@ -441,14 +442,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * time. Application may notice this error. */ pr_err_once("Trying to GSO but underlying device doesn't support it."); - goto nomem; + goto err; } } else { pkt_size = packet->size; } head = alloc_skb(pkt_size + MAX_HEADER, gfp); if (!head) - goto nomem; + goto err; if (gso) { NAPI_GRO_CB(head)->last = head; skb_shinfo(head)->gso_type = sk->sk_gso_type; @@ -469,8 +470,12 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) } } dst = dst_clone(tp->dst); - if (!dst) - goto no_route; + if (!dst) { + if (asoc) + IP_INC_STATS(sock_net(asoc->base.sk), + IPSTATS_MIB_OUTNOROUTES); + goto nodst; + } skb_dst_set(head, dst); /* Build the SCTP header. */ @@ -503,7 +508,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) if (gso) { pkt_size = packet->overhead; list_for_each_entry(chunk, &packet->chunk_list, list) { - int padded = WORD_ROUND(chunk->skb->len); + int padded = SCTP_PAD4(chunk->skb->len); if (pkt_size + padded > tp->pathmtu) break; @@ -533,7 +538,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * included in the chunk length field. The sender should * never pad with more than 3 bytes. * - * [This whole comment explains WORD_ROUND() below.] + * [This whole comment explains SCTP_PAD4() below.] */ pkt_size -= packet->overhead; @@ -555,7 +560,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) has_data = 1; } - padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; if (padding) memset(skb_put(chunk->skb, padding), 0, padding); @@ -582,7 +587,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * acknowledged or have failed. * Re-queue auth chunks if needed. */ - pkt_size -= WORD_ROUND(chunk->skb->len); + pkt_size -= SCTP_PAD4(chunk->skb->len); if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); @@ -621,8 +626,10 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) if (!gso) break; - if (skb_gro_receive(&head, nskb)) + if (skb_gro_receive(&head, nskb)) { + kfree_skb(nskb); goto nomem; + } nskb = NULL; if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= sk->sk_gso_max_segs)) @@ -716,18 +723,13 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) } head->ignore_df = packet->ipfragok; tp->af_specific->sctp_xmit(head, tp); + goto out; -out: - sctp_packet_reset(packet); - return err; -no_route: - kfree_skb(head); - if (nskb != head) - kfree_skb(nskb); - - if (asoc) - IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); +nomem: + if (packet->auth && list_empty(&packet->auth->list)) + sctp_chunk_free(packet->auth); +nodst: /* FIXME: Returning the 'err' will effect all the associations * associated with a socket, although only one of the paths of the * association is unreachable. @@ -736,22 +738,18 @@ no_route: * required. */ /* err = -EHOSTUNREACH; */ -err: - /* Control chunks are unreliable so just drop them. DATA chunks - * will get resent or dropped later. - */ + kfree_skb(head); +err: list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); if (!sctp_chunk_is_data(chunk)) sctp_chunk_free(chunk); } - goto out; -nomem: - if (packet->auth && list_empty(&packet->auth->list)) - sctp_chunk_free(packet->auth); - err = -ENOMEM; - goto err; + +out: + sctp_packet_reset(packet); + return err; } /******************************************************************** @@ -913,7 +911,7 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, */ maxsize = pmtu - packet->overhead; if (packet->auth) - maxsize -= WORD_ROUND(packet->auth->skb->len); + maxsize -= SCTP_PAD4(packet->auth->skb->len); if (chunk_len > maxsize) retval = SCTP_XMIT_PMTU_FULL; diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 107233da5cc9..582585393d35 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q, static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); +static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, @@ -285,10 +285,9 @@ void sctp_outq_free(struct sctp_outq *q) } /* Put a new chunk in an sctp_outq. */ -int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) +void sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) { struct net *net = sock_net(q->asoc->base.sk); - int error = 0; pr_debug("%s: outq:%p, chunk:%p[%s]\n", __func__, q, chunk, chunk && chunk->chunk_hdr ? @@ -299,54 +298,26 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp) * immediately. */ if (sctp_chunk_is_data(chunk)) { - /* Is it OK to queue data chunks? */ - /* From 9. Termination of Association - * - * When either endpoint performs a shutdown, the - * association on each peer will stop accepting new - * data from its user and only deliver data in queue - * at the time of sending or receiving the SHUTDOWN - * chunk. - */ - switch (q->asoc->state) { - case SCTP_STATE_CLOSED: - case SCTP_STATE_SHUTDOWN_PENDING: - case SCTP_STATE_SHUTDOWN_SENT: - case SCTP_STATE_SHUTDOWN_RECEIVED: - case SCTP_STATE_SHUTDOWN_ACK_SENT: - /* Cannot send after transport endpoint shutdown */ - error = -ESHUTDOWN; - break; - - default: - pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n", - __func__, q, chunk, chunk && chunk->chunk_hdr ? - sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : - "illegal chunk"); - - sctp_chunk_hold(chunk); - sctp_outq_tail_data(q, chunk); - if (chunk->asoc->peer.prsctp_capable && - SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) - chunk->asoc->sent_cnt_removable++; - if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) - SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); - else - SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS); - break; - } + pr_debug("%s: outqueueing: outq:%p, chunk:%p[%s])\n", + __func__, q, chunk, chunk && chunk->chunk_hdr ? + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : + "illegal chunk"); + + sctp_outq_tail_data(q, chunk); + if (chunk->asoc->peer.prsctp_capable && + SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) + chunk->asoc->sent_cnt_removable++; + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS); + else + SCTP_INC_STATS(net, SCTP_MIB_OUTORDERCHUNKS); } else { list_add_tail(&chunk->list, &q->control_chunk_list); SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); } - if (error < 0) - return error; - if (!q->cork) - error = sctp_outq_flush(q, 0, gfp); - - return error; + sctp_outq_flush(q, 0, gfp); } /* Insert a chunk into the sorted list based on the TSNs. The retransmit list @@ -559,7 +530,6 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, sctp_retransmit_reason_t reason) { struct net *net = sock_net(q->asoc->base.sk); - int error = 0; switch (reason) { case SCTP_RTXR_T3_RTX: @@ -603,10 +573,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, * will be flushed at the end. */ if (reason != SCTP_RTXR_FAST_RTX) - error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); - - if (error) - q->asoc->base.sk->sk_err = -error; + sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC); } /* @@ -778,12 +745,12 @@ redo: } /* Cork the outqueue so queued chunks are really queued. */ -int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) +void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) { if (q->cork) q->cork = 0; - return sctp_outq_flush(q, 0, gfp); + sctp_outq_flush(q, 0, gfp); } @@ -796,7 +763,7 @@ int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) * locking concerns must be made. Today we use the sock lock to protect * this function. */ -static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) +static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) { struct sctp_packet *packet; struct sctp_packet singleton; @@ -919,8 +886,10 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) sctp_packet_config(&singleton, vtag, 0); sctp_packet_append_chunk(&singleton, chunk); error = sctp_packet_transmit(&singleton, gfp); - if (error < 0) - return error; + if (error < 0) { + asoc->base.sk->sk_err = -error; + return; + } break; case SCTP_CID_ABORT: @@ -1018,6 +987,8 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) retran: error = sctp_outq_flush_rtx(q, packet, rtx_timeout, &start_timer); + if (error < 0) + asoc->base.sk->sk_err = -error; if (start_timer) { sctp_transport_reset_t3_rtx(transport); @@ -1192,14 +1163,15 @@ sctp_flush_out: struct sctp_transport, send_ready); packet = &t->packet; - if (!sctp_packet_empty(packet)) + if (!sctp_packet_empty(packet)) { error = sctp_packet_transmit(packet, gfp); + if (error < 0) + asoc->base.sk->sk_err = -error; + } /* Clear the burst limited state, if any */ sctp_transport_burst_reset(t); } - - return error; } /* Update unack_data based on the incoming SACK chunk */ @@ -1747,7 +1719,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) { int i; sctp_sack_variable_t *frags; - __u16 gap; + __u16 tsn_offset, blocks; __u32 ctsn = ntohl(sack->cum_tsn_ack); if (TSN_lte(tsn, ctsn)) @@ -1766,10 +1738,11 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) */ frags = sack->variable; - gap = tsn - ctsn; - for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) { - if (TSN_lte(ntohs(frags[i].gab.start), gap) && - TSN_lte(gap, ntohs(frags[i].gab.end))) + blocks = ntohs(sack->num_gap_ack_blocks); + tsn_offset = tsn - ctsn; + for (i = 0; i < blocks; ++i) { + if (tsn_offset >= ntohs(frags[i].gab.start) && + tsn_offset <= ntohs(frags[i].gab.end)) goto pass; } diff --git a/net/sctp/proc.c b/net/sctp/proc.c index ef8ba77a5bea..206377fe91ec 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -73,13 +73,17 @@ static const struct snmp_mib sctp_snmp_list[] = { /* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) { + unsigned long buff[SCTP_MIB_MAX]; struct net *net = seq->private; int i; - for (i = 0; sctp_snmp_list[i].name != NULL; i++) + memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX); + + snmp_get_cpu_field_batch(buff, sctp_snmp_list, + net->sctp.sctp_statistics); + for (i = 0; sctp_snmp_list[i].name; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, - snmp_fold_field(net->sctp.sctp_statistics, - sctp_snmp_list[i].entry)); + buff[i]); return 0; } diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index cef0cee182d4..048954eee984 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -106,7 +106,8 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, int portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + const struct nlmsghdr *unlh, + bool net_admin) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct list_head *addr_list; @@ -133,7 +134,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, r->idiag_retrans = 0; } - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) goto errout; if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { @@ -203,6 +204,7 @@ struct sctp_comm_param { struct netlink_callback *cb; const struct inet_diag_req_v2 *r; const struct nlmsghdr *nlh; + bool net_admin; }; static size_t inet_assoc_attr_size(struct sctp_association *asoc) @@ -219,6 +221,7 @@ static size_t inet_assoc_attr_size(struct sctp_association *asoc) + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + nla_total_size(1) /* INET_DIAG_TOS */ + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + nla_total_size(addrlen * asoc->peer.transport_count) + nla_total_size(addrlen * addrcnt) + nla_total_size(sizeof(struct inet_diag_meminfo)) @@ -256,7 +259,8 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) err = inet_sctp_diag_fill(sk, assoc, rep, req, sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh); + nlh->nlmsg_seq, 0, nlh, + commp->net_admin); release_sock(sk); if (err < 0) { WARN_ON(err == -EMSGSIZE); @@ -299,7 +303,8 @@ static int sctp_sock_dump(struct sock *sk, void *p) sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh) < 0) { + NLM_F_MULTI, cb->nlh, + commp->net_admin) < 0) { cb->args[3] = 1; err = 1; goto release; @@ -309,7 +314,8 @@ static int sctp_sock_dump(struct sock *sk, void *p) if (inet_sctp_diag_fill(sk, assoc, skb, r, sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) { + cb->nlh->nlmsg_seq, 0, cb->nlh, + commp->net_admin) < 0) { err = 1; goto release; } @@ -389,7 +395,7 @@ static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh) < 0) { + cb->nlh, commp->net_admin) < 0) { err = 2; goto out; } @@ -426,6 +432,7 @@ static int sctp_diag_dump_one(struct sk_buff *in_skb, .skb = in_skb, .r = req, .nlh = nlh, + .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN), }; if (req->sdiag_family == AF_INET) { @@ -461,6 +468,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .skb = skb, .cb = cb, .r = r, + .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; /* eps hashtable dumps diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 46ffecc57214..9e9690b7afe1 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -253,7 +253,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; - chunksize += WORD_ROUND(SCTP_SAT_LEN(num_types)); + chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); chunksize += sizeof(ecap_param); if (asoc->prsctp_enable) @@ -283,14 +283,14 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* Add HMACS parameter length if any were defined */ auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; if (auth_hmacs->length) - chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); + chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; if (auth_chunks->length) - chunksize += WORD_ROUND(ntohs(auth_chunks->length)); + chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; @@ -300,8 +300,8 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, /* If we have any extensions to report, account for that */ if (num_ext) - chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + - num_ext); + chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) + + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * @@ -443,13 +443,13 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, auth_hmacs = (sctp_paramhdr_t *)asoc->c.auth_hmacs; if (auth_hmacs->length) - chunksize += WORD_ROUND(ntohs(auth_hmacs->length)); + chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (sctp_paramhdr_t *)asoc->c.auth_chunks; if (auth_chunks->length) - chunksize += WORD_ROUND(ntohs(auth_chunks->length)); + chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; @@ -458,8 +458,8 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, } if (num_ext) - chunksize += WORD_ROUND(sizeof(sctp_supported_ext_param_t) + - num_ext); + chunksize += SCTP_PAD4(sizeof(sctp_supported_ext_param_t) + + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); @@ -1375,7 +1375,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, struct sock *sk; /* No need to allocate LL here, as this is only a chunk. */ - skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp); + skb = alloc_skb(SCTP_PAD4(sizeof(sctp_chunkhdr_t) + paylen), gfp); if (!skb) goto nodata; @@ -1467,7 +1467,7 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) void *target; void *padding; int chunklen = ntohs(chunk->chunk_hdr->length); - int padlen = WORD_ROUND(chunklen) - chunklen; + int padlen = SCTP_PAD4(chunklen) - chunklen; padding = skb_put(chunk->skb, padlen); target = skb_put(chunk->skb, len); @@ -1885,7 +1885,7 @@ static int sctp_process_missing_param(const struct sctp_association *asoc, struct __sctp_missing report; __u16 len; - len = WORD_ROUND(sizeof(report)); + len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. @@ -2083,9 +2083,9 @@ static sctp_ierror_t sctp_process_unk_param(const struct sctp_association *asoc, if (*errp) { if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, - WORD_ROUND(ntohs(param.p->length)))) + SCTP_PAD4(ntohs(param.p->length)))) sctp_addto_chunk_fixed(*errp, - WORD_ROUND(ntohs(param.p->length)), + SCTP_PAD4(ntohs(param.p->length)), param.v); } else { /* If there is no memory for generating the ERROR diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 12d45193357c..c345bf153bed 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1020,19 +1020,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc, * This way the whole message is queued up and bundling if * encouraged for small fragments. */ -static int sctp_cmd_send_msg(struct sctp_association *asoc, - struct sctp_datamsg *msg, gfp_t gfp) +static void sctp_cmd_send_msg(struct sctp_association *asoc, + struct sctp_datamsg *msg, gfp_t gfp) { struct sctp_chunk *chunk; - int error = 0; - - list_for_each_entry(chunk, &msg->chunks, frag_list) { - error = sctp_outq_tail(&asoc->outqueue, chunk, gfp); - if (error) - break; - } - return error; + list_for_each_entry(chunk, &msg->chunks, frag_list) + sctp_outq_tail(&asoc->outqueue, chunk, gfp); } @@ -1427,8 +1421,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, local_cork = 1; } /* Send a chunk to our peer. */ - error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, - gfp); + sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp); break; case SCTP_CMD_SEND_PKT: @@ -1682,7 +1675,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, case SCTP_CMD_FORCE_PRIM_RETRAN: t = asoc->peer.retran_path; asoc->peer.retran_path = asoc->peer.primary_path; - error = sctp_outq_uncork(&asoc->outqueue, gfp); + sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; asoc->peer.retran_path = t; break; @@ -1709,7 +1702,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, sctp_outq_cork(&asoc->outqueue); local_cork = 1; } - error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); + sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); break; case SCTP_CMD_SEND_NEXT_ASCONF: sctp_cmd_send_asconf(asoc); @@ -1739,9 +1732,9 @@ out: */ if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { if (chunk->end_of_packet || chunk->singleton) - error = sctp_outq_uncork(&asoc->outqueue, gfp); + sctp_outq_uncork(&asoc->outqueue, gfp); } else if (local_cork) - error = sctp_outq_uncork(&asoc->outqueue, gfp); + sctp_outq_uncork(&asoc->outqueue, gfp); if (sp->data_ready_signalled) sp->data_ready_signalled = 0; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d88bb2b0b699..026e3bca4a94 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3454,7 +3454,7 @@ sctp_disposition_t sctp_sf_ootb(struct net *net, } /* Report violation if chunk len overflows */ - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); @@ -4185,7 +4185,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, hdr = unk_chunk->chunk_hdr; err_chunk = sctp_make_op_error(asoc, unk_chunk, SCTP_ERROR_UNKNOWN_CHUNK, hdr, - WORD_ROUND(ntohs(hdr->length)), + SCTP_PAD4(ntohs(hdr->length)), 0); if (err_chunk) { sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, @@ -4203,7 +4203,7 @@ sctp_disposition_t sctp_sf_unk_chunk(struct net *net, hdr = unk_chunk->chunk_hdr; err_chunk = sctp_make_op_error(asoc, unk_chunk, SCTP_ERROR_UNKNOWN_CHUNK, hdr, - WORD_ROUND(ntohs(hdr->length)), + SCTP_PAD4(ntohs(hdr->length)), 0); if (err_chunk) { sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 8ed2d99bde6d..fb02c7033307 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1958,6 +1958,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* Now send the (possibly) fragmented message. */ list_for_each_entry(chunk, &datamsg->chunks, frag_list) { + sctp_chunk_hold(chunk); + /* Do accounting for the write space. */ sctp_set_owner_w(chunk); @@ -1970,13 +1972,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) * breaks. */ err = sctp_primitive_SEND(net, asoc, datamsg); - sctp_datamsg_put(datamsg); /* Did the lower layer accept the chunk? */ - if (err) + if (err) { + sctp_datamsg_free(datamsg); goto out_free; + } pr_debug("%s: we sent primitively\n", __func__); + sctp_datamsg_put(datamsg); err = msg_len; if (unlikely(wait_connect)) { diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 81b86678be4d..ce54dce13ddb 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -233,7 +233,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) } if (transport->dst) { - transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); + transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); } else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } @@ -287,7 +287,7 @@ void sctp_transport_route(struct sctp_transport *transport, return; } if (transport->dst) { - transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst)); + transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index d85b803da11d..bea00058ce35 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -383,7 +383,7 @@ sctp_ulpevent_make_remote_error(const struct sctp_association *asoc, ch = (sctp_errhdr_t *)(chunk->skb->data); cause = ch->cause; - elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t); + elen = SCTP_PAD4(ntohs(ch->length)) - sizeof(sctp_errhdr_t); /* Pull off the ERROR header. */ skb_pull(chunk->skb, sizeof(sctp_errhdr_t)); @@ -688,7 +688,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, * MUST ignore the padding bytes. */ len = ntohs(chunk->chunk_hdr->length); - padding = WORD_ROUND(len) - len; + padding = SCTP_PAD4(len) - len; /* Fixup cloned skb with just this chunks data. */ skb_trim(skb, chunk->chunk_end - padding - skb->data); diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 877e55066f89..84d0fdaf7de9 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -140,11 +140,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc) * we can go ahead and clear out the lobby in one shot */ if (!skb_queue_empty(&sp->pd_lobby)) { - struct list_head *list; skb_queue_splice_tail_init(&sp->pd_lobby, &sk->sk_receive_queue); - list = (struct list_head *)&sctp_sk(sk)->pd_lobby; - INIT_LIST_HEAD(list); return 1; } } else { diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig new file mode 100644 index 000000000000..6cff3f6d0c3a --- /dev/null +++ b/net/strparser/Kconfig @@ -0,0 +1,4 @@ + +config STREAM_PARSER + tristate + default n diff --git a/net/strparser/Makefile b/net/strparser/Makefile new file mode 100644 index 000000000000..858a126ebaa0 --- /dev/null +++ b/net/strparser/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_STREAM_PARSER) += strparser.o diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c new file mode 100644 index 000000000000..5c7549b5b92c --- /dev/null +++ b/net/strparser/strparser.c @@ -0,0 +1,510 @@ +/* + * Stream Parser + * + * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#include <linux/bpf.h> +#include <linux/errno.h> +#include <linux/errqueue.h> +#include <linux/file.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/poll.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/uaccess.h> +#include <linux/workqueue.h> +#include <net/strparser.h> +#include <net/netns/generic.h> +#include <net/sock.h> + +static struct workqueue_struct *strp_wq; + +struct _strp_rx_msg { + /* Internal cb structure. struct strp_rx_msg must be first for passing + * to upper layer. + */ + struct strp_rx_msg strp; + int accum_len; + int early_eaten; +}; + +static inline struct _strp_rx_msg *_strp_rx_msg(struct sk_buff *skb) +{ + return (struct _strp_rx_msg *)((void *)skb->cb + + offsetof(struct qdisc_skb_cb, data)); +} + +/* Lower lock held */ +static void strp_abort_rx_strp(struct strparser *strp, int err) +{ + struct sock *csk = strp->sk; + + /* Unrecoverable error in receive */ + + del_timer(&strp->rx_msg_timer); + + if (strp->rx_stopped) + return; + + strp->rx_stopped = 1; + + /* Report an error on the lower socket */ + csk->sk_err = err; + csk->sk_error_report(csk); +} + +static void strp_start_rx_timer(struct strparser *strp) +{ + if (strp->sk->sk_rcvtimeo) + mod_timer(&strp->rx_msg_timer, strp->sk->sk_rcvtimeo); +} + +/* Lower lock held */ +static void strp_parser_err(struct strparser *strp, int err, + read_descriptor_t *desc) +{ + desc->error = err; + kfree_skb(strp->rx_skb_head); + strp->rx_skb_head = NULL; + strp->cb.abort_parser(strp, err); +} + +static inline int strp_peek_len(struct strparser *strp) +{ + struct socket *sock = strp->sk->sk_socket; + + return sock->ops->peek_len(sock); +} + +/* Lower socket lock held */ +static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len) +{ + struct strparser *strp = (struct strparser *)desc->arg.data; + struct _strp_rx_msg *rxm; + struct sk_buff *head, *skb; + size_t eaten = 0, cand_len; + ssize_t extra; + int err; + bool cloned_orig = false; + + if (strp->rx_paused) + return 0; + + head = strp->rx_skb_head; + if (head) { + /* Message already in progress */ + + rxm = _strp_rx_msg(head); + if (unlikely(rxm->early_eaten)) { + /* Already some number of bytes on the receive sock + * data saved in rx_skb_head, just indicate they + * are consumed. + */ + eaten = orig_len <= rxm->early_eaten ? + orig_len : rxm->early_eaten; + rxm->early_eaten -= eaten; + + return eaten; + } + + if (unlikely(orig_offset)) { + /* Getting data with a non-zero offset when a message is + * in progress is not expected. If it does happen, we + * need to clone and pull since we can't deal with + * offsets in the skbs for a message expect in the head. + */ + orig_skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!orig_skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + if (!pskb_pull(orig_skb, orig_offset)) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + kfree_skb(orig_skb); + desc->error = -ENOMEM; + return 0; + } + cloned_orig = true; + orig_offset = 0; + } + + if (!strp->rx_skb_nextp) { + /* We are going to append to the frags_list of head. + * Need to unshare the frag_list. + */ + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = err; + return 0; + } + + if (unlikely(skb_shinfo(head)->frag_list)) { + /* We can't append to an sk_buff that already + * has a frag_list. We create a new head, point + * the frag_list of that to the old head, and + * then are able to use the old head->next for + * appending to the message. + */ + if (WARN_ON(head->next)) { + desc->error = -EINVAL; + return 0; + } + + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + skb->len = head->len; + skb->data_len = head->len; + skb->truesize = head->truesize; + *_strp_rx_msg(skb) = *_strp_rx_msg(head); + strp->rx_skb_nextp = &head->next; + skb_shinfo(skb)->frag_list = head; + strp->rx_skb_head = skb; + head = skb; + } else { + strp->rx_skb_nextp = + &skb_shinfo(head)->frag_list; + } + } + } + + while (eaten < orig_len) { + /* Always clone since we will consume something */ + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = -ENOMEM; + break; + } + + cand_len = orig_len - eaten; + + head = strp->rx_skb_head; + if (!head) { + head = skb; + strp->rx_skb_head = head; + /* Will set rx_skb_nextp on next packet if needed */ + strp->rx_skb_nextp = NULL; + rxm = _strp_rx_msg(head); + memset(rxm, 0, sizeof(*rxm)); + rxm->strp.offset = orig_offset + eaten; + } else { + /* Unclone since we may be appending to an skb that we + * already share a frag_list with. + */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + STRP_STATS_INCR(strp->stats.rx_mem_fail); + desc->error = err; + break; + } + + rxm = _strp_rx_msg(head); + *strp->rx_skb_nextp = skb; + strp->rx_skb_nextp = &skb->next; + head->data_len += skb->len; + head->len += skb->len; + head->truesize += skb->truesize; + } + + if (!rxm->strp.full_len) { + ssize_t len; + + len = (*strp->cb.parse_msg)(strp, head); + + if (!len) { + /* Need more header to determine length */ + if (!rxm->accum_len) { + /* Start RX timer for new message */ + strp_start_rx_timer(strp); + } + rxm->accum_len += cand_len; + eaten += cand_len; + STRP_STATS_INCR(strp->stats.rx_need_more_hdr); + WARN_ON(eaten != orig_len); + break; + } else if (len < 0) { + if (len == -ESTRPIPE && rxm->accum_len) { + len = -ENODATA; + strp->rx_unrecov_intr = 1; + } else { + strp->rx_interrupted = 1; + } + strp_parser_err(strp, err, desc); + break; + } else if (len > strp->sk->sk_rcvbuf) { + /* Message length exceeds maximum allowed */ + STRP_STATS_INCR(strp->stats.rx_msg_too_big); + strp_parser_err(strp, -EMSGSIZE, desc); + break; + } else if (len <= (ssize_t)head->len - + skb->len - rxm->strp.offset) { + /* Length must be into new skb (and also + * greater than zero) + */ + STRP_STATS_INCR(strp->stats.rx_bad_hdr_len); + strp_parser_err(strp, -EPROTO, desc); + break; + } + + rxm->strp.full_len = len; + } + + extra = (ssize_t)(rxm->accum_len + cand_len) - + rxm->strp.full_len; + + if (extra < 0) { + /* Message not complete yet. */ + if (rxm->strp.full_len - rxm->accum_len > + strp_peek_len(strp)) { + /* Don't have the whole messages in the socket + * buffer. Set strp->rx_need_bytes to wait for + * the rest of the message. Also, set "early + * eaten" since we've already buffered the skb + * but don't consume yet per strp_read_sock. + */ + + if (!rxm->accum_len) { + /* Start RX timer for new message */ + strp_start_rx_timer(strp); + } + + strp->rx_need_bytes = rxm->strp.full_len - + rxm->accum_len; + rxm->accum_len += cand_len; + rxm->early_eaten = cand_len; + STRP_STATS_ADD(strp->stats.rx_bytes, cand_len); + desc->count = 0; /* Stop reading socket */ + break; + } + rxm->accum_len += cand_len; + eaten += cand_len; + WARN_ON(eaten != orig_len); + break; + } + + /* Positive extra indicates ore bytes than needed for the + * message + */ + + WARN_ON(extra > cand_len); + + eaten += (cand_len - extra); + + /* Hurray, we have a new message! */ + del_timer(&strp->rx_msg_timer); + strp->rx_skb_head = NULL; + STRP_STATS_INCR(strp->stats.rx_msgs); + + /* Give skb to upper layer */ + strp->cb.rcv_msg(strp, head); + + if (unlikely(strp->rx_paused)) { + /* Upper layer paused strp */ + break; + } + } + + if (cloned_orig) + kfree_skb(orig_skb); + + STRP_STATS_ADD(strp->stats.rx_bytes, eaten); + + return eaten; +} + +static int default_read_sock_done(struct strparser *strp, int err) +{ + return err; +} + +/* Called with lock held on lower socket */ +static int strp_read_sock(struct strparser *strp) +{ + struct socket *sock = strp->sk->sk_socket; + read_descriptor_t desc; + + desc.arg.data = strp; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + /* sk should be locked here, so okay to do read_sock */ + sock->ops->read_sock(strp->sk, &desc, strp_recv); + + desc.error = strp->cb.read_sock_done(strp, desc.error); + + return desc.error; +} + +/* Lower sock lock held */ +void strp_data_ready(struct strparser *strp) +{ + if (unlikely(strp->rx_stopped)) + return; + + /* This check is needed to synchronize with do_strp_rx_work. + * do_strp_rx_work acquires a process lock (lock_sock) whereas + * the lock held here is bh_lock_sock. The two locks can be + * held by different threads at the same time, but bh_lock_sock + * allows a thread in BH context to safely check if the process + * lock is held. In this case, if the lock is held, queue work. + */ + if (sock_owned_by_user(strp->sk)) { + queue_work(strp_wq, &strp->rx_work); + return; + } + + if (strp->rx_paused) + return; + + if (strp->rx_need_bytes) { + if (strp_peek_len(strp) >= strp->rx_need_bytes) + strp->rx_need_bytes = 0; + else + return; + } + + if (strp_read_sock(strp) == -ENOMEM) + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_data_ready); + +static void do_strp_rx_work(struct strparser *strp) +{ + read_descriptor_t rd_desc; + struct sock *csk = strp->sk; + + /* We need the read lock to synchronize with strp_data_ready. We + * need the socket lock for calling strp_read_sock. + */ + lock_sock(csk); + + if (unlikely(strp->rx_stopped)) + goto out; + + if (strp->rx_paused) + goto out; + + rd_desc.arg.data = strp; + + if (strp_read_sock(strp) == -ENOMEM) + queue_work(strp_wq, &strp->rx_work); + +out: + release_sock(csk); +} + +static void strp_rx_work(struct work_struct *w) +{ + do_strp_rx_work(container_of(w, struct strparser, rx_work)); +} + +static void strp_rx_msg_timeout(unsigned long arg) +{ + struct strparser *strp = (struct strparser *)arg; + + /* Message assembly timed out */ + STRP_STATS_INCR(strp->stats.rx_msg_timeouts); + lock_sock(strp->sk); + strp->cb.abort_parser(strp, ETIMEDOUT); + release_sock(strp->sk); +} + +int strp_init(struct strparser *strp, struct sock *csk, + struct strp_callbacks *cb) +{ + struct socket *sock = csk->sk_socket; + + if (!cb || !cb->rcv_msg || !cb->parse_msg) + return -EINVAL; + + if (!sock->ops->read_sock || !sock->ops->peek_len) + return -EAFNOSUPPORT; + + memset(strp, 0, sizeof(*strp)); + + strp->sk = csk; + + setup_timer(&strp->rx_msg_timer, strp_rx_msg_timeout, + (unsigned long)strp); + + INIT_WORK(&strp->rx_work, strp_rx_work); + + strp->cb.rcv_msg = cb->rcv_msg; + strp->cb.parse_msg = cb->parse_msg; + strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; + strp->cb.abort_parser = cb->abort_parser ? : strp_abort_rx_strp; + + return 0; +} +EXPORT_SYMBOL_GPL(strp_init); + +void strp_unpause(struct strparser *strp) +{ + strp->rx_paused = 0; + + /* Sync setting rx_paused with RX work */ + smp_mb(); + + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_unpause); + +/* strp must already be stopped so that strp_recv will no longer be called. + * Note that strp_done is not called with the lower socket held. + */ +void strp_done(struct strparser *strp) +{ + WARN_ON(!strp->rx_stopped); + + del_timer_sync(&strp->rx_msg_timer); + cancel_work_sync(&strp->rx_work); + + if (strp->rx_skb_head) { + kfree_skb(strp->rx_skb_head); + strp->rx_skb_head = NULL; + } +} +EXPORT_SYMBOL_GPL(strp_done); + +void strp_stop(struct strparser *strp) +{ + strp->rx_stopped = 1; +} +EXPORT_SYMBOL_GPL(strp_stop); + +void strp_check_rcv(struct strparser *strp) +{ + queue_work(strp_wq, &strp->rx_work); +} +EXPORT_SYMBOL_GPL(strp_check_rcv); + +static int __init strp_mod_init(void) +{ + strp_wq = create_singlethread_workqueue("kstrp"); + + return 0; +} + +static void __exit strp_mod_exit(void) +{ +} +module_init(strp_mod_init); +module_exit(strp_mod_exit); +MODULE_LICENSE("GPL"); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index a5fc9dd24aa9..02beb35f577f 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -21,7 +21,6 @@ #include <linux/workqueue.h> #include <linux/if_vlan.h> #include <linux/rtnetlink.h> -#include <net/ip_fib.h> #include <net/switchdev.h> /** @@ -344,8 +343,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj) switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_VLAN: return sizeof(struct switchdev_obj_port_vlan); - case SWITCHDEV_OBJ_ID_IPV4_FIB: - return sizeof(struct switchdev_obj_ipv4_fib); case SWITCHDEV_OBJ_ID_PORT_FDB: return sizeof(struct switchdev_obj_port_fdb); case SWITCHDEV_OBJ_ID_PORT_MDB: @@ -1042,7 +1039,7 @@ static int switchdev_port_fdb_dump_cb(struct switchdev_obj *obj) struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[0]) + if (dump->idx < dump->cb->args[2]) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, @@ -1089,7 +1086,7 @@ nla_put_failure: */ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, - struct net_device *filter_dev, int idx) + struct net_device *filter_dev, int *idx) { struct switchdev_fdb_dump dump = { .fdb.obj.orig_dev = dev, @@ -1097,207 +1094,27 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, .dev = dev, .skb = skb, .cb = cb, - .idx = idx, + .idx = *idx, }; int err; err = switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb); - cb->args[1] = err; - return dump.idx; + *idx = dump.idx; + return err; } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); -static struct net_device *switchdev_get_lowest_dev(struct net_device *dev) -{ - const struct switchdev_ops *ops = dev->switchdev_ops; - struct net_device *lower_dev; - struct net_device *port_dev; - struct list_head *iter; - - /* Recusively search down until we find a sw port dev. - * (A sw port dev supports switchdev_port_attr_get). - */ - - if (ops && ops->switchdev_port_attr_get) - return dev; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - port_dev = switchdev_get_lowest_dev(lower_dev); - if (port_dev) - return port_dev; - } - - return NULL; -} - -static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) -{ - struct switchdev_attr attr = { - .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - }; - struct switchdev_attr prev_attr; - struct net_device *dev = NULL; - int nhsel; - - ASSERT_RTNL(); - - /* For this route, all nexthop devs must be on the same switch. */ - - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - - if (!nh->nh_dev) - return NULL; - - dev = switchdev_get_lowest_dev(nh->nh_dev); - if (!dev) - return NULL; - - attr.orig_dev = dev; - if (switchdev_port_attr_get(dev, &attr)) - return NULL; - - if (nhsel > 0 && - !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid)) - return NULL; - - prev_attr = attr; - } - - return dev; -} - -/** - * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry - * - * @dst: route's IPv4 destination address - * @dst_len: destination address length (prefix length) - * @fi: route FIB info structure - * @tos: route TOS - * @type: route type - * @nlflags: netlink flags passed in (NLM_F_*) - * @tb_id: route table ID - * - * Add/modify switch IPv4 route entry. - */ -int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 nlflags, u32 tb_id) -{ - struct switchdev_obj_ipv4_fib ipv4_fib = { - .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, - .dst = dst, - .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .nlflags = nlflags, - .tb_id = tb_id, - }; - struct net_device *dev; - int err = 0; - - /* Don't offload route if using custom ip rules or if - * IPv4 FIB offloading has been disabled completely. - */ - -#ifdef CONFIG_IP_MULTIPLE_TABLES - if (fi->fib_net->ipv4.fib_has_custom_rules) - return 0; -#endif - - if (fi->fib_net->ipv4.fib_offload_disabled) - return 0; - - dev = switchdev_get_dev_by_nhs(fi); - if (!dev) - return 0; - - ipv4_fib.obj.orig_dev = dev; - err = switchdev_port_obj_add(dev, &ipv4_fib.obj); - if (!err) - fi->fib_flags |= RTNH_F_OFFLOAD; - - return err == -EOPNOTSUPP ? 0 : err; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add); - -/** - * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch - * - * @dst: route's IPv4 destination address - * @dst_len: destination address length (prefix length) - * @fi: route FIB info structure - * @tos: route TOS - * @type: route type - * @tb_id: route table ID - * - * Delete IPv4 route entry from switch device. - */ -int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) -{ - struct switchdev_obj_ipv4_fib ipv4_fib = { - .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB, - .dst = dst, - .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .nlflags = 0, - .tb_id = tb_id, - }; - struct net_device *dev; - int err = 0; - - if (!(fi->fib_flags & RTNH_F_OFFLOAD)) - return 0; - - dev = switchdev_get_dev_by_nhs(fi); - if (!dev) - return 0; - - ipv4_fib.obj.orig_dev = dev; - err = switchdev_port_obj_del(dev, &ipv4_fib.obj); - if (!err) - fi->fib_flags &= ~RTNH_F_OFFLOAD; - - return err == -EOPNOTSUPP ? 0 : err; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del); - -/** - * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation - * - * @fi: route FIB info structure - */ -void switchdev_fib_ipv4_abort(struct fib_info *fi) -{ - /* There was a problem installing this route to the offload - * device. For now, until we come up with more refined - * policy handling, abruptly end IPv4 fib offloading for - * for entire net by flushing offload device(s) of all - * IPv4 routes, and mark IPv4 fib offloading broken from - * this point forward. - */ - - fib_flush_external(fi->fib_net); - fi->fib_net->ipv4.fib_offload_disabled = true; -} -EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); - bool switchdev_port_same_parent_id(struct net_device *a, struct net_device *b) { struct switchdev_attr a_attr = { .orig_dev = a, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - .flags = SWITCHDEV_F_NO_RECURSE, }; struct switchdev_attr b_attr = { .orig_dev = b, .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, - .flags = SWITCHDEV_F_NO_RECURSE, }; if (switchdev_port_attr_get(a, &a_attr) || @@ -1306,89 +1123,4 @@ bool switchdev_port_same_parent_id(struct net_device *a, return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); } - -static u32 switchdev_port_fwd_mark_get(struct net_device *dev, - struct net_device *group_dev) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(group_dev, lower_dev, iter) { - if (lower_dev == dev) - continue; - if (switchdev_port_same_parent_id(dev, lower_dev)) - return lower_dev->offload_fwd_mark; - return switchdev_port_fwd_mark_get(dev, lower_dev); - } - - return dev->ifindex; -} EXPORT_SYMBOL_GPL(switchdev_port_same_parent_id); - -static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, - u32 old_mark, u32 *reset_mark) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(group_dev, lower_dev, iter) { - if (lower_dev->offload_fwd_mark == old_mark) { - if (!*reset_mark) - *reset_mark = lower_dev->ifindex; - lower_dev->offload_fwd_mark = *reset_mark; - } - switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark); - } -} - -/** - * switchdev_port_fwd_mark_set - Set port offload forwarding mark - * - * @dev: port device - * @group_dev: containing device - * @joining: true if dev is joining group; false if leaving group - * - * An ungrouped port's offload mark is just its ifindex. A grouped - * port's (member of a bridge, for example) offload mark is the ifindex - * of one of the ports in the group with the same parent (switch) ID. - * Ports on the same device in the same group will have the same mark. - * - * Example: - * - * br0 ifindex=9 - * sw1p1 ifindex=2 mark=2 - * sw1p2 ifindex=3 mark=2 - * sw2p1 ifindex=4 mark=5 - * sw2p2 ifindex=5 mark=5 - * - * If sw2p2 leaves the bridge, we'll have: - * - * br0 ifindex=9 - * sw1p1 ifindex=2 mark=2 - * sw1p2 ifindex=3 mark=2 - * sw2p1 ifindex=4 mark=4 - * sw2p2 ifindex=5 mark=5 - */ -void switchdev_port_fwd_mark_set(struct net_device *dev, - struct net_device *group_dev, - bool joining) -{ - u32 mark = dev->ifindex; - u32 reset_mark = 0; - - if (group_dev) { - ASSERT_RTNL(); - if (joining) - mark = switchdev_port_fwd_mark_get(dev, group_dev); - else if (dev->offload_fwd_mark == mark) - /* Ohoh, this port was the mark reference port, - * but it's leaving the group, so reset the - * mark for the remaining ports in the group. - */ - switchdev_port_fwd_mark_reset(group_dev, mark, - &reset_mark); - } - - dev->offload_fwd_mark = mark; -} -EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 46a71c701e7c..e0c71bd8f7cf 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -42,26 +42,37 @@ static int net_ctl_permissions(struct ctl_table_header *head, struct ctl_table *table) { struct net *net = container_of(head->set, struct net, sysctls); - kuid_t root_uid = make_kuid(net->user_ns, 0); - kgid_t root_gid = make_kgid(net->user_ns, 0); /* Allow network administrator to have same access as root. */ - if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN) || - uid_eq(root_uid, current_euid())) { + if (ns_capable_noaudit(net->user_ns, CAP_NET_ADMIN)) { int mode = (table->mode >> 6) & 7; return (mode << 6) | (mode << 3) | mode; } - /* Allow netns root group to have the same access as the root group */ - if (in_egroup_p(root_gid)) { - int mode = (table->mode >> 3) & 7; - return (mode << 3) | mode; - } + return table->mode; } +static void net_ctl_set_ownership(struct ctl_table_header *head, + struct ctl_table *table, + kuid_t *uid, kgid_t *gid) +{ + struct net *net = container_of(head->set, struct net, sysctls); + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ns_root_uid = make_kuid(net->user_ns, 0); + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + ns_root_gid = make_kgid(net->user_ns, 0); + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; +} + static struct ctl_table_root net_sysctl_root = { .lookup = net_ctl_header_lookup, .permissions = net_ctl_permissions, + .set_ownership = net_ctl_set_ownership, }; static int __net_init sysctl_net_init(struct net *net) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index ae469b37d852..753f774cb46f 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -269,18 +269,19 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked) * * RCU is locked, no other locks set */ -void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, - struct tipc_msg *hdr) +int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, + struct tipc_msg *hdr) { struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; struct sk_buff_head xmitq; + int rc = 0; __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (msg_type(hdr) == STATE_MSG) { tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq); - tipc_link_bc_sync_rcv(l, hdr, &xmitq); + rc = tipc_link_bc_sync_rcv(l, hdr, &xmitq); } else { tipc_link_bc_init_rcv(l, hdr); } @@ -291,6 +292,7 @@ void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, /* Any socket wakeup messages ? */ if (!skb_queue_empty(inputq)) tipc_sk_rcv(net, inputq); + return rc; } /* tipc_bcast_add_peer - add a peer node to broadcast link and bearer diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index d5e79b3767fd..5ffe34472ccd 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -56,8 +56,8 @@ int tipc_bcast_get_mtu(struct net *net); int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list); int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked); -void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, - struct tipc_msg *hdr); +int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, + struct tipc_msg *hdr); int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); int tipc_bclink_reset_stats(struct net *net); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 65b1bbf133bd..975dbeb60ab0 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -42,6 +42,7 @@ #include "monitor.h" #include "bcast.h" #include "netlink.h" +#include "udp_media.h" #define MAX_ADDR_STR 60 @@ -56,6 +57,13 @@ static struct tipc_media * const media_info_array[] = { NULL }; +static struct tipc_bearer *bearer_get(struct net *net, int bearer_id) +{ + struct tipc_net *tn = tipc_net(net); + + return rcu_dereference_rtnl(tn->bearer_list[bearer_id]); +} + static void bearer_disable(struct net *net, struct tipc_bearer *b); /** @@ -323,6 +331,7 @@ restart: b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = priority; + test_and_set_bit_lock(0, &b->up); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { @@ -360,15 +369,24 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) */ void tipc_bearer_reset_all(struct net *net) { - struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; int i; for (i = 0; i < MAX_BEARERS; i++) { - b = rcu_dereference_rtnl(tn->bearer_list[i]); + b = bearer_get(net, i); + if (b) + clear_bit_unlock(0, &b->up); + } + for (i = 0; i < MAX_BEARERS; i++) { + b = bearer_get(net, i); if (b) tipc_reset_bearer(net, b); } + for (i = 0; i < MAX_BEARERS; i++) { + b = bearer_get(net, i); + if (b) + test_and_set_bit_lock(0, &b->up); + } } /** @@ -382,8 +400,9 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) int bearer_id = b->identity; pr_info("Disabling bearer <%s>\n", b->name); - b->media->disable_media(b); + clear_bit_unlock(0, &b->up); tipc_node_delete_links(net, bearer_id); + b->media->disable_media(b); RCU_INIT_POINTER(b->media_ptr, NULL); if (b->link_req) tipc_disc_delete(b->link_req); @@ -440,22 +459,16 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, { struct net_device *dev; int delta; - void *tipc_ptr; dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr); if (!dev) return 0; - /* Send RESET message even if bearer is detached from device */ - tipc_ptr = rcu_dereference_rtnl(dev->tipc_ptr); - if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb)))) - goto drop; - - delta = dev->hard_header_len - skb_headroom(skb); - if ((delta > 0) && - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) - goto drop; - + delta = SKB_DATA_ALIGN(dev->hard_header_len - skb_headroom(skb)); + if ((delta > 0) && pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) { + kfree_skb(skb); + return 0; + } skb_reset_network_header(skb); skb->dev = dev; skb->protocol = htons(ETH_P_TIPC); @@ -463,9 +476,6 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, dev->dev_addr, skb->len); dev_queue_xmit(skb); return 0; -drop: - kfree_skb(skb); - return 0; } int tipc_bearer_mtu(struct net *net, u32 bearer_id) @@ -487,12 +497,12 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, struct sk_buff *skb, struct tipc_media_addr *dest) { - struct tipc_net *tn = tipc_net(net); + struct tipc_msg *hdr = buf_msg(skb); struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (likely(b)) + b = bearer_get(net, bearer_id); + if (likely(b && (test_bit(0, &b->up) || msg_is_reset(hdr)))) b->media->send_msg(net, skb, b, dest); else kfree_skb(skb); @@ -505,7 +515,6 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr *dst) { - struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_bearer *b; struct sk_buff *skb, *tmp; @@ -513,12 +522,15 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, return; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + b = bearer_get(net, bearer_id); if (unlikely(!b)) __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { __skb_dequeue(xmitq); - b->media->send_msg(net, skb, b, dst); + if (likely(test_bit(0, &b->up) || msg_is_reset(buf_msg(skb)))) + b->media->send_msg(net, skb, b, dst); + else + kfree_skb(skb); } rcu_read_unlock(); } @@ -535,8 +547,8 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, struct tipc_msg *hdr; rcu_read_lock(); - b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (unlikely(!b)) + b = bearer_get(net, bearer_id); + if (unlikely(!b || !test_bit(0, &b->up))) __skb_queue_purge(xmitq); skb_queue_walk_safe(xmitq, skb, tmp) { hdr = buf_msg(skb); @@ -566,7 +578,8 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, rcu_read_lock(); b = rcu_dereference_rtnl(dev->tipc_ptr); - if (likely(b && (skb->pkt_type <= PACKET_BROADCAST))) { + if (likely(b && test_bit(0, &b->up) && + (skb->pkt_type <= PACKET_BROADCAST))) { skb->next = NULL; tipc_rcv(dev_net(dev), skb, b); rcu_read_unlock(); @@ -591,18 +604,9 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); - struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; - int i; b = rtnl_dereference(dev->tipc_ptr); - if (!b) { - for (i = 0; i < MAX_BEARERS; b = NULL, i++) { - b = rtnl_dereference(tn->bearer_list[i]); - if (b && (b->media_ptr == dev)) - break; - } - } if (!b) return NOTIFY_DONE; @@ -613,11 +617,10 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, if (netif_carrier_ok(dev)) break; case NETDEV_UP: - rcu_assign_pointer(dev->tipc_ptr, b); + test_and_set_bit_lock(0, &b->up); break; case NETDEV_GOING_DOWN: - RCU_INIT_POINTER(dev->tipc_ptr, NULL); - synchronize_net(); + clear_bit_unlock(0, &b->up); tipc_reset_bearer(net, b); break; case NETDEV_CHANGEMTU: @@ -709,6 +712,14 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; nla_nest_end(msg->skb, prop); + +#ifdef CONFIG_TIPC_MEDIA_UDP + if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) { + if (tipc_udp_nl_add_bearer_data(msg, bearer)) + goto attr_msg_full; + } +#endif + nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); @@ -895,6 +906,49 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) return 0; } +int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *name; + struct tipc_bearer *b; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + struct net *net = sock_net(skb->sk); + + if (!info->attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); + + rtnl_lock(); + b = tipc_bearer_find(net, name); + if (!b) { + rtnl_unlock(); + return -EINVAL; + } + +#ifdef CONFIG_TIPC_MEDIA_UDP + if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) { + err = tipc_udp_nl_bearer_add(b, + attrs[TIPC_NLA_BEARER_UDP_OPTS]); + if (err) { + rtnl_unlock(); + return err; + } + } +#endif + rtnl_unlock(); + + return 0; +} + int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) { int err; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 43757f1f9cb3..78892e2f53e3 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -150,6 +150,7 @@ struct tipc_bearer { u32 identity; struct tipc_link_req *link_req; char net_plane; + unsigned long up; }; struct tipc_bearer_names { @@ -180,6 +181,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/link.c b/net/tipc/link.c index 877d94f34814..b36e16cdc945 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -181,7 +181,10 @@ struct tipc_link { u16 acked; struct tipc_link *bc_rcvlink; struct tipc_link *bc_sndlink; - int nack_state; + unsigned long prev_retr; + u16 prev_from; + u16 prev_to; + u8 nack_state; bool bc_peer_is_up; /* Statistics */ @@ -202,6 +205,8 @@ enum { BC_NACK_SND_SUPPRESS, }; +#define TIPC_BC_RETR_LIMIT 10 /* [ms] */ + /* * Interval between NACKs when packets arrive out of order */ @@ -237,8 +242,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq); static void link_print(struct tipc_link *l, const char *str); -static void tipc_link_build_nack_msg(struct tipc_link *l, - struct sk_buff_head *xmitq); +static int tipc_link_build_nack_msg(struct tipc_link *l, + struct sk_buff_head *xmitq); static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static bool tipc_link_release_pkts(struct tipc_link *l, u16 to); @@ -367,6 +372,18 @@ int tipc_link_bc_peers(struct tipc_link *l) return l->ackers; } +u16 link_bc_rcv_gap(struct tipc_link *l) +{ + struct sk_buff *skb = skb_peek(&l->deferdq); + u16 gap = 0; + + if (more(l->snd_nxt, l->rcv_nxt)) + gap = l->snd_nxt - l->rcv_nxt; + if (skb) + gap = buf_seqno(skb) - l->rcv_nxt; + return gap; +} + void tipc_link_set_mtu(struct tipc_link *l, int mtu) { l->mtu = mtu; @@ -807,7 +824,7 @@ void link_prepare_wakeup(struct tipc_link *l) skb_queue_walk_safe(&l->wakeupq, skb, tmp) { imp = TIPC_SKB_CB(skb)->chain_imp; - lim = l->window + l->backlog[imp].limit; + lim = l->backlog[imp].limit; pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; if ((pnd[imp] + l->backlog[imp].len) >= lim) break; @@ -873,9 +890,11 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff *skb, *_skb, *bskb; /* Match msg importance against this and all higher backlog limits: */ - for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { - if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) - return link_schedule_user(l, list); + if (!skb_queue_empty(backlogq)) { + for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { + if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) + return link_schedule_user(l, list); + } } if (unlikely(msg_size(hdr) > mtu)) { skb_queue_purge(list); @@ -1133,7 +1152,10 @@ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq) if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf) return 0; l->rcv_unacked = 0; - return TIPC_LINK_SND_BC_ACK; + + /* Use snd_nxt to store peer's snd_nxt in broadcast rcv link */ + l->snd_nxt = l->rcv_nxt; + return TIPC_LINK_SND_STATE; } /* Unicast ACK */ @@ -1162,17 +1184,26 @@ void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq) } /* tipc_link_build_nack_msg: prepare link nack message for transmission + * Note that sending of broadcast NACK is coordinated among nodes, to + * reduce the risk of NACK storms towards the sender */ -static void tipc_link_build_nack_msg(struct tipc_link *l, - struct sk_buff_head *xmitq) +static int tipc_link_build_nack_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) { u32 def_cnt = ++l->stats.deferred_recv; + int match1, match2; - if (link_is_bc_rcvlink(l)) - return; + if (link_is_bc_rcvlink(l)) { + match1 = def_cnt & 0xf; + match2 = tipc_own_addr(l->net) & 0xf; + if (match1 == match2) + return TIPC_LINK_SND_STATE; + return 0; + } if ((skb_queue_len(&l->deferdq) == 1) || !(def_cnt % TIPC_NACK_INTV)) tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); + return 0; } /* tipc_link_rcv - process TIPC packets/messages arriving from off-node @@ -1223,7 +1254,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { __tipc_skb_queue_sorted(defq, seqno, skb); - tipc_link_build_nack_msg(l, xmitq); + rc |= tipc_link_build_nack_msg(l, xmitq); break; } @@ -1234,7 +1265,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, rc |= tipc_link_input(l, skb, l->inputq); if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) rc |= tipc_link_build_state_msg(l, xmitq); - if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK)) + if (unlikely(rc & ~TIPC_LINK_SND_STATE)) break; } while ((skb = __skb_dequeue(defq))); @@ -1248,10 +1279,11 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq) { + struct tipc_link *bcl = l->bc_rcvlink; struct sk_buff *skb; struct tipc_msg *hdr; struct sk_buff_head *dfq = &l->deferdq; - bool node_up = link_is_up(l->bc_rcvlink); + bool node_up = link_is_up(bcl); struct tipc_mon_state *mstate = &l->mon_state; int dlen = 0; void *data; @@ -1279,7 +1311,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_net_plane(hdr, l->net_plane); msg_set_next_sent(hdr, l->snd_nxt); msg_set_ack(hdr, l->rcv_nxt - 1); - msg_set_bcast_ack(hdr, l->bc_rcvlink->rcv_nxt - 1); + msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1); msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1); msg_set_link_tolerance(hdr, tolerance); msg_set_linkprio(hdr, priority); @@ -1289,6 +1321,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (mtyp == STATE_MSG) { msg_set_seq_gap(hdr, rcvgap); + msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); msg_set_probe(hdr, probe); tipc_mon_prep(l->net, data, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + dlen); @@ -1571,51 +1604,107 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr) l->rcv_nxt = peers_snd_nxt; } +/* link_bc_retr eval()- check if the indicated range can be retransmitted now + * - Adjust permitted range if there is overlap with previous retransmission + */ +static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to) +{ + unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr); + + if (less(*to, *from)) + return false; + + /* New retransmission request */ + if ((elapsed > TIPC_BC_RETR_LIMIT) || + less(*to, l->prev_from) || more(*from, l->prev_to)) { + l->prev_from = *from; + l->prev_to = *to; + l->prev_retr = jiffies; + return true; + } + + /* Inside range of previous retransmit */ + if (!less(*from, l->prev_from) && !more(*to, l->prev_to)) + return false; + + /* Fully or partially outside previous range => exclude overlap */ + if (less(*from, l->prev_from)) { + *to = l->prev_from - 1; + l->prev_from = *from; + } + if (more(*to, l->prev_to)) { + *from = l->prev_to + 1; + l->prev_to = *to; + } + l->prev_retr = jiffies; + return true; +} + /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state */ -void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, - struct sk_buff_head *xmitq) +int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, + struct sk_buff_head *xmitq) { + struct tipc_link *snd_l = l->bc_sndlink; u16 peers_snd_nxt = msg_bc_snd_nxt(hdr); + u16 from = msg_bcast_ack(hdr) + 1; + u16 to = from + msg_bc_gap(hdr) - 1; + int rc = 0; if (!link_is_up(l)) - return; + return rc; if (!msg_peer_node_is_up(hdr)) - return; + return rc; /* Open when peer ackowledges our bcast init msg (pkt #1) */ if (msg_ack(hdr)) l->bc_peer_is_up = true; if (!l->bc_peer_is_up) - return; + return rc; + + l->stats.recv_nacks++; /* Ignore if peers_snd_nxt goes beyond receive window */ if (more(peers_snd_nxt, l->rcv_nxt + l->window)) - return; + return rc; + + if (link_bc_retr_eval(snd_l, &from, &to)) + rc = tipc_link_retrans(snd_l, from, to, xmitq); + + l->snd_nxt = peers_snd_nxt; + if (link_bc_rcv_gap(l)) + rc |= TIPC_LINK_SND_STATE; + + /* Return now if sender supports nack via STATE messages */ + if (l->peer_caps & TIPC_BCAST_STATE_NACK) + return rc; + + /* Otherwise, be backwards compatible */ if (!more(peers_snd_nxt, l->rcv_nxt)) { l->nack_state = BC_NACK_SND_CONDITIONAL; - return; + return 0; } /* Don't NACK if one was recently sent or peeked */ if (l->nack_state == BC_NACK_SND_SUPPRESS) { l->nack_state = BC_NACK_SND_UNCONDITIONAL; - return; + return 0; } /* Conditionally delay NACK sending until next synch rcv */ if (l->nack_state == BC_NACK_SND_CONDITIONAL) { l->nack_state = BC_NACK_SND_UNCONDITIONAL; if ((peers_snd_nxt - l->rcv_nxt) < TIPC_MIN_LINK_WIN) - return; + return 0; } /* Send NACK now but suppress next one */ tipc_link_build_bc_proto_msg(l, true, peers_snd_nxt, xmitq); l->nack_state = BC_NACK_SND_SUPPRESS; + return 0; } void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, @@ -1652,6 +1741,8 @@ void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, } /* tipc_link_bc_nack_rcv(): receive broadcast nack message + * This function is here for backwards compatibility, since + * no BCAST_PROTOCOL/STATE messages occur from TIPC v2.5. */ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) @@ -1692,10 +1783,10 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); l->window = win; - l->backlog[TIPC_LOW_IMPORTANCE].limit = win / 2; - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = win; - l->backlog[TIPC_HIGH_IMPORTANCE].limit = win / 2 * 3; - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = win * 2; + l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); + l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } diff --git a/net/tipc/link.h b/net/tipc/link.h index d7e9d42fcb2d..d1bd1787a768 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -63,7 +63,7 @@ enum { enum { TIPC_LINK_UP_EVT = 1, TIPC_LINK_DOWN_EVT = (1 << 1), - TIPC_LINK_SND_BC_ACK = (1 << 2) + TIPC_LINK_SND_STATE = (1 << 2) }; /* Starting value for maximum packet size negotiation on unicast links @@ -138,8 +138,8 @@ void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, void tipc_link_build_bc_sync_msg(struct tipc_link *l, struct sk_buff_head *xmitq); void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); -void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, - struct sk_buff_head *xmitq); +int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, + struct sk_buff_head *xmitq); int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq); #endif diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 7cf52fb39bee..c3832cdf2278 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -719,6 +719,16 @@ static inline char *msg_media_addr(struct tipc_msg *m) return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } +static inline u32 msg_bc_gap(struct tipc_msg *m) +{ + return msg_bits(m, 8, 0, 0x3ff); +} + +static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 8, 0, 0x3ff, n); +} + /* * Word 9 */ diff --git a/net/tipc/net.h b/net/tipc/net.h index 77a7a118911d..c7c254902873 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -39,6 +39,8 @@ #include <net/genetlink.h> +extern const struct nla_policy tipc_nl_net_policy[]; + int tipc_net_start(struct net *net, u32 addr); void tipc_net_stop(struct net *net); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index a84daec0afe9..3200059d14b2 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -41,6 +41,7 @@ #include "link.h" #include "node.h" #include "net.h" +#include "udp_media.h" #include <net/genetlink.h> static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { @@ -161,6 +162,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .policy = tipc_nl_policy, }, { + .cmd = TIPC_NL_BEARER_ADD, + .doit = tipc_nl_bearer_add, + .policy = tipc_nl_policy, + }, + { .cmd = TIPC_NL_BEARER_SET, .doit = tipc_nl_bearer_set, .policy = tipc_nl_policy, @@ -238,6 +244,18 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .dumpit = tipc_nl_node_dump_monitor_peer, .policy = tipc_nl_policy, }, + { + .cmd = TIPC_NL_PEER_REMOVE, + .doit = tipc_nl_peer_rm, + .policy = tipc_nl_policy, + }, +#ifdef CONFIG_TIPC_MEDIA_UDP + { + .cmd = TIPC_NL_UDP_GET_REMOTEIP, + .dumpit = tipc_udp_nl_dump_remoteip, + .policy = tipc_nl_policy, + }, +#endif }; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) diff --git a/net/tipc/node.c b/net/tipc/node.c index 21974191e425..7ef14e2d2356 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1262,6 +1262,34 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb) kfree_skb(skb); } +static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, + int bearer_id, struct sk_buff_head *xmitq) +{ + struct tipc_link *ucl; + int rc; + + rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr); + + if (rc & TIPC_LINK_DOWN_EVT) { + tipc_bearer_reset_all(n->net); + return; + } + + if (!(rc & TIPC_LINK_SND_STATE)) + return; + + /* If probe message, a STATE response will be sent anyway */ + if (msg_probe(hdr)) + return; + + /* Produce a STATE message carrying broadcast NACK */ + tipc_node_read_lock(n); + ucl = n->links[bearer_id].link; + if (ucl) + tipc_link_build_state_msg(ucl, xmitq); + tipc_node_read_unlock(n); +} + /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace @@ -1298,7 +1326,7 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id rc = tipc_bcast_rcv(net, be->link, skb); /* Broadcast ACKs are sent on a unicast link */ - if (rc & TIPC_LINK_SND_BC_ACK) { + if (rc & TIPC_LINK_SND_STATE) { tipc_node_read_lock(n); tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); @@ -1505,7 +1533,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) - tipc_bcast_sync_rcv(net, n->bc_entry.link, hdr); + tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) tipc_bcast_ack_rcv(net, n->bc_entry.link, bc_ack); @@ -1553,6 +1581,69 @@ discard: kfree_skb(skb); } +int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; + struct tipc_node *peer; + u32 addr; + int err; + int i; + + /* We identify the peer by its net */ + if (!info->attrs[TIPC_NLA_NET]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, + info->attrs[TIPC_NLA_NET], + tipc_nl_net_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_NET_ADDR]) + return -EINVAL; + + addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); + + if (in_own_node(net, addr)) + return -ENOTSUPP; + + spin_lock_bh(&tn->node_list_lock); + peer = tipc_node_find(net, addr); + if (!peer) { + spin_unlock_bh(&tn->node_list_lock); + return -ENXIO; + } + + tipc_node_write_lock(peer); + if (peer->state != SELF_DOWN_PEER_DOWN && + peer->state != SELF_DOWN_PEER_LEAVING) { + tipc_node_write_unlock(peer); + err = -EBUSY; + goto err_out; + } + + for (i = 0; i < MAX_BEARERS; i++) { + struct tipc_link_entry *le = &peer->links[i]; + + if (le->link) { + kfree(le->link); + le->link = NULL; + peer->link_cnt--; + } + } + tipc_node_write_unlock(peer); + tipc_node_delete(peer); + + err = 0; +err_out: + tipc_node_put(peer); + spin_unlock_bh(&tn->node_list_lock); + + return err; +} + int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; diff --git a/net/tipc/node.h b/net/tipc/node.h index d69fdfcc0ec9..39ef54c1f2ad 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -1,7 +1,7 @@ /* * net/tipc/node.h: Include file for TIPC node management routines * - * Copyright (c) 2000-2006, 2014-2015, Ericsson AB + * Copyright (c) 2000-2006, 2014-2016, Ericsson AB * Copyright (c) 2005, 2010-2014, Wind River Systems * All rights reserved. * @@ -45,11 +45,14 @@ /* Optional capabilities supported by this code version */ enum { - TIPC_BCAST_SYNCH = (1 << 1), - TIPC_BLOCK_FLOWCTL = (2 << 1) + TIPC_BCAST_SYNCH = (1 << 1), + TIPC_BCAST_STATE_NACK = (1 << 2), + TIPC_BLOCK_FLOWCTL = (1 << 3) }; -#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | TIPC_BLOCK_FLOWCTL) +#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ + TIPC_BCAST_STATE_NACK | \ + TIPC_BLOCK_FLOWCTL) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); @@ -77,6 +80,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info); int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index ae7e14cae085..d80cd3f7503f 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -49,6 +49,7 @@ #include "core.h" #include "bearer.h" #include "netlink.h" +#include "msg.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 @@ -70,6 +71,13 @@ struct udp_media_addr { }; }; +/* struct udp_replicast - container for UDP remote addresses */ +struct udp_replicast { + struct udp_media_addr addr; + struct rcu_head rcu; + struct list_head list; +}; + /** * struct udp_bearer - ip/udp bearer data structure * @bearer: associated generic tipc bearer @@ -82,8 +90,20 @@ struct udp_bearer { struct socket *ubsock; u32 ifindex; struct work_struct work; + struct udp_replicast rcast; }; +static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr) +{ + if (ntohs(addr->proto) == ETH_P_IP) + return ipv4_is_multicast(addr->ipv4.s_addr); +#if IS_ENABLED(CONFIG_IPV6) + else + return ipv6_addr_is_multicast(&addr->ipv6); +#endif + return 0; +} + /* udp_media_addr_set - convert a ip/udp address to a TIPC media address */ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, struct udp_media_addr *ua) @@ -91,15 +111,9 @@ static void tipc_udp_media_addr_set(struct tipc_media_addr *addr, memset(addr, 0, sizeof(struct tipc_media_addr)); addr->media_id = TIPC_MEDIA_TYPE_UDP; memcpy(addr->value, ua, sizeof(struct udp_media_addr)); - if (ntohs(ua->proto) == ETH_P_IP) { - if (ipv4_is_multicast(ua->ipv4.s_addr)) - addr->broadcast = 1; - } else if (ntohs(ua->proto) == ETH_P_IPV6) { - if (ipv6_addr_type(&ua->ipv6) & IPV6_ADDR_MULTICAST) - addr->broadcast = 1; - } else { - pr_err("Invalid UDP media address\n"); - } + + if (tipc_udp_is_mcast_addr(ua)) + addr->broadcast = 1; } /* tipc_udp_addr2str - convert ip/udp address to string */ @@ -140,28 +154,13 @@ static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a) } /* tipc_send_msg - enqueue a send request */ -static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, - struct tipc_bearer *b, - struct tipc_media_addr *dest) +static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, + struct udp_bearer *ub, struct udp_media_addr *src, + struct udp_media_addr *dst) { int ttl, err = 0; - struct udp_bearer *ub; - struct udp_media_addr *dst = (struct udp_media_addr *)&dest->value; - struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; struct rtable *rt; - if (skb_headroom(skb) < UDP_MIN_HEADROOM) { - err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); - if (err) - goto tx_error; - } - - skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); - ub = rcu_dereference_rtnl(b->media_ptr); - if (!ub) { - err = -ENODEV; - goto tx_error; - } if (dst->proto == htons(ETH_P_IP)) { struct flowi4 fl = { .daddr = dst->ipv4.s_addr, @@ -207,29 +206,178 @@ tx_error: return err; } +static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, + struct tipc_bearer *b, + struct tipc_media_addr *addr) +{ + struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; + struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value; + struct udp_replicast *rcast; + struct udp_bearer *ub; + int err = 0; + + if (skb_headroom(skb) < UDP_MIN_HEADROOM) { + err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); + if (err) + goto out; + } + + skb_set_inner_protocol(skb, htons(ETH_P_TIPC)); + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + err = -ENODEV; + goto out; + } + + if (!addr->broadcast || list_empty(&ub->rcast.list)) + return tipc_udp_xmit(net, skb, ub, src, dst); + + /* Replicast, send an skb to each configured IP address */ + list_for_each_entry_rcu(rcast, &ub->rcast.list, list) { + struct sk_buff *_skb; + + _skb = pskb_copy(skb, GFP_ATOMIC); + if (!_skb) { + err = -ENOMEM; + goto out; + } + + err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr); + if (err) { + kfree_skb(_skb); + goto out; + } + } + err = 0; +out: + kfree_skb(skb); + return err; +} + +static bool tipc_udp_is_known_peer(struct tipc_bearer *b, + struct udp_media_addr *addr) +{ + struct udp_replicast *rcast, *tmp; + struct udp_bearer *ub; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + pr_err_ratelimited("UDP bearer instance not found\n"); + return false; + } + + list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + if (!memcmp(&rcast->addr, addr, sizeof(struct udp_media_addr))) + return true; + } + + return false; +} + +static int tipc_udp_rcast_add(struct tipc_bearer *b, + struct udp_media_addr *addr) +{ + struct udp_replicast *rcast; + struct udp_bearer *ub; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) + return -ENODEV; + + rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC); + if (!rcast) + return -ENOMEM; + + memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr)); + + if (ntohs(addr->proto) == ETH_P_IP) + pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4); +#if IS_ENABLED(CONFIG_IPV6) + else if (ntohs(addr->proto) == ETH_P_IPV6) + pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6); +#endif + + list_add_rcu(&rcast->list, &ub->rcast.list); + return 0; +} + +static int tipc_udp_rcast_disc(struct tipc_bearer *b, struct sk_buff *skb) +{ + struct udp_media_addr src = {0}; + struct udp_media_addr *dst; + + dst = (struct udp_media_addr *)&b->bcast_addr.value; + if (tipc_udp_is_mcast_addr(dst)) + return 0; + + src.port = udp_hdr(skb)->source; + + if (ip_hdr(skb)->version == 4) { + struct iphdr *iphdr = ip_hdr(skb); + + src.proto = htons(ETH_P_IP); + src.ipv4.s_addr = iphdr->saddr; + if (ipv4_is_multicast(iphdr->daddr)) + return 0; +#if IS_ENABLED(CONFIG_IPV6) + } else if (ip_hdr(skb)->version == 6) { + struct ipv6hdr *iphdr = ipv6_hdr(skb); + + src.proto = htons(ETH_P_IPV6); + src.ipv6 = iphdr->saddr; + if (ipv6_addr_is_multicast(&iphdr->daddr)) + return 0; +#endif + } else { + return 0; + } + + if (likely(tipc_udp_is_known_peer(b, &src))) + return 0; + + return tipc_udp_rcast_add(b, &src); +} + /* tipc_udp_recv - read data from bearer socket */ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb) { struct udp_bearer *ub; struct tipc_bearer *b; + struct tipc_msg *hdr; + int err; ub = rcu_dereference_sk_user_data(sk); if (!ub) { pr_err_ratelimited("Failed to get UDP bearer reference"); - kfree_skb(skb); - return 0; + goto out; } - skb_pull(skb, sizeof(struct udphdr)); + hdr = buf_msg(skb); + rcu_read_lock(); b = rcu_dereference_rtnl(ub->bearer); + if (!b) + goto rcu_out; - if (b) { + if (b && test_bit(0, &b->up)) { tipc_rcv(sock_net(sk), skb, b); rcu_read_unlock(); return 0; } + + if (unlikely(msg_user(hdr) == LINK_CONFIG)) { + err = tipc_udp_rcast_disc(b, skb); + if (err) + goto rcu_out; + } + + tipc_rcv(sock_net(sk), skb, b); rcu_read_unlock(); + return 0; + +rcu_out: + rcu_read_unlock(); +out: kfree_skb(skb); return 0; } @@ -241,15 +389,11 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) struct sock *sk = ub->ubsock->sk; if (ntohs(remote->proto) == ETH_P_IP) { - if (!ipv4_is_multicast(remote->ipv4.s_addr)) - return 0; mreqn.imr_multiaddr = remote->ipv4; mreqn.imr_ifindex = ub->ifindex; err = ip_mc_join_group(sk, &mreqn); #if IS_ENABLED(CONFIG_IPV6) } else { - if (!ipv6_addr_is_multicast(&remote->ipv6)) - return 0; err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); #endif @@ -257,75 +401,234 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) return err; } -/** - * parse_options - build local/remote addresses from configuration - * @attrs: netlink config data - * @ub: UDP bearer instance - * @local: local bearer IP address/port - * @remote: peer or multicast IP/port - */ -static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, - struct udp_media_addr *local, - struct udp_media_addr *remote) +static int __tipc_nl_add_udp_addr(struct sk_buff *skb, + struct udp_media_addr *addr, int nla_t) { - struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; - struct sockaddr_storage sa_local, sa_remote; + if (ntohs(addr->proto) == ETH_P_IP) { + struct sockaddr_in ip4; - if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) - goto err; - if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, - attrs[TIPC_NLA_BEARER_UDP_OPTS], - tipc_nl_udp_policy)) - goto err; - if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { - nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL], - sizeof(sa_local)); - nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE], - sizeof(sa_remote)); + ip4.sin_family = AF_INET; + ip4.sin_port = addr->port; + ip4.sin_addr.s_addr = addr->ipv4.s_addr; + if (nla_put(skb, nla_t, sizeof(ip4), &ip4)) + return -EMSGSIZE; + +#if IS_ENABLED(CONFIG_IPV6) + } else if (ntohs(addr->proto) == ETH_P_IPV6) { + struct sockaddr_in6 ip6; + + ip6.sin6_family = AF_INET6; + ip6.sin6_port = addr->port; + memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr)); + if (nla_put(skb, nla_t, sizeof(ip6), &ip6)) + return -EMSGSIZE; +#endif + } + + return 0; +} + +int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) +{ + u32 bid = cb->args[0]; + u32 skip_cnt = cb->args[1]; + u32 portid = NETLINK_CB(cb->skb).portid; + struct udp_replicast *rcast, *tmp; + struct tipc_bearer *b; + struct udp_bearer *ub; + void *hdr; + int err; + int i; + + if (!bid && !skip_cnt) { + struct net *net = sock_net(skb->sk); + struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1]; + struct nlattr **attrs; + char *bname; + + err = tipc_nlmsg_parse(cb->nlh, &attrs); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(battrs, TIPC_NLA_BEARER_MAX, + attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!battrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + + bname = nla_data(battrs[TIPC_NLA_BEARER_NAME]); + + rtnl_lock(); + b = tipc_bearer_find(net, bname); + if (!b) { + rtnl_unlock(); + return -EINVAL; + } + bid = b->identity; } else { -err: - pr_err("Invalid UDP bearer configuration"); + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = net_generic(net, tipc_net_id); + + rtnl_lock(); + b = rtnl_dereference(tn->bearer_list[bid]); + if (!b) { + rtnl_unlock(); + return -EINVAL; + } + } + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) { + rtnl_unlock(); return -EINVAL; } - if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) { - struct sockaddr_in *ip4; - - ip4 = (struct sockaddr_in *)&sa_local; - local->proto = htons(ETH_P_IP); - local->port = ip4->sin_port; - local->ipv4.s_addr = ip4->sin_addr.s_addr; - - ip4 = (struct sockaddr_in *)&sa_remote; - remote->proto = htons(ETH_P_IP); - remote->port = ip4->sin_port; - remote->ipv4.s_addr = ip4->sin_addr.s_addr; + + i = 0; + list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + if (i < skip_cnt) + goto count; + + hdr = genlmsg_put(skb, portid, cb->nlh->nlmsg_seq, + &tipc_genl_family, NLM_F_MULTI, + TIPC_NL_BEARER_GET); + if (!hdr) + goto done; + + err = __tipc_nl_add_udp_addr(skb, &rcast->addr, + TIPC_NLA_UDP_REMOTE); + if (err) { + genlmsg_cancel(skb, hdr); + goto done; + } + genlmsg_end(skb, hdr); +count: + i++; + } +done: + rtnl_unlock(); + cb->args[0] = bid; + cb->args[1] = i; + + return skb->len; +} + +int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b) +{ + struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value; + struct udp_media_addr *dst; + struct udp_bearer *ub; + struct nlattr *nest; + + ub = rcu_dereference_rtnl(b->media_ptr); + if (!ub) + return -ENODEV; + + nest = nla_nest_start(msg->skb, TIPC_NLA_BEARER_UDP_OPTS); + if (!nest) + goto msg_full; + + if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL)) + goto msg_full; + + dst = (struct udp_media_addr *)&b->bcast_addr.value; + if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE)) + goto msg_full; + + if (!list_empty(&ub->rcast.list)) { + if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP)) + goto msg_full; + } + + nla_nest_end(msg->skb, nest); + return 0; +msg_full: + nla_nest_cancel(msg->skb, nest); + return -EMSGSIZE; +} + +/** + * tipc_parse_udp_addr - build udp media address from netlink data + * @nlattr: netlink attribute containing sockaddr storage aligned address + * @addr: tipc media address to fill with address, port and protocol type + * @scope_id: IPv6 scope id pointer, not NULL indicates it's required + */ + +static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr, + u32 *scope_id) +{ + struct sockaddr_storage sa; + + nla_memcpy(&sa, nla, sizeof(sa)); + if (sa.ss_family == AF_INET) { + struct sockaddr_in *ip4 = (struct sockaddr_in *)&sa; + + addr->proto = htons(ETH_P_IP); + addr->port = ip4->sin_port; + addr->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; #if IS_ENABLED(CONFIG_IPV6) - } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) { - int atype; - struct sockaddr_in6 *ip6; + } else if (sa.ss_family == AF_INET6) { + struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)&sa; - ip6 = (struct sockaddr_in6 *)&sa_local; - atype = ipv6_addr_type(&ip6->sin6_addr); - if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) - return -EINVAL; + addr->proto = htons(ETH_P_IPV6); + addr->port = ip6->sin6_port; + memcpy(&addr->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); + + /* Scope ID is only interesting for local addresses */ + if (scope_id) { + int atype; - local->proto = htons(ETH_P_IPV6); - local->port = ip6->sin6_port; - memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); - ub->ifindex = ip6->sin6_scope_id; + atype = ipv6_addr_type(&ip6->sin6_addr); + if (__ipv6_addr_needs_scope_id(atype) && + !ip6->sin6_scope_id) { + return -EINVAL; + } + + *scope_id = ip6->sin6_scope_id ? : 0; + } - ip6 = (struct sockaddr_in6 *)&sa_remote; - remote->proto = htons(ETH_P_IPV6); - remote->port = ip6->sin6_port; - memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); return 0; #endif } return -EADDRNOTAVAIL; } +int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr) +{ + int err; + struct udp_media_addr addr = {0}; + struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; + struct udp_media_addr *dst; + + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy)) + return -EINVAL; + + if (!opts[TIPC_NLA_UDP_REMOTE]) + return -EINVAL; + + err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL); + if (err) + return err; + + dst = (struct udp_media_addr *)&b->bcast_addr.value; + if (tipc_udp_is_mcast_addr(dst)) { + pr_err("Can't add remote ip to TIPC UDP multicast bearer\n"); + return -EINVAL; + } + + if (tipc_udp_is_known_peer(b, &addr)) + return 0; + + return tipc_udp_rcast_add(b, &addr); +} + /** * tipc_udp_enable - callback to create a new udp bearer instance * @net: network namespace @@ -340,18 +643,38 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, { int err = -EINVAL; struct udp_bearer *ub; - struct udp_media_addr *remote; + struct udp_media_addr remote = {0}; struct udp_media_addr local = {0}; struct udp_port_cfg udp_conf = {0}; struct udp_tunnel_sock_cfg tuncfg = {NULL}; + struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; ub = kzalloc(sizeof(*ub), GFP_ATOMIC); if (!ub) return -ENOMEM; - remote = (struct udp_media_addr *)&b->bcast_addr.value; - memset(remote, 0, sizeof(struct udp_media_addr)); - err = parse_options(attrs, ub, &local, remote); + INIT_LIST_HEAD(&ub->rcast.list); + + if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) + goto err; + + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, + attrs[TIPC_NLA_BEARER_UDP_OPTS], + tipc_nl_udp_policy)) + goto err; + + if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) { + pr_err("Invalid UDP bearer configuration"); + err = -EINVAL; + goto err; + } + + err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local, + &ub->ifindex); + if (err) + goto err; + + err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL); if (err) goto err; @@ -396,9 +719,18 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, tuncfg.encap_destroy = NULL; setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg); - err = enable_mcast(ub, remote); + /** + * The bcast media address port is used for all peers and the ip + * is used if it's a multicast address. + */ + memcpy(&b->bcast_addr.value, &remote, sizeof(remote)); + if (tipc_udp_is_mcast_addr(&remote)) + err = enable_mcast(ub, &remote); + else + err = tipc_udp_rcast_add(b, &remote); if (err) goto err; + return 0; err: if (ub->ubsock) @@ -411,6 +743,12 @@ err: static void cleanup_bearer(struct work_struct *work) { struct udp_bearer *ub = container_of(work, struct udp_bearer, work); + struct udp_replicast *rcast, *tmp; + + list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) { + list_del_rcu(&rcast->list); + kfree_rcu(rcast, rcu); + } if (ub->ubsock) udp_tunnel_sock_release(ub->ubsock); diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h new file mode 100644 index 000000000000..281bbae87726 --- /dev/null +++ b/net/tipc/udp_media.h @@ -0,0 +1,46 @@ +/* + * net/tipc/udp_media.h: Include file for UDP bearer media + * + * Copyright (c) 1996-2006, 2013-2016, Ericsson AB + * Copyright (c) 2005, 2010-2011, Wind River Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef CONFIG_TIPC_MEDIA_UDP +#ifndef _TIPC_UDP_MEDIA_H +#define _TIPC_UDP_MEDIA_H + +int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr); +int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b); +int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb); + +#endif +#endif diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 0f506220a3bd..5497d022fada 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -372,6 +372,7 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: @@ -946,6 +947,7 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: /* these interface types don't really have a channel */ return; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/net/wireless/core.c b/net/wireless/core.c index 7645e97362c0..8201e6d7449e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -225,6 +225,23 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, } } +void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + ASSERT_RTNL(); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_NAN)) + return; + + if (!wdev->nan_started) + return; + + rdev_stop_nan(rdev, wdev); + wdev->nan_started = false; + + rdev->opencount--; +} + void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -242,6 +259,9 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) case NL80211_IFTYPE_P2P_DEVICE: cfg80211_stop_p2p_device(rdev, wdev); break; + case NL80211_IFTYPE_NAN: + cfg80211_stop_nan(rdev, wdev); + break; default: break; } @@ -537,6 +557,11 @@ static int wiphy_verify_combinations(struct wiphy *wiphy) c->limits[j].max > 1)) return -EINVAL; + /* Only a single NAN can be allowed */ + if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && + c->limits[j].max > 1)) + return -EINVAL; + cnt += c->limits[j].max; /* * Don't advertise an unsupported type @@ -579,6 +604,11 @@ int wiphy_register(struct wiphy *wiphy) !rdev->ops->tdls_cancel_channel_switch))) return -EINVAL; + if (WARN_ON((wiphy->interface_modes & BIT(NL80211_IFTYPE_NAN)) && + (!rdev->ops->start_nan || !rdev->ops->stop_nan || + !rdev->ops->add_nan_func || !rdev->ops->del_nan_func))) + return -EINVAL; + /* * if a wiphy has unsupported modes for regulatory channel enforcement, * opt-out of enforcement checking @@ -589,6 +619,7 @@ int wiphy_register(struct wiphy *wiphy) BIT(NL80211_IFTYPE_P2P_GO) | BIT(NL80211_IFTYPE_ADHOC) | BIT(NL80211_IFTYPE_P2P_DEVICE) | + BIT(NL80211_IFTYPE_NAN) | BIT(NL80211_IFTYPE_AP_VLAN) | BIT(NL80211_IFTYPE_MONITOR))) wiphy->regulatory_flags |= REGULATORY_IGNORE_STALE_KICKOFF; @@ -906,6 +937,8 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) if (WARN_ON(wdev->netdev)) return; + nl80211_notify_iface(rdev, wdev, NL80211_CMD_DEL_INTERFACE); + list_del_rcu(&wdev->list); rdev->devlist_generation++; @@ -914,6 +947,9 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) cfg80211_mlme_purge_registrations(wdev); cfg80211_stop_p2p_device(rdev, wdev); break; + case NL80211_IFTYPE_NAN: + cfg80211_stop_nan(rdev, wdev); + break; default: WARN_ON_ONCE(1); break; @@ -977,6 +1013,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, /* must be handled by mac80211/driver, has no APIs */ break; case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: /* cannot happen, has no netdev */ break; case NL80211_IFTYPE_AP_VLAN: @@ -1079,6 +1116,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) dev->priv_flags |= IFF_DONT_BRIDGE; + + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); break; case NETDEV_GOING_DOWN: cfg80211_leave(rdev, wdev); @@ -1157,6 +1196,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, * remove and clean it up. */ if (!list_empty(&wdev->list)) { + nl80211_notify_iface(rdev, wdev, + NL80211_CMD_DEL_INTERFACE); sysfs_remove_link(&dev->dev.kobj, "phy80211"); list_del_rcu(&wdev->list); rdev->devlist_generation++; @@ -1246,7 +1287,7 @@ static int __init cfg80211_init(void) if (err) goto out_fail_reg; - cfg80211_wq = create_singlethread_workqueue("cfg80211"); + cfg80211_wq = alloc_ordered_workqueue("cfg80211", WQ_MEM_RECLAIM); if (!cfg80211_wq) { err = -ENOMEM; goto out_fail_wq; diff --git a/net/wireless/core.h b/net/wireless/core.h index eee91443924d..08d2e948c9ad 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -249,9 +249,9 @@ struct cfg80211_event { }; struct cfg80211_cached_keys { - struct key_params params[6]; - u8 data[6][WLAN_MAX_KEY_LEN]; - int def, defmgmt; + struct key_params params[CFG80211_MAX_WEP_KEYS]; + u8 data[CFG80211_MAX_WEP_KEYS][WLAN_KEY_LEN_WEP104]; + int def; }; enum cfg80211_chan_mode { @@ -488,6 +488,9 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); +void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev); + #define CFG80211_MAX_NUM_DIFFERENT_CHANNELS 10 #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 4a4dda53bdf1..364f900a3dc4 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -43,7 +43,8 @@ void __cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, cfg80211_hold_bss(bss_from_pub(bss)); wdev->current_bss = bss_from_pub(bss); - cfg80211_upload_connect_keys(wdev); + if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) + cfg80211_upload_connect_keys(wdev); nl80211_send_ibss_bssid(wiphy_to_rdev(wdev->wiphy), dev, bssid, GFP_KERNEL); @@ -114,6 +115,9 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, } } + if (WARN_ON(connkeys && connkeys->def < 0)) + return -EINVAL; + if (WARN_ON(wdev->connect_keys)) kzfree(wdev->connect_keys); wdev->connect_keys = connkeys; @@ -284,18 +288,16 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, if (!netif_running(wdev->netdev)) return 0; - if (wdev->wext.keys) { + if (wdev->wext.keys) wdev->wext.keys->def = wdev->wext.default_key; - wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key; - } wdev->wext.ibss.privacy = wdev->wext.default_key != -1; - if (wdev->wext.keys) { + if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; - for (i = 0; i < 6; i++) + for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) ck->params[i].key = ck->data[i]; } err = __cfg80211_join_ibss(rdev, wdev->netdev, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index c284d883c349..cbb48e26a871 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -222,7 +222,7 @@ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, ASSERT_WDEV_LOCK(wdev); if (auth_type == NL80211_AUTHTYPE_SHARED_KEY) - if (!key || !key_len || key_idx < 0 || key_idx > 4) + if (!key || !key_len || key_idx < 0 || key_idx > 3) return -EINVAL; if (wdev->current_bss && @@ -634,6 +634,7 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, * fall through, P2P device only supports * public action frames */ + case NL80211_IFTYPE_NAN: default: err = -EOPNOTSUPP; break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4809f4d2cdcc..c510810f0b7c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -56,6 +56,7 @@ enum nl80211_multicast_groups { NL80211_MCGRP_REGULATORY, NL80211_MCGRP_MLME, NL80211_MCGRP_VENDOR, + NL80211_MCGRP_NAN, NL80211_MCGRP_TESTMODE /* keep last - ifdef! */ }; @@ -65,6 +66,7 @@ static const struct genl_multicast_group nl80211_mcgrps[] = { [NL80211_MCGRP_REGULATORY] = { .name = NL80211_MULTICAST_GROUP_REG }, [NL80211_MCGRP_MLME] = { .name = NL80211_MULTICAST_GROUP_MLME }, [NL80211_MCGRP_VENDOR] = { .name = NL80211_MULTICAST_GROUP_VENDOR }, + [NL80211_MCGRP_NAN] = { .name = NL80211_MULTICAST_GROUP_NAN }, #ifdef CONFIG_NL80211_TESTMODE [NL80211_MCGRP_TESTMODE] = { .name = NL80211_MULTICAST_GROUP_TESTMODE } #endif @@ -409,6 +411,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = VHT_MUMIMO_GROUPS_DATA_LEN }, [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { .len = ETH_ALEN }, + [NL80211_ATTR_NAN_MASTER_PREF] = { .type = NLA_U8 }, + [NL80211_ATTR_NAN_DUAL] = { .type = NLA_U8 }, + [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, }; /* policy for the key attributes */ @@ -502,6 +507,39 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = { }, }; +/* policy for NAN function attributes */ +static const struct nla_policy +nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = { + [NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_SERVICE_ID] = { .type = NLA_BINARY, + .len = NL80211_NAN_FUNC_SERVICE_ID_LEN }, + [NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_PUBLISH_BCAST] = { .type = NLA_FLAG }, + [NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG }, + [NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = { .len = ETH_ALEN }, + [NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG }, + [NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 }, + [NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY, + .len = NL80211_NAN_FUNC_SERVICE_SPEC_INFO_MAX_LEN }, + [NL80211_NAN_FUNC_SRF] = { .type = NLA_NESTED }, + [NL80211_NAN_FUNC_RX_MATCH_FILTER] = { .type = NLA_NESTED }, + [NL80211_NAN_FUNC_TX_MATCH_FILTER] = { .type = NLA_NESTED }, + [NL80211_NAN_FUNC_INSTANCE_ID] = { .type = NLA_U8 }, + [NL80211_NAN_FUNC_TERM_REASON] = { .type = NLA_U8 }, +}; + +/* policy for Service Response Filter attributes */ +static const struct nla_policy +nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = { + [NL80211_NAN_SRF_INCLUDE] = { .type = NLA_FLAG }, + [NL80211_NAN_SRF_BF] = { .type = NLA_BINARY, + .len = NL80211_NAN_FUNC_SRF_MAX_LEN }, + [NL80211_NAN_SRF_BF_IDX] = { .type = NLA_U8 }, + [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -848,13 +886,21 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, struct nlattr *key; struct cfg80211_cached_keys *result; int rem, err, def = 0; + bool have_key = false; + + nla_for_each_nested(key, keys, rem) { + have_key = true; + break; + } + + if (!have_key) + return NULL; result = kzalloc(sizeof(*result), GFP_KERNEL); if (!result) return ERR_PTR(-ENOMEM); result->def = -1; - result->defmgmt = -1; nla_for_each_nested(key, keys, rem) { memset(&parse, 0, sizeof(parse)); @@ -866,7 +912,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, err = -EINVAL; if (!parse.p.key) goto error; - if (parse.idx < 0 || parse.idx > 4) + if (parse.idx < 0 || parse.idx > 3) goto error; if (parse.def) { if (def) @@ -881,16 +927,24 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, parse.idx, false, NULL); if (err) goto error; + if (parse.p.cipher != WLAN_CIPHER_SUITE_WEP40 && + parse.p.cipher != WLAN_CIPHER_SUITE_WEP104) { + err = -EINVAL; + goto error; + } result->params[parse.idx].cipher = parse.p.cipher; result->params[parse.idx].key_len = parse.p.key_len; result->params[parse.idx].key = result->data[parse.idx]; memcpy(result->data[parse.idx], parse.p.key, parse.p.key_len); - if (parse.p.cipher == WLAN_CIPHER_SUITE_WEP40 || - parse.p.cipher == WLAN_CIPHER_SUITE_WEP104) { - if (no_ht) - *no_ht = true; - } + /* must be WEP key if we got here */ + if (no_ht) + *no_ht = true; + } + + if (result->def < 0) { + err = -EINVAL; + goto error; } return result; @@ -918,6 +972,7 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: case NUM_NL80211_IFTYPES: @@ -2525,10 +2580,35 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * int if_idx = 0; int wp_start = cb->args[0]; int if_start = cb->args[1]; + int filter_wiphy = -1; struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; rtnl_lock(); + if (!cb->args[2]) { + struct nl80211_dump_wiphy_state state = { + .filter_wiphy = -1, + }; + int ret; + + ret = nl80211_dump_wiphy_parse(skb, cb, &state); + if (ret) + return ret; + + filter_wiphy = state.filter_wiphy; + + /* + * if filtering, set cb->args[2] to +1 since 0 is the default + * value needed to determine that parsing is necessary. + */ + if (filter_wiphy >= 0) + cb->args[2] = filter_wiphy + 1; + else + cb->args[2] = -1; + } else if (cb->args[2] > 0) { + filter_wiphy = cb->args[2] - 1; + } + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { if (!net_eq(wiphy_net(&rdev->wiphy), sock_net(skb->sk))) continue; @@ -2536,6 +2616,10 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * wp_idx++; continue; } + + if (filter_wiphy >= 0 && filter_wiphy != rdev->wiphy_idx) + continue; + if_idx = 0; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { @@ -2751,7 +2835,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct vif_params params; struct wireless_dev *wdev; - struct sk_buff *msg, *event; + struct sk_buff *msg; int err; enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; u32 flags; @@ -2774,7 +2858,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) !(rdev->wiphy.interface_modes & (1 << type))) return -EOPNOTSUPP; - if ((type == NL80211_IFTYPE_P2P_DEVICE || + if ((type == NL80211_IFTYPE_P2P_DEVICE || type == NL80211_IFTYPE_NAN || rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) && info->attrs[NL80211_ATTR_MAC]) { nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC], @@ -2830,9 +2914,10 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) wdev->mesh_id_up_len); wdev_unlock(wdev); break; + case NL80211_IFTYPE_NAN: case NL80211_IFTYPE_P2P_DEVICE: /* - * P2P Device doesn't have a netdev, so doesn't go + * P2P Device and NAN do not have a netdev, so don't go * through the netdev notifier and must be added here */ mutex_init(&wdev->mtx); @@ -2855,20 +2940,15 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return -ENOBUFS; } - event = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (event) { - if (nl80211_send_iface(event, 0, 0, 0, - rdev, wdev, false) < 0) { - nlmsg_free(event); - goto out; - } - - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), - event, 0, NL80211_MCGRP_CONFIG, - GFP_KERNEL); - } + /* + * For wdevs which have no associated netdev object (e.g. of type + * NL80211_IFTYPE_P2P_DEVICE), emit the NEW_INTERFACE event here. + * For all other types, the event will be generated from the + * netdev notifier + */ + if (!wdev->netdev) + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); -out: return genlmsg_reply(msg, info); } @@ -2876,18 +2956,10 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; - struct sk_buff *msg; - int status; if (!rdev->ops->del_virtual_intf) return -EOPNOTSUPP; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (msg && nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, true) < 0) { - nlmsg_free(msg); - msg = NULL; - } - /* * If we remove a wireless device without a netdev then clear * user_ptr[1] so that nl80211_post_doit won't dereference it @@ -2898,15 +2970,7 @@ static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) if (!wdev->netdev) info->user_ptr[1] = NULL; - status = rdev_del_virtual_intf(rdev, wdev); - if (status >= 0 && msg) - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), - msg, 0, NL80211_MCGRP_CONFIG, - GFP_KERNEL); - else - nlmsg_free(msg); - - return status; + return rdev_del_virtual_intf(rdev, wdev); } static int nl80211_set_noack_map(struct sk_buff *skb, struct genl_info *info) @@ -3316,6 +3380,291 @@ static int nl80211_set_mac_acl(struct sk_buff *skb, struct genl_info *info) return err; } +static u32 rateset_to_mask(struct ieee80211_supported_band *sband, + u8 *rates, u8 rates_len) +{ + u8 i; + u32 mask = 0; + + for (i = 0; i < rates_len; i++) { + int rate = (rates[i] & 0x7f) * 5; + int ridx; + + for (ridx = 0; ridx < sband->n_bitrates; ridx++) { + struct ieee80211_rate *srate = + &sband->bitrates[ridx]; + if (rate == srate->bitrate) { + mask |= 1 << ridx; + break; + } + } + if (ridx == sband->n_bitrates) + return 0; /* rate not found */ + } + + return mask; +} + +static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, + u8 *rates, u8 rates_len, + u8 mcs[IEEE80211_HT_MCS_MASK_LEN]) +{ + u8 i; + + memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN); + + for (i = 0; i < rates_len; i++) { + int ridx, rbit; + + ridx = rates[i] / 8; + rbit = BIT(rates[i] % 8); + + /* check validity */ + if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN)) + return false; + + /* check availability */ + if (sband->ht_cap.mcs.rx_mask[ridx] & rbit) + mcs[ridx] |= rbit; + else + return false; + } + + return true; +} + +static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map) +{ + u16 mcs_mask = 0; + + switch (vht_mcs_map) { + case IEEE80211_VHT_MCS_NOT_SUPPORTED: + break; + case IEEE80211_VHT_MCS_SUPPORT_0_7: + mcs_mask = 0x00FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_8: + mcs_mask = 0x01FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_9: + mcs_mask = 0x03FF; + break; + default: + break; + } + + return mcs_mask; +} + +static void vht_build_mcs_mask(u16 vht_mcs_map, + u16 vht_mcs_mask[NL80211_VHT_NSS_MAX]) +{ + u8 nss; + + for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) { + vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03); + vht_mcs_map >>= 2; + } +} + +static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, + struct nl80211_txrate_vht *txrate, + u16 mcs[NL80211_VHT_NSS_MAX]) +{ + u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); + u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {}; + u8 i; + + if (!sband->vht_cap.vht_supported) + return false; + + memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX); + + /* Build vht_mcs_mask from VHT capabilities */ + vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask); + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + +static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_TXRATE_HT] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_HT_RATES }, + [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)}, + [NL80211_TXRATE_GI] = { .type = NLA_U8 }, +}; + +static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, + struct cfg80211_bitrate_mask *mask) +{ + struct nlattr *tb[NL80211_TXRATE_MAX + 1]; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int rem, i; + struct nlattr *tx_rates; + struct ieee80211_supported_band *sband; + u16 vht_tx_mcs_map; + + memset(mask, 0, sizeof(*mask)); + /* Default to all rates enabled */ + for (i = 0; i < NUM_NL80211_BANDS; i++) { + sband = rdev->wiphy.bands[i]; + + if (!sband) + continue; + + mask->control[i].legacy = (1 << sband->n_bitrates) - 1; + memcpy(mask->control[i].ht_mcs, + sband->ht_cap.mcs.rx_mask, + sizeof(mask->control[i].ht_mcs)); + + if (!sband->vht_cap.vht_supported) + continue; + + vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); + vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + } + + /* if no rates are given set it back to the defaults */ + if (!info->attrs[NL80211_ATTR_TX_RATES]) + goto out; + + /* The nested attribute uses enum nl80211_band as the index. This maps + * directly to the enum nl80211_band values used in cfg80211. + */ + BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); + nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { + enum nl80211_band band = nla_type(tx_rates); + int err; + + if (band < 0 || band >= NUM_NL80211_BANDS) + return -EINVAL; + sband = rdev->wiphy.bands[band]; + if (sband == NULL) + return -EINVAL; + err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), + nla_len(tx_rates), nl80211_txattr_policy); + if (err) + return err; + if (tb[NL80211_TXRATE_LEGACY]) { + mask->control[band].legacy = rateset_to_mask( + sband, + nla_data(tb[NL80211_TXRATE_LEGACY]), + nla_len(tb[NL80211_TXRATE_LEGACY])); + if ((mask->control[band].legacy == 0) && + nla_len(tb[NL80211_TXRATE_LEGACY])) + return -EINVAL; + } + if (tb[NL80211_TXRATE_HT]) { + if (!ht_rateset_to_mask( + sband, + nla_data(tb[NL80211_TXRATE_HT]), + nla_len(tb[NL80211_TXRATE_HT]), + mask->control[band].ht_mcs)) + return -EINVAL; + } + if (tb[NL80211_TXRATE_VHT]) { + if (!vht_set_mcs_mask( + sband, + nla_data(tb[NL80211_TXRATE_VHT]), + mask->control[band].vht_mcs)) + return -EINVAL; + } + if (tb[NL80211_TXRATE_GI]) { + mask->control[band].gi = + nla_get_u8(tb[NL80211_TXRATE_GI]); + if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI) + return -EINVAL; + } + + if (mask->control[band].legacy == 0) { + /* don't allow empty legacy rates if HT or VHT + * are not even supported. + */ + if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || + rdev->wiphy.bands[band]->vht_cap.vht_supported)) + return -EINVAL; + + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) + if (mask->control[band].ht_mcs[i]) + goto out; + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + if (mask->control[band].vht_mcs[i]) + goto out; + + /* legacy and mcs rates may not be both empty */ + return -EINVAL; + } + } + +out: + return 0; +} + +static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, + enum nl80211_band band, + struct cfg80211_bitrate_mask *beacon_rate) +{ + u32 count_ht, count_vht, i; + u32 rate = beacon_rate->control[band].legacy; + + /* Allow only one rate */ + if (hweight32(rate) > 1) + return -EINVAL; + + count_ht = 0; + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { + if (hweight8(beacon_rate->control[band].ht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].ht_mcs[i]) { + count_ht++; + if (count_ht > 1) + return -EINVAL; + } + if (count_ht && rate) + return -EINVAL; + } + + count_vht = 0; + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].vht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].vht_mcs[i]) { + count_vht++; + if (count_vht > 1) + return -EINVAL; + } + if (count_vht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht) || (!rate && !count_ht && !count_vht)) + return -EINVAL; + + if (rate && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) + return -EINVAL; + if (count_ht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_HT)) + return -EINVAL; + if (count_vht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_VHT)) + return -EINVAL; + + return 0; +} + static int nl80211_parse_beacon(struct nlattr *attrs[], struct cfg80211_beacon_data *bcn) { @@ -3545,6 +3894,17 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) wdev->iftype)) return -EINVAL; + if (info->attrs[NL80211_ATTR_TX_RATES]) { + err = nl80211_parse_tx_bitrate_mask(info, ¶ms.beacon_rate); + if (err) + return err; + + err = validate_beacon_tx_rate(rdev, params.chandef.chan->band, + ¶ms.beacon_rate); + if (err) + return err; + } + if (info->attrs[NL80211_ATTR_SMPS_MODE]) { params.smps_mode = nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); @@ -5374,6 +5734,18 @@ static int nl80211_check_s32(const struct nlattr *nla, s32 min, s32 max, s32 *ou return 0; } +static int nl80211_check_power_mode(const struct nlattr *nla, + enum nl80211_mesh_power_mode min, + enum nl80211_mesh_power_mode max, + enum nl80211_mesh_power_mode *out) +{ + u32 val = nla_get_u32(nla); + if (val < min || val > max) + return -EINVAL; + *out = val; + return 0; +} + static int nl80211_parse_mesh_config(struct genl_info *info, struct mesh_config *cfg, u32 *mask_out) @@ -5518,7 +5890,7 @@ do { \ NL80211_MESH_POWER_ACTIVE, NL80211_MESH_POWER_MAX, mask, NL80211_MESHCONF_POWER_MODE, - nl80211_check_u32); + nl80211_check_power_mode); FILL_IN_MESH_PARAM_IF_SET(tb, cfg, dot11MeshAwakeWindowDuration, 0, 65535, mask, NL80211_MESHCONF_AWAKE_WINDOW, nl80211_check_u16); @@ -6102,6 +6474,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) wiphy = &rdev->wiphy; + if (wdev->iftype == NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + if (!rdev->ops->scan) return -EOPNOTSUPP; @@ -7368,7 +7743,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) (key.p.cipher != WLAN_CIPHER_SUITE_WEP104 || key.p.key_len != WLAN_KEY_LEN_WEP104)) return -EINVAL; - if (key.idx > 4) + if (key.idx > 3) return -EINVAL; } else { key.p.key_len = 0; @@ -7773,12 +8148,13 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) ibss.beacon_interval = 100; - if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { + if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) ibss.beacon_interval = nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); - if (ibss.beacon_interval < 1 || ibss.beacon_interval > 10000) - return -EINVAL; - } + + err = cfg80211_validate_beacon_int(rdev, ibss.beacon_interval); + if (err) + return err; if (!rdev->ops->join_ibss) return -EOPNOTSUPP; @@ -7985,6 +8361,8 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev, } data = nla_nest_start(skb, attr); + if (!data) + goto nla_put_failure; ((void **)skb->cb)[0] = rdev; ((void **)skb->cb)[1] = hdr; @@ -8602,238 +8980,21 @@ static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, return rdev_cancel_remain_on_channel(rdev, wdev, cookie); } -static u32 rateset_to_mask(struct ieee80211_supported_band *sband, - u8 *rates, u8 rates_len) -{ - u8 i; - u32 mask = 0; - - for (i = 0; i < rates_len; i++) { - int rate = (rates[i] & 0x7f) * 5; - int ridx; - - for (ridx = 0; ridx < sband->n_bitrates; ridx++) { - struct ieee80211_rate *srate = - &sband->bitrates[ridx]; - if (rate == srate->bitrate) { - mask |= 1 << ridx; - break; - } - } - if (ridx == sband->n_bitrates) - return 0; /* rate not found */ - } - - return mask; -} - -static bool ht_rateset_to_mask(struct ieee80211_supported_band *sband, - u8 *rates, u8 rates_len, - u8 mcs[IEEE80211_HT_MCS_MASK_LEN]) -{ - u8 i; - - memset(mcs, 0, IEEE80211_HT_MCS_MASK_LEN); - - for (i = 0; i < rates_len; i++) { - int ridx, rbit; - - ridx = rates[i] / 8; - rbit = BIT(rates[i] % 8); - - /* check validity */ - if ((ridx < 0) || (ridx >= IEEE80211_HT_MCS_MASK_LEN)) - return false; - - /* check availability */ - if (sband->ht_cap.mcs.rx_mask[ridx] & rbit) - mcs[ridx] |= rbit; - else - return false; - } - - return true; -} - -static u16 vht_mcs_map_to_mcs_mask(u8 vht_mcs_map) -{ - u16 mcs_mask = 0; - - switch (vht_mcs_map) { - case IEEE80211_VHT_MCS_NOT_SUPPORTED: - break; - case IEEE80211_VHT_MCS_SUPPORT_0_7: - mcs_mask = 0x00FF; - break; - case IEEE80211_VHT_MCS_SUPPORT_0_8: - mcs_mask = 0x01FF; - break; - case IEEE80211_VHT_MCS_SUPPORT_0_9: - mcs_mask = 0x03FF; - break; - default: - break; - } - - return mcs_mask; -} - -static void vht_build_mcs_mask(u16 vht_mcs_map, - u16 vht_mcs_mask[NL80211_VHT_NSS_MAX]) -{ - u8 nss; - - for (nss = 0; nss < NL80211_VHT_NSS_MAX; nss++) { - vht_mcs_mask[nss] = vht_mcs_map_to_mcs_mask(vht_mcs_map & 0x03); - vht_mcs_map >>= 2; - } -} - -static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, - struct nl80211_txrate_vht *txrate, - u16 mcs[NL80211_VHT_NSS_MAX]) -{ - u16 tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); - u16 tx_mcs_mask[NL80211_VHT_NSS_MAX] = {}; - u8 i; - - if (!sband->vht_cap.vht_supported) - return false; - - memset(mcs, 0, sizeof(u16) * NL80211_VHT_NSS_MAX); - - /* Build vht_mcs_mask from VHT capabilities */ - vht_build_mcs_mask(tx_mcs_map, tx_mcs_mask); - - for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { - if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) - mcs[i] = txrate->mcs[i]; - else - return false; - } - - return true; -} - -static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { - [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_RATES }, - [NL80211_TXRATE_HT] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_HT_RATES }, - [NL80211_TXRATE_VHT] = { .len = sizeof(struct nl80211_txrate_vht)}, - [NL80211_TXRATE_GI] = { .type = NLA_U8 }, -}; - static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *tb[NL80211_TXRATE_MAX + 1]; - struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct cfg80211_bitrate_mask mask; - int rem, i; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; - struct nlattr *tx_rates; - struct ieee80211_supported_band *sband; - u16 vht_tx_mcs_map; + int err; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; - memset(&mask, 0, sizeof(mask)); - /* Default to all rates enabled */ - for (i = 0; i < NUM_NL80211_BANDS; i++) { - sband = rdev->wiphy.bands[i]; - - if (!sband) - continue; - - mask.control[i].legacy = (1 << sband->n_bitrates) - 1; - memcpy(mask.control[i].ht_mcs, - sband->ht_cap.mcs.rx_mask, - sizeof(mask.control[i].ht_mcs)); - - if (!sband->vht_cap.vht_supported) - continue; - - vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); - vht_build_mcs_mask(vht_tx_mcs_map, mask.control[i].vht_mcs); - } - - /* if no rates are given set it back to the defaults */ - if (!info->attrs[NL80211_ATTR_TX_RATES]) - goto out; - - /* - * The nested attribute uses enum nl80211_band as the index. This maps - * directly to the enum nl80211_band values used in cfg80211. - */ - BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); - nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { - enum nl80211_band band = nla_type(tx_rates); - int err; - - if (band < 0 || band >= NUM_NL80211_BANDS) - return -EINVAL; - sband = rdev->wiphy.bands[band]; - if (sband == NULL) - return -EINVAL; - err = nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), - nla_len(tx_rates), nl80211_txattr_policy); - if (err) - return err; - if (tb[NL80211_TXRATE_LEGACY]) { - mask.control[band].legacy = rateset_to_mask( - sband, - nla_data(tb[NL80211_TXRATE_LEGACY]), - nla_len(tb[NL80211_TXRATE_LEGACY])); - if ((mask.control[band].legacy == 0) && - nla_len(tb[NL80211_TXRATE_LEGACY])) - return -EINVAL; - } - if (tb[NL80211_TXRATE_HT]) { - if (!ht_rateset_to_mask( - sband, - nla_data(tb[NL80211_TXRATE_HT]), - nla_len(tb[NL80211_TXRATE_HT]), - mask.control[band].ht_mcs)) - return -EINVAL; - } - if (tb[NL80211_TXRATE_VHT]) { - if (!vht_set_mcs_mask( - sband, - nla_data(tb[NL80211_TXRATE_VHT]), - mask.control[band].vht_mcs)) - return -EINVAL; - } - if (tb[NL80211_TXRATE_GI]) { - mask.control[band].gi = - nla_get_u8(tb[NL80211_TXRATE_GI]); - if (mask.control[band].gi > NL80211_TXRATE_FORCE_LGI) - return -EINVAL; - } - - if (mask.control[band].legacy == 0) { - /* don't allow empty legacy rates if HT or VHT - * are not even supported. - */ - if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || - rdev->wiphy.bands[band]->vht_cap.vht_supported)) - return -EINVAL; - - for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) - if (mask.control[band].ht_mcs[i]) - goto out; - - for (i = 0; i < NL80211_VHT_NSS_MAX; i++) - if (mask.control[band].vht_mcs[i]) - goto out; - - /* legacy and mcs rates may not be both empty */ - return -EINVAL; - } - } + err = nl80211_parse_tx_bitrate_mask(info, &mask); + if (err) + return err; -out: return rdev_set_bitrate_mask(rdev, dev, NULL, &mask); } @@ -8859,6 +9020,7 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } @@ -8904,6 +9066,7 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) case NL80211_IFTYPE_MESH_POINT: case NL80211_IFTYPE_P2P_GO: break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } @@ -9020,6 +9183,7 @@ static int nl80211_tx_mgmt_cancel_wait(struct sk_buff *skb, struct genl_info *in case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: break; + case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } @@ -9252,9 +9416,10 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_BEACON_INTERVAL]) { setup.beacon_interval = nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); - if (setup.beacon_interval < 10 || - setup.beacon_interval > 10000) - return -EINVAL; + + err = cfg80211_validate_beacon_int(rdev, setup.beacon_interval); + if (err) + return err; } if (info->attrs[NL80211_ATTR_DTIM_PERIOD]) { @@ -9300,6 +9465,17 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) return err; } + if (info->attrs[NL80211_ATTR_TX_RATES]) { + err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate); + if (err) + return err; + + err = validate_beacon_tx_rate(rdev, setup.chandef.chan->band, + &setup.beacon_rate); + if (err) + return err; + } + return cfg80211_join_mesh(rdev, dev, &setup, &cfg); } @@ -9413,18 +9589,27 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, if (!freqs) return -ENOBUFS; - for (i = 0; i < req->n_channels; i++) - nla_put_u32(msg, i, req->channels[i]->center_freq); + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + return -ENOBUFS; + } nla_nest_end(msg, freqs); if (req->n_match_sets) { matches = nla_nest_start(msg, NL80211_ATTR_SCHED_SCAN_MATCH); + if (!matches) + return -ENOBUFS; + for (i = 0; i < req->n_match_sets; i++) { match = nla_nest_start(msg, i); - nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID, - req->match_sets[i].ssid.ssid_len, - req->match_sets[i].ssid.ssid); + if (!match) + return -ENOBUFS; + + if (nla_put(msg, NL80211_SCHED_SCAN_MATCH_ATTR_SSID, + req->match_sets[i].ssid.ssid_len, + req->match_sets[i].ssid.ssid)) + return -ENOBUFS; nla_nest_end(msg, match); } nla_nest_end(msg, matches); @@ -9436,6 +9621,9 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, for (i = 0; i < req->n_scan_plans; i++) { scan_plan = nla_nest_start(msg, i + 1); + if (!scan_plan) + return -ENOBUFS; + if (!scan_plan || nla_put_u32(msg, NL80211_SCHED_SCAN_PLAN_INTERVAL, req->scan_plans[i].interval) || @@ -10362,6 +10550,549 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + struct cfg80211_nan_conf conf = {}; + int err; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (wdev->nan_started) + return -EEXIST; + + if (rfkill_blocked(rdev->rfkill)) + return -ERFKILL; + + if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_DUAL]) + return -EINVAL; + + conf.master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + if (!conf.master_pref) + return -EINVAL; + + conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); + + err = rdev_start_nan(rdev, wdev, &conf); + if (err) + return err; + + wdev->nan_started = true; + rdev->opencount++; + + return 0; +} + +static int nl80211_stop_nan(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + cfg80211_stop_nan(rdev, wdev); + + return 0; +} + +static int validate_nan_filter(struct nlattr *filter_attr) +{ + struct nlattr *attr; + int len = 0, n_entries = 0, rem; + + nla_for_each_nested(attr, filter_attr, rem) { + len += nla_len(attr); + n_entries++; + } + + if (len >= U8_MAX) + return -EINVAL; + + return n_entries; +} + +static int handle_nan_filter(struct nlattr *attr_filter, + struct cfg80211_nan_func *func, + bool tx) +{ + struct nlattr *attr; + int n_entries, rem, i; + struct cfg80211_nan_func_filter *filter; + + n_entries = validate_nan_filter(attr_filter); + if (n_entries < 0) + return n_entries; + + BUILD_BUG_ON(sizeof(*func->rx_filters) != sizeof(*func->tx_filters)); + + filter = kcalloc(n_entries, sizeof(*func->rx_filters), GFP_KERNEL); + if (!filter) + return -ENOMEM; + + i = 0; + nla_for_each_nested(attr, attr_filter, rem) { + filter[i].filter = kmemdup(nla_data(attr), nla_len(attr), + GFP_KERNEL); + filter[i].len = nla_len(attr); + i++; + } + if (tx) { + func->num_tx_filters = n_entries; + func->tx_filters = filter; + } else { + func->num_rx_filters = n_entries; + func->rx_filters = filter; + } + + return 0; +} + +static int nl80211_nan_add_func(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + struct nlattr *tb[NUM_NL80211_NAN_FUNC_ATTR], *func_attr; + struct cfg80211_nan_func *func; + struct sk_buff *msg = NULL; + void *hdr = NULL; + int err = 0; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!wdev->nan_started) + return -ENOTCONN; + + if (!info->attrs[NL80211_ATTR_NAN_FUNC]) + return -EINVAL; + + if (wdev->owner_nlportid && + wdev->owner_nlportid != info->snd_portid) + return -ENOTCONN; + + err = nla_parse(tb, NL80211_NAN_FUNC_ATTR_MAX, + nla_data(info->attrs[NL80211_ATTR_NAN_FUNC]), + nla_len(info->attrs[NL80211_ATTR_NAN_FUNC]), + nl80211_nan_func_policy); + if (err) + return err; + + func = kzalloc(sizeof(*func), GFP_KERNEL); + if (!func) + return -ENOMEM; + + func->cookie = wdev->wiphy->cookie_counter++; + + if (!tb[NL80211_NAN_FUNC_TYPE] || + nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]) > NL80211_NAN_FUNC_MAX_TYPE) { + err = -EINVAL; + goto out; + } + + + func->type = nla_get_u8(tb[NL80211_NAN_FUNC_TYPE]); + + if (!tb[NL80211_NAN_FUNC_SERVICE_ID]) { + err = -EINVAL; + goto out; + } + + memcpy(func->service_id, nla_data(tb[NL80211_NAN_FUNC_SERVICE_ID]), + sizeof(func->service_id)); + + func->close_range = + nla_get_flag(tb[NL80211_NAN_FUNC_CLOSE_RANGE]); + + if (tb[NL80211_NAN_FUNC_SERVICE_INFO]) { + func->serv_spec_info_len = + nla_len(tb[NL80211_NAN_FUNC_SERVICE_INFO]); + func->serv_spec_info = + kmemdup(nla_data(tb[NL80211_NAN_FUNC_SERVICE_INFO]), + func->serv_spec_info_len, + GFP_KERNEL); + if (!func->serv_spec_info) { + err = -ENOMEM; + goto out; + } + } + + if (tb[NL80211_NAN_FUNC_TTL]) + func->ttl = nla_get_u32(tb[NL80211_NAN_FUNC_TTL]); + + switch (func->type) { + case NL80211_NAN_FUNC_PUBLISH: + if (!tb[NL80211_NAN_FUNC_PUBLISH_TYPE]) { + err = -EINVAL; + goto out; + } + + func->publish_type = + nla_get_u8(tb[NL80211_NAN_FUNC_PUBLISH_TYPE]); + func->publish_bcast = + nla_get_flag(tb[NL80211_NAN_FUNC_PUBLISH_BCAST]); + + if ((!(func->publish_type & NL80211_NAN_SOLICITED_PUBLISH)) && + func->publish_bcast) { + err = -EINVAL; + goto out; + } + break; + case NL80211_NAN_FUNC_SUBSCRIBE: + func->subscribe_active = + nla_get_flag(tb[NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE]); + break; + case NL80211_NAN_FUNC_FOLLOW_UP: + if (!tb[NL80211_NAN_FUNC_FOLLOW_UP_ID] || + !tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]) { + err = -EINVAL; + goto out; + } + + func->followup_id = + nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_ID]); + func->followup_reqid = + nla_get_u8(tb[NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID]); + memcpy(func->followup_dest.addr, + nla_data(tb[NL80211_NAN_FUNC_FOLLOW_UP_DEST]), + sizeof(func->followup_dest.addr)); + if (func->ttl) { + err = -EINVAL; + goto out; + } + break; + default: + err = -EINVAL; + goto out; + } + + if (tb[NL80211_NAN_FUNC_SRF]) { + struct nlattr *srf_tb[NUM_NL80211_NAN_SRF_ATTR]; + + err = nla_parse(srf_tb, NL80211_NAN_SRF_ATTR_MAX, + nla_data(tb[NL80211_NAN_FUNC_SRF]), + nla_len(tb[NL80211_NAN_FUNC_SRF]), NULL); + if (err) + goto out; + + func->srf_include = + nla_get_flag(srf_tb[NL80211_NAN_SRF_INCLUDE]); + + if (srf_tb[NL80211_NAN_SRF_BF]) { + if (srf_tb[NL80211_NAN_SRF_MAC_ADDRS] || + !srf_tb[NL80211_NAN_SRF_BF_IDX]) { + err = -EINVAL; + goto out; + } + + func->srf_bf_len = + nla_len(srf_tb[NL80211_NAN_SRF_BF]); + func->srf_bf = + kmemdup(nla_data(srf_tb[NL80211_NAN_SRF_BF]), + func->srf_bf_len, GFP_KERNEL); + if (!func->srf_bf) { + err = -ENOMEM; + goto out; + } + + func->srf_bf_idx = + nla_get_u8(srf_tb[NL80211_NAN_SRF_BF_IDX]); + } else { + struct nlattr *attr, *mac_attr = + srf_tb[NL80211_NAN_SRF_MAC_ADDRS]; + int n_entries, rem, i = 0; + + if (!mac_attr) { + err = -EINVAL; + goto out; + } + + n_entries = validate_acl_mac_addrs(mac_attr); + if (n_entries <= 0) { + err = -EINVAL; + goto out; + } + + func->srf_num_macs = n_entries; + func->srf_macs = + kzalloc(sizeof(*func->srf_macs) * n_entries, + GFP_KERNEL); + if (!func->srf_macs) { + err = -ENOMEM; + goto out; + } + + nla_for_each_nested(attr, mac_attr, rem) + memcpy(func->srf_macs[i++].addr, nla_data(attr), + sizeof(*func->srf_macs)); + } + } + + if (tb[NL80211_NAN_FUNC_TX_MATCH_FILTER]) { + err = handle_nan_filter(tb[NL80211_NAN_FUNC_TX_MATCH_FILTER], + func, true); + if (err) + goto out; + } + + if (tb[NL80211_NAN_FUNC_RX_MATCH_FILTER]) { + err = handle_nan_filter(tb[NL80211_NAN_FUNC_RX_MATCH_FILTER], + func, false); + if (err) + goto out; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out; + } + + hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, + NL80211_CMD_ADD_NAN_FUNCTION); + /* This can't really happen - we just allocated 4KB */ + if (WARN_ON(!hdr)) { + err = -ENOMEM; + goto out; + } + + err = rdev_add_nan_func(rdev, wdev, func); +out: + if (err < 0) { + cfg80211_free_nan_func(func); + nlmsg_free(msg); + return err; + } + + /* propagate the instance id and cookie to userspace */ + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, func->cookie, + NL80211_ATTR_PAD)) + goto nla_put_failure; + + func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC); + if (!func_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, + func->instance_id)) + goto nla_put_failure; + + nla_nest_end(msg, func_attr); + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + +static int nl80211_nan_del_func(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + u64 cookie; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!wdev->nan_started) + return -ENOTCONN; + + if (!info->attrs[NL80211_ATTR_COOKIE]) + return -EINVAL; + + if (wdev->owner_nlportid && + wdev->owner_nlportid != info->snd_portid) + return -ENOTCONN; + + cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); + + rdev_del_nan_func(rdev, wdev, cookie); + + return 0; +} + +static int nl80211_nan_change_config(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = info->user_ptr[1]; + struct cfg80211_nan_conf conf = {}; + u32 changed = 0; + + if (wdev->iftype != NL80211_IFTYPE_NAN) + return -EOPNOTSUPP; + + if (!wdev->nan_started) + return -ENOTCONN; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf.master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + if (conf.master_pref <= 1 || conf.master_pref == 255) + return -EINVAL; + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_NAN_DUAL]) { + conf.dual = nla_get_u8(info->attrs[NL80211_ATTR_NAN_DUAL]); + changed |= CFG80211_NAN_CONF_CHANGED_DUAL; + } + + if (!changed) + return -EINVAL; + + return rdev_nan_change_conf(rdev, wdev, &conf, changed); +} + +void cfg80211_nan_match(struct wireless_dev *wdev, + struct cfg80211_nan_match_params *match, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct nlattr *match_attr, *local_func_attr, *peer_func_attr; + struct sk_buff *msg; + void *hdr; + + if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_MATCH); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, + wdev->netdev->ifindex)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, match->cookie, + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, match->addr)) + goto nla_put_failure; + + match_attr = nla_nest_start(msg, NL80211_ATTR_NAN_MATCH); + if (!match_attr) + goto nla_put_failure; + + local_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_LOCAL); + if (!local_func_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->inst_id)) + goto nla_put_failure; + + nla_nest_end(msg, local_func_attr); + + peer_func_attr = nla_nest_start(msg, NL80211_NAN_MATCH_FUNC_PEER); + if (!peer_func_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_NAN_FUNC_TYPE, match->type) || + nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, match->peer_inst_id)) + goto nla_put_failure; + + if (match->info && match->info_len && + nla_put(msg, NL80211_NAN_FUNC_SERVICE_INFO, match->info_len, + match->info)) + goto nla_put_failure; + + nla_nest_end(msg, peer_func_attr); + nla_nest_end(msg, match_attr); + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, + wdev->owner_nlportid); + + return; + +nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_match); + +void cfg80211_nan_func_terminated(struct wireless_dev *wdev, + u8 inst_id, + enum nl80211_nan_func_term_reason reason, + u64 cookie, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + struct nlattr *func_attr; + void *hdr; + + if (WARN_ON(!inst_id)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_DEL_NAN_FUNCTION); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, + wdev->netdev->ifindex)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + NL80211_ATTR_PAD)) + goto nla_put_failure; + + func_attr = nla_nest_start(msg, NL80211_ATTR_NAN_FUNC); + if (!func_attr) + goto nla_put_failure; + + if (nla_put_u8(msg, NL80211_NAN_FUNC_INSTANCE_ID, inst_id) || + nla_put_u8(msg, NL80211_NAN_FUNC_TERM_REASON, reason)) + goto nla_put_failure; + + nla_nest_end(msg, func_attr); + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, + wdev->owner_nlportid); + + return; + +nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_func_terminated); + static int nl80211_get_protocol_features(struct sk_buff *skb, struct genl_info *info) { @@ -11063,7 +11794,14 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, dev_hold(dev); } else if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP) { - if (!wdev->p2p_started) { + if (wdev->iftype == NL80211_IFTYPE_P2P_DEVICE && + !wdev->p2p_started) { + if (rtnl) + rtnl_unlock(); + return -ENETDOWN; + } + if (wdev->iftype == NL80211_IFTYPE_NAN && + !wdev->nan_started) { if (rtnl) rtnl_unlock(); return -ENETDOWN; @@ -11697,6 +12435,46 @@ static const struct genl_ops nl80211_ops[] = { NL80211_FLAG_NEED_RTNL, }, { + .cmd = NL80211_CMD_START_NAN, + .doit = nl80211_start_nan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_STOP_NAN, + .doit = nl80211_stop_nan, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_ADD_NAN_FUNCTION, + .doit = nl80211_nan_add_func, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_NAN_FUNCTION, + .doit = nl80211_nan_del_func, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_CHANGE_NAN_CONFIG, + .doit = nl80211_nan_change_config, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { .cmd = NL80211_CMD_SET_MCAST_RATE, .doit = nl80211_set_mcast_rate, .policy = nl80211_policy, @@ -11847,6 +12625,29 @@ void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, NL80211_MCGRP_CONFIG, GFP_KERNEL); } +void nl80211_notify_iface(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + enum nl80211_commands cmd) +{ + struct sk_buff *msg; + + WARN_ON(cmd != NL80211_CMD_NEW_INTERFACE && + cmd != NL80211_CMD_DEL_INTERFACE); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + if (nl80211_send_iface(msg, 0, 0, 0, rdev, wdev, + cmd == NL80211_CMD_DEL_INTERFACE) < 0) { + nlmsg_free(msg); + return; + } + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_CONFIG, GFP_KERNEL); +} + static int nl80211_add_scan_req(struct sk_buff *msg, struct cfg80211_registered_device *rdev) { diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index a63f402b10b7..7e3821d7fcc5 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -7,6 +7,9 @@ int nl80211_init(void); void nl80211_exit(void); void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, enum nl80211_commands cmd); +void nl80211_notify_iface(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + enum nl80211_commands cmd); void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 85ff30bee2b9..11cf83c8ad4f 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -887,6 +887,64 @@ static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev, trace_rdev_return_void(&rdev->wiphy); } +static inline int rdev_start_nan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf) +{ + int ret; + + trace_rdev_start_nan(&rdev->wiphy, wdev, conf); + ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev) +{ + trace_rdev_stop_nan(&rdev->wiphy, wdev); + rdev->ops->stop_nan(&rdev->wiphy, wdev); + trace_rdev_return_void(&rdev->wiphy); +} + +static inline int +rdev_add_nan_func(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_nan_func *nan_func) +{ + int ret; + + trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func); + ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, u64 cookie) +{ + trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie); + rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie); + trace_rdev_return_void(&rdev->wiphy); +} + +static inline int +rdev_nan_change_conf(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf, u32 changes) +{ + int ret; + + trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes); + if (rdev->ops->nan_change_conf) + ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf, + changes); + else + ret = -ENOTSUPP; + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0358e12be54b..b5bd58d0f731 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -352,52 +352,48 @@ void cfg80211_bss_expire(struct cfg80211_registered_device *rdev) __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); } -const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) +const u8 *cfg80211_find_ie_match(u8 eid, const u8 *ies, int len, + const u8 *match, int match_len, + int match_offset) { - while (len > 2 && ies[0] != eid) { + /* match_offset can't be smaller than 2, unless match_len is + * zero, in which case match_offset must be zero as well. + */ + if (WARN_ON((match_len && match_offset < 2) || + (!match_len && match_offset))) + return NULL; + + while (len >= 2 && len >= ies[1] + 2) { + if ((ies[0] == eid) && + (ies[1] + 2 >= match_offset + match_len) && + !memcmp(ies + match_offset, match, match_len)) + return ies; + len -= ies[1] + 2; ies += ies[1] + 2; } - if (len < 2) - return NULL; - if (len < 2 + ies[1]) - return NULL; - return ies; + + return NULL; } -EXPORT_SYMBOL(cfg80211_find_ie); +EXPORT_SYMBOL(cfg80211_find_ie_match); const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type, const u8 *ies, int len) { - struct ieee80211_vendor_ie *ie; - const u8 *pos = ies, *end = ies + len; - int ie_oui; + const u8 *ie; + u8 match[] = { oui >> 16, oui >> 8, oui, oui_type }; + int match_len = (oui_type < 0) ? 3 : sizeof(match); if (WARN_ON(oui_type > 0xff)) return NULL; - while (pos < end) { - pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos, - end - pos); - if (!pos) - return NULL; - - ie = (struct ieee80211_vendor_ie *)pos; - - /* make sure we can access ie->len */ - BUILD_BUG_ON(offsetof(struct ieee80211_vendor_ie, len) != 1); + ie = cfg80211_find_ie_match(WLAN_EID_VENDOR_SPECIFIC, ies, len, + match, match_len, 2); - if (ie->len < sizeof(*ie)) - goto cont; + if (ie && (ie[1] < 4)) + return NULL; - ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2]; - if (ie_oui == oui && - (oui_type < 0 || ie->oui_type == oui_type)) - return pos; -cont: - pos += 2 + ie->len; - } - return NULL; + return ie; } EXPORT_SYMBOL(cfg80211_find_vendor_ie); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index add6824c44fd..a77db333927e 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -726,7 +726,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, wdev->current_bss = bss_from_pub(bss); - cfg80211_upload_connect_keys(wdev); + if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) + cfg80211_upload_connect_keys(wdev); rcu_read_lock(); country_ie = ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY); @@ -1043,6 +1044,12 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, connect->crypto.ciphers_pairwise[0] = cipher; } } + + connect->crypto.wep_keys = connkeys->params; + connect->crypto.wep_tx_key = connkeys->def; + } else { + if (WARN_ON(connkeys)) + return -EINVAL; } wdev->connect_keys = connkeys; diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index e46469bc130f..0082f4b01795 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -57,7 +57,7 @@ static ssize_t addresses_show(struct device *dev, return sprintf(buf, "%pM\n", wiphy->perm_addr); for (i = 0; i < wiphy->n_addresses; i++) - buf += sprintf(buf, "%pM\n", &wiphy->addresses[i].addr); + buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr); return buf - start; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 72b5255cefe2..a3d0a91b1e09 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1889,6 +1889,96 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_p2p_device, TP_ARGS(wiphy, wdev) ); +TRACE_EVENT(rdev_start_nan, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf), + TP_ARGS(wiphy, wdev, conf), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u8, master_pref) + __field(u8, dual); + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->master_pref = conf->master_pref; + __entry->dual = conf->dual; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", master preference: %u, dual: %d", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, + __entry->dual) +); + +TRACE_EVENT(rdev_nan_change_conf, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + struct cfg80211_nan_conf *conf, u32 changes), + TP_ARGS(wiphy, wdev, conf, changes), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u8, master_pref) + __field(u8, dual); + __field(u32, changes); + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->master_pref = conf->master_pref; + __entry->dual = conf->dual; + __entry->changes = changes; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", master preference: %u, dual: %d, changes: %x", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->master_pref, + __entry->dual, __entry->changes) +); + +DEFINE_EVENT(wiphy_wdev_evt, rdev_stop_nan, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), + TP_ARGS(wiphy, wdev) +); + +TRACE_EVENT(rdev_add_nan_func, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + const struct cfg80211_nan_func *func), + TP_ARGS(wiphy, wdev, func), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u8, func_type) + __field(u64, cookie) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->func_type = func->type; + __entry->cookie = func->cookie + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type=%u, cookie=%llu", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->func_type, + __entry->cookie) +); + +TRACE_EVENT(rdev_del_nan_func, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + u64 cookie), + TP_ARGS(wiphy, wdev, cookie), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(u64, cookie) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->cookie = cookie; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie=%llu", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->cookie) +); + TRACE_EVENT(rdev_set_mac_acl, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_acl_data *params), diff --git a/net/wireless/util.c b/net/wireless/util.c index b7d1592bd5b8..8edce22d1b93 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -218,7 +218,7 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr) { - if (key_idx > 5) + if (key_idx < 0 || key_idx > 5) return -EINVAL; if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) @@ -249,7 +249,13 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, /* Disallow BIP (group-only) cipher as pairwise cipher */ if (pairwise) return -EINVAL; + if (key_idx < 4) + return -EINVAL; break; + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + if (key_idx > 3) + return -EINVAL; default: break; } @@ -906,7 +912,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) if (!wdev->connect_keys) return; - for (i = 0; i < 6; i++) { + for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) { if (!wdev->connect_keys->params[i].cipher) continue; if (rdev_add_key(rdev, dev, i, false, NULL, @@ -919,9 +925,6 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) netdev_err(dev, "failed to set defkey %d\n", i); continue; } - if (wdev->connect_keys->defmgmt == i) - if (rdev_set_default_mgmt_key(rdev, dev, i)) - netdev_err(dev, "failed to set mgtdef %d\n", i); } kzfree(wdev->connect_keys); @@ -1005,8 +1008,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, if (otype == NL80211_IFTYPE_AP_VLAN) return -EOPNOTSUPP; - /* cannot change into P2P device type */ - if (ntype == NL80211_IFTYPE_P2P_DEVICE) + /* cannot change into P2P device or NAN */ + if (ntype == NL80211_IFTYPE_P2P_DEVICE || + ntype == NL80211_IFTYPE_NAN) return -EOPNOTSUPP; if (!rdev->ops->change_virtual_intf || @@ -1085,6 +1089,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, /* not happening */ break; case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_NAN: WARN_ON(1); break; } @@ -1559,7 +1564,7 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev; int res = 0; - if (!beacon_int) + if (beacon_int < 10 || beacon_int > 10000) return -EINVAL; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { @@ -1757,6 +1762,28 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, } EXPORT_SYMBOL(cfg80211_get_station); +void cfg80211_free_nan_func(struct cfg80211_nan_func *f) +{ + int i; + + if (!f) + return; + + kfree(f->serv_spec_info); + kfree(f->srf_bf); + kfree(f->srf_macs); + for (i = 0; i < f->num_rx_filters; i++) + kfree(f->rx_filters[i].filter); + + for (i = 0; i < f->num_tx_filters; i++) + kfree(f->tx_filters[i].filter); + + kfree(f->rx_filters); + kfree(f->tx_filters); + kfree(f); +} +EXPORT_SYMBOL(cfg80211_free_nan_func); + /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 9f27221c8913..a220156cf217 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -406,12 +406,16 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, if (pairwise && !addr) return -EINVAL; + /* + * In many cases we won't actually need this, but it's better + * to do it first in case the allocation fails. Don't use wext. + */ if (!wdev->wext.keys) { wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), - GFP_KERNEL); + GFP_KERNEL); if (!wdev->wext.keys) return -ENOMEM; - for (i = 0; i < 6; i++) + for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) wdev->wext.keys->params[i].key = wdev->wext.keys->data[i]; } @@ -460,7 +464,7 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, if (err == -ENOENT) err = 0; if (!err) { - if (!addr) { + if (!addr && idx < 4) { memset(wdev->wext.keys->data[idx], 0, sizeof(wdev->wext.keys->data[idx])); wdev->wext.keys->params[idx].key_len = 0; @@ -487,10 +491,19 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, err = 0; if (wdev->current_bss) err = rdev_add_key(rdev, dev, idx, pairwise, addr, params); + else if (params->cipher != WLAN_CIPHER_SUITE_WEP40 && + params->cipher != WLAN_CIPHER_SUITE_WEP104) + return -EINVAL; if (err) return err; - if (!addr) { + /* + * We only need to store WEP keys, since they're the only keys that + * can be be set before a connection is established and persist after + * disconnecting. + */ + if (!addr && (params->cipher == WLAN_CIPHER_SUITE_WEP40 || + params->cipher == WLAN_CIPHER_SUITE_WEP104)) { wdev->wext.keys->params[idx] = *params; memcpy(wdev->wext.keys->data[idx], params->key, params->key_len); diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index a4e8af3321d2..995163830a61 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -35,7 +35,6 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, if (wdev->wext.keys) { wdev->wext.keys->def = wdev->wext.default_key; - wdev->wext.keys->defmgmt = wdev->wext.default_mgmt_key; if (wdev->wext.default_key != -1) wdev->wext.connect.privacy = true; } @@ -43,11 +42,11 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, if (!wdev->wext.connect.ssid_len) return 0; - if (wdev->wext.keys) { + if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; - for (i = 0; i < 6; i++) + for (i = 0; i < CFG80211_MAX_WEP_KEYS; i++) ck->params[i].key = ck->data[i]; } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index a750f330b8dd..f83b74d3e2ac 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1500,12 +1500,8 @@ out_fac_release: goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; - if (dtefacs.calling_ae == NULL) - goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; - if (dtefacs.called_ae == NULL) - goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 250e567ba3d6..44ac85fe2bc9 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -17,7 +17,7 @@ #include <linux/crypto.h> #include <linux/scatterlist.h> #include <net/xfrm.h> -#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) +#if IS_ENABLED(CONFIG_INET_ESP) || IS_ENABLED(CONFIG_INET6_ESP) #include <net/esp.h> #endif diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 45f9cf97ea25..fd6986634e6f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -49,6 +49,7 @@ static struct xfrm_policy_afinfo __rcu *xfrm_policy_afinfo[NPROTO] __read_mostly; static struct kmem_cache *xfrm_dst_cache __read_mostly; +static __read_mostly seqcount_t xfrm_policy_hash_generation; static void xfrm_init_pmtu(struct dst_entry *dst); static int stale_bundle(struct dst_entry *dst); @@ -59,6 +60,11 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); +static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) +{ + return atomic_inc_not_zero(&policy->refcnt); +} + static inline bool __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { @@ -385,9 +391,11 @@ static struct hlist_head *policy_hash_bysel(struct net *net, __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __sel_hash(sel, family, hmask, dbits, sbits); - return (hash == hmask + 1 ? - &net->xfrm.policy_inexact[dir] : - net->xfrm.policy_bydst[dir].table + hash); + if (hash == hmask + 1) + return &net->xfrm.policy_inexact[dir]; + + return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, + lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static struct hlist_head *policy_hash_direct(struct net *net, @@ -403,7 +411,8 @@ static struct hlist_head *policy_hash_direct(struct net *net, __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); - return net->xfrm.policy_bydst[dir].table + hash; + return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, + lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static void xfrm_dst_hash_transfer(struct net *net, @@ -426,14 +435,14 @@ redo: h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); if (!entry0) { - hlist_del(&pol->bydst); - hlist_add_head(&pol->bydst, ndsttable+h); + hlist_del_rcu(&pol->bydst); + hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; } else { if (h != h0) continue; - hlist_del(&pol->bydst); - hlist_add_behind(&pol->bydst, entry0); + hlist_del_rcu(&pol->bydst); + hlist_add_behind_rcu(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@ -468,22 +477,32 @@ static void xfrm_bydst_resize(struct net *net, int dir) unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); - struct hlist_head *odst = net->xfrm.policy_bydst[dir].table; struct hlist_head *ndst = xfrm_hash_alloc(nsize); + struct hlist_head *odst; int i; if (!ndst) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + write_seqcount_begin(&xfrm_policy_hash_generation); + + odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, + lockdep_is_held(&net->xfrm.xfrm_policy_lock)); + + odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, + lockdep_is_held(&net->xfrm.xfrm_policy_lock)); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); - net->xfrm.policy_bydst[dir].table = ndst; + rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + write_seqcount_end(&xfrm_policy_hash_generation); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + + synchronize_rcu(); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } @@ -500,7 +519,7 @@ static void xfrm_byidx_resize(struct net *net, int total) if (!nidx) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); @@ -508,7 +527,7 @@ static void xfrm_byidx_resize(struct net *net, int total) net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } @@ -541,7 +560,6 @@ static inline int xfrm_byidx_should_resize(struct net *net, int total) void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { - read_lock_bh(&net->xfrm.xfrm_policy_lock); si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; @@ -550,7 +568,6 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; - read_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_spd_getinfo); @@ -600,7 +617,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { @@ -646,7 +663,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) hlist_add_head(&policy->bydst, chain); } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } @@ -757,7 +774,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct hlist_head *chain; struct hlist_node *newpos; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); delpol = NULL; newpos = NULL; @@ -768,7 +785,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return -EEXIST; } delpol = pol; @@ -804,7 +821,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); @@ -824,7 +841,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, struct hlist_head *chain; *err = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); ret = NULL; hlist_for_each_entry(pol, chain, bydst) { @@ -837,7 +854,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, *err = security_xfrm_policy_delete( pol->security); if (*err) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); @@ -846,7 +863,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, break; } } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); @@ -865,7 +882,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, return NULL; *err = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { @@ -876,7 +893,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, *err = security_xfrm_policy_delete( pol->security); if (*err) { - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); @@ -885,7 +902,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, break; } } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); @@ -943,7 +960,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) @@ -959,14 +976,14 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again1; } @@ -978,13 +995,13 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (pol->type != type) continue; __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again2; } } @@ -993,7 +1010,7 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) if (!cnt) err = -ESRCH; out: - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); @@ -1013,7 +1030,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else @@ -1041,7 +1058,7 @@ int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, } list_del_init(&walk->walk.all); out: - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); @@ -1060,9 +1077,9 @@ void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) if (list_empty(&walk->walk.all)) return; - write_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ + spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); @@ -1100,17 +1117,24 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, struct xfrm_policy *pol, *ret; const xfrm_address_t *daddr, *saddr; struct hlist_head *chain; - u32 priority = ~0U; + unsigned int sequence; + u32 priority; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); if (unlikely(!daddr || !saddr)) return NULL; - read_lock_bh(&net->xfrm.xfrm_policy_lock); - chain = policy_hash_direct(net, daddr, saddr, family, dir); + rcu_read_lock(); + retry: + do { + sequence = read_seqcount_begin(&xfrm_policy_hash_generation); + chain = policy_hash_direct(net, daddr, saddr, family, dir); + } while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)); + + priority = ~0U; ret = NULL; - hlist_for_each_entry(pol, chain, bydst) { + hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir); if (err) { if (err == -ESRCH) @@ -1126,7 +1150,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } } chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, chain, bydst) { + hlist_for_each_entry_rcu(pol, chain, bydst) { if ((pol->priority >= priority) && ret) break; @@ -1144,9 +1168,13 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, } } - xfrm_pol_hold(ret); + if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) + goto retry; + + if (ret && !xfrm_pol_hold_rcu(ret)) + goto retry; fail: - read_unlock_bh(&net->xfrm.xfrm_policy_lock); + rcu_read_unlock(); return ret; } @@ -1223,10 +1251,9 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl) { struct xfrm_policy *pol; - struct net *net = sock_net(sk); rcu_read_lock(); - read_lock_bh(&net->xfrm.xfrm_policy_lock); + again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { bool match = xfrm_selector_match(&pol->selector, fl, @@ -1241,8 +1268,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, policy_to_flow_dir(dir)); - if (!err) - xfrm_pol_hold(pol); + if (!err && !xfrm_pol_hold_rcu(pol)) + goto again; else if (err == -ESRCH) pol = NULL; else @@ -1251,7 +1278,6 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, pol = NULL; } out: - read_unlock_bh(&net->xfrm.xfrm_policy_lock); rcu_read_unlock(); return pol; } @@ -1275,7 +1301,7 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { - hlist_del(&pol->bydst); + hlist_del_rcu(&pol->bydst); hlist_del(&pol->byidx); } @@ -1299,9 +1325,9 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_policy_kill(pol); return 0; @@ -1320,7 +1346,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) return -EINVAL; #endif - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { @@ -1338,7 +1364,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) */ xfrm_sk_policy_unlink(old_pol, dir); } - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); @@ -1368,9 +1394,9 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) newp->type = old->type; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); - write_lock_bh(&net->xfrm.xfrm_policy_lock); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); - write_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; @@ -3052,7 +3078,7 @@ static int __net_init xfrm_net_init(struct net *net) /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); - rwlock_init(&net->xfrm.xfrm_policy_lock); + spin_lock_init(&net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); return 0; @@ -3086,6 +3112,7 @@ static struct pernet_operations __net_initdata xfrm_net_ops = { void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); + seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); } @@ -3183,7 +3210,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * struct hlist_head *chain; u32 priority = ~0U; - read_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME*/ + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { if (xfrm_migrate_selector_match(sel, &pol->selector) && @@ -3207,7 +3234,7 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * xfrm_pol_hold(ret); - read_unlock_bh(&net->xfrm.xfrm_policy_lock); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 9c4fbd8935f4..ba2b539879bc 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -50,12 +50,18 @@ static const struct snmp_mib xfrm_mib_list[] = { static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) { + unsigned long buff[LINUX_MIB_XFRMMAX]; struct net *net = seq->private; int i; + + memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX); + + snmp_get_cpu_field_batch(buff, xfrm_mib_list, + net->mib.xfrm_statistics); for (i = 0; xfrm_mib_list[i].name; i++) seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, - snmp_fold_field(net->mib.xfrm_statistics, - xfrm_mib_list[i].entry)); + buff[i]); + return 0; } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 4fd725a0c500..cdc2e2e71bff 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -558,7 +558,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) x->repl->notify(x, XFRM_REPLAY_UPDATE); } -static struct xfrm_replay xfrm_replay_legacy = { +static const struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, .recheck = xfrm_replay_check, @@ -566,7 +566,7 @@ static struct xfrm_replay xfrm_replay_legacy = { .overflow = xfrm_replay_overflow, }; -static struct xfrm_replay xfrm_replay_bmp = { +static const struct xfrm_replay xfrm_replay_bmp = { .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, .recheck = xfrm_replay_check_bmp, @@ -574,7 +574,7 @@ static struct xfrm_replay xfrm_replay_bmp = { .overflow = xfrm_replay_overflow_bmp, }; -static struct xfrm_replay xfrm_replay_esn = { +static const struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, .recheck = xfrm_replay_recheck_esn, diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a30f898dc1c5..419bf5d463bd 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -28,6 +28,11 @@ #include "xfrm_hash.h" +#define xfrm_state_deref_prot(table, net) \ + rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) + +static void xfrm_state_gc_task(struct work_struct *work); + /* Each xfrm_state may be linked to two tables: 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) @@ -36,6 +41,15 @@ */ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; +static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation); + +static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); +static HLIST_HEAD(xfrm_state_gc_list); + +static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x) +{ + return atomic_inc_not_zero(&x->refcnt); +} static inline unsigned int xfrm_dst_hash(struct net *net, const xfrm_address_t *daddr, @@ -76,18 +90,18 @@ static void xfrm_hash_transfer(struct hlist_head *list, h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family, nhashmask); - hlist_add_head(&x->bydst, ndsttable+h); + hlist_add_head_rcu(&x->bydst, ndsttable + h); h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family, nhashmask); - hlist_add_head(&x->bysrc, nsrctable+h); + hlist_add_head_rcu(&x->bysrc, nsrctable + h); if (x->id.spi) { h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family, nhashmask); - hlist_add_head(&x->byspi, nspitable+h); + hlist_add_head_rcu(&x->byspi, nspitable + h); } } } @@ -122,25 +136,29 @@ static void xfrm_hash_resize(struct work_struct *work) } spin_lock_bh(&net->xfrm.xfrm_state_lock); + write_seqcount_begin(&xfrm_state_hash_generation); nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; + odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net); for (i = net->xfrm.state_hmask; i >= 0; i--) - xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi, - nhashmask); + xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nhashmask); - odst = net->xfrm.state_bydst; - osrc = net->xfrm.state_bysrc; - ospi = net->xfrm.state_byspi; + osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net); + ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net); ohashmask = net->xfrm.state_hmask; - net->xfrm.state_bydst = ndst; - net->xfrm.state_bysrc = nsrc; - net->xfrm.state_byspi = nspi; + rcu_assign_pointer(net->xfrm.state_bydst, ndst); + rcu_assign_pointer(net->xfrm.state_bysrc, nsrc); + rcu_assign_pointer(net->xfrm.state_byspi, nspi); net->xfrm.state_hmask = nhashmask; + write_seqcount_end(&xfrm_state_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_state_lock); osize = (ohashmask + 1) * sizeof(struct hlist_head); + + synchronize_rcu(); + xfrm_hash_free(odst, osize); xfrm_hash_free(osrc, osize); xfrm_hash_free(ospi, osize); @@ -356,15 +374,16 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) static void xfrm_state_gc_task(struct work_struct *work) { - struct net *net = container_of(work, struct net, xfrm.state_gc_work); struct xfrm_state *x; struct hlist_node *tmp; struct hlist_head gc_list; spin_lock_bh(&xfrm_state_gc_lock); - hlist_move_list(&net->xfrm.state_gc_list, &gc_list); + hlist_move_list(&xfrm_state_gc_list, &gc_list); spin_unlock_bh(&xfrm_state_gc_lock); + synchronize_rcu(); + hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) xfrm_state_gc_destroy(x); } @@ -501,14 +520,12 @@ EXPORT_SYMBOL(xfrm_state_alloc); void __xfrm_state_destroy(struct xfrm_state *x) { - struct net *net = xs_net(x); - WARN_ON(x->km.state != XFRM_STATE_DEAD); spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &net->xfrm.state_gc_list); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&net->xfrm.state_gc_work); + schedule_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(__xfrm_state_destroy); @@ -521,10 +538,10 @@ int __xfrm_state_delete(struct xfrm_state *x) x->km.state = XFRM_STATE_DEAD; spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); - hlist_del(&x->bydst); - hlist_del(&x->bysrc); + hlist_del_rcu(&x->bydst); + hlist_del_rcu(&x->bysrc); if (x->id.spi) - hlist_del(&x->byspi); + hlist_del_rcu(&x->byspi); net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); @@ -660,7 +677,7 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); struct xfrm_state *x; - hlist_for_each_entry(x, net->xfrm.state_byspi+h, byspi) { + hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || @@ -669,7 +686,8 @@ static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, if ((mark & x->mark.m) != x->mark.v) continue; - xfrm_state_hold(x); + if (!xfrm_state_hold_rcu(x)) + continue; return x; } @@ -684,7 +702,7 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, unsigned int h = xfrm_src_hash(net, daddr, saddr, family); struct xfrm_state *x; - hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) { + hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) { if (x->props.family != family || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family) || @@ -693,7 +711,8 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, if ((mark & x->mark.m) != x->mark.v) continue; - xfrm_state_hold(x); + if (!xfrm_state_hold_rcu(x)) + continue; return x; } @@ -776,13 +795,16 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; unsigned short encap_family = tmpl->encap_family; + unsigned int sequence; struct km_event c; to_put = NULL; - spin_lock_bh(&net->xfrm.xfrm_state_lock); + sequence = read_seqcount_begin(&xfrm_state_hash_generation); + + rcu_read_lock(); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); - hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { + hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -798,7 +820,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, goto found; h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); - hlist_for_each_entry(x, net->xfrm.state_bydst+h_wildcard, bydst) { + hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -851,19 +873,21 @@ found: } if (km_query(x, tmpl, pol) == 0) { + spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); h = xfrm_src_hash(net, daddr, saddr, encap_family); - hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); - hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); } else { x->km.state = XFRM_STATE_DEAD; to_put = x; @@ -872,13 +896,26 @@ found: } } out: - if (x) - xfrm_state_hold(x); - else + if (x) { + if (!xfrm_state_hold_rcu(x)) { + *err = -EAGAIN; + x = NULL; + } + } else { *err = acquire_in_progress ? -EAGAIN : error; - spin_unlock_bh(&net->xfrm.xfrm_state_lock); + } + rcu_read_unlock(); if (to_put) xfrm_state_put(to_put); + + if (read_seqcount_retry(&xfrm_state_hash_generation, sequence)) { + *err = -EAGAIN; + if (x) { + xfrm_state_put(x); + x = NULL; + } + } + return x; } @@ -946,16 +983,16 @@ static void __xfrm_state_insert(struct xfrm_state *x) h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); - hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); - hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); } tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); @@ -1064,9 +1101,9 @@ static struct xfrm_state *__find_acq_core(struct net *net, xfrm_state_hold(x); tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head(&x->bydst, net->xfrm.state_bydst+h); + hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); h = xfrm_src_hash(net, daddr, saddr, family); - hlist_add_head(&x->bysrc, net->xfrm.state_bysrc+h); + hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); net->xfrm.state_num++; @@ -1395,9 +1432,9 @@ xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 { struct xfrm_state *x; - spin_lock_bh(&net->xfrm.xfrm_state_lock); + rcu_read_lock(); x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); - spin_unlock_bh(&net->xfrm.xfrm_state_lock); + rcu_read_unlock(); return x; } EXPORT_SYMBOL(xfrm_state_lookup); @@ -1582,7 +1619,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) if (x->id.spi) { spin_lock_bh(&net->xfrm.xfrm_state_lock); h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head(&x->byspi, net->xfrm.state_byspi+h); + hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; @@ -2100,8 +2137,6 @@ int __net_init xfrm_state_init(struct net *net) net->xfrm.state_num = 0; INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); - INIT_HLIST_HEAD(&net->xfrm.state_gc_list); - INIT_WORK(&net->xfrm.state_gc_work, xfrm_state_gc_task); spin_lock_init(&net->xfrm.xfrm_state_lock); return 0; @@ -2119,7 +2154,7 @@ void xfrm_state_fini(struct net *net) flush_work(&net->xfrm.state_hash_work); xfrm_state_flush(net, IPSEC_PROTO_ANY, false); - flush_work(&net->xfrm.state_gc_work); + flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c index 05a6e3d9c258..35a7e794ad04 100644 --- a/net/xfrm/xfrm_sysctl.c +++ b/net/xfrm/xfrm_sysctl.c @@ -17,13 +17,13 @@ static struct ctl_table xfrm_table[] = { .procname = "xfrm_aevent_etime", .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_douintvec }, { .procname = "xfrm_aevent_rseqth", .maxlen = sizeof(u32), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_douintvec }, { .procname = "xfrm_larval_drop", |