diff options
Diffstat (limited to 'net')
289 files changed, 12475 insertions, 6212 deletions
diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c index 8a3507524f7b..33f17bd8cda7 100644 --- a/net/6lowpan/nhc_udp.c +++ b/net/6lowpan/nhc_udp.c @@ -5,7 +5,7 @@ * Authors: * Alexander Aring <aar@pengutronix.de> * - * Orignal written by: + * Original written by: * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> * Jon Smirl <jonsmirl@gmail.com> */ @@ -82,7 +82,7 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed) if (fail) return -EINVAL; - /* UDP length needs to be infered from the lower layers + /* UDP length needs to be inferred from the lower layers * here, we obtain the hint from the remaining size of the * frame */ diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 78ec2e1b14d1..59bc13b5f14f 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -4,6 +4,7 @@ #include <linux/if_vlan.h> #include <linux/netpoll.h> #include <linux/export.h> +#include <net/gro.h> #include "vlan.h" bool vlan_do_receive(struct sk_buff **skbp) @@ -495,7 +496,10 @@ static struct sk_buff *vlan_gro_receive(struct list_head *head, skb_gro_pull(skb, sizeof(*vhdr)); skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); - pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + + pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + head, skb); out_unlock: rcu_read_unlock(); @@ -515,7 +519,9 @@ static int vlan_gro_complete(struct sk_buff *skb, int nhoff) rcu_read_lock(); ptype = gro_find_complete_by_type(type); if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr)); + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, nhoff + sizeof(*vhdr)); rcu_read_unlock(); return err; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index dc1a197792e6..4db3f0621959 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -776,6 +776,26 @@ static int vlan_dev_get_iflink(const struct net_device *dev) return real_dev->ifindex; } +static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev); + + path->type = DEV_PATH_VLAN; + path->encap.id = vlan->vlan_id; + path->encap.proto = vlan->vlan_proto; + path->dev = ctx->dev; + ctx->dev = vlan->real_dev; + if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) + return -ENOSPC; + + ctx->vlan[ctx->num_vlans].id = vlan->vlan_id; + ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto; + ctx->num_vlans++; + + return 0; +} + static const struct ethtool_ops vlan_ethtool_ops = { .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, @@ -814,6 +834,7 @@ static const struct net_device_ops vlan_netdev_ops = { #endif .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, + .ndo_fill_forward_path = vlan_dev_fill_forward_path, }; static void vlan_dev_free(struct net_device *dev) diff --git a/net/9p/client.c b/net/9p/client.c index 0a9019da18f3..b7b958f61faf 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -122,7 +122,7 @@ static int get_protocol_version(char *s) } /** - * parse_options - parse mount options into client structure + * parse_opts - parse mount options into client structure * @opts: options string passed from mount * @clnt: existing v9fs client information * @@ -256,7 +256,7 @@ EXPORT_SYMBOL(p9_fcall_fini); static struct kmem_cache *p9_req_cache; /** - * p9_req_alloc - Allocate a new request. + * p9_tag_alloc - Allocate a new request. * @c: Client session. * @type: Transaction type. * @max_size: Maximum packet size for this request. diff --git a/net/9p/error.c b/net/9p/error.c index 231f355fa9c6..61c18daf3050 100644 --- a/net/9p/error.c +++ b/net/9p/error.c @@ -197,7 +197,7 @@ int p9_error_init(void) EXPORT_SYMBOL(p9_error_init); /** - * errstr2errno - convert error string to error number + * p9_errstr2errno - convert error string to error number * @errstr: error string * @len: length of error string * diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index fa158397bb63..f4dd0456beaf 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -872,7 +872,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket) } /** - * p9_mux_destroy - cancels all pending requests of mux + * p9_conn_destroy - cancels all pending requests of mux * @m: mux to destroy * */ diff --git a/net/Kconfig b/net/Kconfig index 8cea808ad9e8..8d955195c069 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -245,6 +245,14 @@ source "net/l3mdev/Kconfig" source "net/qrtr/Kconfig" source "net/ncsi/Kconfig" +config PCPU_DEV_REFCNT + bool "Use percpu variables to maintain network device refcount" + depends on SMP + default y + help + network device refcount are using per cpu variables if this option is set. + This can be forced to N to detect underflows (with a performance drop). + config RPS bool depends on SMP && SYSFS @@ -317,13 +325,9 @@ config BPF_STREAM_PARSER select STREAM_PARSER select NET_SOCK_MSG help - Enabling this allows a stream parser to be used with + Enabling this allows a TCP stream parser to be used with BPF_MAP_TYPE_SOCKMAP. - BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. - It can be used to enforce socket policy, implement socket redirects, - etc. - config NET_FLOW_LIMIT bool depends on RPS @@ -425,6 +429,10 @@ config GRO_CELLS config SOCK_VALIDATE_XMIT bool +config NET_SELFTESTS + def_tristate PHYLIB + depends on PHYLIB + config NET_SOCK_MSG bool default n diff --git a/net/ax25/TODO b/net/ax25/TODO deleted file mode 100644 index 69fb4e368d92..000000000000 --- a/net/ax25/TODO +++ /dev/null @@ -1,20 +0,0 @@ -Do the ax25_list_lock, ax25_dev_lock, linkfail_lockreally, ax25_frag_lock and -listen_lock have to be bh-safe? - -Do the netrom and rose locks have to be bh-safe? - -A device might be deleted after lookup in the SIOCADDRT ioctl but before it's -being used. - -Routes to a device being taken down might be deleted by ax25_rt_device_down -but added by somebody else before the device has been deleted fully. - -The ax25_rt_find_route synopsys is pervert but I somehow had to deal with -the race caused by the static variable in it's previous implementation. - -Implement proper socket locking in netrom and rose. - -Check socket locking when ax25_rcv is sending to raw sockets. In particular -ax25_send_to_raw() seems fishy. Heck - ax25_rcv is fishy. - -Handle XID and TEST frames properly. diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 269ee89d2c2b..2631efc6e359 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -850,6 +850,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol, case AX25_P_ROSE: if (ax25_protocol_is_registered(AX25_P_ROSE)) return -ESOCKTNOSUPPORT; + break; #endif default: break; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a5e313cd6f44..789f257be24f 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -456,7 +456,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, * if: * * - the send time is within our MAX_AGGREGATION_MS time - * - the resulting packet wont be bigger than + * - the resulting packet won't be bigger than * MAX_AGGREGATION_BYTES * otherwise aggregation is not possible */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 360bdbf44748..7dc133cfc363 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -25,7 +25,6 @@ #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> -#include <linux/preempt.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> @@ -438,10 +437,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); - if (in_interrupt()) - netif_rx(skb); - else - netif_rx_ni(skb); + netif_rx_any_context(skb); out: if (primary_if) batadv_hardif_put(primary_if); diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 28166402d30c..1d63c8cbbfe7 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -454,8 +454,9 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, return 0; } - read_lock_bh(&in6_dev->lock); - for (pmc6 = in6_dev->mc_list; pmc6; pmc6 = pmc6->next) { + for (pmc6 = rcu_dereference(in6_dev->mc_list); + pmc6; + pmc6 = rcu_dereference(pmc6->next)) { if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; @@ -484,7 +485,6 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, hlist_add_head(&new->list, mcast_list); ret++; } - read_unlock_bh(&in6_dev->lock); rcu_read_unlock(); return ret; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 7c0b475cc22a..2be5d4a712c5 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1659,19 +1659,19 @@ struct batadv_priv { /** @tp_list: list of tp sessions */ struct hlist_head tp_list; - /** @tp_num: number of currently active tp sessions */ + /** @orig_hash: hash table containing mesh participants (orig nodes) */ struct batadv_hashtable *orig_hash; - /** @orig_hash: hash table containing mesh participants (orig nodes) */ + /** @forw_bat_list_lock: lock protecting forw_bat_list */ spinlock_t forw_bat_list_lock; - /** @forw_bat_list_lock: lock protecting forw_bat_list */ + /** @forw_bcast_list_lock: lock protecting forw_bcast_list */ spinlock_t forw_bcast_list_lock; - /** @forw_bcast_list_lock: lock protecting forw_bcast_list */ + /** @tp_list_lock: spinlock protecting @tp_list */ spinlock_t tp_list_lock; - /** @tp_list_lock: spinlock protecting @tp_list */ + /** @tp_num: number of currently active tp sessions */ atomic_t tp_num; /** @orig_work: work queue callback item for orig node purging */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index cff4944d5b66..97617d02c8f9 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -205,8 +205,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, } } - /* use the neighbour cache for matching addresses assigned by SLAAC - */ + /* use the neighbour cache for matching addresses assigned by SLAAC */ neigh = __ipv6_neigh_lookup(dev->netdev, nexthop); if (neigh) { list_for_each_entry_rcu(peer, &dev->peers, list) { @@ -841,8 +840,6 @@ static void chan_close_cb(struct l2cap_chan *chan) } else { spin_unlock(&devices_lock); } - - return; } static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err) diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 400c5130dc0a..e0ab4cd7afc3 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -99,6 +99,13 @@ config BT_MSFTEXT This options enables support for the Microsoft defined HCI vendor extensions. +config BT_AOSPEXT + bool "Enable Android Open Source Project extensions" + depends on BT + help + This options enables support for the Android Open Source + Project defined HCI vendor extensions. + config BT_DEBUGFS bool "Export Bluetooth internals in debugfs" depends on BT && DEBUG_FS diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 1c645fba8c49..cc0995301f93 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -20,5 +20,6 @@ bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o bluetooth-$(CONFIG_BT_LEDS) += leds.o bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o +bluetooth-$(CONFIG_BT_AOSPEXT) += aosp.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/net/bluetooth/aosp.c b/net/bluetooth/aosp.c new file mode 100644 index 000000000000..a1b7762335a5 --- /dev/null +++ b/net/bluetooth/aosp.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Intel Corporation + */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> + +#include "aosp.h" + +void aosp_do_open(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + if (!hdev->aosp_capable) + return; + + bt_dev_dbg(hdev, "Initialize AOSP extension"); + + /* LE Get Vendor Capabilities Command */ + skb = __hci_cmd_sync(hdev, hci_opcode_pack(0x3f, 0x153), 0, NULL, + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) + return; + + kfree_skb(skb); +} + +void aosp_do_close(struct hci_dev *hdev) +{ + if (!hdev->aosp_capable) + return; + + bt_dev_dbg(hdev, "Cleanup of AOSP extension"); +} diff --git a/net/bluetooth/aosp.h b/net/bluetooth/aosp.h new file mode 100644 index 000000000000..328fc6d39f70 --- /dev/null +++ b/net/bluetooth/aosp.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Intel Corporation + */ + +#if IS_ENABLED(CONFIG_BT_AOSPEXT) + +void aosp_do_open(struct hci_dev *hdev); +void aosp_do_close(struct hci_dev *hdev); + +#else + +static inline void aosp_do_open(struct hci_dev *hdev) {} +static inline void aosp_do_close(struct hci_dev *hdev) {} + +#endif diff --git a/net/bluetooth/ecdh_helper.h b/net/bluetooth/ecdh_helper.h index a6f8d03d4aaf..830723971cf8 100644 --- a/net/bluetooth/ecdh_helper.h +++ b/net/bluetooth/ecdh_helper.h @@ -25,6 +25,6 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 pair_public_key[64], u8 secret[32]); -int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 *private_key); +int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32]); int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64]); int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64]); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 6ffa89e3ba0a..88ec08978ff4 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -772,6 +772,16 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status) hci_conn_del(conn); + /* The suspend notifier is waiting for all devices to disconnect and an + * LE connect cancel will result in an hci_le_conn_failed. Once the last + * connection is deleted, we should also wake the suspend queue to + * complete suspend operations. + */ + if (list_empty(&hdev->conn_hash.list) && + test_and_clear_bit(SUSPEND_DISCONNECTING, hdev->suspend_tasks)) { + wake_up(&hdev->suspend_wait_q); + } + /* Since we may have temporarily stopped the background scanning in * favor of connection establishment, we should restart it. */ @@ -1830,8 +1840,6 @@ u32 hci_conn_get_phy(struct hci_conn *conn) { u32 phys = 0; - hci_dev_lock(conn->hdev); - /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471: * Table 6.2: Packets defined for synchronous, asynchronous, and * CSB logical transport types. @@ -1928,7 +1936,5 @@ u32 hci_conn_get_phy(struct hci_conn *conn) break; } - hci_dev_unlock(conn->hdev); - return phys; } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b0d9c36acc03..fd12f1652bdf 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -44,6 +44,7 @@ #include "smp.h" #include "leds.h" #include "msft.h" +#include "aosp.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -1586,6 +1587,7 @@ setup_failed: ret = hdev->set_diag(hdev, true); msft_do_open(hdev); + aosp_do_open(hdev); clear_bit(HCI_INIT, &hdev->flags); @@ -1782,6 +1784,7 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_DOWN); + aosp_do_close(hdev); msft_do_close(hdev); if (hdev->flush) @@ -3760,6 +3763,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_scan_window_suspend = 0x0012; hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; + hdev->le_scan_int_adv_monitor = 0x0060; + hdev->le_scan_window_adv_monitor = 0x0030; hdev->le_scan_int_connect = 0x0060; hdev->le_scan_window_connect = 0x0060; hdev->le_conn_min_interval = 0x0018; diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 1a0ab58bfad0..47f4f21fbc1a 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -274,7 +274,7 @@ static ssize_t use_debug_keys_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS) ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -292,7 +292,7 @@ static ssize_t sc_only_mode_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = hci_dev_test_flag(hdev, HCI_SC_ONLY) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_SC_ONLY) ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -428,7 +428,7 @@ static ssize_t ssp_debug_mode_read(struct file *file, char __user *user_buf, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = hdev->ssp_debug_mode ? 'Y': 'N'; + buf[0] = hdev->ssp_debug_mode ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); @@ -742,7 +742,7 @@ static ssize_t force_static_address_read(struct file *file, struct hci_dev *hdev = file->private_data; char buf[3]; - buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ? 'Y': 'N'; + buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ? 'Y' : 'N'; buf[1] = '\n'; buf[2] = '\0'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 67668be3461e..016b2999f219 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -395,6 +395,29 @@ done: hci_dev_unlock(hdev); } +static void hci_cc_set_event_filter(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *)skb->data); + struct hci_cp_set_event_filter *cp; + void *sent; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_SET_EVENT_FLT); + if (!sent) + return; + + cp = (struct hci_cp_set_event_filter *)sent; + + if (cp->flt_type == HCI_FLT_CLEAR_ALL) + hci_dev_clear_flag(hdev, HCI_EVENT_FILTER_CONFIGURED); + else + hci_dev_set_flag(hdev, HCI_EVENT_FILTER_CONFIGURED); +} + static void hci_cc_read_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_read_class_of_dev *rp = (void *) skb->data; @@ -1189,12 +1212,11 @@ static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev, hci_dev_lock(hdev); - if (!hdev->cur_adv_instance) { + if (!cp->handle) { /* Store in hdev for instance 0 (Set adv and Directed advs) */ bacpy(&hdev->random_addr, &cp->bdaddr); } else { - adv_instance = hci_find_adv_instance(hdev, - hdev->cur_adv_instance); + adv_instance = hci_find_adv_instance(hdev, cp->handle); if (adv_instance) bacpy(&adv_instance->random_addr, &cp->bdaddr); } @@ -1755,17 +1777,16 @@ static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); hdev->adv_addr_type = cp->own_addr_type; - if (!hdev->cur_adv_instance) { + if (!cp->handle) { /* Store in hdev for instance 0 */ hdev->adv_tx_power = rp->tx_power; } else { - adv_instance = hci_find_adv_instance(hdev, - hdev->cur_adv_instance); + adv_instance = hci_find_adv_instance(hdev, cp->handle); if (adv_instance) adv_instance->tx_power = rp->tx_power; } /* Update adv data as tx power is known now */ - hci_req_update_adv_data(hdev, hdev->cur_adv_instance); + hci_req_update_adv_data(hdev, cp->handle); hci_dev_unlock(hdev); } @@ -3328,6 +3349,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_write_scan_enable(hdev, skb); break; + case HCI_OP_SET_EVENT_FLT: + hci_cc_set_event_filter(hdev, skb); + break; + case HCI_OP_READ_CLASS_OF_DEV: hci_cc_read_class_of_dev(hdev, skb); break; @@ -5005,6 +5030,7 @@ static void hci_loglink_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) return; hchan->handle = le16_to_cpu(ev->handle); + hchan->amp = true; BT_DBG("hcon %p mgr %p hchan %p", hcon, hcon->amp_mgr, hchan); @@ -5037,7 +5063,7 @@ static void hci_disconn_loglink_complete_evt(struct hci_dev *hdev, hci_dev_lock(hdev); hchan = hci_chan_lookup_handle(hdev, le16_to_cpu(ev->handle)); - if (!hchan) + if (!hchan || !hchan->amp) goto unlock; amp_destroy_logical_link(hchan, ev->reason); @@ -5280,12 +5306,12 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM) return; - if (!hdev->cur_adv_instance) { + if (!ev->handle) { bacpy(&conn->resp_addr, &hdev->random_addr); return; } - adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); + adv_instance = hci_find_adv_instance(hdev, ev->handle); if (adv_instance) bacpy(&conn->resp_addr, &adv_instance->random_addr); } @@ -5863,7 +5889,7 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, params->conn_latency = latency; params->supervision_timeout = timeout; store_hint = 0x01; - } else{ + } else { store_hint = 0x00; } @@ -5911,7 +5937,7 @@ static void hci_le_phy_update_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); - if (!ev->status) + if (ev->status) return; hci_dev_lock(hdev); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index e55976db4403..560b74d421a8 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -847,6 +847,10 @@ static u8 update_white_list(struct hci_request *req) */ bool allow_rpa = hdev->suspended; + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) + allow_rpa = true; + /* Go through the current white list programmed into the * controller one by one and check if that address is still * in the list of pending connections or list of devices to @@ -1131,14 +1135,14 @@ static void hci_req_clear_event_filter(struct hci_request *req) { struct hci_cp_set_event_filter f; - memset(&f, 0, sizeof(f)); - f.flt_type = HCI_FLT_CLEAR_ALL; - hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &f); + if (!hci_dev_test_flag(req->hdev, HCI_BREDR_ENABLED)) + return; - /* Update page scan state (since we may have modified it when setting - * the event filter). - */ - __hci_req_update_scan(req); + if (hci_dev_test_flag(req->hdev, HCI_EVENT_FILTER_CONFIGURED)) { + memset(&f, 0, sizeof(f)); + f.flt_type = HCI_FLT_CLEAR_ALL; + hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &f); + } } static void hci_req_set_event_filter(struct hci_request *req) @@ -1147,6 +1151,10 @@ static void hci_req_set_event_filter(struct hci_request *req) struct hci_cp_set_event_filter f; struct hci_dev *hdev = req->hdev; u8 scan = SCAN_DISABLED; + bool scanning = test_bit(HCI_PSCAN, &hdev->flags); + + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + return; /* Always clear event filter when starting */ hci_req_clear_event_filter(req); @@ -1167,12 +1175,13 @@ static void hci_req_set_event_filter(struct hci_request *req) scan = SCAN_PAGE; } - if (scan) + if (scan && !scanning) { set_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks); - else + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + } else if (!scan && scanning) { set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); - - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + } } static void cancel_adv_timeout(struct hci_dev *hdev) @@ -1315,9 +1324,14 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) hdev->advertising_paused = true; hdev->advertising_old_state = old_state; - /* Disable page scan */ - page_scan = SCAN_DISABLED; - hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &page_scan); + + /* Disable page scan if enabled */ + if (test_bit(HCI_PSCAN, &hdev->flags)) { + page_scan = SCAN_DISABLED; + hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, + &page_scan); + set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); + } /* Disable LE passive scan if enabled */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { @@ -1328,9 +1342,6 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) /* Disable advertisement filters */ hci_req_add_set_adv_filter_enable(&req, false); - /* Mark task needing completion */ - set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); - /* Prevent disconnects from causing scanning to be re-enabled */ hdev->scanning_paused = true; @@ -1364,7 +1375,10 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) hdev->suspended = false; hdev->scanning_paused = false; + /* Clear any event filters and restore scan state */ hci_req_clear_event_filter(&req); + __hci_req_update_scan(&req); + /* Reset passive/background scanning to normal */ __hci_update_background_scan(&req); /* Enable all of the advertisement filters */ @@ -1637,9 +1651,8 @@ static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) { u8 scan_rsp_len = 0; - if (hdev->appearance) { + if (hdev->appearance) scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len); - } return append_local_name(hdev, ptr, scan_rsp_len); } @@ -1657,9 +1670,8 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, instance_flags = adv_instance->flags; - if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) { + if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len); - } memcpy(&ptr[scan_rsp_len], adv_instance->scan_rsp_data, adv_instance->scan_rsp_len); @@ -2035,7 +2047,8 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, /* If Controller supports LL Privacy use own address type is * 0x03 */ - if (use_ll_privacy(hdev)) + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; @@ -2170,7 +2183,8 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); - } else if (adv_instance_is_scannable(hdev, instance)) { + } else if (adv_instance_is_scannable(hdev, instance) || + (flags & MGMT_ADV_PARAM_SCAN_RSP)) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); else @@ -2508,7 +2522,8 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, /* If Controller supports LL Privacy use own address type is * 0x03 */ - if (use_ll_privacy(hdev)) + if (use_ll_privacy(hdev) && + hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; @@ -2941,6 +2956,9 @@ static int bredr_inquiry(struct hci_request *req, unsigned long opt) const u8 liac[3] = { 0x00, 0x8b, 0x9e }; struct hci_cp_inquiry cp; + if (test_bit(HCI_INQUIRY, &req->hdev->flags)) + return 0; + bt_dev_dbg(req->hdev, ""); hci_dev_lock(req->hdev); @@ -3241,6 +3259,7 @@ bool hci_req_stop_discovery(struct hci_request *req) if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { cancel_delayed_work(&hdev->le_scan_disable); + cancel_delayed_work(&hdev->le_scan_restart); hci_req_add_le_scan_disable(req, false); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 72c2f5226d67..b6a88b8256c7 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -451,6 +451,8 @@ struct l2cap_chan *l2cap_chan_create(void) if (!chan) return NULL; + skb_queue_head_init(&chan->tx_q); + skb_queue_head_init(&chan->srej_q); mutex_init(&chan->lock); /* Set default lock nesting level */ @@ -490,14 +492,14 @@ static void l2cap_chan_destroy(struct kref *kref) void l2cap_chan_hold(struct l2cap_chan *c) { - BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref)); + BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref)); kref_get(&c->kref); } void l2cap_chan_put(struct l2cap_chan *c) { - BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref)); + BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref)); kref_put(&c->kref, l2cap_chan_destroy); } @@ -516,7 +518,9 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan) chan->flush_to = L2CAP_DEFAULT_FLUSH_TO; chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO; chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO; + chan->conf_state = 0; + set_bit(CONF_NOT_COMPLETE, &chan->conf_state); set_bit(FLAG_FORCE_ACTIVE, &chan->flags); } @@ -648,7 +652,7 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) if (test_bit(CONF_NOT_COMPLETE, &chan->conf_state)) return; - switch(chan->mode) { + switch (chan->mode) { case L2CAP_MODE_BASIC: break; @@ -672,8 +676,6 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err) skb_queue_purge(&chan->tx_q); break; } - - return; } EXPORT_SYMBOL_GPL(l2cap_chan_del); @@ -1690,7 +1692,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) smp_conn_security(hcon, hcon->pending_sec_level); /* For LE slave connections, make sure the connection interval - * is in the range of the minium and maximum interval that has + * is in the range of the minimum and maximum interval that has * been configured for this connection. If not, then trigger * the connection update procedure. */ @@ -5921,7 +5923,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, struct l2cap_ecred_conn_req *req = (void *) data; struct { struct l2cap_ecred_conn_rsp rsp; - __le16 dcid[5]; + __le16 dcid[L2CAP_ECRED_MAX_CID]; } __packed pdu; struct l2cap_chan *chan, *pchan; u16 mtu, mps; @@ -5938,6 +5940,14 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, goto response; } + cmd_len -= sizeof(*req); + num_scid = cmd_len / sizeof(u16); + + if (num_scid > ARRAY_SIZE(pdu.dcid)) { + result = L2CAP_CR_LE_INVALID_PARAMS; + goto response; + } + mtu = __le16_to_cpu(req->mtu); mps = __le16_to_cpu(req->mps); @@ -5970,8 +5980,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, } result = L2CAP_CR_LE_SUCCESS; - cmd_len -= sizeof(*req); - num_scid = cmd_len / sizeof(u16); for (i = 0; i < num_scid; i++) { u16 scid = __le16_to_cpu(req->scid[i]); @@ -7253,7 +7261,7 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control, L2CAP_TXSEQ_EXPECTED) { l2cap_pass_to_tx(chan, control); - BT_DBG("buffer_seq %d->%d", chan->buffer_seq, + BT_DBG("buffer_seq %u->%u", chan->buffer_seq, __next_seq(chan, chan->buffer_seq)); chan->buffer_seq = __next_seq(chan, chan->buffer_seq); @@ -7542,7 +7550,7 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid, BT_DBG("chan %p, len %d", chan, skb->len); /* If we receive data on a fixed channel before the info req/rsp - * procdure is done simply assume that the channel is supported + * procedure is done simply assume that the channel is supported * and mark it as ready. */ if (chan->chan_type == L2CAP_CHAN_FIXED) @@ -7762,7 +7770,8 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) return conn; } -static bool is_valid_psm(u16 psm, u8 dst_type) { +static bool is_valid_psm(u16 psm, u8 dst_type) +{ if (!psm) return false; @@ -8356,7 +8365,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (!conn) goto drop; - BT_DBG("conn %p len %d flags 0x%x", conn, skb->len, flags); + BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); switch (flags) { case ACL_START: @@ -8386,10 +8395,10 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) return; } - BT_DBG("Start: total len %d, frag len %d", len, skb->len); + BT_DBG("Start: total len %d, frag len %u", len, skb->len); if (skb->len > len) { - BT_ERR("Frame is too long (len %d, expected len %d)", + BT_ERR("Frame is too long (len %u, expected len %d)", skb->len, len); l2cap_conn_unreliable(conn, ECOMM); goto drop; @@ -8402,7 +8411,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) break; case ACL_CONT: - BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); + BT_DBG("Cont: frag len %u (expecting %u)", skb->len, conn->rx_len); if (!conn->rx_skb) { BT_ERR("Unexpected continuation frame (len %d)", skb->len); @@ -8423,7 +8432,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) } if (skb->len > conn->rx_len) { - BT_ERR("Fragment is too long (len %d, expected %d)", + BT_ERR("Fragment is too long (len %u, expected %u)", skb->len, conn->rx_len); l2cap_recv_reset(conn); l2cap_conn_unreliable(conn, ECOMM); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index f1b1edd0b697..c99d65ef13b1 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -179,9 +179,17 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, struct l2cap_chan *chan = l2cap_pi(sk)->chan; struct sockaddr_l2 la; int len, err = 0; + bool zapped; BT_DBG("sk %p", sk); + lock_sock(sk); + zapped = sock_flag(sk, SOCK_ZAPPED); + release_sock(sk); + + if (zapped) + return -EINVAL; + if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 74971b4bd457..f9be7f9084d6 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -40,7 +40,7 @@ #include "msft.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 19 +#define MGMT_REVISION 20 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -108,6 +108,8 @@ static const u16 mgmt_commands[] = { MGMT_OP_START_LIMITED_DISCOVERY, MGMT_OP_READ_EXT_INFO, MGMT_OP_SET_APPEARANCE, + MGMT_OP_GET_PHY_CONFIGURATION, + MGMT_OP_SET_PHY_CONFIGURATION, MGMT_OP_SET_BLOCKED_KEYS, MGMT_OP_SET_WIDEBAND_SPEECH, MGMT_OP_READ_CONTROLLER_CAP, @@ -166,6 +168,8 @@ static const u16 mgmt_events[] = { MGMT_EV_PHY_CONFIGURATION_CHANGED, MGMT_EV_EXP_FEATURE_CHANGED, MGMT_EV_DEVICE_FLAGS_CHANGED, + MGMT_EV_ADV_MONITOR_ADDED, + MGMT_EV_ADV_MONITOR_REMOVED, MGMT_EV_CONTROLLER_SUSPEND, MGMT_EV_CONTROLLER_RESUME, }; @@ -196,8 +200,6 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_EXT_INDEX_REMOVED, MGMT_EV_EXT_INFO_CHANGED, MGMT_EV_EXP_FEATURE_CHANGED, - MGMT_EV_ADV_MONITOR_ADDED, - MGMT_EV_ADV_MONITOR_REMOVED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -3728,8 +3730,11 @@ static int read_controller_cap(struct sock *sk, struct hci_dev *hdev, /* When the Read Simple Pairing Options command is supported, then * the remote public key validation is supported. + * + * Alternatively, when Microsoft extensions are available, they can + * indicate support for public key validation as well. */ - if (hdev->commands[41] & 0x08) + if ((hdev->commands[41] & 0x08) || msft_curve_validity(hdev)) flags |= 0x01; /* Remote public key validation (BR/EDR) */ flags |= 0x02; /* Remote public key validation (LE) */ @@ -3982,7 +3987,7 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_NOT_POWERED); + MGMT_STATUS_REJECTED); /* Parameters are limited to a single octet */ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) @@ -7432,6 +7437,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_PARAM_TIMEOUT; flags |= MGMT_ADV_PARAM_INTERVALS; flags |= MGMT_ADV_PARAM_TX_POWER; + flags |= MGMT_ADV_PARAM_SCAN_RSP; /* In extended adv TX_POWER returned from Set Adv Param * will be always valid. @@ -7475,7 +7481,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, * advertising. */ if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); @@ -7976,7 +7982,6 @@ static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev, goto unlock; } - hdev->cur_adv_instance = cp->instance; /* Submit request for advertising params if ext adv available */ if (ext_adv_capable(hdev)) { hci_req_init(&req, hdev); diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 47b104f318e9..e28f15439ce4 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -142,6 +142,9 @@ static bool read_supported_features(struct hci_dev *hdev, msft->evt_prefix_len = rp->evt_prefix_len; msft->features = __le64_to_cpu(rp->features); + if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY) + hdev->msft_curve_validity = true; + kfree_skb(skb); return true; @@ -605,3 +608,8 @@ int msft_set_filter_enable(struct hci_dev *hdev, bool enable) return err; } + +bool msft_curve_validity(struct hci_dev *hdev) +{ + return hdev->msft_curve_validity; +} diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index 88ed613dfa08..6e56d94b88d8 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -22,6 +22,7 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, u16 handle); void msft_req_add_set_filter_enable(struct hci_request *req, bool enable); int msft_set_filter_enable(struct hci_dev *hdev, bool enable); +bool msft_curve_validity(struct hci_dev *hdev); #else @@ -54,4 +55,9 @@ static inline int msft_set_filter_enable(struct hci_dev *hdev, bool enable) return -EOPNOTSUPP; } +static inline bool msft_curve_validity(struct hci_dev *hdev) +{ + return false; +} + #endif diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 22a110f37abc..3bd41563f118 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -51,8 +51,8 @@ struct sco_conn { unsigned int mtu; }; -#define sco_conn_lock(c) spin_lock(&c->lock); -#define sco_conn_unlock(c) spin_unlock(&c->lock); +#define sco_conn_lock(c) spin_lock(&c->lock) +#define sco_conn_unlock(c) spin_unlock(&c->lock) static void sco_sock_close(struct sock *sk); static void sco_sock_kill(struct sock *sk); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index b0c1ee110eff..5c17acfb1645 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -54,7 +54,7 @@ #define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd) /* Keys which are not distributed with Secure Connections */ -#define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY); +#define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY) #define SMP_TIMEOUT msecs_to_jiffies(30000) @@ -398,7 +398,7 @@ static int smp_e(const u8 *k, u8 *r) SMP_DBG("r %16phN", r); - memzero_explicit(&ctx, sizeof (ctx)); + memzero_explicit(&ctx, sizeof(ctx)); return err; } @@ -595,7 +595,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) if (!chan) return; - BT_DBG("code 0x%2.2x", code); + bt_dev_dbg(conn->hcon->hdev, "code 0x%2.2x", code); iv[0].iov_base = &code; iv[0].iov_len = 1; @@ -859,7 +859,8 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, memset(smp->tk, 0, sizeof(smp->tk)); clear_bit(SMP_FLAG_TK_VALID, &smp->flags); - BT_DBG("tk_request: auth:%d lcl:%d rem:%d", auth, local_io, remote_io); + bt_dev_dbg(hcon->hdev, "auth:%d lcl:%d rem:%d", auth, local_io, + remote_io); /* If neither side wants MITM, either "just" confirm an incoming * request or use just-works for outgoing ones. The JUST_CFM @@ -924,7 +925,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, get_random_bytes(&passkey, sizeof(passkey)); passkey %= 1000000; put_unaligned_le32(passkey, smp->tk); - BT_DBG("PassKey: %d", passkey); + bt_dev_dbg(hcon->hdev, "PassKey: %d", passkey); set_bit(SMP_FLAG_TK_VALID, &smp->flags); } @@ -949,7 +950,7 @@ static u8 smp_confirm(struct smp_chan *smp) struct smp_cmd_pairing_confirm cp; int ret; - BT_DBG("conn %p", conn); + bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); ret = smp_c1(smp->tk, smp->prnd, smp->preq, smp->prsp, conn->hcon->init_addr_type, &conn->hcon->init_addr, @@ -977,7 +978,8 @@ static u8 smp_random(struct smp_chan *smp) u8 confirm[16]; int ret; - BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); + bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn, + conn->hcon->out ? "master" : "slave"); ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, @@ -1236,7 +1238,7 @@ static void smp_distribute_keys(struct smp_chan *smp) struct hci_dev *hdev = hcon->hdev; __u8 *keydist; - BT_DBG("conn %p", conn); + bt_dev_dbg(hdev, "conn %p", conn); rsp = (void *) &smp->prsp[1]; @@ -1266,7 +1268,7 @@ static void smp_distribute_keys(struct smp_chan *smp) *keydist &= ~SMP_SC_NO_DIST; } - BT_DBG("keydist 0x%x", *keydist); + bt_dev_dbg(hdev, "keydist 0x%x", *keydist); if (*keydist & SMP_DIST_ENC_KEY) { struct smp_cmd_encrypt_info enc; @@ -1366,13 +1368,14 @@ static void smp_timeout(struct work_struct *work) security_timer.work); struct l2cap_conn *conn = smp->conn; - BT_DBG("conn %p", conn); + bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); hci_disconnect(conn->hcon, HCI_ERROR_REMOTE_USER_TERM); } static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) { + struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan = conn->smp; struct smp_chan *smp; @@ -1382,13 +1385,13 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(smp->tfm_cmac)) { - BT_ERR("Unable to create CMAC crypto context"); + bt_dev_err(hcon->hdev, "Unable to create CMAC crypto context"); goto zfree_smp; } smp->tfm_ecdh = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(smp->tfm_ecdh)) { - BT_ERR("Unable to create ECDH crypto context"); + bt_dev_err(hcon->hdev, "Unable to create ECDH crypto context"); goto free_shash; } @@ -1399,7 +1402,7 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn) INIT_DELAYED_WORK(&smp->security_timer, smp_timeout); - hci_conn_hold(conn->hcon); + hci_conn_hold(hcon); return smp; @@ -1564,8 +1567,8 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) if (!hcon->out) return 0; - BT_DBG("%s Starting passkey round %u", hdev->name, - smp->passkey_round + 1); + bt_dev_dbg(hdev, "Starting passkey round %u", + smp->passkey_round + 1); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); @@ -1625,11 +1628,11 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) u32 value; int err; - BT_DBG(""); - if (!conn) return -ENOTCONN; + bt_dev_dbg(conn->hcon->hdev, ""); + chan = conn->smp; if (!chan) return -ENOTCONN; @@ -1651,7 +1654,7 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey) case MGMT_OP_USER_PASSKEY_REPLY: value = le32_to_cpu(passkey); memset(smp->tk, 0, sizeof(smp->tk)); - BT_DBG("PassKey: %d", value); + bt_dev_dbg(conn->hcon->hdev, "PassKey: %d", value); put_unaligned_le32(value, smp->tk); fallthrough; case MGMT_OP_USER_CONFIRM_REPLY: @@ -1733,7 +1736,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) u8 key_size, auth, sec_level; int ret; - BT_DBG("conn %p", conn); + bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*req)) return SMP_INVALID_PARAMS; @@ -1887,7 +1890,7 @@ static u8 sc_send_public_key(struct smp_chan *smp) } if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { - BT_DBG("Using debug keys"); + bt_dev_dbg(hdev, "Using debug keys"); if (set_ecdh_privkey(smp->tfm_ecdh, debug_sk)) return SMP_UNSPECIFIED; memcpy(smp->local_pk, debug_pk, 64); @@ -1924,7 +1927,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) u8 key_size, auth; int ret; - BT_DBG("conn %p", conn); + bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rsp)) return SMP_INVALID_PARAMS; @@ -2019,7 +2022,7 @@ static u8 sc_check_confirm(struct smp_chan *smp) { struct l2cap_conn *conn = smp->conn; - BT_DBG(""); + bt_dev_dbg(conn->hcon->hdev, ""); if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM); @@ -2078,8 +2081,10 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) { struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; + struct hci_conn *hcon = conn->hcon; + struct hci_dev *hdev = hcon->hdev; - BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave"); + bt_dev_dbg(hdev, "conn %p %s", conn, hcon->out ? "master" : "slave"); if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; @@ -2094,7 +2099,7 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) if (test_bit(SMP_FLAG_REMOTE_PK, &smp->flags)) return sc_check_confirm(smp); - BT_ERR("Unexpected SMP Pairing Confirm"); + bt_dev_err(hdev, "Unexpected SMP Pairing Confirm"); ret = fixup_sc_false_positive(smp); if (ret) @@ -2125,7 +2130,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) u32 passkey; int err; - BT_DBG("conn %p", conn); + bt_dev_dbg(hcon->hdev, "conn %p", conn); if (skb->len < sizeof(smp->rrnd)) return SMP_INVALID_PARAMS; @@ -2284,7 +2289,7 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_chan *smp; u8 sec_level, auth; - BT_DBG("conn %p", conn); + bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; @@ -2347,7 +2352,8 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) __u8 authreq; int ret; - BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level); + bt_dev_dbg(hcon->hdev, "conn %p hcon %p level 0x%2.2x", conn, hcon, + sec_level); /* This may be NULL if there's an unexpected disconnection */ if (!conn) @@ -2483,7 +2489,7 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb) struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; - BT_DBG("conn %p", conn); + bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; @@ -2516,7 +2522,7 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_ltk *ltk; u8 authenticated; - BT_DBG("conn %p", conn); + bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; @@ -2548,7 +2554,7 @@ static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb) struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; - BT_DBG(""); + bt_dev_dbg(conn->hcon->hdev, ""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; @@ -2580,7 +2586,7 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn, struct hci_conn *hcon = conn->hcon; bdaddr_t rpa; - BT_DBG(""); + bt_dev_dbg(hcon->hdev, ""); if (skb->len < sizeof(*info)) return SMP_INVALID_PARAMS; @@ -2647,7 +2653,7 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_chan *smp = chan->data; struct smp_csrk *csrk; - BT_DBG("conn %p", conn); + bt_dev_dbg(conn->hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*rp)) return SMP_INVALID_PARAMS; @@ -2727,11 +2733,20 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_pairing_confirm cfm; int err; - BT_DBG("conn %p", conn); + bt_dev_dbg(hdev, "conn %p", conn); if (skb->len < sizeof(*key)) return SMP_INVALID_PARAMS; + /* Check if remote and local public keys are the same and debug key is + * not in use. + */ + if (!test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags) && + !crypto_memneq(key, smp->local_pk, 64)) { + bt_dev_err(hdev, "Remote and local public keys are identical"); + return SMP_UNSPECIFIED; + } + memcpy(smp->remote_pk, key, 64); if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) { @@ -2782,7 +2797,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) smp->method = sc_select_method(smp); - BT_DBG("%s selected method 0x%02x", hdev->name, smp->method); + bt_dev_dbg(hdev, "selected method 0x%02x", smp->method); /* JUST_WORKS and JUST_CFM result in an unauthenticated key */ if (smp->method == JUST_WORKS || smp->method == JUST_CFM) @@ -2857,7 +2872,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) u8 io_cap[3], r[16], e[16]; int err; - BT_DBG("conn %p", conn); + bt_dev_dbg(hcon->hdev, "conn %p", conn); if (skb->len < sizeof(*check)) return SMP_INVALID_PARAMS; @@ -2917,7 +2932,7 @@ static int smp_cmd_keypress_notify(struct l2cap_conn *conn, { struct smp_cmd_keypress_notify *kp = (void *) skb->data; - BT_DBG("value 0x%02x", kp->value); + bt_dev_dbg(conn->hcon->hdev, "value 0x%02x", kp->value); return 0; } @@ -3014,7 +3029,7 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb) break; default: - BT_DBG("Unknown command code 0x%2.2x", code); + bt_dev_dbg(hcon->hdev, "Unknown command code 0x%2.2x", code); reason = SMP_CMD_NOTSUPP; goto done; } @@ -3039,7 +3054,7 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err) { struct l2cap_conn *conn = chan->conn; - BT_DBG("chan %p", chan); + bt_dev_dbg(conn->hcon->hdev, "chan %p", chan); if (chan->data) smp_chan_destroy(conn); @@ -3056,7 +3071,7 @@ static void bredr_pairing(struct l2cap_chan *chan) struct smp_cmd_pairing req; struct smp_chan *smp; - BT_DBG("chan %p", chan); + bt_dev_dbg(hdev, "chan %p", chan); /* Only new pairings are interesting */ if (!test_bit(HCI_CONN_NEW_LINK_KEY, &hcon->flags)) @@ -3103,7 +3118,7 @@ static void bredr_pairing(struct l2cap_chan *chan) set_bit(SMP_FLAG_SC, &smp->flags); - BT_DBG("%s starting SMP over BR/EDR", hdev->name); + bt_dev_dbg(hdev, "starting SMP over BR/EDR"); /* Prepare and send the BR/EDR SMP Pairing Request */ build_bredr_pairing_cmd(smp, &req, NULL); @@ -3121,7 +3136,7 @@ static void smp_resume_cb(struct l2cap_chan *chan) struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; - BT_DBG("chan %p", chan); + bt_dev_dbg(hcon->hdev, "chan %p", chan); if (hcon->type == ACL_LINK) { bredr_pairing(chan); @@ -3144,7 +3159,7 @@ static void smp_ready_cb(struct l2cap_chan *chan) struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; - BT_DBG("chan %p", chan); + bt_dev_dbg(hcon->hdev, "chan %p", chan); /* No need to call l2cap_chan_hold() here since we already own * the reference taken in smp_new_conn_cb(). This is just the @@ -3162,7 +3177,7 @@ static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { int err; - BT_DBG("chan %p", chan); + bt_dev_dbg(chan->conn->hcon->hdev, "chan %p", chan); err = smp_sig_channel(chan, skb); if (err) { @@ -3214,7 +3229,7 @@ static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) { struct l2cap_chan *chan; - BT_DBG("pchan %p", pchan); + bt_dev_dbg(pchan->conn->hcon->hdev, "pchan %p", pchan); chan = l2cap_chan_create(); if (!chan) @@ -3235,7 +3250,7 @@ static inline struct l2cap_chan *smp_new_conn_cb(struct l2cap_chan *pchan) */ atomic_set(&chan->nesting, L2CAP_NESTING_SMP); - BT_DBG("created chan %p", chan); + bt_dev_dbg(pchan->conn->hcon->hdev, "created chan %p", chan); return chan; } @@ -3276,14 +3291,14 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid) tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0); if (IS_ERR(tfm_cmac)) { - BT_ERR("Unable to create CMAC crypto context"); + bt_dev_err(hdev, "Unable to create CMAC crypto context"); kfree_sensitive(smp); return ERR_CAST(tfm_cmac); } tfm_ecdh = crypto_alloc_kpp("ecdh", 0, 0); if (IS_ERR(tfm_ecdh)) { - BT_ERR("Unable to create ECDH crypto context"); + bt_dev_err(hdev, "Unable to create ECDH crypto context"); crypto_free_shash(tfm_cmac); kfree_sensitive(smp); return ERR_CAST(tfm_ecdh); @@ -3339,7 +3354,7 @@ static void smp_del_chan(struct l2cap_chan *chan) { struct smp_dev *smp; - BT_DBG("chan %p", chan); + bt_dev_dbg(chan->conn->hcon->hdev, "chan %p", chan); smp = chan->data; if (smp) { @@ -3382,7 +3397,7 @@ int smp_register(struct hci_dev *hdev) { struct l2cap_chan *chan; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, ""); /* If the controller does not support Low Energy operation, then * there is also no need to register any SMP channel. diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 58bcb8c849d5..a5d72c48fb66 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 Facebook */ #include <linux/bpf.h> +#include <linux/btf_ids.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/etherdevice.h> @@ -10,20 +11,86 @@ #include <net/bpf_sk_storage.h> #include <net/sock.h> #include <net/tcp.h> +#include <net/net_namespace.h> #include <linux/error-injection.h> #include <linux/smp.h> +#include <linux/sock_diag.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> +struct bpf_test_timer { + enum { NO_PREEMPT, NO_MIGRATE } mode; + u32 i; + u64 time_start, time_spent; +}; + +static void bpf_test_timer_enter(struct bpf_test_timer *t) + __acquires(rcu) +{ + rcu_read_lock(); + if (t->mode == NO_PREEMPT) + preempt_disable(); + else + migrate_disable(); + + t->time_start = ktime_get_ns(); +} + +static void bpf_test_timer_leave(struct bpf_test_timer *t) + __releases(rcu) +{ + t->time_start = 0; + + if (t->mode == NO_PREEMPT) + preempt_enable(); + else + migrate_enable(); + rcu_read_unlock(); +} + +static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration) + __must_hold(rcu) +{ + t->i++; + if (t->i >= repeat) { + /* We're done. */ + t->time_spent += ktime_get_ns() - t->time_start; + do_div(t->time_spent, t->i); + *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent; + *err = 0; + goto reset; + } + + if (signal_pending(current)) { + /* During iteration: we've been cancelled, abort. */ + *err = -EINTR; + goto reset; + } + + if (need_resched()) { + /* During iteration: we need to reschedule between runs. */ + t->time_spent += ktime_get_ns() - t->time_start; + bpf_test_timer_leave(t); + cond_resched(); + bpf_test_timer_enter(t); + } + + /* Do another round. */ + return true; + +reset: + t->i = 0; + return false; +} + static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; - u64 time_start, time_spent = 0; - int ret = 0; - u32 i; + int ret; for_each_cgroup_storage_type(stype) { storage[stype] = bpf_cgroup_storage_alloc(prog, stype); @@ -38,40 +105,20 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, if (!repeat) repeat = 1; - rcu_read_lock(); - migrate_disable(); - time_start = ktime_get_ns(); - for (i = 0; i < repeat; i++) { - bpf_cgroup_storage_set(storage); + bpf_test_timer_enter(&t); + do { + ret = bpf_cgroup_storage_set(storage); + if (ret) + break; if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx); - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - if (need_resched()) { - time_spent += ktime_get_ns() - time_start; - migrate_enable(); - rcu_read_unlock(); - - cond_resched(); - - rcu_read_lock(); - migrate_disable(); - time_start = ktime_get_ns(); - } - } - time_spent += ktime_get_ns() - time_start; - migrate_enable(); - rcu_read_unlock(); - - do_div(time_spent, repeat); - *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + bpf_cgroup_storage_unset(); + } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); @@ -167,10 +214,37 @@ int noinline bpf_modify_return_test(int a, int *b) *b += 1; return a + *b; } + +u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d) +{ + return a + b + c + d; +} + +int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b) +{ + return a + b; +} + +struct sock * noinline bpf_kfunc_call_test3(struct sock *sk) +{ + return sk; +} + __diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); +BTF_SET_START(test_sk_kfunc_ids) +BTF_ID(func, bpf_kfunc_call_test1) +BTF_ID(func, bpf_kfunc_call_test2) +BTF_ID(func, bpf_kfunc_call_test3) +BTF_SET_END(test_sk_kfunc_ids) + +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +{ + return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id); +} + static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { @@ -674,18 +748,17 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + struct bpf_test_timer t = { NO_PREEMPT }; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; - u64 time_start, time_spent = 0; const struct ethhdr *eth; unsigned int flags = 0; u32 retval, duration; void *data; int ret; - u32 i; if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; @@ -721,48 +794,127 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ctx.data = data; ctx.data_end = (__u8 *)data + size; - rcu_read_lock(); - preempt_disable(); - time_start = ktime_get_ns(); - for (i = 0; i < repeat; i++) { + bpf_test_timer_enter(&t); + do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size, flags); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); - if (signal_pending(current)) { - preempt_enable(); - rcu_read_unlock(); + if (ret < 0) + goto out; - ret = -EINTR; - goto out; - } + ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), + retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, + sizeof(struct bpf_flow_keys)); - if (need_resched()) { - time_spent += ktime_get_ns() - time_start; - preempt_enable(); - rcu_read_unlock(); +out: + kfree(user_ctx); + kfree(data); + return ret; +} - cond_resched(); +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_prog_array *progs = NULL; + struct bpf_sk_lookup_kern ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_sk_lookup *user_ctx; + u32 retval, duration; + int ret = -EINVAL; - rcu_read_lock(); - preempt_disable(); - time_start = ktime_get_ns(); - } + if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || + kattr->test.data_size_out) + return -EINVAL; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); + if (IS_ERR(user_ctx)) + return PTR_ERR(user_ctx); + + if (!user_ctx) + return -EINVAL; + + if (user_ctx->sk) + goto out; + + if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) + goto out; + + if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + ret = -ERANGE; + goto out; } - time_spent += ktime_get_ns() - time_start; - preempt_enable(); - rcu_read_unlock(); - do_div(time_spent, repeat); - duration = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + ctx.family = (u16)user_ctx->family; + ctx.protocol = (u16)user_ctx->protocol; + ctx.dport = (u16)user_ctx->local_port; + ctx.sport = (__force __be16)user_ctx->remote_port; - ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), - retval, duration); + switch (ctx.family) { + case AF_INET: + ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; + ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; + ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; + break; +#endif + + default: + ret = -EAFNOSUPPORT; + goto out; + } + + progs = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!progs) { + ret = -ENOMEM; + goto out; + } + + progs->items[0].prog = prog; + + bpf_test_timer_enter(&t); + do { + ctx.selected_sk = NULL; + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + user_ctx->cookie = 0; + if (ctx.selected_sk) { + if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { + ret = -EOPNOTSUPP; + goto out; + } + + user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); + } + + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); if (!ret) - ret = bpf_ctx_finish(kattr, uattr, user_ctx, - sizeof(struct bpf_flow_keys)); + ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); out: + bpf_prog_array_free(progs); kfree(user_ctx); - kfree(data); return ret; } diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index dfec65eca8a6..3db1def4437b 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -160,7 +160,9 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { if (p && (p->flags & BR_NEIGH_SUPPRESS)) return; - if (ipv4_is_zeronet(sip) || sip == tip) { + if (parp->ar_op != htons(ARPOP_RREQUEST) && + parp->ar_op != htons(ARPOP_RREPLY) && + (ipv4_is_zeronet(sip) || sip == tip)) { /* prevent flooding to neigh suppress ports */ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1; return; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 3f2f06b4dd27..e8b626cc6bfd 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -385,6 +385,54 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) return br_del_if(br, slave_dev); } +static int br_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct net_bridge_fdb_entry *f; + struct net_bridge_port *dst; + struct net_bridge *br; + + if (netif_is_bridge_port(ctx->dev)) + return -1; + + br = netdev_priv(ctx->dev); + + br_vlan_fill_forward_path_pvid(br, ctx, path); + + f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id); + if (!f || !f->dst) + return -1; + + dst = READ_ONCE(f->dst); + if (!dst) + return -1; + + if (br_vlan_fill_forward_path_mode(br, dst, path)) + return -1; + + path->type = DEV_PATH_BRIDGE; + path->dev = dst->br->dev; + ctx->dev = dst->dev; + + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_TAG: + if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) + return -ENOSPC; + ctx->vlan[ctx->num_vlans].id = path->bridge.vlan_id; + ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto; + ctx->num_vlans++; + break; + case DEV_PATH_BR_VLAN_UNTAG_HW: + case DEV_PATH_BR_VLAN_UNTAG: + ctx->num_vlans--; + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + + return 0; +} + static const struct ethtool_ops br_ethtool_ops = { .get_drvinfo = br_getinfo, .get_link = ethtool_op_get_link, @@ -419,6 +467,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_bridge_setlink = br_setlink, .ndo_bridge_dellink = br_dellink, .ndo_features_check = passthru_features_check, + .ndo_fill_forward_path = br_fill_forward_path, }; static struct device_type br_type = { diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b7490237f3fc..698b79747d32 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -726,6 +726,56 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } +static int br_fdb_replay_one(struct notifier_block *nb, + struct net_bridge_fdb_entry *fdb, + struct net_device *dev) +{ + struct switchdev_notifier_fdb_info item; + int err; + + item.addr = fdb->key.addr.addr; + item.vid = fdb->key.vlan_id; + item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); + item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); + item.info.dev = dev; + + err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item); + return notifier_to_errno(err); +} + +int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb) +{ + struct net_bridge_fdb_entry *fdb; + struct net_bridge *br; + int err = 0; + + if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) + return -EINVAL; + + br = netdev_priv(br_dev); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { + struct net_bridge_port *dst = READ_ONCE(fdb->dst); + struct net_device *dst_dev; + + dst_dev = dst ? dst->dev : br->dev; + if (dst_dev != br_dev && dst_dev != dev) + continue; + + err = br_fdb_replay_one(nb, fdb, dst_dev); + if (err) + break; + } + + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(br_fdb_replay); + static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type, bool swdev_notify) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 222285d9dae2..8875e953ac53 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -144,6 +144,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); + break; default: break; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 8846c5bcd075..95fa4af0e8dd 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -506,6 +506,134 @@ err: kfree(priv); } +static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, + const struct net_bridge_mdb_entry *mp) +{ + if (mp->addr.proto == htons(ETH_P_IP)) + ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr); +#if IS_ENABLED(CONFIG_IPV6) + else if (mp->addr.proto == htons(ETH_P_IPV6)) + ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr); +#endif + else + ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr); + + mdb->vid = mp->addr.vid; +} + +static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, + struct switchdev_obj_port_mdb *mdb, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_port_obj_info obj_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .obj = &mdb->obj, + }; + int err; + + err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info); + return notifier_to_errno(err); +} + +static int br_mdb_queue_one(struct list_head *mdb_list, + enum switchdev_obj_id id, + const struct net_bridge_mdb_entry *mp, + struct net_device *orig_dev) +{ + struct switchdev_obj_port_mdb *mdb; + + mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC); + if (!mdb) + return -ENOMEM; + + mdb->obj.id = id; + mdb->obj.orig_dev = orig_dev; + br_switchdev_mdb_populate(mdb, mp); + list_add_tail(&mdb->obj.list, mdb_list); + + return 0; +} + +int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb, struct netlink_ext_ack *extack) +{ + struct net_bridge_mdb_entry *mp; + struct switchdev_obj *obj, *tmp; + struct net_bridge *br; + LIST_HEAD(mdb_list); + int err = 0; + + ASSERT_RTNL(); + + if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) + return -EINVAL; + + br = netdev_priv(br_dev); + + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + return 0; + + /* We cannot walk over br->mdb_list protected just by the rtnl_mutex, + * because the write-side protection is br->multicast_lock. But we + * need to emulate the [ blocking ] calling context of a regular + * switchdev event, so since both br->multicast_lock and RCU read side + * critical sections are atomic, we have no choice but to pick the RCU + * read side lock, queue up all our events, leave the critical section + * and notify switchdev from blocking context. + */ + rcu_read_lock(); + + hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + + if (mp->host_joined) { + err = br_mdb_queue_one(&mdb_list, + SWITCHDEV_OBJ_ID_HOST_MDB, + mp, br_dev); + if (err) { + rcu_read_unlock(); + goto out_free_mdb; + } + } + + for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; + pp = &p->next) { + if (p->key.port->dev != dev) + continue; + + err = br_mdb_queue_one(&mdb_list, + SWITCHDEV_OBJ_ID_PORT_MDB, + mp, dev); + if (err) { + rcu_read_unlock(); + goto out_free_mdb; + } + } + } + + rcu_read_unlock(); + + list_for_each_entry(obj, &mdb_list, list) { + err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), + extack); + if (err) + goto out_free_mdb; + } + +out_free_mdb: + list_for_each_entry_safe(obj, tmp, &mdb_list, list) { + list_del(&obj->list); + kfree(SWITCHDEV_OBJ_PORT_MDB(obj)); + } + + return err; +} +EXPORT_SYMBOL_GPL(br_mdb_replay); + static void br_mdb_switchdev_host_port(struct net_device *dev, struct net_device *lower_dev, struct net_bridge_mdb_entry *mp, @@ -515,18 +643,12 @@ static void br_mdb_switchdev_host_port(struct net_device *dev, .obj = { .id = SWITCHDEV_OBJ_ID_HOST_MDB, .flags = SWITCHDEV_F_DEFER, + .orig_dev = dev, }, - .vid = mp->addr.vid, }; - if (mp->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr); -#if IS_ENABLED(CONFIG_IPV6) - else - ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr); -#endif + br_switchdev_mdb_populate(&mdb, mp); - mdb.obj.orig_dev = dev; switch (type) { case RTM_NEWMDB: switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); @@ -558,21 +680,13 @@ void br_mdb_notify(struct net_device *dev, .id = SWITCHDEV_OBJ_ID_PORT_MDB, .flags = SWITCHDEV_F_DEFER, }, - .vid = mp->addr.vid, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; if (pg) { - if (mp->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr); -#if IS_ENABLED(CONFIG_IPV6) - else if (mp->addr.proto == htons(ETH_P_IPV6)) - ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr); -#endif - else - ether_addr_copy(mdb.addr, mp->addr.dst.mac_addr); + br_switchdev_mdb_populate(&mdb, mp); mdb.obj.orig_dev = pg->key.port->dev; switch (type) { diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index 12487f6fe9b4..cd2b1e424e54 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -411,6 +411,13 @@ static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) cancel_delayed_work_sync(&mrp->in_test_work); br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0); + /* Disable the roles */ + br_mrp_switchdev_set_ring_role(br, mrp, BR_MRP_RING_ROLE_DISABLED); + p = rtnl_dereference(mrp->i_port); + if (p) + br_mrp_switchdev_set_in_role(br, mrp, mrp->in_id, mrp->ring_id, + BR_MRP_IN_ROLE_DISABLED); + br_mrp_switchdev_del(br, mrp); /* Reset the ports */ diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 9d265447d654..2883601d5c8b 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1593,7 +1593,8 @@ out: spin_unlock(&br->multicast_lock); } -static void br_mc_disabled_update(struct net_device *dev, bool value) +static int br_mc_disabled_update(struct net_device *dev, bool value, + struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = dev, @@ -1602,11 +1603,13 @@ static void br_mc_disabled_update(struct net_device *dev, bool value) .u.mc_disabled = !value, }; - switchdev_port_attr_set(dev, &attr, NULL); + return switchdev_port_attr_set(dev, &attr, extack); } int br_multicast_add_port(struct net_bridge_port *port) { + int err; + port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; @@ -1618,8 +1621,12 @@ int br_multicast_add_port(struct net_bridge_port *port) timer_setup(&port->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif - br_mc_disabled_update(port->dev, - br_opt_get(port->br, BROPT_MULTICAST_ENABLED)); + err = br_mc_disabled_update(port->dev, + br_opt_get(port->br, + BROPT_MULTICAST_ENABLED), + NULL); + if (err && err != -EOPNOTSUPP) + return err; port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats); if (!port->mcast_stats) @@ -3560,16 +3567,23 @@ static void br_multicast_start_querier(struct net_bridge *br, rcu_read_unlock(); } -int br_multicast_toggle(struct net_bridge *br, unsigned long val) +int br_multicast_toggle(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { struct net_bridge_port *port; bool change_snoopers = false; + int err = 0; spin_lock_bh(&br->multicast_lock); if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val) goto unlock; - br_mc_disabled_update(br->dev, val); + err = br_mc_disabled_update(br->dev, val, extack); + if (err == -EOPNOTSUPP) + err = 0; + if (err) + goto unlock; + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { change_snoopers = true; @@ -3607,7 +3621,7 @@ unlock: br_multicast_leave_snoopers(br); } - return 0; + return err; } bool br_multicast_enabled(const struct net_device *dev) diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c index fea38b9a7268..13290a749d09 100644 --- a/net/bridge/br_multicast_eht.c +++ b/net/bridge/br_multicast_eht.c @@ -498,11 +498,13 @@ static void br_multicast_del_eht_host(struct net_bridge_port_group *pg, &set_h->h_addr); } -static void __eht_allow_incl(struct net_bridge_port_group *pg, - union net_bridge_eht_addr *h_addr, - void *srcs, - u32 nsrcs, - size_t addr_size) +/* create new set entries from reports */ +static void __eht_create_set_entries(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size, + int filter_mode) { union net_bridge_eht_addr eht_src_addr; u32 src_idx; @@ -511,72 +513,17 @@ static void __eht_allow_incl(struct net_bridge_port_group *pg, for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, - MCAST_INCLUDE, + filter_mode, false); } } -static bool __eht_allow_excl(struct net_bridge_port_group *pg, - union net_bridge_eht_addr *h_addr, - void *srcs, - u32 nsrcs, - size_t addr_size) -{ - bool changed = false, host_excl = false; - union net_bridge_eht_addr eht_src_addr; - struct net_bridge_group_src *src_ent; - struct br_ip src_ip; - u32 src_idx; - - host_excl = !!(br_multicast_eht_host_filter_mode(pg, h_addr) == MCAST_EXCLUDE); - memset(&eht_src_addr, 0, sizeof(eht_src_addr)); - for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); - if (!host_excl) { - br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, - MCAST_INCLUDE, - false); - } else { - if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, - h_addr)) - continue; - memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size); - src_ent = br_multicast_find_group_src(pg, &src_ip); - if (!src_ent) - continue; - br_multicast_del_group_src(src_ent, true); - changed = true; - } - } - - return changed; -} - -static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, - union net_bridge_eht_addr *h_addr, - void *srcs, - u32 nsrcs, - size_t addr_size) -{ - bool changed = false; - - switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { - case MCAST_INCLUDE: - __eht_allow_incl(pg, h_addr, srcs, nsrcs, addr_size); - break; - case MCAST_EXCLUDE: - changed = __eht_allow_excl(pg, h_addr, srcs, nsrcs, addr_size); - break; - } - - return changed; -} - -static bool __eht_block_incl(struct net_bridge_port_group *pg, - union net_bridge_eht_addr *h_addr, - void *srcs, - u32 nsrcs, - size_t addr_size) +/* delete existing set entries and their (S,G) entries if they were the last */ +static bool __eht_del_set_entries(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size) { union net_bridge_eht_addr eht_src_addr; struct net_bridge_group_src *src_ent; @@ -602,39 +549,23 @@ static bool __eht_block_incl(struct net_bridge_port_group *pg, return changed; } -static bool __eht_block_excl(struct net_bridge_port_group *pg, - union net_bridge_eht_addr *h_addr, - void *srcs, - u32 nsrcs, - size_t addr_size) +static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, + union net_bridge_eht_addr *h_addr, + void *srcs, + u32 nsrcs, + size_t addr_size) { - bool changed = false, host_excl = false; - union net_bridge_eht_addr eht_src_addr; - struct net_bridge_group_src *src_ent; - struct br_ip src_ip; - u32 src_idx; + bool changed = false; - host_excl = !!(br_multicast_eht_host_filter_mode(pg, h_addr) == MCAST_EXCLUDE); - memset(&eht_src_addr, 0, sizeof(eht_src_addr)); - memset(&src_ip, 0, sizeof(src_ip)); - src_ip.proto = pg->key.addr.proto; - for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); - if (host_excl) { - br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, - MCAST_EXCLUDE, - false); - } else { - if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, - h_addr)) - continue; - memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size); - src_ent = br_multicast_find_group_src(pg, &src_ip); - if (!src_ent) - continue; - br_multicast_del_group_src(src_ent, true); - changed = true; - } + switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { + case MCAST_INCLUDE: + __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + MCAST_INCLUDE); + break; + case MCAST_EXCLUDE: + changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, + addr_size); + break; } return changed; @@ -650,10 +581,12 @@ static bool br_multicast_eht_block(struct net_bridge_port_group *pg, switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { case MCAST_INCLUDE: - changed = __eht_block_incl(pg, h_addr, srcs, nsrcs, addr_size); + changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, + addr_size); break; case MCAST_EXCLUDE: - changed = __eht_block_excl(pg, h_addr, srcs, nsrcs, addr_size); + __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + MCAST_EXCLUDE); break; } @@ -671,7 +604,6 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg, { bool changed = false, flush_entries = to_report; union net_bridge_eht_addr eht_src_addr; - u32 src_idx; if (br_multicast_eht_host_filter_mode(pg, h_addr) != filter_mode) flush_entries = true; @@ -680,11 +612,8 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg, /* if we're changing mode del host and its entries */ if (flush_entries) br_multicast_del_eht_host(pg, h_addr); - for (src_idx = 0; src_idx < nsrcs; src_idx++) { - memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); - br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, - filter_mode, false); - } + __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + filter_mode); /* we can be missing sets only if we've deleted some entries */ if (flush_entries) { struct net_bridge *br = pg->key.port->br; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index f2b1343f8332..0456593aceec 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1293,7 +1293,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_SNOOPING]) { u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]); - br_multicast_toggle(br, mcast_snooping); + err = br_multicast_toggle(br, mcast_snooping, extack); + if (err) + return err; } if (data[IFLA_BR_MCAST_QUERY_USE_IFADDR]) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d7d167e10b70..7ce8a77cc6b6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -810,7 +810,8 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, bool local_rcv, bool local_orig); int br_multicast_set_router(struct net_bridge *br, unsigned long val); int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); -int br_multicast_toggle(struct net_bridge *br, unsigned long val); +int br_multicast_toggle(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); int br_multicast_set_querier(struct net_bridge *br, unsigned long val); int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); @@ -1118,6 +1119,13 @@ void br_vlan_notify(const struct net_bridge *br, bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); +void br_vlan_fill_forward_path_pvid(struct net_bridge *br, + struct net_device_path_ctx *ctx, + struct net_device_path *path); +int br_vlan_fill_forward_path_mode(struct net_bridge *br, + struct net_bridge_port *dst, + struct net_device_path *path); + static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { @@ -1277,6 +1285,19 @@ static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p, return 0; } +static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br, + struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ +} + +static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br, + struct net_bridge_port *dst, + struct net_device_path *path) +{ + return 0; +} + static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 21c6781906aa..3dafb6143cff 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -64,6 +64,20 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) } } +u8 br_port_get_stp_state(const struct net_device *dev) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + p = br_port_get_rtnl(dev); + if (!p) + return BR_STATE_DISABLED; + + return p->state; +} +EXPORT_SYMBOL_GPL(br_port_get_stp_state); + /* called under bridge lock */ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { @@ -625,6 +639,19 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) return 0; } +clock_t br_get_ageing_time(struct net_device *br_dev) +{ + struct net_bridge *br; + + if (!netif_is_bridge_master(br_dev)) + return 0; + + br = netdev_priv(br_dev); + + return jiffies_to_clock_t(br->ageing_time); +} +EXPORT_SYMBOL_GPL(br_get_ageing_time); + /* called under bridge lock */ void __br_set_topology_change(struct net_bridge *br, unsigned char val) { diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 1e24d9a2c9a7..a5e601e41cb9 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -107,48 +107,28 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, return 0; } -static void -br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, - u16 vid, struct net_device *dev, - bool added_by_user, bool offloaded) -{ - struct switchdev_notifier_fdb_info info; - unsigned long notifier_type; - - info.addr = mac; - info.vid = vid; - info.added_by_user = added_by_user; - info.offloaded = offloaded; - notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE; - call_switchdev_notifiers(notifier_type, dev, &info.info, NULL); -} - void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) { + struct switchdev_notifier_fdb_info info = { + .addr = fdb->key.addr.addr, + .vid = fdb->key.vlan_id, + .added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags), + .is_local = test_bit(BR_FDB_LOCAL, &fdb->flags), + .offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags), + }; + if (!fdb->dst) return; - if (test_bit(BR_FDB_LOCAL, &fdb->flags)) - return; switch (type) { case RTM_DELNEIGH: - br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, - fdb->key.vlan_id, - fdb->dst->dev, - test_bit(BR_FDB_ADDED_BY_USER, - &fdb->flags), - test_bit(BR_FDB_OFFLOADED, - &fdb->flags)); + call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE, + fdb->dst->dev, &info.info, NULL); break; case RTM_NEWNEIGH: - br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, - fdb->key.vlan_id, - fdb->dst->dev, - test_bit(BR_FDB_ADDED_BY_USER, - &fdb->flags), - test_bit(BR_FDB_OFFLOADED, - &fdb->flags)); + call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE, + fdb->dst->dev, &info.info, NULL); break; } } diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 072e29840082..381467b691d5 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -409,17 +409,11 @@ static ssize_t multicast_snooping_show(struct device *d, return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } -static int toggle_multicast(struct net_bridge *br, unsigned long val, - struct netlink_ext_ack *extack) -{ - return br_multicast_toggle(br, val); -} - static ssize_t multicast_snooping_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, toggle_multicast); + return store_bridge_parm(d, buf, len, br_multicast_toggle); } static DEVICE_ATTR_RW(multicast_snooping); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 8829f621b8ec..da3256a3eed0 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1339,6 +1339,61 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) } EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); +void br_vlan_fill_forward_path_pvid(struct net_bridge *br, + struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct net_bridge_vlan_group *vg; + int idx = ctx->num_vlans - 1; + u16 vid; + + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; + + if (!br_opt_get(br, BROPT_VLAN_ENABLED)) + return; + + vg = br_vlan_group(br); + + if (idx >= 0 && + ctx->vlan[idx].proto == br->vlan_proto) { + vid = ctx->vlan[idx].id; + } else { + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG; + vid = br_get_pvid(vg); + } + + path->bridge.vlan_id = vid; + path->bridge.vlan_proto = br->vlan_proto; +} + +int br_vlan_fill_forward_path_mode(struct net_bridge *br, + struct net_bridge_port *dst, + struct net_device_path *path) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + + if (!br_opt_get(br, BROPT_VLAN_ENABLED)) + return 0; + + vg = nbp_vlan_group_rcu(dst); + v = br_vlan_find(vg, path->bridge.vlan_id); + if (!v || !br_vlan_should_use(v)) + return -EINVAL; + + if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) + return 0; + + if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; + else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW; + else + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; + + return 0; +} + int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { @@ -1751,6 +1806,79 @@ out_kfree: kfree_skb(skb); } +static int br_vlan_replay_one(struct notifier_block *nb, + struct net_device *dev, + struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_port_obj_info obj_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .obj = &vlan->obj, + }; + int err; + + err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info); + return notifier_to_errno(err); +} + +int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb, struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge_port *p; + struct net_bridge *br; + int err = 0; + u16 pvid; + + ASSERT_RTNL(); + + if (!netif_is_bridge_master(br_dev)) + return -EINVAL; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) + return -EINVAL; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + p = NULL; + } else { + p = br_port_get_rtnl(dev); + if (WARN_ON(!p)) + return -EINVAL; + vg = nbp_vlan_group(p); + br = p->br; + } + + if (!vg) + return 0; + + pvid = br_get_pvid(vg); + + list_for_each_entry(v, &vg->vlan_list, vlist) { + struct switchdev_obj_port_vlan vlan = { + .obj.orig_dev = dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = br_vlan_flags(v, pvid), + .vid = v->vid, + }; + + if (!br_vlan_should_use(v)) + continue; + + err = br_vlan_replay_one(nb, dev, &vlan, extack); + if (err) + return err; + } + + return err; +} +EXPORT_SYMBOL_GPL(br_vlan_replay); + /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 169e005fbda2..0d3a8c01552e 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -35,7 +35,7 @@ static const struct rhashtable_params br_vlan_tunnel_rht_params = { }; static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, - u64 tunnel_id) + __be64 tunnel_id) { return rhashtable_lookup_fast(tbl, &tunnel_id, br_vlan_tunnel_rht_params); diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index ac5372121e60..7f304a19ac1b 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -23,10 +23,6 @@ config NFT_BRIDGE_REJECT help Add support to reject packets. -config NF_LOG_BRIDGE - tristate "Bridge packet logging" - select NF_LOG_COMMON - endif # NF_TABLES_BRIDGE config NF_CONNTRACK_BRIDGE diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 8e2c5759d964..1c9ce49ab651 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -9,9 +9,6 @@ obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o # connection tracking obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o -# packet logging -obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o - obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o # tables diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index d481ff24a150..96d789c8d1c7 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -24,6 +24,7 @@ #include <linux/cpumask.h> #include <linux/audit.h> #include <net/sock.h> +#include <net/netns/generic.h> /* needed for logical [in,out]-dev filtering */ #include "../br_private.h" @@ -39,8 +40,11 @@ #define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \ COUNTER_OFFSET(n) * cpu)) +struct ebt_pernet { + struct list_head tables; +}; - +static unsigned int ebt_pernet_id __read_mostly; static DEFINE_MUTEX(ebt_mutex); #ifdef CONFIG_COMPAT @@ -336,7 +340,9 @@ static inline struct ebt_table * find_table_lock(struct net *net, const char *name, int *error, struct mutex *mutex) { - return find_inlist_lock(&net->xt.tables[NFPROTO_BRIDGE], name, + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + + return find_inlist_lock(&ebt_net->tables, name, "ebtable_", error, mutex); } @@ -1136,6 +1142,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) int ebt_register_table(struct net *net, const struct ebt_table *input_table, const struct nf_hook_ops *ops, struct ebt_table **res) { + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table_info *newinfo; struct ebt_table *t, *table; struct ebt_replace_kernel *repl; @@ -1194,7 +1201,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, table->private = newinfo; rwlock_init(&table->lock); mutex_lock(&ebt_mutex); - list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) { + list_for_each_entry(t, &ebt_net->tables, list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; goto free_unlock; @@ -1206,7 +1213,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, ret = -ENOENT; goto free_unlock; } - list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]); + list_add(&table->list, &ebt_net->tables); mutex_unlock(&ebt_mutex); WRITE_ONCE(*res, table); @@ -1234,11 +1241,12 @@ out: static struct ebt_table *__ebt_find_table(struct net *net, const char *name) { + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table *t; mutex_lock(&ebt_mutex); - list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) { + list_for_each_entry(t, &ebt_net->tables, list) { if (strcmp(t->name, name) == 0) { mutex_unlock(&ebt_mutex); return t; @@ -2436,6 +2444,20 @@ static struct nf_sockopt_ops ebt_sockopts = { .owner = THIS_MODULE, }; +static int __net_init ebt_pernet_init(struct net *net) +{ + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + + INIT_LIST_HEAD(&ebt_net->tables); + return 0; +} + +static struct pernet_operations ebt_net_ops = { + .init = ebt_pernet_init, + .id = &ebt_pernet_id, + .size = sizeof(struct ebt_pernet), +}; + static int __init ebtables_init(void) { int ret; @@ -2449,13 +2471,21 @@ static int __init ebtables_init(void) return ret; } + ret = register_pernet_subsys(&ebt_net_ops); + if (ret < 0) { + nf_unregister_sockopt(&ebt_sockopts); + xt_unregister_target(&ebt_standard_target); + return ret; + } + return 0; } -static void __exit ebtables_fini(void) +static void ebtables_fini(void) { nf_unregister_sockopt(&ebt_sockopts); xt_unregister_target(&ebt_standard_target); + unregister_pernet_subsys(&ebt_net_ops); } EXPORT_SYMBOL(ebt_register_table); diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c deleted file mode 100644 index 1ad61d1017b6..000000000000 --- a/net/bridge/netfilter/nf_log_bridge.c +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org> - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_bridge.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_log.h> - -static void nf_log_bridge_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - nf_log_l2packet(net, pf, eth_hdr(skb)->h_proto, hooknum, skb, - in, out, loginfo, prefix); -} - -static struct nf_logger nf_bridge_logger __read_mostly = { - .name = "nf_log_bridge", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_bridge_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_bridge_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); -} - -static void __net_exit nf_log_bridge_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_bridge_logger); -} - -static struct pernet_operations nf_log_bridge_net_ops = { - .init = nf_log_bridge_net_init, - .exit = nf_log_bridge_net_exit, -}; - -static int __init nf_log_bridge_init(void) -{ - int ret; - - /* Request to load the real packet loggers. */ - nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG); - nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG); - nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG); - - ret = register_pernet_subsys(&nf_log_bridge_net_ops); - if (ret < 0) - return ret; - - nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); - return 0; -} - -static void __exit nf_log_bridge_exit(void) -{ - unregister_pernet_subsys(&nf_log_bridge_net_ops); - nf_log_unregister(&nf_bridge_logger); -} - -module_init(nf_log_bridge_init); -module_exit(nf_log_bridge_exit); - -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_DESCRIPTION("Netfilter bridge packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 2b1dd252f231..c959320c4775 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1069,7 +1069,7 @@ again: /* * Do not return the error but go back to waiting. We - * have the inital workspace and the CRUSH computation + * have the initial workspace and the CRUSH computation * time is bounded so we will get it eventually. */ WARN_ON(atomic_read(&wsm->total_ws) < 1); diff --git a/net/core/Makefile b/net/core/Makefile index 3e2c378e5f31..1a6168d8f23b 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o -obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -28,10 +27,14 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SELFTESTS) += selftests.o +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o +obj-$(CONFIG_BPF_SYSCALL) += sock_map.o +endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4edd033e899c..cc3712ad8716 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) diff --git a/net/core/dev.c b/net/core/dev.c index 15fe36332fb8..222b1d322c96 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -848,6 +848,52 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) +{ + int k = stack->num_paths++; + + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + return NULL; + + return &stack->path[k]; +} + +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack) +{ + const struct net_device *last_dev; + struct net_device_path_ctx ctx = { + .dev = dev, + .daddr = daddr, + }; + struct net_device_path *path; + int ret = 0; + + stack->num_paths = 0; + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { + last_dev = ctx.dev; + path = dev_fwd_path(stack); + if (!path) + return -1; + + memset(path, 0, sizeof(struct net_device_path)); + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); + if (ret < 0) + return -1; + + if (WARN_ON_ONCE(last_dev == ctx.dev)) + return -1; + } + path = dev_fwd_path(stack); + if (!path) + return -1; + path->type = DEV_PATH_ETHERNET; + path->dev = ctx.dev; + + return ret; +} +EXPORT_SYMBOL_GPL(dev_fill_forward_path); + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -2463,16 +2509,14 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS -struct static_key xps_needed __read_mostly; -EXPORT_SYMBOL(xps_needed); -struct static_key xps_rxqs_needed __read_mostly; -EXPORT_SYMBOL(xps_rxqs_needed); +static struct static_key xps_needed __read_mostly; +static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, - int tci, u16 index) + struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; @@ -2491,6 +2535,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } + if (old_maps) + RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; @@ -2503,7 +2549,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { - int num_tc = dev->num_tc ? : 1; + int num_tc = dev_maps->num_tc; bool active = false; int tci; @@ -2511,7 +2557,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, int i, j; for (i = count, j = offset; i--; j++) { - if (!remove_xps_queue(dev_maps, tci, j)) + if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } @@ -2523,74 +2569,54 @@ static bool remove_xps_queue_cpu(struct net_device *dev, static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, - bool is_rxqs_map) + enum xps_map_type type) { - if (is_rxqs_map) { - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - } static_key_slow_dec_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + + RCU_INIT_POINTER(dev->xps_maps[type], NULL); + kfree_rcu(dev_maps, rcu); } -static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, - struct xps_dev_maps *dev_maps, unsigned int nr_ids, - u16 offset, u16 count, bool is_rxqs_map) +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, + u16 offset, u16 count) { + struct xps_dev_maps *dev_maps; bool active = false; int i, j; - for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), - j < nr_ids;) - active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, - count); + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (!dev_maps) + return; + + for (j = 0; j < dev_maps->nr_ids; j++) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); - if (!is_rxqs_map) { - for (i = offset + (count - 1); count--; i--) { + if (type == XPS_CPUS) { + for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( - netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); - } + netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { - const unsigned long *possible_mask = NULL; - struct xps_dev_maps *dev_maps; - unsigned int nr_ids; - if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); - if (static_key_false(&xps_rxqs_needed)) { - dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) { - nr_ids = dev->num_rx_queues; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, - offset, count, true); - } - } - - dev_maps = xmap_dereference(dev->xps_cpus_map); - if (!dev_maps) - goto out_no_maps; + if (static_key_false(&xps_rxqs_needed)) + clean_xps_maps(dev, XPS_RXQS, offset, count); - if (num_possible_cpus() > 1) - possible_mask = cpumask_bits(cpu_possible_mask); - nr_ids = nr_cpu_ids; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, - false); + clean_xps_maps(dev, XPS_CPUS, offset, count); -out_no_maps: mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2640,16 +2666,35 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, return new_map; } +/* Copy xps maps at a given index */ +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *new_dev_maps, int index, + int tc, bool skip_tc) +{ + int i, tci = index * dev_maps->num_tc; + struct xps_map *map; + + /* copy maps belonging to foreign traffic classes */ + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && skip_tc) + continue; + + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); + } +} + /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map) + u16 index, enum xps_map_type type) { - const unsigned long *online_mask = NULL, *possible_mask = NULL; - struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; + const unsigned long *online_mask = NULL; + bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - bool active = false; unsigned int nr_ids; if (dev->num_tc) { @@ -2667,38 +2712,48 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } mutex_lock(&xps_map_mutex); - if (is_rxqs_map) { + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); - dev_maps = xmap_dereference(dev->xps_rxqs_map); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); - if (num_possible_cpus() > 1) { + if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); - possible_mask = cpumask_bits(cpu_possible_mask); - } - dev_maps = xmap_dereference(dev->xps_cpus_map); nr_ids = nr_cpu_ids; } if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; + /* The old dev_maps could be larger or smaller than the one we're + * setting up now, as dev->num_tc or nr_ids could have been updated in + * between. We could try to be smart, but let's be safe instead and only + * copy foreign traffic classes if the two map sizes match. + */ + if (dev_maps && + dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) + copy = true; + /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { - if (!new_dev_maps) - new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { - mutex_unlock(&xps_map_mutex); - return -ENOMEM; + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + new_dev_maps->nr_ids = nr_ids; + new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : - NULL; + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, j, index, is_rxqs_map); + map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; @@ -2711,29 +2766,21 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) + if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { - /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); - } + for (j = 0; j < nr_ids; j++) { + bool skip_tc = false; - /* We need to explicitly update tci as prevous loop - * could break out early if dev_maps is NULL. - */ tci = j * num_tc + tc; - if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; + skip_tc = true; + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; @@ -2741,78 +2788,81 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (!is_rxqs_map) { + if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) numa_node_id = -1; } #endif - } else if (dev_maps) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } - /* copy maps belonging to foreign traffic classes */ - for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); - } + if (copy) + xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, + skip_tc); } - if (is_rxqs_map) - rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); - else - rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); + rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) goto out_no_old_maps; - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { - for (i = num_tc, tci = j * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + for (j = 0; j < dev_maps->nr_ids; j++) { + for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); - if (map && map != new_map) - kfree_rcu(map, rcu); + if (!map) + continue; + + if (copy) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + if (map == new_map) + continue; + } + + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); + kfree_rcu(map, rcu); } } - kfree_rcu(dev_maps, rcu); + old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; active = true; out_no_new_maps: - if (!is_rxqs_map) { + if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); - } if (!dev_maps) goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { - for (i = tc, tci = j * num_tc; i--; tci++) - active |= remove_xps_queue(dev_maps, tci, index); - if (!netif_attr_test_mask(j, mask, nr_ids) || - !netif_attr_test_online(j, online_mask, nr_ids)) - active |= remove_xps_queue(dev_maps, tci, index); - for (i = num_tc - tc, tci++; --i; tci++) - active |= remove_xps_queue(dev_maps, tci, index); + for (j = 0; j < dev_maps->nr_ids; j++) { + tci = j * dev_maps->num_tc; + + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && + netif_attr_test_mask(j, mask, dev_maps->nr_ids) && + netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) + continue; + + active |= remove_xps_queue(dev_maps, + copy ? old_dev_maps : NULL, + tci, index); + } } + if (old_dev_maps) + kfree_rcu(old_dev_maps, rcu); + /* free map if not active */ if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2820,11 +2870,10 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); - map = dev_maps ? + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) @@ -2845,7 +2894,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, int ret; cpus_read_lock(); - ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; @@ -3956,13 +4005,15 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { + int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) + return queue_index; + + tci *= dev_maps->num_tc; + tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { @@ -3993,18 +4044,18 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; - dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); - if (tci >= 0 && tci < dev->num_rx_queues) + if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } get_cpus_map: if (queue_index < 0) { - dev_maps = rcu_dereference(sb_dev->xps_cpus_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; @@ -4672,10 +4723,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; + bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; - bool orig_bcast; int off; /* Reinjected packets coming from act_mirred or similar should @@ -4722,6 +4773,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; @@ -4749,8 +4801,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } @@ -5284,6 +5339,7 @@ skip_classify: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; + break; case RX_HANDLER_PASS: break; default: @@ -5876,15 +5932,13 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); -static struct list_head *gro_list_prepare(struct napi_struct *napi, - struct sk_buff *skb) +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); - struct list_head *head; struct sk_buff *p; - head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; list_for_each_entry(p, head, list) { unsigned long diffs; @@ -5910,8 +5964,6 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } - - return head; } static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) @@ -5975,11 +6027,11 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *gro_head; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; @@ -5988,7 +6040,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_head = gro_list_prepare(napi, skb); + gro_list_prepare(&gro_list->list, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -6024,7 +6076,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, - gro_head, skb); + &gro_list->list, skb); break; } rcu_read_unlock(); @@ -6043,7 +6095,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); - napi->gro_hash[hash].count--; + gro_list->count--; } if (same_flow) @@ -6052,16 +6104,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { - gro_flush_oldest(napi, gro_head); - } else { - napi->gro_hash[hash].count++; - } + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(napi, &gro_list->list); + else + gro_list->count++; + NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, gro_head); + list_add(&skb->list, &gro_list->list); ret = GRO_HELD; pull: @@ -6069,11 +6121,11 @@ pull: if (grow > 0) gro_pull_from_frag0(skb, grow); ok: - if (napi->gro_hash[hash].count) { - if (!test_bit(hash, &napi->gro_bitmask)) - __set_bit(hash, &napi->gro_bitmask); - } else if (test_bit(hash, &napi->gro_bitmask)) { - __clear_bit(hash, &napi->gro_bitmask); + if (gro_list->count) { + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); } return ret; @@ -6790,6 +6842,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) return err; } +EXPORT_SYMBOL(dev_set_threaded); void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) @@ -10338,14 +10391,20 @@ EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; +#else + return refcount_read(&dev->dev_refcnt); +#endif } EXPORT_SYMBOL(netdev_refcnt_read); +int netdev_unregister_timeout_secs __read_mostly = 10; + #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** @@ -10370,7 +10429,7 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = warning_time = jiffies; refcnt = netdev_refcnt_read(dev); - while (refcnt != 0) { + while (refcnt != 1) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -10407,7 +10466,9 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt != 1 && + time_after(jiffies, warning_time + + netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; @@ -10483,7 +10544,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); @@ -10700,9 +10761,14 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; +#ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; + dev_hold(dev); +#else + refcount_set(&dev->dev_refcnt, 1); +#endif if (dev_addr_init(dev)) goto free_pcpu; @@ -10766,8 +10832,10 @@ free_all: return NULL; free_pcpu: +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: +#endif netdev_freemem(dev); return NULL; } @@ -10809,8 +10877,10 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; +#endif free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; @@ -10998,11 +11068,13 @@ void unregister_netdev(struct net_device *dev) EXPORT_SYMBOL(unregister_netdev); /** - * dev_change_net_namespace - move device to different nethost namespace + * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. + * @new_ifindex: If not zero, specifies device index in the target + * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on @@ -11011,10 +11083,11 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) { struct net *net_old = dev_net(dev); - int err, new_nsid, new_ifindex; + int err, new_nsid; ASSERT_RTNL(); @@ -11045,6 +11118,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char goto out; } + /* Check that new_ifindex isn't used yet. */ + err = -EBUSY; + if (new_ifindex && __dev_get_by_index(net, new_ifindex)) + goto out; + /* * And now a mini version of register_netdevice unregister_netdevice. */ @@ -11072,10 +11150,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; + if (!new_ifindex) { + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11128,7 +11208,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char out: return err; } -EXPORT_SYMBOL_GPL(dev_change_net_namespace); +EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index fa1c37ec40c9..45ae6eeb2964 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -228,7 +228,7 @@ EXPORT_SYMBOL(__hw_addr_unsync); * @sync: function to call if address should be added * @unsync: function to call if address should be removed * - * This funciton is intended to be called from the ndo_set_rx_mode + * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address add/remove * notifications. The unsync function may be NULL in which case * the addresses requiring removal will simply be removed without @@ -723,7 +723,7 @@ void dev_uc_flush(struct net_device *dev) EXPORT_SYMBOL(dev_uc_flush); /** - * dev_uc_flush - Init unicast address list + * dev_uc_init - Init unicast address list * @dev: device * * Init unicast address list. diff --git a/net/core/devlink.c b/net/core/devlink.c index 737b61c2976e..4eb969518ee0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8599,9 +8599,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u32 sf) + u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -8615,6 +8616,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; + attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); @@ -8667,6 +8669,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (attrs->pci_sf.external) { + n = snprintf(name, len, "c%u", attrs->pci_sf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index db65ce62b625..ead2a8aa57b4 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1754,7 +1754,7 @@ static void exit_net_drop_monitor(void) /* * Because of the module_get/put we do in the trace state change path - * we are guarnateed not to have any current users when we get here + * we are guaranteed not to have any current users when we get here */ for_each_possible_cpu(cpu) { diff --git a/net/core/filter.c b/net/core/filter.c index 9323d34d34cc..cae56d08a670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = { static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err = __bpf_try_make_writable(skb, write_len); - - bpf_compute_data_end_sk_skb(skb); - return err; + return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -3412,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK)) @@ -3448,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + if (skb->encapsulation) return -EALREADY; @@ -3466,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; - skb_set_inner_protocol(skb, skb->protocol); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); @@ -3577,7 +3583,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOMEM; __skb_pull(skb, len_diff_abs); } - bpf_compute_data_end_sk_skb(skb); if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); @@ -3742,10 +3747,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { - int ret = __bpf_skb_change_tail(skb, new_len, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { @@ -3808,10 +3810,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { @@ -3919,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, struct xdp_buff *xdp) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - return dev_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_CPUMAP: - return cpu_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_redirect(fwd, xdp); - default: - return -EBADRQC; - } - return 0; -} - void xdp_do_flush(void) { __dev_flush(); @@ -3944,71 +3926,52 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - return __dev_map_lookup_elem(map, index); - case BPF_MAP_TYPE_DEVMAP_HASH: - return __dev_map_hash_lookup_elem(map, index); - case BPF_MAP_TYPE_CPUMAP: - return __cpu_map_lookup_elem(map, index); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_lookup_elem(map, index); - default: - return NULL; - } -} - -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; + enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; int err; - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - if (unlikely(!map)) { - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_XSKMAP: + err = __xsk_map_redirect(fwd, xdp); + break; + case BPF_MAP_TYPE_UNSPEC: + if (map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + break; + } + err = dev_xdp_enqueue(fwd, xdp, dev); + break; } - - err = dev_xdp_enqueue(fwd, xdp, dev); - } else { - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + fallthrough; + default: + err = -EBADRQC; } if (unlikely(err)) goto err; - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_redirect); @@ -4017,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, - struct bpf_map *map) + void *fwd, + enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->tgt_index; - void *fwd = ri->tgt_value; - int err = 0; - - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); - - if (map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - struct bpf_dtab_netdev *dst = fwd; + int err; - err = dev_map_generic_redirect(dst, skb, xdp_prog); + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_generic_redirect(fwd, skb, xdp_prog); if (unlikely(err)) goto err; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - struct xdp_sock *xs = fwd; - - err = xsk_generic_rcv(xs, xdp); + break; + case BPF_MAP_TYPE_XSKMAP: + err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); - } else { + break; + default: /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } @@ -4059,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; - struct net_device *fwd; - int err = 0; - - if (map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, - map); - ri->tgt_index = 0; - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; - err = xdp_ok_fwd_dev(fwd, skb->len); - if (unlikely(err)) - goto err; + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - skb->dev = fwd; - _trace_xdp_redirect(dev, xdp_prog, index); - generic_xdp_tx(skb, xdp_prog); - return 0; + if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); + generic_xdp_tx(skb, xdp_prog); + return 0; + } + + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } @@ -4094,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->flags = flags; + /* NB! Map type UNSPEC and map_id == INT_MAX (never generated + * by map_idr) is used for ifindex based XDP redirect. + */ ri->tgt_index = ifindex; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } @@ -4113,28 +4076,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - - /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) - return XDP_ABORTED; - - ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { - /* If the lookup fails we want to clear out the state in the - * redirect_info struct completely, so that if an eBPF program - * performs multiple lookups, the last one always takes - * precedence. - */ - WRITE_ONCE(ri->map, NULL); - return flags; - } - - ri->flags = flags; - ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); - - return XDP_REDIRECT; + return map->ops->map_redirect(map, ifindex, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { @@ -4787,6 +4729,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; + case SO_REUSEPORT: + sk->sk_reuseport = valbool; + break; default: ret = -EINVAL; } @@ -4956,6 +4901,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, case SO_BINDTOIFINDEX: *((int *)optval) = sk->sk_bound_dev_if; break; + case SO_REUSEPORT: + *((int *)optval) = sk->sk_reuseport; + break; default: goto err_clear; } @@ -9663,22 +9611,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* si->dst_reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + /* si->dst_reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + return insn; +} + static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): - off = si->off; - off -= offsetof(struct __sk_buff, data_end); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct tcp_skb_cb, bpf.data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); + insn = bpf_convert_data_end_access(si, insn); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, @@ -9847,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10457,6 +10424,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, } const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index a96a4f5de0ce..3ed7c98a98e1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,7 +114,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, * is the protocol port offset returned from proto_ports_offset */ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen) + const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); @@ -161,7 +161,7 @@ static bool icmp_has_id(u8 type) */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, - void *data, int thoff, int hlen) + const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; @@ -187,8 +187,8 @@ EXPORT_SYMBOL(skb_flow_get_icmp_tci); */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, int thoff, int hlen) + void *target_container, const void *data, + int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; @@ -409,8 +409,8 @@ EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen, - int lse_index, bool *entropy_label) + void *target_container, const void *data, int nhoff, + int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; @@ -467,7 +467,8 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb, static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen) + void *target_container, const void *data, + int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { @@ -523,7 +524,7 @@ static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, - void *target_container, void *data, + void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { @@ -663,8 +664,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, - void *data, __be16 *p_proto, int *p_nhoff, int hlen, - unsigned int flags) + const void *data, __be16 *p_proto, int *p_nhoff, + int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; @@ -695,7 +696,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb, static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int thoff, int hlen) + void *target_container, const void *data, + int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; @@ -719,8 +721,8 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb, static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, - u8 ip_proto, int hlen) + void *target_container, const void *data, + int nhoff, u8 ip_proto, int hlen) { enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; struct flow_dissector_key_ports *key_ports; @@ -744,7 +746,8 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, const struct iphdr *iph) + void *target_container, const void *data, + const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; @@ -761,7 +764,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb, static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, const struct ipv6hdr *iph) + void *target_container, const void *data, + const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; @@ -828,8 +832,10 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); - memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, - sizeof(key_addrs->v6addrs)); + memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, + sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -908,9 +914,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, __be16 proto, int nhoff, int hlen, - unsigned int flags) + void *target_container, const void *data, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -1642,7 +1647,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, } EXPORT_SYMBOL(skb_get_hash_perturb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, +u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index c714e6a9dad4..d8b9dbabd4a4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -10,9 +10,6 @@ #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) -extern struct list_head ptype_all __read_mostly; -extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; - static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 307628fdf380..f6197774048b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1361,83 +1361,94 @@ static const struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static ssize_t xps_cpus_show(struct netdev_queue *queue, - char *buf) +static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, + int tc, char *buf, enum xps_map_type type) { - int cpu, len, ret, num_tc = 1, tc = 0; - struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - cpumask_var_t mask; - unsigned long index; - - if (!netif_is_multiqueue(dev)) - return -ENOENT; + unsigned long *mask; + unsigned int nr_ids; + int j, len; - index = get_netdev_queue_index(queue); + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_maps[type]); - if (!rtnl_trylock()) - return restart_syscall(); + /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 + * when dev_maps hasn't been allocated yet, to be backward compatible. + */ + nr_ids = dev_maps ? dev_maps->nr_ids : + (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); - if (dev->num_tc) { - /* Do not allow XPS on subordinate device directly */ - num_tc = dev->num_tc; - if (num_tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); + if (!mask) { + rcu_read_unlock(); + return -ENOMEM; + } - /* If queue belongs to subordinate dev use its map */ - dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + if (!dev_maps || tc >= dev_maps->num_tc) + goto out_no_maps; - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } - } + for (j = 0; j < nr_ids; j++) { + int i, tci = j * dev_maps->num_tc + tc; + struct xps_map *map; - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { - ret = -ENOMEM; - goto err_rtnl_unlock; - } + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); - if (dev_maps) { - for_each_possible_cpu(cpu) { - int i, tci = cpu * num_tc + tc; - struct xps_map *map; - - map = rcu_dereference(dev_maps->attr_map[tci]); - if (!map) - continue; - - for (i = map->len; i--;) { - if (map->queues[i] == index) { - cpumask_set_cpu(cpu, mask); - break; - } + for (i = map->len; i--;) { + if (map->queues[i] == index) { + set_bit(j, mask); + break; } } } +out_no_maps: rcu_read_unlock(); - rtnl_unlock(); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); + bitmap_free(mask); - len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); - free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + unsigned int index; + int len, tc; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; -err_rtnl_unlock: + index = get_netdev_queue_index(queue); + + if (!rtnl_trylock()) + return restart_syscall(); + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + rtnl_unlock(); + return -EINVAL; + } + + /* Make sure the subordinate device can't be freed */ + get_device(&dev->dev); rtnl_unlock(); - return ret; + + len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); + + put_device(&dev->dev); + return len; } static ssize_t xps_cpus_store(struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; - unsigned long index; + unsigned int index; cpumask_var_t mask; int err; @@ -1476,64 +1487,21 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { - int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; - struct xps_dev_maps *dev_maps; - unsigned long *mask, index; + unsigned int index; + int tc; index = get_netdev_queue_index(queue); if (!rtnl_trylock()) return restart_syscall(); - if (dev->num_tc) { - num_tc = dev->num_tc; - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } - } - mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); - if (!mask) { - ret = -ENOMEM; - goto err_rtnl_unlock; - } - - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_rxqs_map); - if (!dev_maps) - goto out_no_maps; - - for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), - j < dev->num_rx_queues;) { - int i, tci = j * num_tc + tc; - struct xps_map *map; - - map = rcu_dereference(dev_maps->attr_map[tci]); - if (!map) - continue; - - for (i = map->len; i--;) { - if (map->queues[i] == index) { - set_bit(j, mask); - break; - } - } - } -out_no_maps: - rcu_read_unlock(); - + tc = netdev_txq_to_tc(dev, index); rtnl_unlock(); + if (tc < 0) + return -EINVAL; - len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); - bitmap_free(mask); - - return len < PAGE_SIZE ? len : -EINVAL; - -err_rtnl_unlock: - rtnl_unlock(); - return ret; + return xps_queue_show(dev, index, tc, buf, XPS_RXQS); } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, @@ -1541,7 +1509,8 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, { struct net_device *dev = queue->dev; struct net *net = dev_net(dev); - unsigned long *mask, index; + unsigned long *mask; + unsigned int index; int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1565,7 +1534,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, } cpus_read_lock(); - err = __netif_set_xps_queue(dev, mask, index, true); + err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); cpus_read_unlock(); rtnl_unlock(); diff --git a/net/core/netevent.c b/net/core/netevent.c index d76ed7739c70..5bb615e963cc 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -32,7 +32,7 @@ int register_netevent_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(register_netevent_notifier); /** - * netevent_unregister_notifier - unregister a netevent notifier block + * unregister_netevent_notifier - unregister a netevent notifier block * @nb: notifier * * Unregister a notifier previously registered by diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3485b16a7ff3..714d5fa38546 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1877,6 +1877,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, + [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2603,14 +2604,22 @@ static int do_setlink(const struct sk_buff *skb, return err; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { - struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); + struct net *net; + int new_ifindex; + + net = rtnl_link_get_net_capable(skb, dev_net(dev), + tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } - err = dev_change_net_namespace(dev, net, ifname); + if (tb[IFLA_NEW_IFINDEX]) + new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); + else + new_ifindex = 0; + + err = __dev_change_net_namespace(dev, net, ifname, new_ifindex); put_net(net); if (err) goto errout; diff --git a/net/core/scm.c b/net/core/scm.c index 8156d4fb8a39..ae3085d9aae8 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -228,14 +228,16 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) if (msg->msg_control_is_user) { struct cmsghdr __user *cm = msg->msg_control_user; - struct cmsghdr cmhdr; - - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr) || - copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) - return -EFAULT; + + if (!user_write_access_begin(cm, cmlen)) + goto efault; + + unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); + unsafe_put_user(level, &cm->cmsg_level, efault_end); + unsafe_put_user(type, &cm->cmsg_type, efault_end); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault_end); + user_write_access_end(); } else { struct cmsghdr *cm = msg->msg_control; @@ -249,6 +251,11 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_control += cmlen; msg->msg_controllen -= cmlen; return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; } EXPORT_SYMBOL(put_cmsg); diff --git a/net/core/selftests.c b/net/core/selftests.c new file mode 100644 index 000000000000..ba7b0171974c --- /dev/null +++ b/net/core/selftests.c @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates. + * stmmac Selftests Support + * + * Author: Jose Abreu <joabreu@synopsys.com> + * + * Ported from stmmac by: + * Copyright (C) 2021 Oleksij Rempel <o.rempel@pengutronix.de> + */ + +#include <linux/phy.h> +#include <net/selftests.h> +#include <net/tcp.h> +#include <net/udp.h> + +struct net_packet_attrs { + unsigned char *src; + unsigned char *dst; + u32 ip_src; + u32 ip_dst; + bool tcp; + u16 sport; + u16 dport; + int timeout; + int size; + int max_size; + u8 id; + u16 queue_mapping; +}; + +struct net_test_priv { + struct net_packet_attrs *packet; + struct packet_type pt; + struct completion comp; + int double_vlan; + int vlan_id; + int ok; +}; + +struct netsfhdr { + __be32 version; + __be64 magic; + u8 id; +} __packed; + +static u8 net_test_next_id; + +#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct netsfhdr)) +#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL +#define NET_LB_TIMEOUT msecs_to_jiffies(200) + +static struct sk_buff *net_test_get_skb(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct sk_buff *skb = NULL; + struct udphdr *uhdr = NULL; + struct tcphdr *thdr = NULL; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct iphdr *ihdr; + int iplen, size; + + size = attr->size + NET_TEST_PKT_SIZE; + + if (attr->tcp) + size += sizeof(struct tcphdr); + else + size += sizeof(struct udphdr); + + if (attr->max_size && attr->max_size > size) + size = attr->max_size; + + skb = netdev_alloc_skb(ndev, size); + if (!skb) + return NULL; + + prefetchw(skb->data); + + ehdr = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + + skb_set_network_header(skb, skb->len); + ihdr = skb_put(skb, sizeof(*ihdr)); + + skb_set_transport_header(skb, skb->len); + if (attr->tcp) + thdr = skb_put(skb, sizeof(*thdr)); + else + uhdr = skb_put(skb, sizeof(*uhdr)); + + eth_zero_addr(ehdr->h_dest); + + if (attr->src) + ether_addr_copy(ehdr->h_source, attr->src); + if (attr->dst) + ether_addr_copy(ehdr->h_dest, attr->dst); + + ehdr->h_proto = htons(ETH_P_IP); + + if (attr->tcp) { + thdr->source = htons(attr->sport); + thdr->dest = htons(attr->dport); + thdr->doff = sizeof(struct tcphdr) / 4; + thdr->check = 0; + } else { + uhdr->source = htons(attr->sport); + uhdr->dest = htons(attr->dport); + uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size); + if (attr->max_size) + uhdr->len = htons(attr->max_size - + (sizeof(*ihdr) + sizeof(*ehdr))); + uhdr->check = 0; + } + + ihdr->ihl = 5; + ihdr->ttl = 32; + ihdr->version = 4; + if (attr->tcp) + ihdr->protocol = IPPROTO_TCP; + else + ihdr->protocol = IPPROTO_UDP; + iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size; + if (attr->tcp) + iplen += sizeof(*thdr); + else + iplen += sizeof(*uhdr); + + if (attr->max_size) + iplen = attr->max_size - sizeof(*ehdr); + + ihdr->tot_len = htons(iplen); + ihdr->frag_off = 0; + ihdr->saddr = htonl(attr->ip_src); + ihdr->daddr = htonl(attr->ip_dst); + ihdr->tos = 0; + ihdr->id = 0; + ip_send_check(ihdr); + + shdr = skb_put(skb, sizeof(*shdr)); + shdr->version = 0; + shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); + attr->id = net_test_next_id; + shdr->id = net_test_next_id++; + + if (attr->size) + skb_put(skb, attr->size); + if (attr->max_size && attr->max_size > skb->len) + skb_put(skb, attr->max_size - skb->len); + + skb->csum = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + if (attr->tcp) { + thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, + ihdr->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + } else { + udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); + } + + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + skb->dev = ndev; + + return skb; +} + +static int net_test_loopback_validate(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt, + struct net_device *orig_ndev) +{ + struct net_test_priv *tpriv = pt->af_packet_priv; + unsigned char *src = tpriv->packet->src; + unsigned char *dst = tpriv->packet->dst; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct udphdr *uhdr; + struct tcphdr *thdr; + struct iphdr *ihdr; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + + if (skb_linearize(skb)) + goto out; + if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN)) + goto out; + + ehdr = (struct ethhdr *)skb_mac_header(skb); + if (dst) { + if (!ether_addr_equal_unaligned(ehdr->h_dest, dst)) + goto out; + } + + if (src) { + if (!ether_addr_equal_unaligned(ehdr->h_source, src)) + goto out; + } + + ihdr = ip_hdr(skb); + if (tpriv->double_vlan) + ihdr = (struct iphdr *)(skb_network_header(skb) + 4); + + if (tpriv->packet->tcp) { + if (ihdr->protocol != IPPROTO_TCP) + goto out; + + thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (thdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr)); + } else { + if (ihdr->protocol != IPPROTO_UDP) + goto out; + + uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (uhdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr)); + } + + if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC)) + goto out; + if (tpriv->packet->id != shdr->id) + goto out; + + tpriv->ok = true; + complete(&tpriv->comp); +out: + kfree_skb(skb); + return 0; +} + +static int __net_test_loopback(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct net_test_priv *tpriv; + struct sk_buff *skb = NULL; + int ret = 0; + + tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); + if (!tpriv) + return -ENOMEM; + + tpriv->ok = false; + init_completion(&tpriv->comp); + + tpriv->pt.type = htons(ETH_P_IP); + tpriv->pt.func = net_test_loopback_validate; + tpriv->pt.dev = ndev; + tpriv->pt.af_packet_priv = tpriv; + tpriv->packet = attr; + dev_add_pack(&tpriv->pt); + + skb = net_test_get_skb(ndev, attr); + if (!skb) { + ret = -ENOMEM; + goto cleanup; + } + + ret = dev_direct_xmit(skb, attr->queue_mapping); + if (ret < 0) { + goto cleanup; + } else if (ret > 0) { + ret = -ENETUNREACH; + goto cleanup; + } + + if (!attr->timeout) + attr->timeout = NET_LB_TIMEOUT; + + wait_for_completion_timeout(&tpriv->comp, attr->timeout); + ret = tpriv->ok ? 0 : -ETIMEDOUT; + +cleanup: + dev_remove_pack(&tpriv->pt); + kfree(tpriv); + return ret; +} + +static int net_test_netif_carrier(struct net_device *ndev) +{ + return netif_carrier_ok(ndev) ? 0 : -ENOLINK; +} + +static int net_test_phy_phydev(struct net_device *ndev) +{ + return ndev->phydev ? 0 : -EOPNOTSUPP; +} + +static int net_test_phy_loopback_enable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, true); +} + +static int net_test_phy_loopback_disable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, false); +} + +static int net_test_phy_loopback_udp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + return __net_test_loopback(ndev, &attr); +} + +static int net_test_phy_loopback_tcp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + return __net_test_loopback(ndev, &attr); +} + +static const struct net_test { + char name[ETH_GSTRING_LEN]; + int (*fn)(struct net_device *ndev); +} net_selftests[] = { + { + .name = "Carrier ", + .fn = net_test_netif_carrier, + }, { + .name = "PHY dev is present ", + .fn = net_test_phy_phydev, + }, { + /* This test should be done before all PHY loopback test */ + .name = "PHY internal loopback, enable ", + .fn = net_test_phy_loopback_enable, + }, { + .name = "PHY internal loopback, UDP ", + .fn = net_test_phy_loopback_udp, + }, { + .name = "PHY internal loopback, TCP ", + .fn = net_test_phy_loopback_tcp, + }, { + /* This test should be done after all PHY loopback test */ + .name = "PHY internal loopback, disable", + .fn = net_test_phy_loopback_disable, + }, +}; + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) +{ + int count = net_selftest_get_count(); + int i; + + memset(buf, 0, sizeof(*buf) * count); + net_test_next_id = 0; + + if (etest->flags != ETH_TEST_FL_OFFLINE) { + netdev_err(ndev, "Only offline tests are supported\n"); + etest->flags |= ETH_TEST_FL_FAILED; + return; + } + + + for (i = 0; i < count; i++) { + buf[i] = net_selftests[i].fn(ndev); + if (buf[i] && (buf[i] != -EOPNOTSUPP)) + etest->flags |= ETH_TEST_FL_FAILED; + } +} +EXPORT_SYMBOL_GPL(net_selftest); + +int net_selftest_get_count(void) +{ + return ARRAY_SIZE(net_selftests); +} +EXPORT_SYMBOL_GPL(net_selftest_get_count); + +void net_selftest_get_strings(u8 *data) +{ + u8 *p = data; + int i; + + for (i = 0; i < net_selftest_get_count(); i++) { + snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1, + net_selftests[i].name); + p += ETH_GSTRING_LEN; + } +} +EXPORT_SYMBOL_GPL(net_selftest_get_strings); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>"); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c421c8f80925..14010c0eec48 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2500,9 +2500,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, } EXPORT_SYMBOL_GPL(skb_splice_bits); -/* Send skb data on a socket. Socket must be locked. */ -int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, - int len) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendmsg(sock, msg, vec, num, size); +} + +static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendpage(sock, page, offset, size, flags); +} + +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size); +typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, + int len, sendmsg_func sendmsg, sendpage_func sendpage) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -2522,7 +2545,8 @@ do_frag_list: memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; - ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, + sendmsg_unlocked, sk, &msg, &kv, 1, slen); if (ret <= 0) goto error; @@ -2553,9 +2577,11 @@ do_frag_list: slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = kernel_sendpage_locked(sk, skb_frag_page(frag), - skb_frag_off(frag) + offset, - slen, MSG_DONTWAIT); + ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, + sendpage_unlocked, sk, + skb_frag_page(frag), + skb_frag_off(frag) + offset, + slen, MSG_DONTWAIT); if (ret <= 0) goto error; @@ -2587,8 +2613,23 @@ out: error: return orig_len == len ? ret : orig_len - len; } + +/* Send skb data on a socket. Socket must be locked. */ +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, + int len) +{ + return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, + kernel_sendpage_locked); +} EXPORT_SYMBOL_GPL(skb_send_sock_locked); +/* Send skb data on a socket. Socket must be unlocked. */ +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, + sendpage_unlocked); +} + /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 5def3a2e85be..43ce17a6a585 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,6 +399,104 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_wait_data); + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; + struct sk_msg *msg_rx; + int i, copied = 0; + + msg_rx = sk_psock_peek_msg(psock); + while (copied != len) { + struct scatterlist *sge; + + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) + return copied ? copied : -EFAULT; + + copied += copy; + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + if (!msg_rx->skb) + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + return copied; + sk_msg_iter_var_next(i); + } + + if (copied == len) + break; + } while (i != msg_rx->sg.end); + + if (unlikely(peek)) { + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) + break; + continue; + } + + msg_rx->sg.start = i; + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); + } + msg_rx = sk_psock_peek_msg(psock); + } + + return copied; +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg); + static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { @@ -410,7 +508,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); if (unlikely(!msg)) return NULL; @@ -498,7 +596,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; - return skb_send_sock_locked(psock->sk, skb, off, len); + return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb); } @@ -512,8 +610,7 @@ static void sk_psock_backlog(struct work_struct *work) u32 len, off; int ret; - /* Lock sock to avoid losing sk_socket during loop. */ - lock_sock(psock->sk); + mutex_lock(&psock->work_mutex); if (state->skb) { skb = state->skb; len = state->len; @@ -526,10 +623,11 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); do { ret = -EIO; - if (likely(psock->sk->sk_socket)) + if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { @@ -553,7 +651,7 @@ start: kfree_skb(skb); } end: - release_sock(psock->sk); + mutex_unlock(&psock->work_mutex); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -563,11 +661,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); - if (inet_csk_has_ulp(sk)) { - psock = ERR_PTR(-EINVAL); - goto out; - } - if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; @@ -591,7 +684,9 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) spin_lock_init(&psock->link_lock); INIT_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); @@ -619,7 +714,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) return link; } -void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; @@ -630,9 +725,14 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) } } -static void sk_psock_zap_ingress(struct sk_psock *psock) +static void __sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -646,23 +746,35 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } -static void sk_psock_destroy_deferred(struct work_struct *gc) +void sk_psock_stop(struct sk_psock *psock, bool wait) { - struct sk_psock *psock = container_of(gc, struct sk_psock, gc); + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + __sk_psock_zap_ingress(psock); + spin_unlock_bh(&psock->ingress_lock); + if (wait) + cancel_work_sync(&psock->work); +} + +static void sk_psock_done_strp(struct sk_psock *psock); + +static void sk_psock_destroy(struct work_struct *work) +{ + struct sk_psock *psock = container_of(to_rcu_work(work), + struct sk_psock, rwork); /* No sk_callback_lock since already detached. */ - /* Parser has been stopped */ - if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + sk_psock_done_strp(psock); cancel_work_sync(&psock->work); + mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); sk_psock_link_destroy(psock); sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); if (psock->sk_redir) sock_put(psock->sk_redir); @@ -670,30 +782,21 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -static void sk_psock_destroy(struct rcu_head *rcu) -{ - struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); - - INIT_WORK(&psock->gc, sk_psock_destroy_deferred); - schedule_work(&psock->gc); -} - void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); + sk_psock_stop(psock, false); write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.skb_verdict) + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - call_rcu(&psock->rcu, sk_psock_destroy); + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); + queue_rcu_work(system_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); @@ -744,27 +847,12 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, - struct sk_buff *skb) -{ - bpf_compute_data_end_sk_skb(skb); - return bpf_prog_run_pin_on_cpu(prog, skb); -} - -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -777,14 +865,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + kfree_skb(skb); + return; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); kfree_skb(skb); return; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); + spin_unlock_bh(&psock_other->ingress_lock); } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -806,12 +900,13 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) int ret = __SK_PASS; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -823,7 +918,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -835,8 +929,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -848,8 +941,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + } + spin_unlock_bh(&psock->ingress_lock); } break; case __SK_REDIRECT: @@ -862,6 +959,24 @@ out_free: } } +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; @@ -876,12 +991,13 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) kfree_skb(skb); goto out; } - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb->sk = sk; - tcp_skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_verdict_apply(psock, skb, ret); @@ -896,15 +1012,15 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); + prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); @@ -920,16 +1036,59 @@ static void sk_psock_strp_data_ready(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { - psock->parser.saved_data_ready(sk); + psock->saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); + strp_data_ready(&psock->strp); write_unlock_bh(&sk->sk_callback_lock); } } rcu_read_unlock(); } +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + return strp_init(&psock->strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + if (psock->saved_data_ready) + return; + + psock->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + if (!psock->saved_data_ready) + return; + + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (psock->progs.stream_parser) + strp_done(&psock->strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t orig_len) { @@ -953,12 +1112,15 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, kfree_skb(skb); goto out; } - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb->sk = sk; - tcp_skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_verdict_apply(psock, skb, ret); @@ -982,82 +1144,21 @@ static void sk_psock_verdict_data_ready(struct sock *sk) sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); } -static void sk_psock_write_space(struct sock *sk) -{ - struct sk_psock *psock; - void (*write_space)(struct sock *sk) = NULL; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) { - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); - write_space = psock->saved_write_space; - } - rcu_read_unlock(); - if (write_space) - write_space(sk); -} - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -{ - static const struct strp_callbacks cb = { - .rcv_msg = sk_psock_strp_read, - .read_sock_done = sk_psock_strp_read_done, - .parse_msg = sk_psock_strp_parse, - }; - - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); -} - void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) - return; - - parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) - return; - - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; } diff --git a/net/core/sock.c b/net/core/sock.c index 5ec90f99e102..c761c4a0b66b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3531,7 +3531,7 @@ int proto_register(struct proto *prot, int alloc_slab) return ret; out_free_timewait_sock_slab: - if (alloc_slab && prot->twsk_prot) + if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d758fb83c884..6f1b82b8ad49 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,6 +24,10 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); + static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; @@ -148,9 +152,11 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->parser.enabled && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.stream_parser) strp_stop = true; - if (psock->parser.enabled && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.stream_verdict) + verdict_stop = true; + if (psock->saved_data_ready && stab->progs.skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -179,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw) static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { - struct proto *prot; - - switch (sk->sk_type) { - case SOCK_STREAM: - prot = tcp_bpf_get_proto(sk, psock); - break; - - case SOCK_DGRAM: - prot = udp_bpf_get_proto(sk, psock); - break; - - default: + if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; - } - - if (IS_ERR(prot)) - return PTR_ERR(prot); - - sk_psock_update_proto(sk, psock, prot); - return 0; + psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; + return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) @@ -221,26 +211,38 @@ out: return psock; } -static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, - struct sock *sk) +static bool sock_map_redirect_allowed(const struct sock *sk); + +static int sock_map_link(struct bpf_map *map, struct sock *sk) { - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog *stream_verdict = NULL; + struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; + struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; - skb_verdict = READ_ONCE(progs->skb_verdict); - if (skb_verdict) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); + /* Only sockets we can redirect into/from in BPF need to hold + * refs to parser/verdict progs and have their sk_data_ready + * and sk_write_space callbacks overridden. + */ + if (!sock_map_redirect_allowed(sk)) + goto no_progs; + + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); } - skb_parser = READ_ONCE(progs->skb_parser); - if (skb_parser) { - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - ret = PTR_ERR(skb_parser); - goto out_put_skb_verdict; + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; } } @@ -249,10 +251,20 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out_put_skb_parser; + goto out_put_stream_parser; } } + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + +no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); @@ -261,8 +273,11 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_parser && READ_ONCE(psock->progs.skb_parser)) || - (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -283,16 +298,19 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->parser.enabled) { + if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + psock_set_prog(&psock->progs.stream_parser, stream_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; @@ -301,35 +319,17 @@ out_unlock_drop: out_drop: sk_psock_put(sk, psock); out_progs: - if (msg_parser) - bpf_prog_put(msg_parser); -out_put_skb_parser: - if (skb_parser) - bpf_prog_put(skb_parser); -out_put_skb_verdict: if (skb_verdict) bpf_prog_put(skb_verdict); - return ret; -} - -static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) -{ - struct sk_psock *psock; - int ret; - - psock = sock_map_psock_get_checked(sk); - if (IS_ERR(psock)) - return PTR_ERR(psock); - - if (!psock) { - psock = sk_psock_init(sk, map->numa_node); - if (IS_ERR(psock)) - return PTR_ERR(psock); - } - - ret = sock_map_init_proto(sk, psock); - if (ret < 0) - sk_psock_put(sk, psock); +out_put_msg_parser: + if (msg_parser) + bpf_prog_put(msg_parser); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); return ret; } @@ -463,8 +463,6 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) return 0; } -static bool sock_map_redirect_allowed(const struct sock *sk); - static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { @@ -484,14 +482,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, if (!link) return -ENOMEM; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, &stab->progs, sk); - else - ret = sock_map_link_no_progs(map, sk); + ret = sock_map_link(map, sk); if (ret < 0) goto out_free; @@ -544,12 +535,15 @@ static bool sk_is_udp(const struct sock *sk) static bool sock_map_redirect_allowed(const struct sock *sk) { - return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN; + if (sk_is_tcp(sk)) + return sk->sk_state != TCP_LISTEN; + else + return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) { - return sk_is_tcp(sk) || sk_is_udp(sk); + return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) @@ -657,7 +651,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +660,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -998,14 +990,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, if (!link) return -ENOMEM; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, &htab->progs, sk); - else - ret = sock_map_link_no_progs(map, sk); + ret = sock_map_link(map, sk); if (ret < 0) goto out_free; @@ -1250,7 +1235,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1244,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1448,8 +1431,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **pprog; @@ -1461,10 +1444,19 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, case BPF_SK_MSG_VERDICT: pprog = &progs->msg_parser; break; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->skb_parser; + pprog = &progs->stream_parser; break; +#endif case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; + pprog = &progs->stream_verdict; + break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; pprog = &progs->skb_verdict; break; default: @@ -1529,7 +1521,7 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); @@ -1539,6 +1531,8 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); + sk_psock_stop(psock, true); + sk_psock_put(sk, psock); release_sock(sk); saved_close(sk, timeout); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4567de519603..c8496c1142c9 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ static int two = 2; static int three = 3; +static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -570,6 +571,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "netdev_unregister_timeout_secs", + .data = &netdev_unregister_timeout_secs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &int_3600, + }, { } }; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 2455b0c0e486..ffc601a3b329 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -23,14 +23,21 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/secure_seq.h> +#include <net/netns/generic.h> #include "ackvec.h" #include "ccid.h" #include "dccp.h" #include "feat.h" +struct dccp_v4_pernet { + struct sock *v4_ctl_sk; +}; + +static unsigned int dccp_v4_pernet_id __read_mostly; + /* - * The per-net dccp.v4_ctl_sk socket is used for responding to + * The per-net v4_ctl_sk socket is used for responding to * the Out-of-the-blue (OOTB) packets. A control sock will be created * for this socket at the initialization time. */ @@ -513,7 +520,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) struct sk_buff *skb; struct dst_entry *dst; struct net *net = dev_net(skb_dst(rxskb)->dev); - struct sock *ctl_sk = net->dccp.v4_ctl_sk; + struct dccp_v4_pernet *pn; + struct sock *ctl_sk; /* Never send a reset in response to a reset. */ if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) @@ -522,6 +530,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) return; + pn = net_generic(net, dccp_v4_pernet_id); + ctl_sk = pn->v4_ctl_sk; dst = dccp_v4_route_skb(net, ctl_sk, rxskb); if (dst == NULL) return; @@ -1005,16 +1015,20 @@ static struct inet_protosw dccp_v4_protosw = { static int __net_init dccp_v4_init_net(struct net *net) { + struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); + if (dccp_hashinfo.bhash == NULL) return -ESOCKTNOSUPPORT; - return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET, + return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET, SOCK_DCCP, IPPROTO_DCCP, net); } static void __net_exit dccp_v4_exit_net(struct net *net) { - inet_ctl_sock_destroy(net->dccp.v4_ctl_sk); + struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); + + inet_ctl_sock_destroy(pn->v4_ctl_sk); } static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) @@ -1026,6 +1040,8 @@ static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, .exit_batch = dccp_v4_exit_batch, + .id = &dccp_v4_pernet_id, + .size = sizeof(struct dccp_v4_pernet), }; static int __init dccp_v4_init(void) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 2be5c69824f9..6f5304db5a67 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -27,13 +27,20 @@ #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/secure_seq.h> +#include <net/netns/generic.h> #include <net/sock.h> #include "dccp.h" #include "ipv6.h" #include "feat.h" -/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */ +struct dccp_v6_pernet { + struct sock *v6_ctl_sk; +}; + +static unsigned int dccp_v6_pernet_id __read_mostly; + +/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; @@ -254,7 +261,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) struct sk_buff *skb; struct flowi6 fl6; struct net *net = dev_net(skb_dst(rxskb)->dev); - struct sock *ctl_sk = net->dccp.v6_ctl_sk; + struct dccp_v6_pernet *pn; + struct sock *ctl_sk; struct dst_entry *dst; if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) @@ -263,6 +271,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) if (!ipv6_unicast_destination(rxskb)) return; + pn = net_generic(net, dccp_v6_pernet_id); + ctl_sk = pn->v6_ctl_sk; skb = dccp_ctl_make_reset(ctl_sk, rxskb); if (skb == NULL) return; @@ -1089,16 +1099,20 @@ static struct inet_protosw dccp_v6_protosw = { static int __net_init dccp_v6_init_net(struct net *net) { + struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); + if (dccp_hashinfo.bhash == NULL) return -ESOCKTNOSUPPORT; - return inet_ctl_sock_create(&net->dccp.v6_ctl_sk, PF_INET6, + return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6, SOCK_DCCP, IPPROTO_DCCP, net); } static void __net_exit dccp_v6_exit_net(struct net *net) { - inet_ctl_sock_destroy(net->dccp.v6_ctl_sk); + struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); + + inet_ctl_sock_destroy(pn->v6_ctl_sk); } static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) @@ -1110,6 +1124,8 @@ static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, .exit_batch = dccp_v6_exit_batch, + .id = &dccp_v6_pernet_id, + .size = sizeof(struct dccp_v6_pernet), }; static int __init dccp_v6_init(void) diff --git a/net/decnet/TODO b/net/decnet/TODO deleted file mode 100644 index 358e9eb49016..000000000000 --- a/net/decnet/TODO +++ /dev/null @@ -1,40 +0,0 @@ -Steve's quick list of things that need finishing off: -[they are in no particular order and range from the trivial to the long winded] - - o Proper timeouts on each neighbour (in routing mode) rather than - just the 60 second On-Ethernet cache value. - - o Support for X.25 linklayer - - o Support for DDCMP link layer - - o The DDCMP device itself - - o PPP support (rfc1762) - - o Lots of testing with real applications - - o Verify errors etc. against POSIX 1003.1g (draft) - - o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) - [maybe this should be done at socket level... the control data in the - send/recvmsg() calls should simply be a vector of set/getsockopt() - calls] - - o check MSG_CTRUNC is set where it should be. - - o Find all the commonality between DECnet and IPv4 routing code and extract - it into a small library of routines. [probably a project for 2.7.xx] - - o Add perfect socket hashing - an idea suggested by Paul Koning. Currently - we have a half-way house scheme which seems to work reasonably well, but - the full scheme is still worth implementing, its not not top of my list - right now. - - o Add session control message flow control - - o Add NSP message flow control - - o DECnet sendpages() function - - o AIO for DECnet diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index c97bdca5ec30..1a12912b88d6 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -520,7 +520,7 @@ static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb) fcval = *ptr; /* - * Here we ignore erronous packets which should really + * Here we ignore erroneous packets which should really * should cause a connection abort. It is not critical * for now though. */ diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 2193ae529e75..32b1bed8ae51 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -84,8 +84,7 @@ #include <net/dn_neigh.h> #include <net/dn_fib.h> -struct dn_rt_hash_bucket -{ +struct dn_rt_hash_bucket { struct dn_route __rcu *chain; spinlock_t lock; }; @@ -93,7 +92,7 @@ struct dn_rt_hash_bucket extern struct neigh_table dn_neigh_table; -static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00}; +static unsigned char dn_hiord_addr[6] = {0xAA, 0x00, 0x04, 0x00, 0x00, 0x00}; static const int dn_rt_min_delay = 2 * HZ; static const int dn_rt_max_delay = 10 * HZ; @@ -359,10 +358,11 @@ static void dn_run_flush(struct timer_list *unused) for (i = 0; i < dn_rt_hash_mask; i++) { spin_lock_bh(&dn_rt_hash_table[i].lock); - if ((rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL)) == NULL) + rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL); + if (!rt) goto nothing_to_declare; - for(; rt; rt = next) { + for (; rt; rt = next) { next = rcu_dereference_raw(rt->dn_next); RCU_INIT_POINTER(rt->dn_next, NULL); dst_dev_put(&rt->dst); @@ -425,7 +425,8 @@ static int dn_return_short(struct sk_buff *skb) /* Add back headers */ skb_push(skb, skb->data - skb_network_header(skb)); - if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) return NET_RX_DROP; cb = DN_SKB_CB(skb); @@ -461,7 +462,8 @@ static int dn_return_long(struct sk_buff *skb) /* Add back all headers */ skb_push(skb, skb->data - skb_network_header(skb)); - if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) return NET_RX_DROP; cb = DN_SKB_CB(skb); @@ -505,7 +507,8 @@ static int dn_route_rx_packet(struct net *net, struct sock *sk, struct sk_buff * struct dn_skb_cb *cb; int err; - if ((err = dn_route_input(skb)) == 0) + err = dn_route_input(skb); + if (err == 0) return dst_input(skb); cb = DN_SKB_CB(skb); @@ -629,7 +632,8 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type if (dn == NULL) goto dump_it; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) goto out; if (!pskb_may_pull(skb, 3)) @@ -898,7 +902,7 @@ static inline int dn_match_addr(__le16 addr1, __le16 addr2) { __u16 tmp = le16_to_cpu(addr1) ^ le16_to_cpu(addr2); int match = 16; - while(tmp) { + while (tmp) { tmp >>= 1; match--; } @@ -1324,7 +1328,8 @@ static int dn_route_input_slow(struct sk_buff *skb) dev_hold(in_dev); - if ((dn_db = rcu_dereference(in_dev->dn_ptr)) == NULL) + dn_db = rcu_dereference(in_dev->dn_ptr); + if (!dn_db) goto out; /* Zero source addresses are not allowed */ @@ -1383,7 +1388,7 @@ static int dn_route_input_slow(struct sk_buff *skb) fld.saddr = src_map; } - switch(res.type) { + switch (res.type) { case RTN_UNICAST: /* * Forwarding check here, we only check for forwarding @@ -1407,7 +1412,7 @@ static int dn_route_input_slow(struct sk_buff *skb) flags |= RTCF_DOREDIRECT; local_src = DN_FIB_RES_PREFSRC(res); - + break; case RTN_BLACKHOLE: case RTN_UNREACHABLE: break; @@ -1526,7 +1531,7 @@ static int dn_route_input(struct sk_buff *skb) return 0; rcu_read_lock(); - for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL; + for (rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL; rt = rcu_dereference(rt->dn_next)) { if ((rt->fld.saddr == cb->src) && (rt->fld.daddr == cb->dst) && @@ -1739,13 +1744,13 @@ int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[0]; s_idx = idx = cb->args[1]; - for(h = 0; h <= dn_rt_hash_mask; h++) { + for (h = 0; h <= dn_rt_hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; rcu_read_lock_bh(); - for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0; + for (rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0; rt; rt = rcu_dereference_bh(rt->dn_next), idx++) { if (idx < s_idx) @@ -1779,7 +1784,7 @@ static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq) struct dn_route *rt = NULL; struct dn_rt_cache_iter_state *s = seq->private; - for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) { + for (s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) { rcu_read_lock_bh(); rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain); if (rt) @@ -1809,7 +1814,7 @@ static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos) struct dn_route *rt = dn_rt_cache_get_first(seq); if (rt) { - while(*pos && (rt = dn_rt_cache_get_next(seq, rt))) + while (*pos && (rt = dn_rt_cache_get_next(seq, rt))) --*pos; } return *pos ? NULL : rt; @@ -1864,21 +1869,21 @@ void __init dn_route_init(void) goal = totalram_pages() >> (26 - PAGE_SHIFT); - for(order = 0; (1UL << order) < goal; order++) + for (order = 0; (1UL << order) < goal; order++) /* NOTHING */; /* * Only want 1024 entries max, since the table is very, very unlikely * to be larger than that. */ - while(order && ((((1UL << order) * PAGE_SIZE) / + while (order && ((((1UL << order) * PAGE_SIZE) / sizeof(struct dn_rt_hash_bucket)) >= 2048)) order--; do { dn_rt_hash_mask = (1UL << order) * PAGE_SIZE / sizeof(struct dn_rt_hash_bucket); - while(dn_rt_hash_mask & (dn_rt_hash_mask - 1)) + while (dn_rt_hash_mask & (dn_rt_hash_mask - 1)) dn_rt_hash_mask--; dn_rt_hash_table = (struct dn_rt_hash_bucket *) __get_free_pages(GFP_ATOMIC, order); @@ -1893,7 +1898,7 @@ void __init dn_route_init(void) (long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024); dn_rt_hash_mask--; - for(i = 0; i <= dn_rt_hash_mask; i++) { + for (i = 0; i <= dn_rt_hash_mask; i++) { spin_lock_init(&dn_rt_hash_table[i].lock); dn_rt_hash_table[i].chain = NULL; } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 58b8fc82cd3c..cbc2bd643ab2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -1,26 +1,23 @@ # SPDX-License-Identifier: GPL-2.0-only -config HAVE_NET_DSA - def_bool y - depends on INET && NETDEVICES && !S390 - -# Drivers must select NET_DSA and the appropriate tagging format menuconfig NET_DSA tristate "Distributed Switch Architecture" - depends on HAVE_NET_DSA depends on BRIDGE || BRIDGE=n depends on HSR || HSR=n + depends on INET && NETDEVICES select GRO_CELLS select NET_SWITCHDEV select PHYLINK select NET_DEVLINK + select NET_SELFTESTS help Say Y if you want to enable support for the hardware switches supported by the Distributed Switch Architecture. if NET_DSA -# tagging formats +# Drivers must select the appropriate tagging format(s) + config NET_DSA_TAG_8021Q tristate select VLAN_8021Q @@ -48,6 +45,13 @@ config NET_DSA_TAG_BRCM Say Y if you want to enable support for tagging frames for the Broadcom switches which place the tag after the MAC source address. +config NET_DSA_TAG_BRCM_LEGACY + tristate "Tag driver for Broadcom legacy switches using in-frame headers" + select NET_DSA_TAG_BRCM_COMMON + help + Say Y if you want to enable support for tagging frames for the + Broadcom legacy switches which place the tag after the MAC source + address. config NET_DSA_TAG_BRCM_PREPEND tristate "Tag driver for Broadcom switches using prepended headers" diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 3c3e56a1f34d..b71e87909f0e 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -392,7 +392,7 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_USER: - dp->mac = of_get_mac_address(dp->dn); + of_get_mac_address(dp->dn, dp->mac); err = dsa_slave_create(dp); if (err) break; @@ -668,6 +668,30 @@ static const struct devlink_ops dsa_devlink_ops = { .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get, }; +static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) +{ + const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; + struct dsa_switch_tree *dst = ds->dst; + int port, err; + + if (tag_ops->proto == dst->default_proto) + return 0; + + for (port = 0; port < ds->num_ports; port++) { + if (!dsa_is_cpu_port(ds, port)) + continue; + + err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + if (err) { + dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", + tag_ops->name, ERR_PTR(err)); + return err; + } + } + + return 0; +} + static int dsa_switch_setup(struct dsa_switch *ds) { struct dsa_devlink_priv *dl_priv; @@ -718,6 +742,10 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err < 0) goto unregister_notifier; + err = dsa_switch_setup_tag_protocol(ds); + if (err) + goto teardown; + devlink_params_publish(ds->devlink); if (!ds->slave_mii_bus && ds->ops->phy_read) { @@ -1068,34 +1096,60 @@ static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol); } -static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) +static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master, + const char *user_protocol) { struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; const struct dsa_device_ops *tag_ops; - enum dsa_tag_protocol tag_protocol; + enum dsa_tag_protocol default_proto; + + /* Find out which protocol the switch would prefer. */ + default_proto = dsa_get_tag_protocol(dp, master); + if (dst->default_proto) { + if (dst->default_proto != default_proto) { + dev_err(ds->dev, + "A DSA switch tree can have only one tagging protocol\n"); + return -EINVAL; + } + } else { + dst->default_proto = default_proto; + } + + /* See if the user wants to override that preference. */ + if (user_protocol) { + if (!ds->ops->change_tag_protocol) { + dev_err(ds->dev, "Tag protocol cannot be modified\n"); + return -EINVAL; + } + + tag_ops = dsa_find_tagger_by_name(user_protocol); + } else { + tag_ops = dsa_tag_driver_get(default_proto); + } + + if (IS_ERR(tag_ops)) { + if (PTR_ERR(tag_ops) == -ENOPROTOOPT) + return -EPROBE_DEFER; + + dev_warn(ds->dev, "No tagger for this switch\n"); + return PTR_ERR(tag_ops); + } - tag_protocol = dsa_get_tag_protocol(dp, master); if (dst->tag_ops) { - if (dst->tag_ops->proto != tag_protocol) { + if (dst->tag_ops != tag_ops) { dev_err(ds->dev, "A DSA switch tree can have only one tagging protocol\n"); + + dsa_tag_driver_put(tag_ops); return -EINVAL; } + /* In the case of multiple CPU ports per switch, the tagging - * protocol is still reference-counted only per switch tree, so - * nothing to do here. + * protocol is still reference-counted only per switch tree. */ + dsa_tag_driver_put(tag_ops); } else { - tag_ops = dsa_tag_driver_get(tag_protocol); - if (IS_ERR(tag_ops)) { - if (PTR_ERR(tag_ops) == -ENOPROTOOPT) - return -EPROBE_DEFER; - dev_warn(ds->dev, "No tagger for this switch\n"); - dp->master = NULL; - return PTR_ERR(tag_ops); - } - dst->tag_ops = tag_ops; } @@ -1104,6 +1158,19 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) dsa_port_set_tag_protocol(dp, dst->tag_ops); dp->dst = dst; + /* At this point, the tree may be configured to use a different + * tagger than the one chosen by the switch driver during + * .setup, in the case when a user selects a custom protocol + * through the DT. + * + * This is resolved by syncing the driver with the tree in + * dsa_switch_setup_tag_protocol once .setup has run and the + * driver is ready to accept calls to .change_tag_protocol. If + * the driver does not support the custom protocol at that + * point, the tree is wholly rejected, thereby ensuring that the + * tree and driver are always in agreement on the protocol to + * use. + */ return 0; } @@ -1117,12 +1184,14 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) if (ethernet) { struct net_device *master; + const char *user_protocol; master = of_find_net_device_by_node(ethernet); if (!master) return -EPROBE_DEFER; - return dsa_port_parse_cpu(dp, master); + user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL); + return dsa_port_parse_cpu(dp, master, user_protocol); } if (link) @@ -1234,7 +1303,7 @@ static int dsa_port_parse(struct dsa_port *dp, const char *name, dev_put(master); - return dsa_port_parse_cpu(dp, master); + return dsa_port_parse_cpu(dp, master, NULL); } if (!strcmp(name, "dsa")) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9d4b0e9b1aa1..92282de54230 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -181,12 +181,14 @@ int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); -int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); +int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, + struct netlink_ext_ack *extack); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_lag_change(struct dsa_port *dp, struct netdev_lag_lower_state_info *linfo); int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, - struct netdev_lag_upper_info *uinfo); + struct netdev_lag_upper_info *uinfo, + struct netlink_ext_ack *extack); void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct netlink_ext_ack *extack); @@ -233,19 +235,7 @@ extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, struct net_device *dev) { - /* Switchdev offloading can be configured on: */ - - if (dev == dp->slave) - /* DSA ports directly connected to a bridge, and event - * was emitted for the ports themselves. - */ - return true; - - if (dp->lag_dev == dev) - /* DSA ports connected to a bridge via a LAG */ - return true; - - return false; + return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, @@ -272,6 +262,9 @@ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; +extern struct notifier_block dsa_slave_switchdev_notifier; +extern struct notifier_block dsa_slave_switchdev_blocking_notifier; + void dsa_slave_mii_bus_init(struct dsa_switch *ds); int dsa_slave_create(struct dsa_port *dp); void dsa_slave_destroy(struct net_device *slave_dev); diff --git a/net/dsa/port.c b/net/dsa/port.c index c9c6d7ab3f47..6379d66a6bb3 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -122,29 +122,132 @@ void dsa_port_disable(struct dsa_port *dp) rtnl_unlock(); } -static void dsa_port_change_brport_flags(struct dsa_port *dp, - bool bridge_offload) +static int dsa_port_inherit_brport_flags(struct dsa_port *dp, + struct netlink_ext_ack *extack) { - struct switchdev_brport_flags flags; - int flag; + const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | + BR_BCAST_FLOOD; + struct net_device *brport_dev = dsa_port_to_bridge_port(dp); + int flag, err; - flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; - if (bridge_offload) - flags.val = flags.mask; - else - flags.val = flags.mask & ~BR_LEARNING; + for_each_set_bit(flag, &mask, 32) { + struct switchdev_brport_flags flags = {0}; + + flags.mask = BIT(flag); + + if (br_port_flag_is_set(brport_dev, BIT(flag))) + flags.val = BIT(flag); - for_each_set_bit(flag, &flags.mask, 32) { - struct switchdev_brport_flags tmp; + err = dsa_port_bridge_flags(dp, flags, extack); + if (err && err != -EOPNOTSUPP) + return err; + } - tmp.val = flags.val & BIT(flag); - tmp.mask = BIT(flag); + return 0; +} - dsa_port_bridge_flags(dp, tmp, NULL); +static void dsa_port_clear_brport_flags(struct dsa_port *dp) +{ + const unsigned long val = BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; + const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | + BR_BCAST_FLOOD; + int flag, err; + + for_each_set_bit(flag, &mask, 32) { + struct switchdev_brport_flags flags = {0}; + + flags.mask = BIT(flag); + flags.val = val & BIT(flag); + + err = dsa_port_bridge_flags(dp, flags, NULL); + if (err && err != -EOPNOTSUPP) + dev_err(dp->ds->dev, + "failed to clear bridge port flag %lu: %pe\n", + flags.val, ERR_PTR(err)); } } -int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) +static int dsa_port_switchdev_sync(struct dsa_port *dp, + struct netlink_ext_ack *extack) +{ + struct net_device *brport_dev = dsa_port_to_bridge_port(dp); + struct net_device *br = dp->bridge_dev; + int err; + + err = dsa_port_inherit_brport_flags(dp, extack); + if (err) + return err; + + err = dsa_port_set_state(dp, br_port_get_stp_state(brport_dev)); + if (err && err != -EOPNOTSUPP) + return err; + + err = dsa_port_vlan_filtering(dp, br_vlan_enabled(br), extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = dsa_port_mrouter(dp->cpu_dp, br_multicast_router(br), extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = dsa_port_ageing_time(dp, br_get_ageing_time(br)); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_mdb_replay(br, brport_dev, + &dsa_slave_switchdev_blocking_notifier, + extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_fdb_replay(br, brport_dev, &dsa_slave_switchdev_notifier); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_vlan_replay(br, brport_dev, + &dsa_slave_switchdev_blocking_notifier, + extack); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +static void dsa_port_switchdev_unsync(struct dsa_port *dp) +{ + /* Configure the port for standalone mode (no address learning, + * flood everything). + * The bridge only emits SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS events + * when the user requests it through netlink or sysfs, but not + * automatically at port join or leave, so we need to handle resetting + * the brport flags ourselves. But we even prefer it that way, because + * otherwise, some setups might never get the notification they need, + * for example, when a port leaves a LAG that offloads the bridge, + * it becomes standalone, but as far as the bridge is concerned, no + * port ever left. + */ + dsa_port_clear_brport_flags(dp); + + /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, + * so allow it to be in BR_STATE_FORWARDING to be kept functional + */ + dsa_port_set_state_now(dp, BR_STATE_FORWARDING); + + /* VLAN filtering is handled by dsa_switch_bridge_leave */ + + /* Some drivers treat the notification for having a local multicast + * router by allowing multicast to be flooded to the CPU, so we should + * allow this in standalone mode too. + */ + dsa_port_mrouter(dp->cpu_dp, true, NULL); + + /* Ageing time may be global to the switch chip, so don't change it + * here because we have no good reason (or value) to change it to. + */ +} + +int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, + struct netlink_ext_ack *extack) { struct dsa_notifier_bridge_info info = { .tree_index = dp->ds->dst->index, @@ -154,24 +257,25 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) }; int err; - /* Notify the port driver to set its configurable flags in a way that - * matches the initial settings of a bridge port. - */ - dsa_port_change_brport_flags(dp, true); - /* Here the interface is already bridged. Reflect the current * configuration so that drivers can program their chips accordingly. */ dp->bridge_dev = br; err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info); + if (err) + goto out_rollback; - /* The bridging is rolled back on error */ - if (err) { - dsa_port_change_brport_flags(dp, false); - dp->bridge_dev = NULL; - } + err = dsa_port_switchdev_sync(dp, extack); + if (err) + goto out_rollback_unbridge; + + return 0; +out_rollback_unbridge: + dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); +out_rollback: + dp->bridge_dev = NULL; return err; } @@ -194,23 +298,7 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) if (err) pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); - /* Configure the port for standalone mode (no address learning, - * flood everything). - * The bridge only emits SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS events - * when the user requests it through netlink or sysfs, but not - * automatically at port join or leave, so we need to handle resetting - * the brport flags ourselves. But we even prefer it that way, because - * otherwise, some setups might never get the notification they need, - * for example, when a port leaves a LAG that offloads the bridge, - * it becomes standalone, but as far as the bridge is concerned, no - * port ever left. - */ - dsa_port_change_brport_flags(dp, false); - - /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, - * so allow it to be in BR_STATE_FORWARDING to be kept functional - */ - dsa_port_set_state_now(dp, BR_STATE_FORWARDING); + dsa_port_switchdev_unsync(dp); } int dsa_port_lag_change(struct dsa_port *dp, @@ -241,7 +329,8 @@ int dsa_port_lag_change(struct dsa_port *dp, } int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag, - struct netdev_lag_upper_info *uinfo) + struct netdev_lag_upper_info *uinfo, + struct netlink_ext_ack *extack) { struct dsa_notifier_lag_info info = { .sw_index = dp->ds->index, @@ -249,17 +338,31 @@ int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag, .lag = lag, .info = uinfo, }; + struct net_device *bridge_dev; int err; dsa_lag_map(dp->ds->dst, lag); dp->lag_dev = lag; err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_JOIN, &info); - if (err) { - dp->lag_dev = NULL; - dsa_lag_unmap(dp->ds->dst, lag); - } + if (err) + goto err_lag_join; + + bridge_dev = netdev_master_upper_dev_get(lag); + if (!bridge_dev || !netif_is_bridge_master(bridge_dev)) + return 0; + err = dsa_port_bridge_join(dp, bridge_dev, extack); + if (err) + goto err_bridge_join; + + return 0; + +err_bridge_join: + dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info); +err_lag_join: + dp->lag_dev = NULL; + dsa_lag_unmap(dp->ds->dst, lag); return err; } @@ -447,7 +550,7 @@ int dsa_port_bridge_flags(const struct dsa_port *dp, struct dsa_switch *ds = dp->ds; if (!ds->ops->port_bridge_flags) - return -EINVAL; + return -EOPNOTSUPP; return ds->ops->port_bridge_flags(ds, dp->index, flags, extack); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 992fcab4b552..77b33bd161b8 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -15,6 +15,7 @@ #include <linux/mdio.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> +#include <net/selftests.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/if_hsr.h> @@ -748,7 +749,10 @@ static void dsa_slave_get_strings(struct net_device *dev, if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data + 4 * len); + } else if (stringset == ETH_SS_TEST) { + net_selftest_get_strings(data); } + } static void dsa_slave_get_ethtool_stats(struct net_device *dev, @@ -794,11 +798,27 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) count += ds->ops->get_sset_count(ds, dp->index, sset); return count; + } else if (sset == ETH_SS_TEST) { + return net_selftest_get_count(); } return -EOPNOTSUPP; } +static void dsa_slave_net_selftest(struct net_device *ndev, + struct ethtool_test *etest, u64 *buf) +{ + struct dsa_port *dp = dsa_slave_to_port(ndev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->self_test) { + ds->ops->self_test(ds, dp->index, etest, buf); + return; + } + + net_selftest(ndev, etest, buf); +} + static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -1278,14 +1298,32 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, } } +static int dsa_slave_setup_ft_block(struct dsa_switch *ds, int port, + void *type_data) +{ + struct dsa_port *cpu_dp = dsa_to_port(ds, port)->cpu_dp; + struct net_device *master = cpu_dp->master; + + if (!master->netdev_ops->ndo_setup_tc) + return -EOPNOTSUPP; + + return master->netdev_ops->ndo_setup_tc(master, TC_SETUP_FT, type_data); +} + static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; - if (type == TC_SETUP_BLOCK) + switch (type) { + case TC_SETUP_BLOCK: return dsa_slave_setup_tc_block(dev, type_data); + case TC_SETUP_FT: + return dsa_slave_setup_ft_block(ds, dp->index, type_data); + default: + break; + } if (!ds->ops->port_setup_tc) return -EOPNOTSUPP; @@ -1612,6 +1650,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, + .self_test = dsa_slave_net_selftest, }; /* legacy way, bypassing the bridge *****************************************/ @@ -1654,6 +1693,21 @@ static void dsa_slave_get_stats64(struct net_device *dev, dev_get_tstats64(dev, s); } +static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct dsa_port *dp = dsa_slave_to_port(ctx->dev); + struct dsa_port *cpu_dp = dp->cpu_dp; + + path->dev = ctx->dev; + path->type = DEV_PATH_DSA; + path->dsa.proto = cpu_dp->tag_ops->proto; + path->dsa.port = dp->index; + ctx->dev = cpu_dp->master; + + return 0; +} + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -1679,6 +1733,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid, .ndo_get_devlink_port = dsa_slave_get_devlink_port, .ndo_change_mtu = dsa_slave_change_mtu, + .ndo_fill_forward_path = dsa_slave_fill_forward_path, }; static struct device_type dsa_type = { @@ -1862,7 +1917,7 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->hw_features |= NETIF_F_HW_TC; slave_dev->features |= NETIF_F_LLTX; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; - if (!IS_ERR_OR_NULL(port->mac)) + if (!is_zero_ether_addr(port->mac)) ether_addr_copy(slave_dev->dev_addr, port->mac); else eth_hw_addr_inherit(slave_dev, master); @@ -1976,11 +2031,14 @@ static int dsa_slave_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_slave_to_port(dev); + struct netlink_ext_ack *extack; int err = NOTIFY_DONE; + extack = netdev_notifier_info_to_extack(&info->info); + if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { - err = dsa_port_bridge_join(dp, info->upper_dev); + err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); err = notifier_from_errno(err); @@ -1991,7 +2049,7 @@ static int dsa_slave_changeupper(struct net_device *dev, } else if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_port_lag_join(dp, info->upper_dev, - info->upper_info); + info->upper_info, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_MOD(info->info.extack, "Offloading not supported"); @@ -2292,7 +2350,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, fdb_info = ptr; if (dsa_slave_dev_check(dev)) { - if (!fdb_info->added_by_user) + if (!fdb_info->added_by_user || fdb_info->is_local) return NOTIFY_OK; dp = dsa_slave_to_port(dev); @@ -2389,11 +2447,11 @@ static struct notifier_block dsa_slave_nb __read_mostly = { .notifier_call = dsa_slave_netdevice_event, }; -static struct notifier_block dsa_slave_switchdev_notifier = { +struct notifier_block dsa_slave_switchdev_notifier = { .notifier_call = dsa_slave_switchdev_event, }; -static struct notifier_block dsa_slave_switchdev_blocking_notifier = { +struct notifier_block dsa_slave_switchdev_blocking_notifier = { .notifier_call = dsa_slave_switchdev_blocking_event, }; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 32963276452f..9bf8e20ecdf3 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -323,15 +323,6 @@ static int dsa_switch_vlan_del(struct dsa_switch *ds, return 0; } -static bool dsa_switch_tag_proto_match(struct dsa_switch *ds, int port, - struct dsa_notifier_tag_proto_info *info) -{ - if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) - return true; - - return false; -} - static int dsa_switch_change_tag_proto(struct dsa_switch *ds, struct dsa_notifier_tag_proto_info *info) { @@ -344,16 +335,14 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds, ASSERT_RTNL(); for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_tag_proto_match(ds, port, info)) { - err = ds->ops->change_tag_protocol(ds, port, - tag_ops->proto); - if (err) - return err; + if (!dsa_is_cpu_port(ds, port)) + continue; - if (dsa_is_cpu_port(ds, port)) - dsa_port_set_tag_protocol(dsa_to_port(ds, port), - tag_ops); - } + err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + if (err) + return err; + + dsa_port_set_tag_protocol(dsa_to_port(ds, port), tag_ops); } /* Now that changing the tag protocol can no longer fail, let's update diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e2577a7dcbca..40e9f3098c8d 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -12,9 +12,26 @@ #include "dsa_priv.h" -/* This tag length is 4 bytes, older ones were 6 bytes, we do not - * handle them - */ +/* Legacy Broadcom tag (6 bytes) */ +#define BRCM_LEG_TAG_LEN 6 + +/* Type fields */ +/* 1st byte in the tag */ +#define BRCM_LEG_TYPE_HI 0x88 +/* 2nd byte in the tag */ +#define BRCM_LEG_TYPE_LO 0x74 + +/* Tag fields */ +/* 3rd byte in the tag */ +#define BRCM_LEG_UNICAST (0 << 5) +#define BRCM_LEG_MULTICAST (1 << 5) +#define BRCM_LEG_EGRESS (2 << 5) +#define BRCM_LEG_INGRESS (3 << 5) + +/* 6th byte in the tag */ +#define BRCM_LEG_PORT_ID (0xf) + +/* Newer Broadcom tag (4 bytes) */ #define BRCM_TAG_LEN 4 /* Tag is constructed and desconstructed using byte by byte access @@ -195,6 +212,87 @@ DSA_TAG_DRIVER(brcm_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM); #endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) +static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + u8 *brcm_tag; + + /* The Ethernet switch we are interfaced with needs packets to be at + * least 64 bytes (including FCS) otherwise they will be discarded when + * they enter the switch port logic. When Broadcom tags are enabled, we + * need to make sure that packets are at least 70 bytes + * (including FCS and tag) because the length verification is done after + * the Broadcom tag is stripped off the ingress packet. + * + * Let dsa_slave_xmit() free the SKB + */ + if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false)) + return NULL; + + skb_push(skb, BRCM_LEG_TAG_LEN); + + memmove(skb->data, skb->data + BRCM_LEG_TAG_LEN, 2 * ETH_ALEN); + + brcm_tag = skb->data + 2 * ETH_ALEN; + + /* Broadcom tag type */ + brcm_tag[0] = BRCM_LEG_TYPE_HI; + brcm_tag[1] = BRCM_LEG_TYPE_LO; + + /* Broadcom tag value */ + brcm_tag[2] = BRCM_LEG_EGRESS; + brcm_tag[3] = 0; + brcm_tag[4] = 0; + brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID; + + return skb; +} + +static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + int source_port; + u8 *brcm_tag; + + if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID))) + return NULL; + + brcm_tag = skb->data - 2; + + source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; + + skb->dev = dsa_master_find_slave(dev, 0, source_port); + if (!skb->dev) + return NULL; + + /* Remove Broadcom tag and update checksum */ + skb_pull_rcsum(skb, BRCM_LEG_TAG_LEN); + + skb->offload_fwd_mark = 1; + + /* Move the Ethernet DA and SA */ + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - BRCM_LEG_TAG_LEN, + 2 * ETH_ALEN); + + return skb; +} + +static const struct dsa_device_ops brcm_legacy_netdev_ops = { + .name = "brcm-legacy", + .proto = DSA_TAG_PROTO_BRCM_LEGACY, + .xmit = brcm_leg_tag_xmit, + .rcv = brcm_leg_tag_rcv, + .overhead = BRCM_LEG_TAG_LEN, +}; + +DSA_TAG_DRIVER(brcm_legacy_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY); +#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */ + #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, struct net_device *dev) @@ -227,6 +325,9 @@ static struct dsa_tag_driver *dsa_tag_driver_array[] = { #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM) &DSA_TAG_DRIVER_NAME(brcm_netdev_ops), #endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) + &DSA_TAG_DRIVER_NAME(brcm_legacy_netdev_ops), +#endif #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) &DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops), #endif diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 59748487664f..f9b2966d1936 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -24,9 +24,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(dev); u8 xmit_tpid; u8 *mtk_tag; - unsigned char *dest = eth_hdr(skb)->h_dest; - bool is_multicast_skb = is_multicast_ether_addr(dest) && - !is_broadcast_ether_addr(dest); /* Build the special tag after the MAC Source Address. If VLAN header * is present, it's required that VLAN header and special tag is @@ -55,10 +52,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, mtk_tag[0] = xmit_tpid; mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; - /* Disable SA learning for multicast frames */ - if (unlikely(is_multicast_skb)) - mtk_tag[1] |= MTK_HDR_XMIT_SA_DIS; - /* Tag control information is kept for 802.1Q */ if (xmit_tpid == MTK_HDR_XMIT_UNTAGGED) { mtk_tag[2] = 0; @@ -74,9 +67,6 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, u16 hdr; int port; __be16 *phdr; - unsigned char *dest = eth_hdr(skb)->h_dest; - bool is_multicast_skb = is_multicast_ether_addr(dest) && - !is_broadcast_ether_addr(dest); if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN))) return NULL; @@ -102,9 +92,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; - /* Only unicast or broadcast frames are offloaded */ - if (likely(!is_multicast_skb)) - skb->offload_fwd_mark = 1; + skb->offload_fwd_mark = 1; return skb; } diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 743809b5806b..f9df9cac81c5 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -83,7 +83,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct dsa_port *dp; u8 *extraction; u16 vlan_tpid; - u64 cpuq; /* Revert skb->data by the amount consumed by the DSA master, * so it points to the beginning of the frame. @@ -113,7 +112,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, ocelot_xfh_get_qos_class(extraction, &qos_class); ocelot_xfh_get_tag_type(extraction, &tag_type); ocelot_xfh_get_vlan_tci(extraction, &vlan_tci); - ocelot_xfh_get_cpuq(extraction, &cpuq); skb->dev = dsa_master_find_slave(netdev, 0, src_port); if (!skb->dev) @@ -128,12 +126,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, skb->offload_fwd_mark = 1; skb->priority = qos_class; -#if IS_ENABLED(CONFIG_BRIDGE_MRP) - if (eth_hdr(skb)->h_proto == cpu_to_be16(ETH_P_MRP) && - cpuq & BIT(OCELOT_MRP_CPUQ)) - skb->offload_fwd_mark = 0; -#endif - /* Ocelot switches copy frames unmodified to the CPU. However, it is * possible for the user to request a VLAN modification through * VCAP_IS1_ACT_VID_REPLACE_ENA. In this case, what will happen is that diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index e9176475bac8..cf8ac316f4c7 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -79,7 +79,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, /* The RTL4 header has its own custom Ethertype 0x8899 and that * starts right at the beginning of the packet, after the src - * ethernet addr. Apparantly skb->data always points 2 bytes in, + * ethernet addr. Apparently skb->data always points 2 bytes in, * behind the Ethertype. */ tag = skb->data - 2; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 4106373180c6..9cce612e8976 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -58,6 +58,7 @@ #include <net/ip.h> #include <net/dsa.h> #include <net/flow_dissector.h> +#include <net/gro.h> #include <linux/uaccess.h> #include <net/pkt_sched.h> @@ -122,7 +123,7 @@ EXPORT_SYMBOL(eth_header); * Make a best effort attempt to pull the length for all of the headers for * a given frame in a linear buffer. */ -u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len) +u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len) { const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; @@ -449,7 +450,10 @@ struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb) skb_gro_pull(skb, sizeof(*eh)); skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); - pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); + + pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + head, skb); out_unlock: rcu_read_unlock(); @@ -473,8 +477,9 @@ int eth_gro_complete(struct sk_buff *skb, int nhoff) rcu_read_lock(); ptype = gro_find_complete_by_type(type); if (ptype != NULL) - err = ptype->callbacks.gro_complete(skb, nhoff + - sizeof(struct ethhdr)); + err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, + ipv6_gro_complete, inet_gro_complete, + skb, nhoff + sizeof(*eh)); rcu_read_unlock(); return err; @@ -506,13 +511,14 @@ unsigned char * __weak arch_get_platform_mac_address(void) int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) { - const unsigned char *addr = NULL; + unsigned char *addr; + int ret; - if (dev->of_node) - addr = of_get_mac_address(dev->of_node); - if (IS_ERR_OR_NULL(addr)) - addr = arch_get_platform_mac_address(); + ret = of_get_mac_address(dev->of_node, mac_addr); + if (!ret) + return 0; + addr = arch_get_platform_mac_address(); if (!addr) return -ENODEV; diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 7a849ff22dad..723c9a8a8cdf 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o + tunnels.o fec.o eeprom.o stats.o diff --git a/net/ethtool/common.h b/net/ethtool/common.h index a9d071248698..2dc2b80aea5f 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -47,4 +47,9 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info); extern const struct ethtool_phy_ops *ethtool_phy_ops; +int ethtool_get_module_info_call(struct net_device *dev, + struct ethtool_modinfo *modinfo); +int ethtool_get_module_eeprom_call(struct net_device *dev, + struct ethtool_eeprom *ee, u8 *data); + #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c new file mode 100644 index 000000000000..2a6733a6449a --- /dev/null +++ b/net/ethtool/eeprom.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool.h> +#include <linux/sfp.h> +#include "netlink.h" +#include "common.h" + +struct eeprom_req_info { + struct ethnl_req_info base; + u32 offset; + u32 length; + u8 page; + u8 bank; + u8 i2c_address; +}; + +struct eeprom_reply_data { + struct ethnl_reply_data base; + u32 length; + u8 *data; +}; + +#define MODULE_EEPROM_REQINFO(__req_base) \ + container_of(__req_base, struct eeprom_req_info, base) + +#define MODULE_EEPROM_REPDATA(__reply_base) \ + container_of(__reply_base, struct eeprom_reply_data, base) + +static int fallback_set_params(struct eeprom_req_info *request, + struct ethtool_modinfo *modinfo, + struct ethtool_eeprom *eeprom) +{ + u32 offset = request->offset; + u32 length = request->length; + + if (request->page) + offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset; + + if (modinfo->type == ETH_MODULE_SFF_8079 && + request->i2c_address == 0x51) + offset += ETH_MODULE_EEPROM_PAGE_LEN * 2; + + if (offset >= modinfo->eeprom_len) + return -EINVAL; + + eeprom->cmd = ETHTOOL_GMODULEEEPROM; + eeprom->len = length; + eeprom->offset = offset; + + return 0; +} + +static int eeprom_fallback(struct eeprom_req_info *request, + struct eeprom_reply_data *reply, + struct genl_info *info) +{ + struct net_device *dev = reply->base.dev; + struct ethtool_modinfo modinfo = {0}; + struct ethtool_eeprom eeprom = {0}; + u8 *data; + int err; + + modinfo.cmd = ETHTOOL_GMODULEINFO; + err = ethtool_get_module_info_call(dev, &modinfo); + if (err < 0) + return err; + + err = fallback_set_params(request, &modinfo, &eeprom); + if (err < 0) + return err; + + data = kmalloc(eeprom.len, GFP_KERNEL); + if (!data) + return -ENOMEM; + err = ethtool_get_module_eeprom_call(dev, &eeprom, data); + if (err < 0) + goto err_out; + + reply->data = data; + reply->length = eeprom.len; + + return 0; + +err_out: + kfree(data); + return err; +} + +static int get_module_eeprom_by_page(struct net_device *dev, + struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (dev->sfp_bus) + return sfp_get_module_eeprom_by_page(dev->sfp_bus, page_data, extack); + + if (ops->get_module_info) + return ops->get_module_eeprom_by_page(dev, page_data, extack); + + return -EOPNOTSUPP; +} + +static int eeprom_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); + struct ethtool_module_eeprom page_data = {0}; + struct net_device *dev = reply_base->dev; + int ret; + + page_data.offset = request->offset; + page_data.length = request->length; + page_data.i2c_address = request->i2c_address; + page_data.page = request->page; + page_data.bank = request->bank; + page_data.data = kmalloc(page_data.length, GFP_KERNEL); + if (!page_data.data) + return -ENOMEM; + + ret = ethnl_ops_begin(dev); + if (ret) + goto err_free; + + ret = get_module_eeprom_by_page(dev, &page_data, info->extack); + if (ret < 0) + goto err_ops; + + reply->length = ret; + reply->data = page_data.data; + + ethnl_ops_complete(dev); + return 0; + +err_ops: + ethnl_ops_complete(dev); +err_free: + kfree(page_data.data); + + if (ret == -EOPNOTSUPP) + return eeprom_fallback(request, reply, info); + return ret; +} + +static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info); + + if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] || + !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] || + !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] || + !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]) + return -EINVAL; + + request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]); + request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]); + request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]); + + if (!request->length) + return -EINVAL; + + /* The following set of conditions limit the API to only dump 1/2 + * EEPROM page without crossing low page boundary located at offset 128. + * This means user may only request dumps of length limited to 128 from + * either low 128 bytes or high 128 bytes. + * For pages higher than 0 only high 128 bytes are accessible. + */ + request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]); + if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE], + "reading from lower half page is allowed for page 0 only"); + return -EINVAL; + } + + if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN && + request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], + "reading cross half page boundary is illegal"); + return -EINVAL; + } else if (request->offset >= ETH_MODULE_EEPROM_PAGE_LEN * 2) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_OFFSET], + "offset is out of bounds"); + return -EINVAL; + } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], + "reading cross page boundary is illegal"); + return -EINVAL; + } + + if (tb[ETHTOOL_A_MODULE_EEPROM_BANK]) + request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]); + + return 0; +} + +static int eeprom_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); + + return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */ +} + +static int eeprom_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + + return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data); +} + +static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + + kfree(reply->data); +} + +const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { + .request_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, + .reply_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, + .hdr_attr = ETHTOOL_A_MODULE_EEPROM_HEADER, + .req_info_size = sizeof(struct eeprom_req_info), + .reply_data_size = sizeof(struct eeprom_reply_data), + + .parse_request = eeprom_parse_request, + .prepare_data = eeprom_prepare_data, + .reply_size = eeprom_reply_size, + .fill_reply = eeprom_fill_reply, + .cleanup_data = eeprom_cleanup_data, +}; + +const struct nla_policy ethnl_module_eeprom_get_policy[] = { + [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_MODULE_EEPROM_OFFSET] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_LENGTH] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, + [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 }, + [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] = + NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS), +}; + diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c new file mode 100644 index 000000000000..8738dafd5417 --- /dev/null +++ b/net/ethtool/fec.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct fec_req_info { + struct ethnl_req_info base; +}; + +struct fec_reply_data { + struct ethnl_reply_data base; + __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes); + u32 active_fec; + u8 fec_auto; + struct fec_stat_grp { + u64 stats[1 + ETHTOOL_MAX_LANES]; + u8 cnt; + } corr, uncorr, corr_bits; +}; + +#define FEC_REPDATA(__reply_base) \ + container_of(__reply_base, struct fec_reply_data, base) + +#define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1) + +const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = { + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), +}; + +static void +ethtool_fec_to_link_modes(u32 fec, unsigned long *link_modes, u8 *fec_auto) +{ + if (fec_auto) + *fec_auto = !!(fec & ETHTOOL_FEC_AUTO); + + if (fec & ETHTOOL_FEC_OFF) + __set_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes); + if (fec & ETHTOOL_FEC_RS) + __set_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes); + if (fec & ETHTOOL_FEC_BASER) + __set_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes); + if (fec & ETHTOOL_FEC_LLRS) + __set_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes); +} + +static int +ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec, + unsigned long *link_modes, u8 fec_auto) +{ + memset(fec, 0, sizeof(*fec)); + + if (fec_auto) + fec->fec |= ETHTOOL_FEC_AUTO; + + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_OFF; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_RS; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_BASER; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_LLRS; + + if (!bitmap_empty(link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS)) + return -EINVAL; + + return 0; +} + +static void +fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats) +{ + int i; + + if (stats->lanes[0] == ETHTOOL_STAT_NOT_SET) { + grp->stats[0] = stats->total; + grp->cnt = stats->total != ETHTOOL_STAT_NOT_SET; + return; + } + + grp->cnt = 1; + grp->stats[0] = 0; + for (i = 0; i < ETHTOOL_MAX_LANES; i++) { + if (stats->lanes[i] == ETHTOOL_STAT_NOT_SET) + break; + + grp->stats[0] += stats->lanes[i]; + grp->stats[grp->cnt++] = stats->lanes[i]; + } +} + +static int fec_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {}; + struct fec_reply_data *data = FEC_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + struct ethtool_fecparam fec = {}; + int ret; + + if (!dev->ethtool_ops->get_fecparam) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = dev->ethtool_ops->get_fecparam(dev, &fec); + if (ret) + goto out_complete; + if (req_base->flags & ETHTOOL_FLAG_STATS && + dev->ethtool_ops->get_fec_stats) { + struct ethtool_fec_stats stats; + + ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats); + + fec_stats_recalc(&data->corr, &stats.corrected_blocks); + fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); + fec_stats_recalc(&data->corr_bits, &stats.corrected_bits); + } + + WARN_ON_ONCE(fec.reserved); + + ethtool_fec_to_link_modes(fec.fec, data->fec_link_modes, + &data->fec_auto); + + ethtool_fec_to_link_modes(fec.active_fec, active_fec_modes, NULL); + data->active_fec = find_first_bit(active_fec_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS); + /* Don't report attr if no FEC mode set. Note that + * ethtool_fecparam_to_link_modes() ignores NONE and AUTO. + */ + if (data->active_fec == __ETHTOOL_LINK_MODE_MASK_NBITS) + data->active_fec = 0; + +out_complete: + ethnl_ops_complete(dev); + return ret; +} + +static int fec_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct fec_reply_data *data = FEC_REPDATA(reply_base); + int len = 0; + int ret; + + ret = ethnl_bitset_size(data->fec_link_modes, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + + len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ + nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ + + if (req_base->flags & ETHTOOL_FLAG_STATS) + len += 3 * nla_total_size_64bit(sizeof(u64) * + (1 + ETHTOOL_MAX_LANES)); + + return len; +} + +static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STATS); + if (!nest) + return -EMSGSIZE; + + if (nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORRECTED, + sizeof(u64) * data->corr.cnt, + data->corr.stats, ETHTOOL_A_FEC_STAT_PAD) || + nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_UNCORR, + sizeof(u64) * data->uncorr.cnt, + data->uncorr.stats, ETHTOOL_A_FEC_STAT_PAD) || + nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORR_BITS, + sizeof(u64) * data->corr_bits.cnt, + data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int fec_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct fec_reply_data *data = FEC_REPDATA(reply_base); + int ret; + + ret = ethnl_put_bitset(skb, ETHTOOL_A_FEC_MODES, + data->fec_link_modes, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + + if (nla_put_u8(skb, ETHTOOL_A_FEC_AUTO, data->fec_auto) || + (data->active_fec && + nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec))) + return -EMSGSIZE; + + if (req_base->flags & ETHTOOL_FLAG_STATS && fec_put_stats(skb, data)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_fec_request_ops = { + .request_cmd = ETHTOOL_MSG_FEC_GET, + .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, + .hdr_attr = ETHTOOL_A_FEC_HEADER, + .req_info_size = sizeof(struct fec_req_info), + .reply_data_size = sizeof(struct fec_reply_data), + + .prepare_data = fec_prepare_data, + .reply_size = fec_reply_size, + .fill_reply = fec_fill_reply, +}; + +/* FEC_SET */ + +const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_FEC_MODES] = { .type = NLA_NESTED }, + [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1), +}; + +int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {}; + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + struct ethtool_fecparam fec = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + bool mod = false; + u8 fec_auto; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEC_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_fecparam || !ops->set_fecparam) + goto out_dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ret = ops->get_fecparam(dev, &fec); + if (ret < 0) + goto out_ops; + + ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto); + + ret = ethnl_update_bitset(fec_link_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_FEC_MODES], + link_mode_names, info->extack, &mod); + if (ret < 0) + goto out_ops; + ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod); + + ret = 0; + if (!mod) + goto out_ops; + + ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], + "invalid FEC modes requested"); + goto out_ops; + } + if (!fec.fec) { + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], + "no FEC modes set"); + goto out_ops; + } + + ret = dev->ethtool_ops->set_fecparam(dev, &fec); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_FEC_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 771688e1b0da..3fa7a394eabf 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -489,7 +489,7 @@ store_link_ksettings_for_user(void __user *to, { struct ethtool_link_usettings link_usettings; - memcpy(&link_usettings.base, &from->base, sizeof(link_usettings)); + memcpy(&link_usettings, from, sizeof(link_usettings)); bitmap_to_arr32(link_usettings.link_modes.supported, from->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -1828,6 +1828,18 @@ out: return ret; } +__printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(*data, ETH_GSTRING_LEN, fmt, args); + va_end(args); + + *data += ETH_GSTRING_LEN; +} +EXPORT_SYMBOL(ethtool_sprintf); + static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; @@ -2176,8 +2188,8 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) return 0; } -static int __ethtool_get_module_info(struct net_device *dev, - struct ethtool_modinfo *modinfo) +int ethtool_get_module_info_call(struct net_device *dev, + struct ethtool_modinfo *modinfo) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; @@ -2203,7 +2215,7 @@ static int ethtool_get_module_info(struct net_device *dev, if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) return -EFAULT; - ret = __ethtool_get_module_info(dev, &modinfo); + ret = ethtool_get_module_info_call(dev, &modinfo); if (ret) return ret; @@ -2213,8 +2225,8 @@ static int ethtool_get_module_info(struct net_device *dev, return 0; } -static int __ethtool_get_module_eeprom(struct net_device *dev, - struct ethtool_eeprom *ee, u8 *data) +int ethtool_get_module_eeprom_call(struct net_device *dev, + struct ethtool_eeprom *ee, u8 *data) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; @@ -2237,12 +2249,12 @@ static int ethtool_get_module_eeprom(struct net_device *dev, int ret; struct ethtool_modinfo modinfo; - ret = __ethtool_get_module_info(dev, &modinfo); + ret = ethtool_get_module_info_call(dev, &modinfo); if (ret) return ret; return ethtool_get_any_eeprom(dev, useraddr, - __ethtool_get_module_eeprom, + ethtool_get_module_eeprom_call, modinfo.eeprom_len); } @@ -2540,6 +2552,9 @@ static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) if (rc) return rc; + if (WARN_ON_ONCE(fecparam.reserved)) + fecparam.reserved = 0; + if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) return -EFAULT; return 0; @@ -2555,6 +2570,12 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; + if (!fecparam.fec || fecparam.fec & ETHTOOL_FEC_NONE) + return -EINVAL; + + fecparam.active_fec = 0; + fecparam.reserved = 0; + return dev->ethtool_ops->set_fecparam(dev, &fecparam); } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 50d3c8896f91..290012d0d11d 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -244,7 +244,10 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_COALESCE_GET] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, + [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, + [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, + [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -551,6 +554,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_COALESCE_NTF] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_NTF] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, + [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, }; /* default notification handler */ @@ -643,6 +647,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -912,6 +917,41 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_tunnel_info_get_policy, .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_FEC_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_fec_get_policy, + .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_FEC_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_fec, + .policy = ethnl_fec_set_policy, + .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_module_eeprom_get_policy, + .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_STATS_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_stats_get_policy, + .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index cde9f3169ae5..8abcbc10796c 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -344,6 +344,9 @@ extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; +extern const struct ethnl_request_ops ethnl_fec_request_ops; +extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; +extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -375,6 +378,10 @@ extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; +extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; +extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; +extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1]; +extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -392,5 +399,12 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); + +extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; +extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index d4ac02718b72..9009f412151e 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -21,12 +21,6 @@ const struct nla_policy ethnl_pause_get_policy[] = { NLA_POLICY_NESTED(ethnl_header_policy_stats), }; -static void ethtool_stats_init(u64 *stats, unsigned int n) -{ - while (n--) - stats[n] = ETHTOOL_STAT_NOT_SET; -} - static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c new file mode 100644 index 000000000000..b7642dc96d50 --- /dev/null +++ b/net/ethtool/stats.c @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct stats_req_info { + struct ethnl_req_info base; + DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT); +}; + +#define STATS_REQINFO(__req_base) \ + container_of(__req_base, struct stats_req_info, base) + +struct stats_reply_data { + struct ethnl_reply_data base; + struct ethtool_eth_phy_stats phy_stats; + struct ethtool_eth_mac_stats mac_stats; + struct ethtool_eth_ctrl_stats ctrl_stats; + struct ethtool_rmon_stats rmon_stats; + const struct ethtool_rmon_hist_range *rmon_ranges; +}; + +#define STATS_REPDATA(__reply_base) \ + container_of(__reply_base, struct stats_reply_data, base) + +const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_STATS_ETH_PHY] = "eth-phy", + [ETHTOOL_STATS_ETH_MAC] = "eth-mac", + [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", + [ETHTOOL_STATS_RMON] = "rmon", +}; + +const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier", +}; + +const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT] = "FramesTransmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL] = "SingleCollisionFrames", + [ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL] = "MultipleCollisionFrames", + [ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT] = "FramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR] = "FrameCheckSequenceErrors", + [ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR] = "AlignmentErrors", + [ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES] = "OctetsTransmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER] = "FramesWithDeferredXmissions", + [ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL] = "LateCollisions", + [ETHTOOL_A_STATS_ETH_MAC_11_XS_COL] = "FramesAbortedDueToXSColls", + [ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR] = "FramesLostDueToIntMACXmitError", + [ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR] = "CarrierSenseErrors", + [ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES] = "OctetsReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR] = "FramesLostDueToIntMACRcvError", + [ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST] = "MulticastFramesXmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST] = "BroadcastFramesXmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER] = "FramesWithExcessiveDeferral", + [ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST] = "MulticastFramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST] = "BroadcastFramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR] = "InRangeLengthErrors", + [ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN] = "OutOfRangeLengthField", + [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors", +}; + +const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_CTRL_3_TX] = "MACControlFramesTransmitted", + [ETHTOOL_A_STATS_ETH_CTRL_4_RX] = "MACControlFramesReceived", + [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived", +}; + +const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_RMON_UNDERSIZE] = "etherStatsUndersizePkts", + [ETHTOOL_A_STATS_RMON_OVERSIZE] = "etherStatsOversizePkts", + [ETHTOOL_A_STATS_RMON_FRAG] = "etherStatsFragments", + [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", +}; + +const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { + [ETHTOOL_A_STATS_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED }, +}; + +static int stats_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct stats_req_info *req_info = STATS_REQINFO(req_base); + bool mod = false; + int err; + + err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT, + tb[ETHTOOL_A_STATS_GROUPS], stats_std_names, + extack, &mod); + if (err) + return err; + + if (!mod) { + NL_SET_ERR_MSG(extack, "no stats requested"); + return -EINVAL; + } + + return 0; +} + +static int stats_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + struct stats_reply_data *data = STATS_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + /* Mark all stats as unset (see ETHTOOL_STAT_NOT_SET) to prevent them + * from being reported to user space in case driver did not set them. + */ + memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); + memset(&data->mac_stats, 0xff, sizeof(data->mac_stats)); + memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats)); + memset(&data->rmon_stats, 0xff, sizeof(data->rmon_stats)); + + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && + dev->ethtool_ops->get_eth_phy_stats) + dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); + if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) && + dev->ethtool_ops->get_eth_mac_stats) + dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats); + if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) && + dev->ethtool_ops->get_eth_ctrl_stats) + dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats); + if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) && + dev->ethtool_ops->get_rmon_stats) + dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats, + &data->rmon_ranges); + + ethnl_ops_complete(dev); + return 0; +} + +static int stats_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + unsigned int n_grps = 0, n_stats = 0; + int len = 0; + + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); + n_grps++; + } + if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64); + n_grps++; + } + if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64); + n_grps++; + } + if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64); + n_grps++; + /* Above includes the space for _A_STATS_GRP_HIST_VALs */ + + len += (nla_total_size(0) + /* _A_STATS_GRP_HIST */ + nla_total_size(4) + /* _A_STATS_GRP_HIST_BKT_LOW */ + nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */ + ETHTOOL_RMON_HIST_MAX * 2; + } + + len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ + nla_total_size(4) + /* _A_STATS_GRP_ID */ + nla_total_size(4)); /* _A_STATS_GRP_SS_ID */ + len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */ + nla_total_size_64bit(sizeof(u64))); + + return len; +} + +static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val) +{ + struct nlattr *nest; + int ret; + + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + + /* We want to start stats attr types from 0, so we don't have a type + * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside + * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the + * actual attr we're 4B off - nla_need_padding_for_64bit() & co. + * can't be used. + */ +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) + if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0)) + return -EMSGSIZE; +#endif + + nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT); + if (!nest) + return -EMSGSIZE; + + ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */); + if (ret) { + nla_nest_cancel(skb, nest); + return ret; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int stats_put_phy_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, + data->phy_stats.SymbolErrorDuringCarrier)) + return -EMSGSIZE; + return 0; +} + +static int stats_put_mac_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT, + data->mac_stats.FramesTransmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL, + data->mac_stats.SingleCollisionFrames) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL, + data->mac_stats.MultipleCollisionFrames) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT, + data->mac_stats.FramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR, + data->mac_stats.FrameCheckSequenceErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR, + data->mac_stats.AlignmentErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES, + data->mac_stats.OctetsTransmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER, + data->mac_stats.FramesWithDeferredXmissions) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL, + data->mac_stats.LateCollisions) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL, + data->mac_stats.FramesAbortedDueToXSColls) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR, + data->mac_stats.FramesLostDueToIntMACXmitError) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR, + data->mac_stats.CarrierSenseErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES, + data->mac_stats.OctetsReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR, + data->mac_stats.FramesLostDueToIntMACRcvError) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST, + data->mac_stats.MulticastFramesXmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST, + data->mac_stats.BroadcastFramesXmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER, + data->mac_stats.FramesWithExcessiveDeferral) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST, + data->mac_stats.MulticastFramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST, + data->mac_stats.BroadcastFramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR, + data->mac_stats.InRangeLengthErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN, + data->mac_stats.OutOfRangeLengthField) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR, + data->mac_stats.FrameTooLongErrors)) + return -EMSGSIZE; + return 0; +} + +static int stats_put_ctrl_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX, + data->ctrl_stats.MACControlFramesTransmitted) || + stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX, + data->ctrl_stats.MACControlFramesReceived) || + stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP, + data->ctrl_stats.UnsupportedOpcodesReceived)) + return -EMSGSIZE; + return 0; +} + +static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist, + const struct ethtool_rmon_hist_range *ranges) +{ + struct nlattr *nest; + int i; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_RMON_HIST_MAX; i++) { + if (!ranges[i].low && !ranges[i].high) + break; + if (hist[i] == ETHTOOL_STAT_NOT_SET) + continue; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI, + ranges[i].high) || + nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL, + hist[i], ETHTOOL_A_STATS_GRP_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int stats_put_rmon_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX, + data->rmon_stats.hist, data->rmon_ranges) || + stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX, + data->rmon_stats.hist_tx, data->rmon_ranges)) + return -EMSGSIZE; + + if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE, + data->rmon_stats.undersize_pkts) || + stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE, + data->rmon_stats.oversize_pkts) || + stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG, + data->rmon_stats.fragments) || + stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER, + data->rmon_stats.jabbers)) + return -EMSGSIZE; + + return 0; +} + +static int stats_put_stats(struct sk_buff *skb, + const struct stats_reply_data *data, + u32 id, u32 ss_id, + int (*cb)(struct sk_buff *skb, + const struct stats_reply_data *data)) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) || + nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id)) + goto err_cancel; + + if (cb(skb, data)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int stats_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + const struct stats_reply_data *data = STATS_REPDATA(reply_base); + int ret = 0; + + if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, + ETH_SS_STATS_ETH_PHY, + stats_put_phy_stats); + if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC, + ETH_SS_STATS_ETH_MAC, + stats_put_mac_stats); + if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL, + ETH_SS_STATS_ETH_CTRL, + stats_put_ctrl_stats); + if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON, + ETH_SS_STATS_RMON, stats_put_rmon_stats); + + return ret; +} + +const struct ethnl_request_ops ethnl_stats_request_ops = { + .request_cmd = ETHTOOL_MSG_STATS_GET, + .reply_cmd = ETHTOOL_MSG_STATS_GET_REPLY, + .hdr_attr = ETHTOOL_A_STATS_HEADER, + .req_info_size = sizeof(struct stats_req_info), + .reply_data_size = sizeof(struct stats_reply_data), + + .parse_request = stats_parse_request, + .prepare_data = stats_prepare_data, + .reply_size = stats_reply_size, + .fill_reply = stats_fill_reply, +}; diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index c3a5489964cd..b3029fff715d 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -80,6 +80,31 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, .strings = udp_tunnel_type_names, }, + [ETH_SS_STATS_STD] = { + .per_dev = false, + .count = __ETHTOOL_STATS_CNT, + .strings = stats_std_names, + }, + [ETH_SS_STATS_ETH_PHY] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_PHY_CNT, + .strings = stats_eth_phy_names, + }, + [ETH_SS_STATS_ETH_MAC] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_MAC_CNT, + .strings = stats_eth_mac_names, + }, + [ETH_SS_STATS_ETH_CTRL] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT, + .strings = stats_eth_ctrl_names, + }, + [ETH_SS_STATS_RMON] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_RMON_CNT, + .strings = stats_rmon_names, + }, }; struct strset_req_info { diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 4cfd9e829c7b..99f3af1a9d4d 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -108,7 +108,7 @@ void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev) /* hsr_debugfs_term - Tear down debugfs intrastructure * * Description: - * When Debufs is configured this routine removes debugfs file system + * When Debugfs is configured this routine removes debugfs file system * elements that are specific to hsr */ void diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b9..bbdd9c44f14e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o +obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1355e6c0d567..f17870ee558b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, + .read_sock = udp_read_sock, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index d520e61649c8..dff4f0eb96b0 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -5,6 +5,7 @@ #include <linux/bpf_verifier.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/filter.h> #include <net/tcp.h> #include <net/bpf_sk_storage.h> @@ -178,10 +179,52 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, } } +BTF_SET_START(bpf_tcp_ca_kfunc_ids) +BTF_ID(func, tcp_reno_ssthresh) +BTF_ID(func, tcp_reno_cong_avoid) +BTF_ID(func, tcp_reno_undo_cwnd) +BTF_ID(func, tcp_slow_start) +BTF_ID(func, tcp_cong_avoid_ai) +#ifdef CONFIG_DYNAMIC_FTRACE +#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC) +BTF_ID(func, cubictcp_init) +BTF_ID(func, cubictcp_recalc_ssthresh) +BTF_ID(func, cubictcp_cong_avoid) +BTF_ID(func, cubictcp_state) +BTF_ID(func, cubictcp_cwnd_event) +BTF_ID(func, cubictcp_acked) +#endif +#if IS_BUILTIN(CONFIG_TCP_CONG_DCTCP) +BTF_ID(func, dctcp_init) +BTF_ID(func, dctcp_update_alpha) +BTF_ID(func, dctcp_cwnd_event) +BTF_ID(func, dctcp_ssthresh) +BTF_ID(func, dctcp_cwnd_undo) +BTF_ID(func, dctcp_state) +#endif +#if IS_BUILTIN(CONFIG_TCP_CONG_BBR) +BTF_ID(func, bbr_init) +BTF_ID(func, bbr_main) +BTF_ID(func, bbr_sndbuf_expand) +BTF_ID(func, bbr_undo_cwnd) +BTF_ID(func, bbr_cwnd_event) +BTF_ID(func, bbr_ssthresh) +BTF_ID(func, bbr_min_tso_segs) +BTF_ID(func, bbr_set_state) +#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ +BTF_SET_END(bpf_tcp_ca_kfunc_ids) + +static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id) +{ + return btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id); +} + static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { .get_func_proto = bpf_tcp_ca_get_func_proto, .is_valid_access = bpf_tcp_ca_is_valid_access, .btf_struct_access = bpf_tcp_ca_btf_struct_access, + .check_kfunc_call = bpf_tcp_ca_check_kfunc_call, }; static int bpf_tcp_ca_init_member(const struct btf_type *t, diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 4b834bbf95e0..35803ab7ac80 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -309,7 +309,7 @@ static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb, struct esp_output_extra *extra) { /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after + * accommodate the high bits. We will move it back after * encryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { @@ -754,7 +754,7 @@ int esp_input_done2(struct sk_buff *skb, int err) int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); int ihl; - if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + if (!xo || !(xo->flags & CRYPTO_DONE)) kfree(ESP_SKB_CB(skb)->tmp); if (unlikely(err)) @@ -854,7 +854,7 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) struct ip_esp_hdr *esph; /* For ESN we move the header forward by 4 bytes to - * accomodate the high bits. We will move it back after + * accommodate the high bits. We will move it back after * decryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 616e2dc1c8fa..8bd988fbcb31 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -971,7 +971,7 @@ static bool icmp_redirect(struct sk_buff *skb) } /* - * Handle ICMP_ECHO ("ping") requests. + * Handle ICMP_ECHO ("ping") and ICMP_EXT_ECHO ("PROBE") requests. * * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo * requests. @@ -979,27 +979,125 @@ static bool icmp_redirect(struct sk_buff *skb) * included in the reply. * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring * echo requests, MUST have default=NOT. + * RFC 8335: 8 MUST have a config option to enable/disable ICMP + * Extended Echo Functionality, MUST be disabled by default * See also WRT handling of options once they are done and working. */ static bool icmp_echo(struct sk_buff *skb) { + struct icmp_ext_hdr *ext_hdr, _ext_hdr; + struct icmp_ext_echo_iio *iio, _iio; + struct icmp_bxm icmp_param; + struct net_device *dev; + char buff[IFNAMSIZ]; struct net *net; + u16 ident_len; + u8 status; net = dev_net(skb_dst(skb)->dev); - if (!net->ipv4.sysctl_icmp_echo_ignore_all) { - struct icmp_bxm icmp_param; + /* should there be an ICMP stat for ignored echos? */ + if (net->ipv4.sysctl_icmp_echo_ignore_all) + return true; - icmp_param.data.icmph = *icmp_hdr(skb); + icmp_param.data.icmph = *icmp_hdr(skb); + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = skb->len; + icmp_param.head_len = sizeof(struct icmphdr); + + if (icmp_param.data.icmph.type == ICMP_ECHO) { icmp_param.data.icmph.type = ICMP_ECHOREPLY; - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = skb->len; - icmp_param.head_len = sizeof(struct icmphdr); - icmp_reply(&icmp_param, skb); + goto send_reply; } - /* should there be an ICMP stat for ignored echos? */ - return true; + if (!net->ipv4.sysctl_icmp_echo_enable_probe) + return true; + /* We currently only support probing interfaces on the proxy node + * Check to ensure L-bit is set + */ + if (!(ntohs(icmp_param.data.icmph.un.echo.sequence) & 1)) + return true; + /* Clear status bits in reply message */ + icmp_param.data.icmph.un.echo.sequence &= htons(0xFF00); + icmp_param.data.icmph.type = ICMP_EXT_ECHOREPLY; + ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr); + /* Size of iio is class_type dependent. + * Only check header here and assign length based on ctype in the switch statement + */ + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio); + if (!ext_hdr || !iio) + goto send_mal_query; + if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr)) + goto send_mal_query; + ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr); + status = 0; + dev = NULL; + switch (iio->extobj_hdr.class_type) { + case EXT_ECHO_CTYPE_NAME: + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); + if (ident_len >= IFNAMSIZ) + goto send_mal_query; + memset(buff, 0, sizeof(buff)); + memcpy(buff, &iio->ident.name, ident_len); + dev = dev_get_by_name(net, buff); + break; + case EXT_ECHO_CTYPE_INDEX: + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + + sizeof(iio->ident.ifindex), &_iio); + if (ident_len != sizeof(iio->ident.ifindex)) + goto send_mal_query; + dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); + break; + case EXT_ECHO_CTYPE_ADDR: + if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + iio->ident.addr.ctype3_hdr.addrlen) + goto send_mal_query; + switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) { + case ICMP_AFI_IP: + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + + sizeof(struct in_addr), &_iio); + if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + sizeof(struct in_addr)) + goto send_mal_query; + dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr.s_addr); + break; +#if IS_ENABLED(CONFIG_IPV6) + case ICMP_AFI_IP6: + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); + if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + sizeof(struct in6_addr)) + goto send_mal_query; + dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); + if (dev) + dev_hold(dev); + break; +#endif + default: + goto send_mal_query; + } + break; + default: + goto send_mal_query; + } + if (!dev) { + icmp_param.data.icmph.code = ICMP_EXT_NO_IF; + goto send_reply; + } + /* Fill bits in reply message */ + if (dev->flags & IFF_UP) + status |= EXT_ECHOREPLY_ACTIVE; + if (__in_dev_get_rcu(dev) && __in_dev_get_rcu(dev)->ifa_list) + status |= EXT_ECHOREPLY_IPV4; + if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list)) + status |= EXT_ECHOREPLY_IPV6; + dev_put(dev); + icmp_param.data.icmph.un.echo.sequence |= htons(status); +send_reply: + icmp_reply(&icmp_param, skb); + return true; +send_mal_query: + icmp_param.data.icmph.code = ICMP_EXT_MAL_QUERY; + goto send_reply; } /* @@ -1088,6 +1186,21 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); ICMPMSGIN_INC_STATS(net, icmph->type); + + /* Check for ICMP Extended Echo (PROBE) messages */ + if (icmph->type == ICMP_EXT_ECHO) { + /* We can't use icmp_pointers[].handler() because it is an array of + * size NR_ICMP_TYPES + 1 (19 elements) and PROBE has code 42. + */ + success = icmp_echo(skb); + goto success_check; + } + + if (icmph->type == ICMP_EXT_ECHOREPLY) { + success = ping_rcv(skb); + goto success_check; + } + /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -1097,7 +1210,6 @@ int icmp_rcv(struct sk_buff *skb) if (icmph->type > NR_ICMP_TYPES) goto error; - /* * Parse the ICMP message */ @@ -1123,7 +1235,7 @@ int icmp_rcv(struct sk_buff *skb) } success = icmp_pointers[icmph->type].handler(skb); - +success_check: if (success) { consume_skb(skb); return NET_RX_SUCCESS; @@ -1340,6 +1452,7 @@ static int __net_init icmp_sk_init(struct net *net) /* Control parameters for ECHO replies. */ net->ipv4.sysctl_icmp_echo_ignore_all = 0; + net->ipv4.sysctl_icmp_echo_enable_probe = 0; net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1; /* Control parameter - ignore bogus broadcast responses? */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 3aab53beb4ea..c3efc7d658f6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -34,7 +34,7 @@ * Andi Kleen : Replace ip_reply with ip_send_reply. * Andi Kleen : Split fast and slow ip_build_xmit path * for decreased register pressure on x86 - * and more readibility. + * and more readability. * Marc Boucher : When call_out_firewall returns FW_QUEUE, * silently drop skb instead of failing with -EPERM. * Detlev Wengorz : Copy protocol for fragments. @@ -262,7 +262,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, * interface with a smaller MTU. * - Arriving GRO skb (or GSO skb in a virtualized environment) that is * bridged to a NETIF_F_TSO tunnel stacked over an interface with an - * insufficent MTU. + * insufficient MTU. */ features = netif_skb_features(skb); BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 7c841037c533..aff707988e23 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -25,6 +25,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un __be32 saddr = iph->saddr; __u8 flags; struct net_device *dev = skb_dst(skb)->dev; + struct flow_keys flkeys; unsigned int hh_len; sk = sk_to_full_sk(sk); @@ -48,6 +49,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un fl4.flowi4_oif = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; fl4.flowi4_flags = flags; + fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) return PTR_ERR(rt); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a2f4f894be2b..63cb953bd019 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -76,12 +76,18 @@ config NF_DUP_IPV4 config NF_LOG_ARP tristate "ARP packet logging" default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON + select NF_LOG_SYSLOG + help + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG. config NF_LOG_IPV4 tristate "IPv4 packet logging" default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON + select NF_LOG_SYSLOG + help + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG. config NF_REJECT_IPV4 tristate "IPv4 packet rejection" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 7c497c78105f..f38fb1368ddb 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -9,10 +9,6 @@ obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o -# logging -obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o -obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o - # reject obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 8115611aa47d..ffdcc2b9360f 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -20,8 +20,13 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> +static unsigned int defrag4_pernet_id __read_mostly; static DEFINE_MUTEX(defrag4_mutex); +struct defrag4_pernet { + unsigned int users; +}; + static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -106,15 +111,19 @@ static const struct nf_hook_ops ipv4_defrag_ops[] = { static void __net_exit defrag4_net_exit(struct net *net) { - if (net->nf.defrag_ipv4) { + struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); + + if (nf_defrag->users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); - net->nf.defrag_ipv4 = false; + nf_defrag->users = 0; } } static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, + .id = &defrag4_pernet_id, + .size = sizeof(struct defrag4_pernet), }; static int __init nf_defrag_init(void) @@ -129,21 +138,22 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv4_enable(struct net *net) { + struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); int err = 0; might_sleep(); - if (net->nf.defrag_ipv4) + if (nf_defrag->users) return 0; mutex_lock(&defrag4_mutex); - if (net->nf.defrag_ipv4) + if (nf_defrag->users) goto out_unlock; err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) - net->nf.defrag_ipv4 = true; + nf_defrag->users = 1; out_unlock: mutex_unlock(&defrag4_mutex); diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c deleted file mode 100644 index 136030ad2e54..000000000000 --- a/net/ipv4/netfilter/nf_log_arp.c +++ /dev/null @@ -1,172 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org> - * - * Based on code from ebt_log from: - * - * Bart De Schuymer <bdschuym@pandora.be> - * Harald Welte <laforge@netfilter.org> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter/xt_LOG.h> -#include <net/netfilter/nf_log.h> - -static const struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_DEFAULT_MASK, - }, - }, -}; - -struct arppayload { - unsigned char mac_src[ETH_ALEN]; - unsigned char ip_src[4]; - unsigned char mac_dst[ETH_ALEN]; - unsigned char ip_dst[4]; -}; - -static void dump_arp_packet(struct nf_log_buf *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int nhoff) -{ - const struct arppayload *ap; - struct arppayload _arpp; - const struct arphdr *ah; - unsigned int logflags; - struct arphdr _arph; - - ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); - if (ah == NULL) { - nf_log_buf_add(m, "TRUNCATED"); - return; - } - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_DEFAULT_MASK; - - if (logflags & NF_LOG_MACDECODE) { - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); - nf_log_dump_vlan(m, skb); - nf_log_buf_add(m, "MACPROTO=%04x ", - ntohs(eth_hdr(skb)->h_proto)); - } - - nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", - ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); - - /* If it's for Ethernet and the lengths are OK, then log the ARP - * payload. - */ - if (ah->ar_hrd != htons(ARPHRD_ETHER) || - ah->ar_hln != ETH_ALEN || - ah->ar_pln != sizeof(__be32)) - return; - - ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); - if (ap == NULL) { - nf_log_buf_add(m, " INCOMPLETE [%zu bytes]", - skb->len - sizeof(_arph)); - return; - } - nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", - ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); -} - -static void nf_log_arp_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct nf_log_buf *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) - return; - - m = nf_log_buf_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, - prefix); - dump_arp_packet(m, loginfo, skb, 0); - - nf_log_buf_close(m); -} - -static struct nf_logger nf_arp_logger __read_mostly = { - .name = "nf_log_arp", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_arp_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_arp_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); -} - -static void __net_exit nf_log_arp_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_arp_logger); -} - -static struct pernet_operations nf_log_arp_net_ops = { - .init = nf_log_arp_net_init, - .exit = nf_log_arp_net_exit, -}; - -static int __init nf_log_arp_init(void) -{ - int ret; - - ret = register_pernet_subsys(&nf_log_arp_net_ops); - if (ret < 0) - return ret; - - ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); - if (ret < 0) { - pr_err("failed to register logger\n"); - goto err1; - } - - return 0; - -err1: - unregister_pernet_subsys(&nf_log_arp_net_ops); - return ret; -} - -static void __exit nf_log_arp_exit(void) -{ - unregister_pernet_subsys(&nf_log_arp_net_ops); - nf_log_unregister(&nf_arp_logger); -} - -module_init(nf_log_arp_init); -module_exit(nf_log_arp_exit); - -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_DESCRIPTION("Netfilter ARP packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(3, 0); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c deleted file mode 100644 index d07583fac8f8..000000000000 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ /dev/null @@ -1,395 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <net/ipv6.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/tcp.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter/xt_LOG.h> -#include <net/netfilter/nf_log.h> - -static const struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_DEFAULT_MASK, - }, - }, -}; - -/* One level of recursion won't kill us */ -static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int iphoff) -{ - struct iphdr _iph; - const struct iphdr *ih; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_DEFAULT_MASK; - - ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); - if (ih == NULL) { - nf_log_buf_add(m, "TRUNCATED"); - return; - } - - /* Important fields: - * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ - /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); - - /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", - ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, - ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); - - /* Max length: 6 "CE DF MF " */ - if (ntohs(ih->frag_off) & IP_CE) - nf_log_buf_add(m, "CE "); - if (ntohs(ih->frag_off) & IP_DF) - nf_log_buf_add(m, "DF "); - if (ntohs(ih->frag_off) & IP_MF) - nf_log_buf_add(m, "MF "); - - /* Max length: 11 "FRAG:65535 " */ - if (ntohs(ih->frag_off) & IP_OFFSET) - nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - - if ((logflags & NF_LOG_IPOPT) && - ih->ihl * 4 > sizeof(struct iphdr)) { - const unsigned char *op; - unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; - unsigned int i, optsize; - - optsize = ih->ihl * 4 - sizeof(struct iphdr); - op = skb_header_pointer(skb, iphoff+sizeof(_iph), - optsize, _opt); - if (op == NULL) { - nf_log_buf_add(m, "TRUNCATED"); - return; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - nf_log_buf_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - nf_log_buf_add(m, "%02X", op[i]); - nf_log_buf_add(m, ") "); - } - - switch (ih->protocol) { - case IPPROTO_TCP: - if (nf_log_dump_tcp_header(m, skb, ih->protocol, - ntohs(ih->frag_off) & IP_OFFSET, - iphoff+ih->ihl*4, logflags)) - return; - break; - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - if (nf_log_dump_udp_header(m, skb, ih->protocol, - ntohs(ih->frag_off) & IP_OFFSET, - iphoff+ih->ihl*4)) - return; - break; - case IPPROTO_ICMP: { - struct icmphdr _icmph; - const struct icmphdr *ich; - static const size_t required_len[NR_ICMP_TYPES+1] - = { [ICMP_ECHOREPLY] = 4, - [ICMP_DEST_UNREACH] - = 8 + sizeof(struct iphdr), - [ICMP_SOURCE_QUENCH] - = 8 + sizeof(struct iphdr), - [ICMP_REDIRECT] - = 8 + sizeof(struct iphdr), - [ICMP_ECHO] = 4, - [ICMP_TIME_EXCEEDED] - = 8 + sizeof(struct iphdr), - [ICMP_PARAMETERPROB] - = 8 + sizeof(struct iphdr), - [ICMP_TIMESTAMP] = 20, - [ICMP_TIMESTAMPREPLY] = 20, - [ICMP_ADDRESS] = 12, - [ICMP_ADDRESSREPLY] = 12 }; - - /* Max length: 11 "PROTO=ICMP " */ - nf_log_buf_add(m, "PROTO=ICMP "); - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, - sizeof(_icmph), &_icmph); - if (ich == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (ich->type <= NR_ICMP_TYPES && - required_len[ich->type] && - skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - switch (ich->type) { - case ICMP_ECHOREPLY: - case ICMP_ECHO: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - nf_log_buf_add(m, "ID=%u SEQ=%u ", - ntohs(ich->un.echo.id), - ntohs(ich->un.echo.sequence)); - break; - - case ICMP_PARAMETERPROB: - /* Max length: 14 "PARAMETER=255 " */ - nf_log_buf_add(m, "PARAMETER=%u ", - ntohl(ich->un.gateway) >> 24); - break; - case ICMP_REDIRECT: - /* Max length: 24 "GATEWAY=255.255.255.255 " */ - nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); - fallthrough; - case ICMP_DEST_UNREACH: - case ICMP_SOURCE_QUENCH: - case ICMP_TIME_EXCEEDED: - /* Max length: 3+maxlen */ - if (!iphoff) { /* Only recurse once. */ - nf_log_buf_add(m, "["); - dump_ipv4_packet(net, m, info, skb, - iphoff + ih->ihl*4+sizeof(_icmph)); - nf_log_buf_add(m, "] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ich->type == ICMP_DEST_UNREACH && - ich->code == ICMP_FRAG_NEEDED) { - nf_log_buf_add(m, "MTU=%u ", - ntohs(ich->un.frag.mtu)); - } - } - break; - } - /* Max Length */ - case IPPROTO_AH: { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 9 "PROTO=AH " */ - nf_log_buf_add(m, "PROTO=AH "); - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ah = skb_header_pointer(skb, iphoff+ih->ihl*4, - sizeof(_ahdr), &_ahdr); - if (ah == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Length: 15 "SPI=0xF1234567 " */ - nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); - break; - } - case IPPROTO_ESP: { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 10 "PROTO=ESP " */ - nf_log_buf_add(m, "PROTO=ESP "); - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - eh = skb_header_pointer(skb, iphoff+ih->ihl*4, - sizeof(_esph), &_esph); - if (eh == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - iphoff - ih->ihl*4); - break; - } - - /* Length: 15 "SPI=0xF1234567 " */ - nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); - break; - } - /* Max length: 10 "PROTO 255 " */ - default: - nf_log_buf_add(m, "PROTO=%u ", ih->protocol); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & NF_LOG_UID) && !iphoff) - nf_log_dump_sk_uid_gid(net, m, skb->sk); - - /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (!iphoff && skb->mark) - nf_log_buf_add(m, "MARK=0x%x ", skb->mark); - - /* Proto Max log string length */ - /* IP: 40+46+6+11+127 = 230 */ - /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ - /* UDP: 10+max(25,20) = 35 */ - /* UDPLITE: 14+max(25,20) = 39 */ - /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ - /* ESP: 10+max(25)+15 = 50 */ - /* AH: 9+max(25)+15 = 49 */ - /* unknown: 10 */ - - /* (ICMP allows recursion one level deep) */ - /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ - /* maxlen = 230+ 91 + 230 + 252 = 803 */ -} - -static void dump_ipv4_mac_header(struct nf_log_buf *m, - const struct nf_loginfo *info, - const struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - unsigned int logflags = 0; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - - if (!(logflags & NF_LOG_MACDECODE)) - goto fallback; - - switch (dev->type) { - case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); - nf_log_dump_vlan(m, skb); - nf_log_buf_add(m, "MACPROTO=%04x ", - ntohs(eth_hdr(skb)->h_proto)); - return; - default: - break; - } - -fallback: - nf_log_buf_add(m, "MAC="); - if (dev->hard_header_len && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - unsigned int i; - - nf_log_buf_add(m, "%02x", *p++); - for (i = 1; i < dev->hard_header_len; i++, p++) - nf_log_buf_add(m, ":%02x", *p); - } - nf_log_buf_add(m, " "); -} - -static void nf_log_ip_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct nf_log_buf *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) - return; - - m = nf_log_buf_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - nf_log_dump_packet_common(m, pf, hooknum, skb, in, - out, loginfo, prefix); - - if (in != NULL) - dump_ipv4_mac_header(m, loginfo, skb); - - dump_ipv4_packet(net, m, loginfo, skb, 0); - - nf_log_buf_close(m); -} - -static struct nf_logger nf_ip_logger __read_mostly = { - .name = "nf_log_ipv4", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_ip_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_ipv4_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); -} - -static void __net_exit nf_log_ipv4_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_ip_logger); -} - -static struct pernet_operations nf_log_ipv4_net_ops = { - .init = nf_log_ipv4_net_init, - .exit = nf_log_ipv4_net_exit, -}; - -static int __init nf_log_ipv4_init(void) -{ - int ret; - - ret = register_pernet_subsys(&nf_log_ipv4_net_ops); - if (ret < 0) - return ret; - - ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); - if (ret < 0) { - pr_err("failed to register logger\n"); - goto err1; - } - - return 0; - -err1: - unregister_pernet_subsys(&nf_log_ipv4_net_ops); - return ret; -} - -static void __exit nf_log_ipv4_exit(void) -{ - unregister_pernet_subsys(&nf_log_ipv4_net_ops); - nf_log_unregister(&nf_ip_logger); -} - -module_init(nf_log_ipv4_init); -module_exit(nf_log_ipv4_exit); - -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("Netfilter IPv4 packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(AF_INET, 0); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 743777bce179..4075230b14c6 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -16,6 +16,9 @@ #include <net/route.h> #include <net/sock.h> +#define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) +#define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ + static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); @@ -32,6 +35,7 @@ static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, + [NHA_RES_GROUP] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_policy_get[] = { @@ -45,6 +49,32 @@ static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_FDB] = { .type = NLA_FLAG }, }; +static const struct nla_policy rtm_nh_res_policy_new[] = { + [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, + [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, + [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, +}; + +static const struct nla_policy rtm_nh_policy_dump_bucket[] = { + [NHA_ID] = { .type = NLA_U32 }, + [NHA_OIF] = { .type = NLA_U32 }, + [NHA_MASTER] = { .type = NLA_U32 }, + [NHA_RES_BUCKET] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { + [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy rtm_nh_policy_get_bucket[] = { + [NHA_ID] = { .type = NLA_U32 }, + [NHA_RES_BUCKET] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { + [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, +}; + static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; @@ -52,10 +82,8 @@ static bool nexthop_notifiers_is_empty(struct net *net) static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, - const struct nexthop *nh) + const struct nh_info *nhi) { - struct nh_info *nhi = rtnl_dereference(nh->nh_info); - nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) @@ -71,12 +99,14 @@ __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { + struct nh_info *nhi = rtnl_dereference(nh->nh_info); + info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; - __nh_notifier_single_info_init(info->nh, nh); + __nh_notifier_single_info_init(info->nh, nhi); return 0; } @@ -86,8 +116,8 @@ static void nh_notifier_single_info_fini(struct nh_notifier_info *info) kfree(info->nh); } -static int nh_notifier_mp_info_init(struct nh_notifier_info *info, - struct nh_group *nhg) +static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; @@ -103,11 +133,44 @@ static int nh_notifier_mp_info_init(struct nh_notifier_info *info, for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + struct nh_info *nhi; + nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].id = nhge->nh->id; info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, - nhge->nh); + nhi); + } + + return 0; +} + +static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) +{ + struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); + u16 num_nh_buckets = res_table->num_nh_buckets; + unsigned long size; + u16 i; + + info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; + size = struct_size(info->nh_res_table, nhs, num_nh_buckets); + info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | + __GFP_NOWARN); + if (!info->nh_res_table) + return -ENOMEM; + + info->nh_res_table->num_nh_buckets = num_nh_buckets; + + for (i = 0; i < num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + struct nh_info *nhi; + + nhge = rtnl_dereference(bucket->nh_entry); + nhi = rtnl_dereference(nhge->nh->nh_info); + __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], + nhi); } return 0; @@ -118,8 +181,10 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info, { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - if (nhg->mpath) - return nh_notifier_mp_info_init(info, nhg); + if (nhg->hash_threshold) + return nh_notifier_mpath_info_init(info, nhg); + else if (nhg->resilient) + return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } @@ -128,8 +193,10 @@ static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - if (nhg->mpath) + if (nhg->hash_threshold) kfree(info->nh_grp); + else if (nhg->resilient) + vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, @@ -181,6 +248,178 @@ static int call_nexthop_notifiers(struct net *net, return notifier_to_errno(err); } +static int +nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, + bool force, unsigned int *p_idle_timer_ms) +{ + struct nh_res_table *res_table; + struct nh_group *nhg; + struct nexthop *nh; + int err = 0; + + /* When 'force' is false, nexthop bucket replacement is performed + * because the bucket was deemed to be idle. In this case, capable + * listeners can choose to perform an atomic replacement: The bucket is + * only replaced if it is inactive. However, if the idle timer interval + * is smaller than the interval in which a listener is querying + * buckets' activity from the device, then atomic replacement should + * not be tried. Pass the idle timer value to listeners, so that they + * could determine which type of replacement to perform. + */ + if (force) { + *p_idle_timer_ms = 0; + return 0; + } + + rcu_read_lock(); + + nh = nexthop_find_by_id(info->net, info->id); + if (!nh) { + err = -EINVAL; + goto out; + } + + nhg = rcu_dereference(nh->nh_grp); + res_table = rcu_dereference(nhg->res_table); + *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); + +out: + rcu_read_unlock(); + + return err; +} + +static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, + u16 bucket_index, bool force, + struct nh_info *oldi, + struct nh_info *newi) +{ + unsigned int idle_timer_ms; + int err; + + err = nh_notifier_res_bucket_idle_timer_get(info, force, + &idle_timer_ms); + if (err) + return err; + + info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; + info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), + GFP_KERNEL); + if (!info->nh_res_bucket) + return -ENOMEM; + + info->nh_res_bucket->bucket_index = bucket_index; + info->nh_res_bucket->idle_timer_ms = idle_timer_ms; + info->nh_res_bucket->force = force; + __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); + __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); + return 0; +} + +static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) +{ + kfree(info->nh_res_bucket); +} + +static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, + u16 bucket_index, bool force, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + .id = nhg_id, + }; + int err; + + if (nexthop_notifiers_is_empty(net)) + return 0; + + err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, + oldi, newi); + if (err) + return err; + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_BUCKET_REPLACE, &info); + nh_notifier_res_bucket_info_fini(&info); + + return notifier_to_errno(err); +} + +/* There are three users of RES_TABLE, and NHs etc. referenced from there: + * + * 1) a collection of callbacks for NH maintenance. This operates under + * RTNL, + * 2) the delayed work that gradually balances the resilient table, + * 3) and nexthop_select_path(), operating under RCU. + * + * Both the delayed work and the RTNL block are writers, and need to + * maintain mutual exclusion. Since there are only two and well-known + * writers for each table, the RTNL code can make sure it has exclusive + * access thus: + * + * - Have the DW operate without locking; + * - synchronously cancel the DW; + * - do the writing; + * - if the write was not actually a delete, call upkeep, which schedules + * DW again if necessary. + * + * The functions that are always called from the RTNL context use + * rtnl_dereference(). The functions that can also be called from the DW do + * a raw dereference and rely on the above mutual exclusion scheme. + */ +#define nh_res_dereference(p) (rcu_dereference_raw(p)) + +static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, + u16 bucket_index, bool force, + struct nexthop *old_nh, + struct nexthop *new_nh, + struct netlink_ext_ack *extack) +{ + struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); + struct nh_info *newi = nh_res_dereference(new_nh->nh_info); + + return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, + force, oldi, newi, extack); +} + +static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, + struct netlink_ext_ack *extack) +{ + struct nh_notifier_info info = { + .net = net, + .extack = extack, + }; + struct nh_group *nhg; + int err; + + ASSERT_RTNL(); + + if (nexthop_notifiers_is_empty(net)) + return 0; + + /* At this point, the nexthop buckets are still not populated. Only + * emit a notification with the logical nexthops, so that a listener + * could potentially veto it in case of unsupported configuration. + */ + nhg = rtnl_dereference(nh->nh_grp); + err = nh_notifier_mpath_info_init(&info, nhg); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); + return err; + } + + err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, + NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, + &info); + kfree(info.nh_grp); + + return notifier_to_errno(err); +} + static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, @@ -239,6 +478,9 @@ static void nexthop_free_group(struct nexthop *nh) WARN_ON(nhg->spare == nhg); + if (nhg->resilient) + vfree(rcu_dereference_raw(nhg->res_table)); + kfree(nhg->spare); kfree(nhg); } @@ -297,6 +539,30 @@ static struct nh_group *nexthop_grp_alloc(u16 num_nh) return nhg; } +static void nh_res_table_upkeep_dw(struct work_struct *work); + +static struct nh_res_table * +nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) +{ + const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; + struct nh_res_table *res_table; + unsigned long size; + + size = struct_size(res_table, nh_buckets, num_nh_buckets); + res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); + if (!res_table) + return NULL; + + res_table->net = net; + res_table->nhg_id = nhg_id; + INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); + INIT_LIST_HEAD(&res_table->uw_nh_entries); + res_table->idle_timer = cfg->nh_grp_res_idle_timer; + res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; + res_table->num_nh_buckets = num_nh_buckets; + return res_table; +} + static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) @@ -345,6 +611,48 @@ static u32 nh_find_unused_id(struct net *net) return 0; } +static void nh_res_time_set_deadline(unsigned long next_time, + unsigned long *deadline) +{ + if (time_before(next_time, *deadline)) + *deadline = next_time; +} + +static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) +{ + if (list_empty(&res_table->uw_nh_entries)) + return 0; + return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); +} + +static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) +{ + struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); + struct nlattr *nest; + + nest = nla_nest_start(skb, NHA_RES_GROUP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, + res_table->num_nh_buckets) || + nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, + jiffies_to_clock_t(res_table->idle_timer)) || + nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, + jiffies_to_clock_t(res_table->unbalanced_timer)) || + nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, + nh_res_table_unbalanced_time(res_table), + NHA_RES_GROUP_PAD)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) { struct nexthop_grp *p; @@ -353,8 +661,10 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) u16 group_type = 0; int i; - if (nhg->mpath) + if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; + else if (nhg->resilient) + group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; @@ -370,6 +680,9 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) p += 1; } + if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -457,13 +770,26 @@ nla_put_failure: return -EMSGSIZE; } +static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) +{ + return nla_total_size(0) + /* NHA_RES_GROUP */ + nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ + nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ + nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ + nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ +} + static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; + size_t tot = nla_total_size(sz) + + nla_total_size(2); /* NHA_GROUP_TYPE */ - return nla_total_size(sz) + - nla_total_size(2); /* NHA_GROUP_TYPE */ + if (nhg->resilient) + tot += nh_nlmsg_size_grp_res(nhg); + + return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) @@ -538,18 +864,142 @@ errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } +static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) +{ + return (unsigned long)atomic_long_read(&bucket->used_time); +} + +static unsigned long +nh_res_bucket_idle_point(const struct nh_res_table *res_table, + const struct nh_res_bucket *bucket, + unsigned long now) +{ + unsigned long time = nh_res_bucket_used_time(bucket); + + /* Bucket was not used since it was migrated. The idle time is now. */ + if (time == bucket->migrated_time) + return now; + + return time + res_table->idle_timer; +} + +static unsigned long +nh_res_table_unb_point(const struct nh_res_table *res_table) +{ + return res_table->unbalanced_since + res_table->unbalanced_timer; +} + +static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, + struct nh_res_bucket *bucket) +{ + unsigned long now = jiffies; + + atomic_long_set(&bucket->used_time, (long)now); + bucket->migrated_time = now; +} + +static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) +{ + atomic_long_set(&bucket->used_time, (long)jiffies); +} + +static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) +{ + unsigned long used_time = nh_res_bucket_used_time(bucket); + + return jiffies_delta_to_clock_t(jiffies - used_time); +} + +static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, + struct nh_res_bucket *bucket, u16 bucket_index, + int event, u32 portid, u32 seq, + unsigned int nlflags, + struct netlink_ext_ack *extack) +{ + struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); + struct nlmsghdr *nlh; + struct nlattr *nest; + struct nhmsg *nhm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); + if (!nlh) + return -EMSGSIZE; + + nhm = nlmsg_data(nlh); + nhm->nh_family = AF_UNSPEC; + nhm->nh_flags = bucket->nh_flags; + nhm->nh_protocol = nh->protocol; + nhm->nh_scope = 0; + nhm->resvd = 0; + + if (nla_put_u32(skb, NHA_ID, nh->id)) + goto nla_put_failure; + + nest = nla_nest_start(skb, NHA_RES_BUCKET); + if (!nest) + goto nla_put_failure; + + if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || + nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || + nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, + nh_res_bucket_idle_time(bucket), + NHA_RES_BUCKET_PAD)) + goto nla_put_failure_nest; + + nla_nest_end(skb, nest); + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure_nest: + nla_nest_cancel(skb, nest); +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void nexthop_bucket_notify(struct nh_res_table *res_table, + u16 bucket_index) +{ + struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; + struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); + struct nexthop *nh = nhge->nh_parent; + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto errout; + + err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, + RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, + NULL); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); + return; +errout: + if (err < 0) + rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); +} + static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - /* nested multipath (group within a group) is not - * supported - */ - if (nhg->mpath) { + /* Nesting groups within groups is not supported. */ + if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, - "Multipath group can not be a nexthop within a group"); + "Hash-threshold group can not be a nexthop within a group"); + return false; + } + if (nhg->resilient) { + NL_SET_ERR_MSG(extack, + "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; @@ -591,7 +1041,7 @@ static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, - struct netlink_ext_ack *extack) + u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; @@ -652,8 +1102,14 @@ static int nh_check_attr_group(struct net *net, for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; - if (i == NHA_FDB) + switch (i) { + case NHA_FDB: continue; + case NHA_RES_GROUP: + if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) + continue; + break; + } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; @@ -695,7 +1151,7 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) +static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; int i; @@ -704,7 +1160,7 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; - if (hash > atomic_read(&nhge->mpath.upper_bound)) + if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nhi = rcu_dereference(nhge->nh->nh_info); @@ -732,6 +1188,22 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) return rc; } +static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) +{ + struct nh_res_table *res_table = rcu_dereference(nhg->res_table); + u16 bucket_index = hash % res_table->num_nh_buckets; + struct nh_res_bucket *bucket; + struct nh_grp_entry *nhge; + + /* nexthop_select_path() is expected to return a non-NULL value, so + * skip protocol validation and just hand out whatever there is. + */ + bucket = &res_table->nh_buckets[bucket_index]; + nh_res_bucket_set_busy(bucket); + nhge = rcu_dereference(bucket->nh_entry); + return nhge->nh; +} + struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; @@ -740,8 +1212,10 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) return nh; nhg = rcu_dereference(nh->nh_grp); - if (nhg->mpath) - return nexthop_select_path_mp(nhg, hash); + if (nhg->hash_threshold) + return nexthop_select_path_hthr(nhg, hash); + else if (nhg->resilient) + return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; @@ -924,7 +1398,319 @@ static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, return 0; } -static void nh_group_rebalance(struct nh_group *nhg) +static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets == nhge->res.wants_buckets; +} + +static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets > nhge->res.wants_buckets; +} + +static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets < nhge->res.wants_buckets; +} + +static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) +{ + return list_empty(&res_table->uw_nh_entries); +} + +static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) +{ + struct nh_grp_entry *nhge; + + if (bucket->occupied) { + nhge = nh_res_dereference(bucket->nh_entry); + nhge->res.count_buckets--; + bucket->occupied = false; + } +} + +static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, + struct nh_grp_entry *nhge) +{ + nh_res_bucket_unset_nh(bucket); + + bucket->occupied = true; + rcu_assign_pointer(bucket->nh_entry, nhge); + nhge->res.count_buckets++; +} + +static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, + struct nh_res_bucket *bucket, + unsigned long *deadline, bool *force) +{ + unsigned long now = jiffies; + struct nh_grp_entry *nhge; + unsigned long idle_point; + + if (!bucket->occupied) { + /* The bucket is not occupied, its NHGE pointer is either + * NULL or obsolete. We _have to_ migrate: set force. + */ + *force = true; + return true; + } + + nhge = nh_res_dereference(bucket->nh_entry); + + /* If the bucket is populated by an underweight or balanced + * nexthop, do not migrate. + */ + if (!nh_res_nhge_is_ow(nhge)) + return false; + + /* At this point we know that the bucket is populated with an + * overweight nexthop. It needs to be migrated to a new nexthop if + * the idle timer of unbalanced timer expired. + */ + + idle_point = nh_res_bucket_idle_point(res_table, bucket, now); + if (time_after_eq(now, idle_point)) { + /* The bucket is idle. We _can_ migrate: unset force. */ + *force = false; + return true; + } + + /* Unbalanced timer of 0 means "never force". */ + if (res_table->unbalanced_timer) { + unsigned long unb_point; + + unb_point = nh_res_table_unb_point(res_table); + if (time_after(now, unb_point)) { + /* The bucket is not idle, but the unbalanced timer + * expired. We _can_ migrate, but set force anyway, + * so that drivers know to ignore activity reports + * from the HW. + */ + *force = true; + return true; + } + + nh_res_time_set_deadline(unb_point, deadline); + } + + nh_res_time_set_deadline(idle_point, deadline); + return false; +} + +static bool nh_res_bucket_migrate(struct nh_res_table *res_table, + u16 bucket_index, bool notify, + bool notify_nl, bool force) +{ + struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; + struct nh_grp_entry *new_nhge; + struct netlink_ext_ack extack; + int err; + + new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, + struct nh_grp_entry, + res.uw_nh_entry); + if (WARN_ON_ONCE(!new_nhge)) + /* If this function is called, "bucket" is either not + * occupied, or it belongs to a next hop that is + * overweight. In either case, there ought to be a + * corresponding underweight next hop. + */ + return false; + + if (notify) { + struct nh_grp_entry *old_nhge; + + old_nhge = nh_res_dereference(bucket->nh_entry); + err = call_nexthop_res_bucket_notifiers(res_table->net, + res_table->nhg_id, + bucket_index, force, + old_nhge->nh, + new_nhge->nh, &extack); + if (err) { + pr_err_ratelimited("%s\n", extack._msg); + if (!force) + return false; + /* It is not possible to veto a forced replacement, so + * just clear the hardware flags from the nexthop + * bucket to indicate to user space that this bucket is + * not correctly populated in hardware. + */ + bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + } + } + + nh_res_bucket_set_nh(bucket, new_nhge); + nh_res_bucket_set_idle(res_table, bucket); + + if (notify_nl) + nexthop_bucket_notify(res_table, bucket_index); + + if (nh_res_nhge_is_balanced(new_nhge)) + list_del(&new_nhge->res.uw_nh_entry); + return true; +} + +#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) + +static void nh_res_table_upkeep(struct nh_res_table *res_table, + bool notify, bool notify_nl) +{ + unsigned long now = jiffies; + unsigned long deadline; + u16 i; + + /* Deadline is the next time that upkeep should be run. It is the + * earliest time at which one of the buckets might be migrated. + * Start at the most pessimistic estimate: either unbalanced_timer + * from now, or if there is none, idle_timer from now. For each + * encountered time point, call nh_res_time_set_deadline() to + * refine the estimate. + */ + if (res_table->unbalanced_timer) + deadline = now + res_table->unbalanced_timer; + else + deadline = now + res_table->idle_timer; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + bool force; + + if (nh_res_bucket_should_migrate(res_table, bucket, + &deadline, &force)) { + if (!nh_res_bucket_migrate(res_table, i, notify, + notify_nl, force)) { + unsigned long idle_point; + + /* A driver can override the migration + * decision if the HW reports that the + * bucket is actually not idle. Therefore + * remark the bucket as busy again and + * update the deadline. + */ + nh_res_bucket_set_busy(bucket); + idle_point = nh_res_bucket_idle_point(res_table, + bucket, + now); + nh_res_time_set_deadline(idle_point, &deadline); + } + } + } + + /* If the group is still unbalanced, schedule the next upkeep to + * either the deadline computed above, or the minimum deadline, + * whichever comes later. + */ + if (!nh_res_table_is_balanced(res_table)) { + unsigned long now = jiffies; + unsigned long min_deadline; + + min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; + if (time_before(deadline, min_deadline)) + deadline = min_deadline; + + queue_delayed_work(system_power_efficient_wq, + &res_table->upkeep_dw, deadline - now); + } +} + +static void nh_res_table_upkeep_dw(struct work_struct *work) +{ + struct delayed_work *dw = to_delayed_work(work); + struct nh_res_table *res_table; + + res_table = container_of(dw, struct nh_res_table, upkeep_dw); + nh_res_table_upkeep(res_table, true, true); +} + +static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) +{ + cancel_delayed_work_sync(&res_table->upkeep_dw); +} + +static void nh_res_group_rebalance(struct nh_group *nhg, + struct nh_res_table *res_table) +{ + int prev_upper_bound = 0; + int total = 0; + int w = 0; + int i; + + INIT_LIST_HEAD(&res_table->uw_nh_entries); + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, + total); + nhge->res.wants_buckets = upper_bound - prev_upper_bound; + prev_upper_bound = upper_bound; + + if (nh_res_nhge_is_uw(nhge)) { + if (list_empty(&res_table->uw_nh_entries)) + res_table->unbalanced_since = jiffies; + list_add(&nhge->res.uw_nh_entry, + &res_table->uw_nh_entries); + } + } +} + +/* Migrate buckets in res_table so that they reference NHGE's from NHG with + * the right NH ID. Set those buckets that do not have a corresponding NHGE + * entry in NHG as not occupied. + */ +static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, + struct nh_group *nhg) +{ + u16 i; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; + bool found = false; + int j; + + for (j = 0; j < nhg->num_nh; j++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[j]; + + if (nhge->nh->id == id) { + nh_res_bucket_set_nh(bucket, nhge); + found = true; + break; + } + } + + if (!found) + nh_res_bucket_unset_nh(bucket); + } +} + +static void replace_nexthop_grp_res(struct nh_group *oldg, + struct nh_group *newg) +{ + /* For NH group replacement, the new NHG might only have a stub + * hash table with 0 buckets, because the number of buckets was not + * specified. For NH removal, oldg and newg both reference the same + * res_table. So in any case, in the following, we want to work + * with oldg->res_table. + */ + struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); + unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; + bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); + + nh_res_table_cancel_upkeep(old_res_table); + nh_res_table_migrate_buckets(old_res_table, newg); + nh_res_group_rebalance(newg, old_res_table); + if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) + old_res_table->unbalanced_since = prev_unbalanced_since; + nh_res_table_upkeep(old_res_table, true, false); +} + +static void nh_hthr_group_rebalance(struct nh_group *nhg) { int total = 0; int w = 0; @@ -939,7 +1725,7 @@ static void nh_group_rebalance(struct nh_group *nhg) w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; - atomic_set(&nhge->mpath.upper_bound, upper_bound); + atomic_set(&nhge->hthr.upper_bound, upper_bound); } } @@ -965,7 +1751,9 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, } newg->has_v4 = false; - newg->mpath = nhg->mpath; + newg->is_multipath = nhg->is_multipath; + newg->hash_threshold = nhg->hash_threshold; + newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -993,15 +1781,25 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, j++; } - nh_group_rebalance(newg); + if (newg->hash_threshold) + nh_hthr_group_rebalance(newg); + else if (newg->resilient) + replace_nexthop_grp_res(nhg, newg); + rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); nexthop_put(nhge->nh); - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); - if (err) - pr_err("%s\n", extack._msg); + /* Removal of a NH from a resilient group is notified through + * bucket notifications. + */ + if (newg->hash_threshold) { + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, + &extack); + if (err) + pr_err("%s\n", extack._msg); + } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); @@ -1022,6 +1820,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { @@ -1032,6 +1831,11 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) list_del_init(&nhge->nh_list); } + + if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + nh_res_table_cancel_upkeep(res_table); + } } /* not called for nexthop replace */ @@ -1107,9 +1911,12 @@ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) } static int replace_nexthop_grp(struct net *net, struct nexthop *old, - struct nexthop *new, + struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { + struct nh_res_table *tmp_table = NULL; + struct nh_res_table *new_res_table; + struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; @@ -1118,19 +1925,67 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, return -EINVAL; } - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); - if (err) - return err; - oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); + if (newg->hash_threshold != oldg->hash_threshold) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); + return -EINVAL; + } + + if (newg->hash_threshold) { + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, + extack); + if (err) + return err; + } else if (newg->resilient) { + new_res_table = rtnl_dereference(newg->res_table); + old_res_table = rtnl_dereference(oldg->res_table); + + /* Accept if num_nh_buckets was not given, but if it was + * given, demand that the value be correct. + */ + if (cfg->nh_grp_res_has_num_buckets && + cfg->nh_grp_res_num_buckets != + old_res_table->num_nh_buckets) { + NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); + return -EINVAL; + } + + /* Emit a pre-replace notification so that listeners could veto + * a potentially unsupported configuration. Otherwise, + * individual bucket replacement notifications would need to be + * vetoed, which is something that should only happen if the + * bucket is currently active. + */ + err = call_nexthop_res_table_notifiers(net, new, extack); + if (err) + return err; + + if (cfg->nh_grp_res_has_idle_timer) + old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; + if (cfg->nh_grp_res_has_unbalanced_timer) + old_res_table->unbalanced_timer = + cfg->nh_grp_res_unbalanced_timer; + + replace_nexthop_grp_res(oldg, newg); + + tmp_table = new_res_table; + rcu_assign_pointer(newg->res_table, old_res_table); + rcu_assign_pointer(newg->spare->res_table, old_res_table); + } + /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); + if (newg->resilient) { + rcu_assign_pointer(oldg->res_table, tmp_table); + rcu_assign_pointer(oldg->spare->res_table, tmp_table); + } + for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; @@ -1156,6 +2011,71 @@ static void nh_group_v4_update(struct nh_group *nhg) nhg->has_v4 = has_v4; } +static int replace_nexthop_single_notify_res(struct net *net, + struct nh_res_table *res_table, + struct nexthop *old, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + u32 nhg_id = res_table->nhg_id; + int err; + u16 i; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + + nhge = rtnl_dereference(bucket->nh_entry); + if (nhge->nh == old) { + err = __call_nexthop_res_bucket_notifiers(net, nhg_id, + i, true, + oldi, newi, + extack); + if (err) + goto err_notify; + } + } + + return 0; + +err_notify: + while (i-- > 0) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + struct nh_grp_entry *nhge; + + nhge = rtnl_dereference(bucket->nh_entry); + if (nhge->nh == old) + __call_nexthop_res_bucket_notifiers(net, nhg_id, i, + true, newi, oldi, + extack); + } + return err; +} + +static int replace_nexthop_single_notify(struct net *net, + struct nexthop *group_nh, + struct nexthop *old, + struct nh_info *oldi, + struct nh_info *newi, + struct netlink_ext_ack *extack) +{ + struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); + struct nh_res_table *res_table; + + if (nhg->hash_threshold) { + return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, + group_nh, extack); + } else if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + return replace_nexthop_single_notify_res(net, res_table, + old, oldi, newi, + extack); + } + + return -EINVAL; +} + static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) @@ -1198,8 +2118,8 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, - extack); + err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, + extack); if (err) goto err_notify; } @@ -1229,7 +2149,7 @@ err_notify: list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; - call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, extack); + replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; @@ -1276,7 +2196,8 @@ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, } static int replace_nexthop(struct net *net, struct nexthop *old, - struct nexthop *new, struct netlink_ext_ack *extack) + struct nexthop *new, const struct nh_config *cfg, + struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; @@ -1319,7 +2240,7 @@ static int replace_nexthop(struct net *net, struct nexthop *old, } if (old->is_group) - err = replace_nexthop_grp(net, old, new, extack); + err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); @@ -1361,7 +2282,7 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { - rc = replace_nexthop(net, nh, new_nh, extack); + rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; @@ -1379,9 +2300,37 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, goto out; } + if (new_nh->is_group) { + struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); + struct nh_res_table *res_table; + + if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + + /* Not passing the number of buckets is OK when + * replacing, but not when creating a new group. + */ + if (!cfg->nh_grp_res_has_num_buckets) { + NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); + rc = -EINVAL; + goto out; + } + + nh_res_group_rebalance(nhg, res_table); + + /* Do not send bucket notifications, we do full + * notification below. + */ + nh_res_table_upkeep(res_table, false, false); + } + } + rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); + /* The initial insertion is a full notification for hash-threshold as + * well as resilient groups. + */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); @@ -1441,6 +2390,7 @@ static struct nexthop *nexthop_create_group(struct net *net, u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; + int err; int i; if (WARN_ON(!num_nh)) @@ -1472,8 +2422,10 @@ static struct nexthop *nexthop_create_group(struct net *net, struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); - if (!nexthop_get(nhe)) + if (!nexthop_get(nhe)) { + err = -ENOENT; goto out_no_nh; + } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) @@ -1485,13 +2437,28 @@ static struct nexthop *nexthop_create_group(struct net *net, nhg->nh_entries[i].nh_parent = nh; } - if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) - nhg->mpath = 1; + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { + nhg->hash_threshold = 1; + nhg->is_multipath = true; + } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { + struct nh_res_table *res_table; + + res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); + if (!res_table) { + err = -ENOMEM; + goto out_no_nh; + } + + rcu_assign_pointer(nhg->spare->res_table, res_table); + rcu_assign_pointer(nhg->res_table, res_table); + nhg->resilient = true; + nhg->is_multipath = true; + } - WARN_ON_ONCE(nhg->mpath != 1); + WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); - if (nhg->mpath) - nh_group_rebalance(nhg); + if (nhg->hash_threshold) + nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; @@ -1510,7 +2477,7 @@ out_no_nh: kfree(nhg); kfree(nh); - return ERR_PTR(-ENOENT); + return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, @@ -1680,6 +2647,70 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, return nh; } +static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, + unsigned long *timer_p, bool *has_p, + struct netlink_ext_ack *extack) +{ + unsigned long timer; + u32 value; + + if (!attr) { + *timer_p = fallback; + *has_p = false; + return 0; + } + + value = nla_get_u32(attr); + timer = clock_t_to_jiffies(value); + if (timer == ~0UL) { + NL_SET_ERR_MSG(extack, "Timer value too large"); + return -EINVAL; + } + + *timer_p = timer; + *has_p = true; + return 0; +} + +static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; + int err; + + if (res) { + err = nla_parse_nested(tb, + ARRAY_SIZE(rtm_nh_res_policy_new) - 1, + res, rtm_nh_res_policy_new, extack); + if (err < 0) + return err; + } + + if (tb[NHA_RES_GROUP_BUCKETS]) { + cfg->nh_grp_res_num_buckets = + nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); + cfg->nh_grp_res_has_num_buckets = true; + if (!cfg->nh_grp_res_num_buckets) { + NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); + return -EINVAL; + } + } + + err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], + NH_RES_DEFAULT_IDLE_TIMER, + &cfg->nh_grp_res_idle_timer, + &cfg->nh_grp_res_has_idle_timer, + extack); + if (err) + return err; + + return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], + NH_RES_DEFAULT_UNBALANCED_TIMER, + &cfg->nh_grp_res_unbalanced_timer, + &cfg->nh_grp_res_has_unbalanced_timer, + extack); +} + static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) @@ -1758,7 +2789,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } - err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), extack); + err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), + cfg->nh_grp_type, extack); + if (err) + goto out; + + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) + err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], + cfg, extack); /* no other attributes should be set */ goto out; @@ -1983,10 +3021,12 @@ errout_free: } struct nh_dump_filter { + u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; + u32 res_bucket_nh_id; }; static bool nh_dump_filtered(struct nexthop *nh, @@ -2100,26 +3140,24 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb, void *data) { struct rb_node *node; - int idx = 0, s_idx; + int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; - if (idx < s_idx) - goto cont; - nh = rb_entry(node, struct nexthop, rb_node); - ctx->idx = idx; + if (nh->id < s_idx) + continue; + + ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; -cont: - idx++; } - ctx->idx = idx; + ctx->idx++; return 0; } @@ -2166,6 +3204,318 @@ out_err: return err; } +static struct nexthop * +nexthop_find_group_resilient(struct net *net, u32 id, + struct netlink_ext_ack *extack) +{ + struct nh_group *nhg; + struct nexthop *nh; + + nh = nexthop_find_by_id(net, id); + if (!nh) + return ERR_PTR(-ENOENT); + + if (!nh->is_group) { + NL_SET_ERR_MSG(extack, "Not a nexthop group"); + return ERR_PTR(-EINVAL); + } + + nhg = rtnl_dereference(nh->nh_grp); + if (!nhg->resilient) { + NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); + return ERR_PTR(-EINVAL); + } + + return nh; +} + +static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, + struct netlink_ext_ack *extack) +{ + u32 idx; + + if (attr) { + idx = nla_get_u32(attr); + if (!idx) { + NL_SET_ERR_MSG(extack, "Invalid nexthop id"); + return -EINVAL; + } + *nh_id_p = idx; + } else { + *nh_id_p = 0; + } + + return 0; +} + +static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, + struct nh_dump_filter *filter, + struct netlink_callback *cb) +{ + struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; + int err; + + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, + rtm_nh_policy_dump_bucket, NULL); + if (err < 0) + return err; + + err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); + if (err) + return err; + + if (tb[NHA_RES_BUCKET]) { + size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; + + err = nla_parse_nested(res_tb, max, + tb[NHA_RES_BUCKET], + rtm_nh_res_bucket_policy_dump, + cb->extack); + if (err < 0) + return err; + + err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], + &filter->res_bucket_nh_id, + cb->extack); + if (err) + return err; + } + + return __nh_valid_dump_req(nlh, tb, filter, cb->extack); +} + +struct rtm_dump_res_bucket_ctx { + struct rtm_dump_nh_ctx nh; + u16 bucket_index; + u32 done_nh_idx; /* 1 + the index of the last fully processed NH. */ +}; + +static struct rtm_dump_res_bucket_ctx * +rtm_dump_res_bucket_ctx(struct netlink_callback *cb) +{ + struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + return ctx; +} + +struct rtm_dump_nexthop_bucket_data { + struct rtm_dump_res_bucket_ctx *ctx; + struct nh_dump_filter filter; +}; + +static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, + struct netlink_callback *cb, + struct nexthop *nh, + struct rtm_dump_nexthop_bucket_data *dd) +{ + u32 portid = NETLINK_CB(cb->skb).portid; + struct nhmsg *nhm = nlmsg_data(cb->nlh); + struct nh_res_table *res_table; + struct nh_group *nhg; + u16 bucket_index; + int err; + + if (dd->ctx->nh.idx < dd->ctx->done_nh_idx) + return 0; + + nhg = rtnl_dereference(nh->nh_grp); + res_table = rtnl_dereference(nhg->res_table); + for (bucket_index = dd->ctx->bucket_index; + bucket_index < res_table->num_nh_buckets; + bucket_index++) { + struct nh_res_bucket *bucket; + struct nh_grp_entry *nhge; + + bucket = &res_table->nh_buckets[bucket_index]; + nhge = rtnl_dereference(bucket->nh_entry); + if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) + continue; + + if (dd->filter.res_bucket_nh_id && + dd->filter.res_bucket_nh_id != nhge->nh->id) + continue; + + err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, + RTM_NEWNEXTHOPBUCKET, portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + if (err < 0) { + if (likely(skb->len)) + goto out; + goto out_err; + } + } + + dd->ctx->done_nh_idx = dd->ctx->nh.idx + 1; + bucket_index = 0; + +out: + err = skb->len; +out_err: + dd->ctx->bucket_index = bucket_index; + return err; +} + +static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, + struct netlink_callback *cb, + struct nexthop *nh, void *data) +{ + struct rtm_dump_nexthop_bucket_data *dd = data; + struct nh_group *nhg; + + if (!nh->is_group) + return 0; + + nhg = rtnl_dereference(nh->nh_grp); + if (!nhg->resilient) + return 0; + + return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); +} + +/* rtnl */ +static int rtm_dump_nexthop_bucket(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); + struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; + struct net *net = sock_net(skb->sk); + struct nexthop *nh; + int err; + + err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); + if (err) + return err; + + if (dd.filter.nh_id) { + nh = nexthop_find_group_resilient(net, dd.filter.nh_id, + cb->extack); + if (IS_ERR(nh)) + return PTR_ERR(nh); + err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); + } else { + struct rb_root *root = &net->nexthop.rb_root; + + err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, + &rtm_dump_nexthop_bucket_cb, &dd); + } + + if (err < 0) { + if (likely(skb->len)) + goto out; + goto out_err; + } + +out: + err = skb->len; +out_err: + cb->seq = net->nexthop.seq; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + return err; +} + +static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, + u16 *bucket_index, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, + res, rtm_nh_res_bucket_policy_get, extack); + if (err < 0) + return err; + + if (!tb[NHA_RES_BUCKET_INDEX]) { + NL_SET_ERR_MSG(extack, "Bucket index is missing"); + return -EINVAL; + } + + *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); + return 0; +} + +static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, + u32 *id, u16 *bucket_index, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; + int err; + + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, + rtm_nh_policy_get_bucket, extack); + if (err < 0) + return err; + + err = __nh_valid_get_del_req(nlh, tb, id, extack); + if (err) + return err; + + if (!tb[NHA_RES_BUCKET]) { + NL_SET_ERR_MSG(extack, "Bucket information is missing"); + return -EINVAL; + } + + err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], + bucket_index, extack); + if (err) + return err; + + return 0; +} + +/* rtnl */ +static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(in_skb->sk); + struct nh_res_table *res_table; + struct sk_buff *skb = NULL; + struct nh_group *nhg; + struct nexthop *nh; + u16 bucket_index; + int err; + u32 id; + + err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); + if (err) + return err; + + nh = nexthop_find_group_resilient(net, id, extack); + if (IS_ERR(nh)) + return PTR_ERR(nh); + + nhg = rtnl_dereference(nh->nh_grp); + res_table = rtnl_dereference(nhg->res_table); + if (bucket_index >= res_table->num_nh_buckets) { + NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); + return -ENOENT; + } + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], + bucket_index, RTM_NEWNEXTHOPBUCKET, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + 0, extack); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + goto errout_free; + } + + return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + +errout_free: + kfree_skb(skb); + return err; +} + static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); @@ -2277,6 +3627,75 @@ out: } EXPORT_SYMBOL(nexthop_set_hw_flags); +void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, + bool offload, bool trap) +{ + struct nh_res_table *res_table; + struct nh_res_bucket *bucket; + struct nexthop *nexthop; + struct nh_group *nhg; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop || !nexthop->is_group) + goto out; + + nhg = rcu_dereference(nexthop->nh_grp); + if (!nhg->resilient) + goto out; + + if (bucket_index >= nhg->res_table->num_nh_buckets) + goto out; + + res_table = rcu_dereference(nhg->res_table); + bucket = &res_table->nh_buckets[bucket_index]; + bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + if (offload) + bucket->nh_flags |= RTNH_F_OFFLOAD; + if (trap) + bucket->nh_flags |= RTNH_F_TRAP; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); + +void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, + unsigned long *activity) +{ + struct nh_res_table *res_table; + struct nexthop *nexthop; + struct nh_group *nhg; + u16 i; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop || !nexthop->is_group) + goto out; + + nhg = rcu_dereference(nexthop->nh_grp); + if (!nhg->resilient) + goto out; + + /* Instead of silently ignoring some buckets, demand that the sizes + * be the same. + */ + res_table = rcu_dereference(nhg->res_table); + if (num_buckets != res_table->num_nh_buckets) + goto out; + + for (i = 0; i < num_buckets; i++) { + if (test_bit(i, activity)) + nh_res_bucket_set_busy(&res_table->nh_buckets[i]); + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_res_grp_activity_update); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); @@ -2320,6 +3739,9 @@ static int __init nexthop_init(void) rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); + rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket, + rtm_dump_nexthop_bucket, 0); + return 0; } subsys_initcall(nexthop_init); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 8b943f85fff9..1c9f71a37258 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -453,7 +453,9 @@ EXPORT_SYMBOL_GPL(ping_bind); static inline int ping_supported(int family, int type, int code) { return (family == AF_INET && type == ICMP_ECHO && code == 0) || - (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0); + (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) || + (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) || + (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0); } /* diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bba150fdd265..f6787c55f6ab 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -21,7 +21,7 @@ * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table - * Alan Cox : MSS actually. Also added the window + * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. @@ -41,7 +41,7 @@ * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) - * Pavel Krauz : Limited broadcast fixed + * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. @@ -54,8 +54,8 @@ * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. - * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect - * Ilia Sotnikov : Removed TOS from hash calculations + * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect + * Ilia Sotnikov : Removed TOS from hash calculations */ #define pr_fmt(fmt) "IPv4: " fmt @@ -66,6 +66,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/memblock.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -234,19 +235,6 @@ static const struct seq_operations rt_cache_seq_ops = { .show = rt_cache_seq_show, }; -static int rt_cache_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rt_cache_seq_ops); -} - -static const struct proc_ops rt_cache_proc_ops = { - .proc_open = rt_cache_seq_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - - static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; @@ -324,19 +312,6 @@ static const struct seq_operations rt_cpu_seq_ops = { .show = rt_cpu_seq_show, }; - -static int rt_cpu_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &rt_cpu_seq_ops); -} - -static const struct proc_ops rt_cpu_proc_ops = { - .proc_open = rt_cpu_seq_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = seq_release, -}; - #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { @@ -367,13 +342,13 @@ static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_create("rt_cache", 0444, net->proc_net, - &rt_cache_proc_ops); + pde = proc_create_seq("rt_cache", 0444, net->proc_net, + &rt_cache_seq_ops); if (!pde) goto err1; - pde = proc_create("rt_cache", 0444, - net->proc_net_stat, &rt_cpu_proc_ops); + pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, + &rt_cpu_seq_ops); if (!pde) goto err2; @@ -478,8 +453,10 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } -#define IP_IDENTS_SZ 2048u - +/* Hash tables of size 2048..262144 depending on RAM size. + * Each bucket uses 8 bytes. + */ +static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; @@ -489,12 +466,16 @@ static u32 *ip_tstamps __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; - atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = READ_ONCE(*p_tstamp); - u32 now = (u32)jiffies; + u32 bucket, old, now = (u32)jiffies; + atomic_t *p_id; + u32 *p_tstamp; u32 delta = 0; + bucket = hash & ip_idents_mask; + p_tstamp = ip_tstamps + bucket; + p_id = ip_idents + bucket; + old = READ_ONCE(*p_tstamp); + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old); @@ -722,6 +703,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, for_each_possible_cpu(i) { struct rtable __rcu **prt; + prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) @@ -1258,12 +1240,12 @@ static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) } /* - We do not cache source address of outgoing interface, - because it is used only by IP RR, TS and SRR options, - so that it out of fast path. - - BTW remember: "addr" is allowed to be not aligned - in IP options! + * We do not cache source address of outgoing interface, + * because it is used only by IP RR, TS and SRR options, + * so that it out of fast path. + * + * BTW remember: "addr" is allowed to be not aligned + * in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) @@ -2108,7 +2090,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto out; /* Check for the most weird martians, which can be not detected - by fib_lookup. + * by fib_lookup. */ tun_info = skb_tunnel_info(skb); @@ -2246,7 +2228,7 @@ local_input: if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; - rth->rt_flags &= ~RTCF_LOCAL; + rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { @@ -2317,15 +2299,15 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. - The problem was that too many Ethernet cards have broken/missing - hardware multicast filters :-( As result the host on multicasting - network acquires a lot of useless route cache entries, sort of - SDR messages from all the world. Now we try to get rid of them. - Really, provided software IP multicast filter is organized - reasonably (at least, hashed), it does not result in a slowdown - comparing with route cache reject entries. - Note, that multicast routers are not affected, because - route cache entry is created eventually. + * The problem was that too many Ethernet cards have broken/missing + * hardware multicast filters :-( As result the host on multicasting + * network acquires a lot of useless route cache entries, sort of + * SDR messages from all the world. Now we try to get rid of them. + * Really, provided software IP multicast filter is organized + * reasonably (at least, hashed), it does not result in a slowdown + * comparing with route cache reject entries. + * Note, that multicast routers are not affected, because + * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -2537,11 +2519,11 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. - It was wrong for two reasons: - 1. ip_dev_find(net, saddr) can return wrong iface, if saddr - is assigned to multiple interfaces. - 2. Moreover, we are allowed to send packets with saddr - of another iface. --ANK + * It was wrong for two reasons: + * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr + * is assigned to multiple interfaces. + * 2. Moreover, we are allowed to send packets with saddr + * of another iface. --ANK */ if (fl4->flowi4_oif == 0 && @@ -2553,18 +2535,18 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, goto out; /* Special hack: user can direct multicasts - and limited broadcast via necessary interface - without fiddling with IP_MULTICAST_IF or IP_PKTINFO. - This hack is not just for fun, it allows - vic,vat and friends to work. - They bind socket to loopback, set ttl to zero - and expect that it will work. - From the viewpoint of routing cache they are broken, - because we are not allowed to build multicast path - with loopback source addr (look, routing cache - cannot know, that ttl is zero, so that packet - will not leave this host and route is valid). - Luckily, this hack is good workaround. + * and limited broadcast via necessary interface + * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. + * This hack is not just for fun, it allows + * vic,vat and friends to work. + * They bind socket to loopback, set ttl to zero + * and expect that it will work. + * From the viewpoint of routing cache they are broken, + * because we are not allowed to build multicast path + * with loopback source addr (look, routing cache + * cannot know, that ttl is zero, so that packet + * will not leave this host and route is valid). + * Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; @@ -2627,21 +2609,21 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, (ipv4_is_multicast(fl4->daddr) || !netif_index_is_l3_master(net, fl4->flowi4_oif))) { /* Apparently, routing tables are wrong. Assume, - that the destination is on link. - - WHY? DW. - Because we are allowed to send to iface - even if it has NO routes and NO assigned - addresses. When oif is specified, routing - tables are looked up with only one purpose: - to catch if destination is gatewayed, rather than - direct. Moreover, if MSG_DONTROUTE is set, - we send packet, ignoring both routing tables - and ifaddr state. --ANK - - - We could make it even if oif is unknown, - likely IPv6, but we do not. + * that the destination is on link. + * + * WHY? DW. + * Because we are allowed to send to iface + * even if it has NO routes and NO assigned + * addresses. When oif is specified, routing + * tables are looked up with only one purpose: + * to catch if destination is gatewayed, rather than + * direct. Moreover, if MSG_DONTROUTE is set, + * we send packet, ignoring both routing tables + * and ifaddr state. --ANK + * + * + * We could make it even if oif is unknown, + * likely IPv6, but we do not. */ if (fl4->saddr == 0) @@ -3553,18 +3535,25 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { + void *idents_hash; int cpu; - ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), - GFP_KERNEL); - if (!ip_idents) - panic("IP: failed to allocate ip_idents\n"); + /* For modern hosts, this will use 2 MB of memory */ + idents_hash = alloc_large_system_hash("IP idents", + sizeof(*ip_idents) + sizeof(*ip_tstamps), + 0, + 16, /* one bucket per 64 KB */ + HASH_ZERO, + NULL, + &ip_idents_mask, + 2048, + 256*1024); + + ip_idents = idents_hash; - prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); - ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); - if (!ip_tstamps) - panic("IP: failed to allocate ip_tstamps\n"); + ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 60465f077497..a62934b9f15a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -31,7 +31,6 @@ static int two = 2; static int four = 4; static int thousand = 1000; -static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -47,7 +46,6 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; -static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; @@ -209,7 +207,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, net = container_of(table->data, struct net, ipv4.sysctl_ip_fwd_update_priority); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, net); @@ -389,7 +387,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, { int ret = 0; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && !ret) { int enabled = init_net.ipv4.sysctl_tcp_early_demux; @@ -405,7 +403,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, { int ret = 0; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && !ret) { int enabled = init_net.ipv4.sysctl_udp_early_demux; @@ -457,7 +455,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ipv4.sysctl_fib_multipath_hash_policy); int ret; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); @@ -595,30 +593,39 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "icmp_echo_enable_probe", + .data = &init_net.ipv4.sysctl_icmp_echo_enable_probe, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, { .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_ignore_bogus_error_responses", .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_errors_use_inbound_ifaddr", .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_ratelimit", @@ -645,9 +652,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "raw_l3mdev_accept", .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -655,60 +662,60 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_dynaddr", .data = &init_net.ipv4.sysctl_ip_dynaddr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_early_demux", .data = &init_net.ipv4.sysctl_ip_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "udp_early_demux", .data = &init_net.ipv4.sysctl_udp_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_udp_early_demux }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_tcp_early_demux }, { .procname = "nexthop_compat_mode", .data = &init_net.ipv4.sysctl_nexthop_compat_mode, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, @@ -729,21 +736,21 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_use_pmtu", .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_update_priority", .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, .extra1 = SYSCTL_ZERO, @@ -752,40 +759,40 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_autobind_reuse", .data = &init_net.ipv4.sysctl_ip_autobind_reuse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fwmark_accept", .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "tcp_l3mdev_accept", .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -793,9 +800,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_mtu_probing", .data = &init_net.ipv4.sysctl_tcp_mtu_probing, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_base_mss", @@ -840,9 +847,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "igmp_link_local_mcast_reports", .data = &init_net.ipv4.sysctl_igmp_llm_reports, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "igmp_max_memberships", @@ -897,9 +904,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_keepalive_probes", .data = &init_net.ipv4.sysctl_tcp_keepalive_probes, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_keepalive_intvl", @@ -911,26 +918,26 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_syn_retries", .data = &init_net.ipv4.sysctl_tcp_syn_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = &tcp_syn_retries_min, .extra2 = &tcp_syn_retries_max }, { .procname = "tcp_synack_retries", .data = &init_net.ipv4.sysctl_tcp_synack_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_SYN_COOKIES { .procname = "tcp_syncookies", .data = &init_net.ipv4.sysctl_tcp_syncookies, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, #endif { @@ -943,24 +950,24 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_retries1", .data = &init_net.ipv4.sysctl_tcp_retries1, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_retr1_max }, { .procname = "tcp_retries2", .data = &init_net.ipv4.sysctl_tcp_retries2, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fin_timeout", @@ -979,9 +986,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_tw_reuse", .data = &init_net.ipv4.sysctl_tcp_tw_reuse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, @@ -1030,16 +1037,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "fib_multipath_use_neigh", .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, @@ -1057,9 +1064,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "udp_l3mdev_accept", .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1067,88 +1074,88 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_sack", .data = &init_net.ipv4.sysctl_tcp_sack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_window_scaling", .data = &init_net.ipv4.sysctl_tcp_window_scaling, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_timestamps", .data = &init_net.ipv4.sysctl_tcp_timestamps, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_retrans", .data = &init_net.ipv4.sysctl_tcp_early_retrans, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &four, }, { .procname = "tcp_recovery", .data = &init_net.ipv4.sysctl_tcp_recovery, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_thin_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_slow_start_after_idle", .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_retrans_collapse", .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_stdurg", .data = &init_net.ipv4.sysctl_tcp_stdurg, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_rfc1337", .data = &init_net.ipv4.sysctl_tcp_rfc1337, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_abort_on_overflow", .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fack", .data = &init_net.ipv4.sysctl_tcp_fack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_max_reordering", @@ -1160,16 +1167,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_dsack", .data = &init_net.ipv4.sysctl_tcp_dsack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_app_win", .data = &init_net.ipv4.sysctl_tcp_app_win, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_adv_win_scale", @@ -1183,46 +1190,46 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_frto", .data = &init_net.ipv4.sysctl_tcp_frto, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_metrics_save", .data = &init_net.ipv4.sysctl_tcp_nometrics_save, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_ssthresh_metrics_save", .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_workaround_signed_windows", .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_limit_output_bytes", @@ -1241,11 +1248,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_min_tso_segs", .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, - .extra2 = &gso_max_segs, }, { .procname = "tcp_min_rtt_wlen", @@ -1259,9 +1265,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_autocorking", .data = &init_net.ipv4.sysctl_tcp_autocorking, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1323,18 +1329,17 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &comp_sack_nr_max, }, { .procname = "tcp_reflect_tos", .data = &init_net.ipv4.sysctl_tcp_reflect_tos, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1357,9 +1362,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index de7cc8445ac0..e14fd0c50c10 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -267,6 +267,7 @@ #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/static_key.h> +#include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> @@ -2587,6 +2588,17 @@ void tcp_set_state(struct sock *sk, int state) BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + /* bpf uapi header bpf.h defines an anonymous enum with values + * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux + * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. + * But clang built vmlinux does not have this enum in DWARF + * since clang removes the above code before generating IR/debuginfo. + * Let us explicitly emit the type debuginfo to ensure the + * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF + * regardless of which compiler is used. + */ + BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index bc7d2a586e18..ad9d17923fc5 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,86 +10,6 @@ #include <net/inet_common.h> #include <net/tls.h> -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags) -{ - struct iov_iter *iter = &msg->msg_iter; - int peek = flags & MSG_PEEK; - struct sk_msg *msg_rx; - int i, copied = 0; - - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - - while (copied != len) { - struct scatterlist *sge; - - if (unlikely(!msg_rx)) - break; - - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; - - copied += copy; - if (likely(!peek)) { - sge->offset += copy; - sge->length -= copy; - if (!msg_rx->skb) - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - - if (!sge->length) { - sk_msg_iter_var_next(i); - if (!msg_rx->skb) - put_page(page); - } - } else { - /* Lets not optimize peek case if copy_page_to_iter - * didn't copy the entire length lets just break. - */ - if (copy != sge->length) - return copied; - sk_msg_iter_var_next(i); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) - break; - msg_rx = list_next_entry(msg_rx, list); - continue; - } - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); - } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -229,7 +149,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL static bool tcp_bpf_stream_read(const struct sock *sk) { struct sk_psock *psock; @@ -243,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -284,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; @@ -601,20 +499,43 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (restore) { + if (inet_csk_has_ulp(sk)) { + /* TLS does not have an unhash proto in SW cases, + * but we need to ensure we stop using the sock_map + * unhash routine because the associated psock is being + * removed. So use the original unhash handler. + */ + WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } + return 0; + } + + if (inet_csk_has_ulp(sk)) + return -EINVAL; + if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) - return ERR_PTR(-EINVAL); + return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } - return &tcp_bpf_prots[family][config]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + return 0; } +EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to @@ -629,4 +550,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) newsk->sk_prot = sk->sk_prot_creator; } -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ffcbe46dacdb..4a30deaa9a37 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -124,7 +124,7 @@ static inline void bictcp_hystart_reset(struct sock *sk) ca->sample_cnt = 0; } -static void bictcp_init(struct sock *sk) +static void cubictcp_init(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -137,7 +137,7 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) +static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); @@ -319,7 +319,7 @@ tcp_friendliness: ca->cnt = max(ca->cnt, 2U); } -static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) +static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -338,7 +338,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_cong_avoid_ai(tp, ca->cnt, acked); } -static u32 bictcp_recalc_ssthresh(struct sock *sk) +static u32 cubictcp_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -355,7 +355,7 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static void bictcp_state(struct sock *sk, u8 new_state) +static void cubictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -442,7 +442,7 @@ static void hystart_update(struct sock *sk, u32 delay) } } -static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) +static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -471,13 +471,13 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) } static struct tcp_congestion_ops cubictcp __read_mostly = { - .init = bictcp_init, - .ssthresh = bictcp_recalc_ssthresh, - .cong_avoid = bictcp_cong_avoid, - .set_state = bictcp_state, + .init = cubictcp_init, + .ssthresh = cubictcp_recalc_ssthresh, + .cong_avoid = cubictcp_cong_avoid, + .set_state = cubictcp_state, .undo_cwnd = tcp_reno_undo_cwnd, - .cwnd_event = bictcp_cwnd_event, - .pkts_acked = bictcp_acked, + .cwnd_event = cubictcp_cwnd_event, + .pkts_acked = cubictcp_acked, .owner = THIS_MODULE, .name = "cubic", }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 69a545db80d2..4cf4dd532d1c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2914,7 +2914,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - WARN_ON(tp->retrans_out != 0); + WARN_ON(tp->retrans_out != 0 && !tp->syn_data); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { @@ -5994,11 +5994,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; else tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; - skb_rbtree_walk_from(data) { - if (__tcp_retransmit_skb(sk, data, 1)) - break; - } - tcp_rearm_rto(sk); + skb_rbtree_walk_from(data) + tcp_mark_skb_lost(sk, data); + tcp_xmit_retransmit_queue(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index daad4f99db32..312184cead57 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -655,14 +655,18 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ +#ifdef CONFIG_TCP_MD5SIG +#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED +#else +#define OPTION_BYTES sizeof(__be32) +#endif + static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; -#ifdef CONFIG_TCP_MD5SIG - __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; -#endif + __be32 opt[OPTION_BYTES / sizeof(__be32)]; } rep; struct ip_reply_arg arg; #ifdef CONFIG_TCP_MD5SIG @@ -770,6 +774,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->daddr, &rep.th); } #endif + /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ + if (rep.opt[0] == 0) { + __be32 mrst = mptcp_reset_option(skb); + + if (mrst) { + rep.opt[0] = mrst; + arg.iov[0].iov_len += sizeof(mrst); + rep.th.doff = arg.iov[0].iov_len / 4; + } + } + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); @@ -2806,6 +2821,9 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index e6459537d4d2..82b36ec3f2f8 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -63,7 +63,7 @@ enum tcp_lp_state { * @sowd: smoothed OWD << 3 * @owd_min: min OWD * @owd_max: max OWD - * @owd_max_rsv: resrved max owd + * @owd_max_rsv: reserved max owd * @remote_hz: estimated remote HZ * @remote_ref_time: remote reference time * @local_ref_time: local reference time @@ -305,7 +305,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) /* FIXME: try to reset owd_min and owd_max here * so decrease the chance the min/max is no longer suitable - * and will usually within threshold when whithin inference */ + * and will usually within threshold when within inference */ lp->owd_min = lp->sowd >> 3; lp->owd_max = lp->sowd >> 2; lp->owd_max_rsv = lp->sowd >> 2; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fbf140a770d8..bde781f46b41 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2775,13 +2775,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ -static bool skb_still_in_host_queue(const struct sock *sk, +static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); - return true; + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + smp_mb__after_atomic(); + if (skb_fclone_busy(sk, skb)) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } } return false; } @@ -3147,14 +3151,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; - /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: fragmentation, tunneling, mangling etc. - */ - if (refcount_read(&sk->sk_wmem_alloc) > - min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), - sk->sk_sndbuf)) - return -EAGAIN; - if (skb_still_in_host_queue(sk, skb)) return -EBUSY; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 99d743eb9dc4..15f5504adf5b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1782,6 +1782,35 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct sk_buff *skb; + int err, used; + + skb = skb_recv_udp(sk, 0, 1, &err); + if (!skb) + return err; + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + break; + } else if (used <= skb->len) { + copied += used; + } + + if (!desc->count) + break; + } + + return copied; +} +EXPORT_SYMBOL(udp_read_sock); + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -2178,6 +2207,8 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) segs = udp_rcv_segment(sk, skb, true); skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); + + udp_post_segment_fix_csum(skb); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); @@ -2664,9 +2695,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_GRO: lock_sock(sk); + + /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk->sk_socket); up->gro_enabled = valbool; + up->accept_udp_l4 = valbool; release_sock(sk); break; @@ -2853,6 +2887,9 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a94791efc1a..954c4591a6fd 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -4,6 +4,68 @@ #include <linux/skmsg.h> #include <net/sock.h> #include <net/udp.h> +#include <net/inet_common.h> + +#include "udp_impl.h" + +static struct proto *udpv6_prot_saved __read_mostly; + +static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + return udpv6_prot_saved->recvmsg(sk, msg, len, noblock, flags, + addr_len); +#endif + return udp_prot.recvmsg(sk, msg, len, noblock, flags, addr_len); +} + +static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct sk_psock *psock; + int copied, ret; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + if (sk_psock_queue_empty(psock)) { + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + goto out; + } + +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + int data, err = 0; + long timeo; + + timeo = sock_rcvtimeo(sk, nonblock); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + ret = sk_udp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + goto out; + } + if (err) { + ret = err; + goto out; + } + copied = -EAGAIN; + } + ret = copied; +out: + release_sock(sk); + sk_psock_put(sk, psock); + return ret; +} enum { UDP_BPF_IPV4, @@ -11,7 +73,6 @@ enum { UDP_BPF_NUM_PROTS, }; -static struct proto *udpv6_prot_saved __read_mostly; static DEFINE_SPINLOCK(udpv6_prot_lock); static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; @@ -20,6 +81,7 @@ static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) *prot = *base; prot->unhash = sock_map_unhash; prot->close = sock_map_close; + prot->recvmsg = udp_bpf_recvmsg; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) @@ -41,12 +103,20 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - return &udp_bpf_prots[family]; + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + return 0; } +EXPORT_SYMBOL_GPL(udp_bpf_update_proto); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index c5b4b586570f..54e06b88af69 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -515,21 +515,24 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; + /* we can do L4 aggregation only if the packet can't land in a tunnel + * otherwise we could corrupt the inner stream + */ NAPI_GRO_CB(skb)->is_flist = 0; - if (skb->dev->features & NETIF_F_GRO_FRAGLIST) - NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; + if (!sk || !udp_sk(sk)->gro_receive) { + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1; - if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || - (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { - pp = call_gro_receive(udp_gro_receive_segment, head, skb); + if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || + (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) + pp = call_gro_receive(udp_gro_receive_segment, head, skb); return pp; } - if (!sk || NAPI_GRO_CB(skb)->encap_mark || + if (NAPI_GRO_CB(skb)->encap_mark || (uh->check && skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && - !NAPI_GRO_CB(skb)->csum_valid) || - !udp_sk(sk)->gro_receive) + !NAPI_GRO_CB(skb)->csum_valid)) goto out; /* mark that this skb passed once through the tunnel gro layer */ @@ -639,6 +642,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + /* clear the encap mark, so that inner frag_list gro_complete + * can take place + */ + NAPI_GRO_CB(skb)->encap_mark = 0; + /* Set encapsulation before calling into inner gro_complete() * functions to make them set up the inner offsets. */ @@ -662,7 +670,8 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (NAPI_GRO_CB(skb)->is_flist) { + /* do fraglist only if there is no outer UDP encap (or we already processed it) */ + if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) { uh->len = htons(skb->len - nhoff); skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a9e53f5942fa..b0ef65eb9bd2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2358,7 +2358,7 @@ regen: /* <draft-ietf-6man-rfc4941bis-08.txt>, Section 3.3.1: * check if generated address is not inappropriate: * - * - Reserved IPv6 Interface Identifers + * - Reserved IPv6 Interface Identifiers * - XXX: already assigned to an address on the device */ @@ -4485,7 +4485,9 @@ restart: age >= ifp->valid_lft) { spin_unlock(&ifp->lock); in6_ifa_hold(ifp); + rcu_read_unlock_bh(); ipv6_del_addr(ifp); + rcu_read_lock_bh(); goto restart; } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { spin_unlock(&ifp->lock); @@ -5107,17 +5109,20 @@ next: break; } case MULTICAST_ADDR: + read_unlock_bh(&idev->lock); fillargs->event = RTM_GETMULTICAST; /* multicast address */ - for (ifmca = idev->mc_list; ifmca; - ifmca = ifmca->next, ip_idx++) { + for (ifmca = rcu_dereference(idev->mc_list); + ifmca; + ifmca = rcu_dereference(ifmca->next), ip_idx++) { if (ip_idx < s_ip_idx) continue; err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); if (err < 0) break; } + read_lock_bh(&idev->lock); break; case ANYCAST_ADDR: fillargs->event = RTM_GETANYCAST; @@ -6113,10 +6118,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) { - rcu_read_lock_bh(); if (likely(ifp->idev->dead == 0)) __ipv6_ifa_notify(event, ifp); - rcu_read_unlock_bh(); } #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index c70c192bc91b..1d4054bb345b 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -198,6 +198,12 @@ static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct s return -EAFNOSUPPORT; } +static struct net_device *eafnosupport_ipv6_dev_find(struct net *net, const struct in6_addr *addr, + struct net_device *dev) +{ + return ERR_PTR(-EAFNOSUPPORT); +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -209,6 +215,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_nh_init = eafnosupport_fib6_nh_init, .ip6_del_rt = eafnosupport_ip6_del_rt, .ipv6_fragment = eafnosupport_ipv6_fragment, + .ipv6_dev_find = eafnosupport_ipv6_dev_find, }; EXPORT_SYMBOL_GPL(ipv6_stub); @@ -250,7 +257,7 @@ void in6_dev_finish_destroy(struct inet6_dev *idev) struct net_device *dev = idev->dev; WARN_ON(!list_empty(&idev->addr_list)); - WARN_ON(idev->mc_list); + WARN_ON(rcu_access_pointer(idev->mc_list)); WARN_ON(timer_pending(&idev->rs_timer)); #ifdef NET_REFCNT_DEBUG diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 802f5111805a..2389ff702f51 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -222,7 +222,7 @@ lookup_protocol: inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_index = 0; - inet->mc_list = NULL; + RCU_INIT_POINTER(inet->mc_list, NULL); inet->rcv_tos = 0; if (net->ipv4.sysctl_ip_no_pmtu_disc) @@ -714,6 +714,7 @@ const struct proto_ops inet6_dgram_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ + .read_sock = udp_read_sock, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, @@ -1032,6 +1033,7 @@ static const struct ipv6_stub ipv6_stub_impl = { #endif .nd_tbl = &nd_tbl, .ipv6_fragment = ip6_fragment, + .ipv6_dev_find = ipv6_dev_find, }; static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 080ee7f44c64..20d492da725a 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -705,7 +705,7 @@ static int ah6_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { - pr_info("AH: %s digestsize %u != %hu\n", + pr_info("AH: %s digestsize %u != %u\n", x->aalg->alg_name, crypto_ahash_digestsize(ahash), aalg_desc->uinfo.auth.icv_fullbits/8); goto error; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 727d791ed5e6..393ae2b78e7d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -1147,7 +1147,7 @@ static int esp_init_authenc(struct xfrm_state *x) err = -EINVAL; if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - pr_info("ESP: %s digestsize %u != %hu\n", + pr_info("ESP: %s digestsize %u != %u\n", x->aalg->alg_name, crypto_aead_authsize(aead), aalg_desc->uinfo.auth.icv_fullbits / 8); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 4af56affaafd..40ed4fcf1cf4 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -318,7 +318,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; - if (!hw_offload || (hw_offload && !skb_is_gso(skb))) { + if (!hw_offload || !skb_is_gso(skb)) { esp.nfrags = esp6_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6126f8bf94b3..56e479d158b7 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -381,7 +381,7 @@ static int ipv6_srh_rcv(struct sk_buff *skb) looped_back: if (hdr->segments_left == 0) { - if (hdr->nexthdr == NEXTHDR_IPV6) { + if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), @@ -397,7 +397,8 @@ looped_back: skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; - + if (hdr->nexthdr == NEXTHDR_IPV4) + skb->protocol = htons(ETH_P_IP); __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index fd1f896115c1..e8398ffb5e35 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -916,6 +916,10 @@ static int icmpv6_rcv(struct sk_buff *skb) success = ping_rcv(skb); break; + case ICMPV6_EXT_ECHO_REPLY: + success = ping_rcv(skb); + break; + case ICMPV6_PKT_TOOBIG: /* BUGGG_FUTURE: if packet contains rthdr, we cannot update standard destination cache. Seems, only "advanced" @@ -944,11 +948,11 @@ static int icmpv6_rcv(struct sk_buff *skb) case ICMPV6_MGM_QUERY: igmp6_event_query(skb); - break; + return 0; case ICMPV6_MGM_REPORT: igmp6_event_report(skb); - break; + return 0; case ICMPV6_MGM_REDUCTION: case ICMPV6_NI_QUERY: @@ -1169,23 +1173,23 @@ static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "echo_ignore_all", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_multicast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_anycast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ratemask", diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 1baf43aacb2e..bc224f917bbd 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -387,7 +387,6 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; - dev_hold(dev); ip6gre_tunnel_link(ign, nt); return nt; @@ -1496,6 +1495,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) } ip6gre_tnl_init_features(dev); + dev_hold(dev); return 0; cleanup_dst_cache_init: @@ -1538,8 +1538,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) strcpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; - - dev_hold(dev); } static struct inet6_protocol ip6gre_protocol __read_mostly = { @@ -1889,6 +1887,7 @@ static int ip6erspan_tap_init(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ip6erspan_tnl_link_config(tunnel, 1); + dev_hold(dev); return 0; cleanup_dst_cache_init: @@ -1988,8 +1987,6 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, if (tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); - dev_hold(dev); - out: return err; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 42fe7db6bbb3..288bafded998 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -266,7 +266,6 @@ static int ip6_tnl_create2(struct net_device *dev) strcpy(t->parms.name, dev->name); - dev_hold(dev); ip6_tnl_link(ip6n, t); return 0; @@ -388,7 +387,7 @@ ip6_tnl_dev_uninit(struct net_device *dev) } /** - * parse_tvl_tnl_enc_lim - handle encapsulation limit option + * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * @@ -1882,6 +1881,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; + dev_hold(dev); return 0; destroy_dst: @@ -1925,7 +1925,6 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->parms.proto = IPPROTO_IPV6; - dev_hold(dev); rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index e0cc32e45880..2d048e21abbb 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -193,7 +193,6 @@ static int vti6_tnl_create2(struct net_device *dev) strcpy(t->parms.name, dev->name); - dev_hold(dev); vti6_tnl_link(ip6n, t); return 0; @@ -934,6 +933,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev) dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; + dev_hold(dev); return 0; } @@ -965,7 +965,6 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) struct vti6_net *ip6n = net_generic(net, vti6_net_id); t->parms.proto = IPPROTO_IPV6; - dev_hold(dev); rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 6c8604390266..0d59efb6b49e 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -29,7 +29,6 @@ #include <linux/socket.h> #include <linux/sockios.h> #include <linux/jiffies.h> -#include <linux/times.h> #include <linux/net.h> #include <linux/in.h> #include <linux/in6.h> @@ -42,6 +41,7 @@ #include <linux/slab.h> #include <linux/pkt_sched.h> #include <net/mld.h> +#include <linux/workqueue.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> @@ -67,18 +67,14 @@ static int __mld2_query_bugs[] __attribute__((__unused__)) = { BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4) }; +static struct workqueue_struct *mld_wq; static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); -static void igmp6_timer_handler(struct timer_list *t); +static void mld_mca_work(struct work_struct *work); -static void mld_gq_timer_expire(struct timer_list *t); -static void mld_ifc_timer_expire(struct timer_list *t); static void mld_ifc_event(struct inet6_dev *idev); -static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); -static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); -static void mld_clear_delrec(struct inet6_dev *idev); static bool mld_in_v1_mode(const struct inet6_dev *idev); static int sf_setstate(struct ifmcaddr6 *pmc); static void sf_markstate(struct ifmcaddr6 *pmc); @@ -112,12 +108,52 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; /* * socket join on multicast group */ +#define mc_dereference(e, idev) \ + rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock)) + +#define sock_dereference(e, sk) \ + rcu_dereference_protected(e, lockdep_sock_is_held(sk)) + +#define for_each_pmc_socklock(np, sk, pmc) \ + for (pmc = sock_dereference((np)->ipv6_mc_list, sk); \ + pmc; \ + pmc = sock_dereference(pmc->next, sk)) #define for_each_pmc_rcu(np, pmc) \ - for (pmc = rcu_dereference(np->ipv6_mc_list); \ - pmc != NULL; \ + for (pmc = rcu_dereference((np)->ipv6_mc_list); \ + pmc; \ pmc = rcu_dereference(pmc->next)) +#define for_each_psf_mclock(mc, psf) \ + for (psf = mc_dereference((mc)->mca_sources, mc->idev); \ + psf; \ + psf = mc_dereference(psf->sf_next, mc->idev)) + +#define for_each_psf_rcu(mc, psf) \ + for (psf = rcu_dereference((mc)->mca_sources); \ + psf; \ + psf = rcu_dereference(psf->sf_next)) + +#define for_each_psf_tomb(mc, psf) \ + for (psf = mc_dereference((mc)->mca_tomb, mc->idev); \ + psf; \ + psf = mc_dereference(psf->sf_next, mc->idev)) + +#define for_each_mc_mclock(idev, mc) \ + for (mc = mc_dereference((idev)->mc_list, idev); \ + mc; \ + mc = mc_dereference(mc->next, idev)) + +#define for_each_mc_rcu(idev, mc) \ + for (mc = rcu_dereference((idev)->mc_list); \ + mc; \ + mc = rcu_dereference(mc->next)) + +#define for_each_mc_tomb(idev, mc) \ + for (mc = mc_dereference((idev)->mc_tomb, idev); \ + mc; \ + mc = mc_dereference(mc->next, idev)) + static int unsolicited_report_interval(struct inet6_dev *idev) { int iv; @@ -144,15 +180,11 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, if (!ipv6_addr_is_multicast(addr)) return -EINVAL; - rcu_read_lock(); - for_each_pmc_rcu(np, mc_lst) { + for_each_pmc_socklock(np, sk, mc_lst) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && - ipv6_addr_equal(&mc_lst->addr, addr)) { - rcu_read_unlock(); + ipv6_addr_equal(&mc_lst->addr, addr)) return -EADDRINUSE; - } } - rcu_read_unlock(); mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); @@ -179,8 +211,7 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = mode; - rwlock_init(&mc_lst->sflock); - mc_lst->sflist = NULL; + RCU_INIT_POINTER(mc_lst->sflist, NULL); /* * now add/increase the group membership on the device @@ -227,7 +258,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) return -EINVAL; for (lnk = &np->ipv6_mc_list; - (mc_lst = rtnl_dereference(*lnk)) != NULL; + (mc_lst = sock_dereference(*lnk, sk)) != NULL; lnk = &mc_lst->next) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && ipv6_addr_equal(&mc_lst->addr, addr)) { @@ -239,11 +270,12 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); - (void) ip6_mc_leave_src(sk, mc_lst, idev); + ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); - } else - (void) ip6_mc_leave_src(sk, mc_lst, NULL); + } else { + ip6_mc_leave_src(sk, mc_lst, NULL); + } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); @@ -255,10 +287,9 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } EXPORT_SYMBOL(ipv6_sock_mc_drop); -/* called with rcu_read_lock() */ -static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, - const struct in6_addr *group, - int ifindex) +static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net, + const struct in6_addr *group, + int ifindex) { struct net_device *dev = NULL; struct inet6_dev *idev = NULL; @@ -270,19 +301,17 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, dev = rt->dst.dev; ip6_rt_put(rt); } - } else - dev = dev_get_by_index_rcu(net, ifindex); + } else { + dev = __dev_get_by_index(net, ifindex); + } if (!dev) return NULL; idev = __in6_dev_get(dev); if (!idev) return NULL; - read_lock_bh(&idev->lock); - if (idev->dead) { - read_unlock_bh(&idev->lock); + if (idev->dead) return NULL; - } return idev; } @@ -294,7 +323,7 @@ void __ipv6_sock_mc_close(struct sock *sk) ASSERT_RTNL(); - while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) { + while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) { struct net_device *dev; np->ipv6_mc_list = mc_lst->next; @@ -303,11 +332,12 @@ void __ipv6_sock_mc_close(struct sock *sk) if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); - (void) ip6_mc_leave_src(sk, mc_lst, idev); + ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); - } else - (void) ip6_mc_leave_src(sk, mc_lst, NULL); + } else { + ip6_mc_leave_src(sk, mc_lst, NULL); + } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); @@ -320,8 +350,11 @@ void ipv6_sock_mc_close(struct sock *sk) if (!rcu_access_pointer(np->ipv6_mc_list)) return; + rtnl_lock(); + lock_sock(sk); __ipv6_sock_mc_close(sk); + release_sock(sk); rtnl_unlock(); } @@ -336,7 +369,6 @@ int ip6_mc_source(int add, int omode, struct sock *sk, struct net *net = sock_net(sk); int i, j, rv; int leavegroup = 0; - int pmclocked = 0; int err; source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; @@ -345,16 +377,14 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - rcu_read_lock(); - idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface); - if (!idev) { - rcu_read_unlock(); + idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface); + if (!idev) return -ENODEV; - } err = -EADDRNOTAVAIL; - for_each_pmc_rcu(inet6, pmc) { + mutex_lock(&idev->mc_lock); + for_each_pmc_socklock(inet6, sk, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) @@ -365,7 +395,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, goto done; } /* if a source filter was set, must be the same mode as before */ - if (pmc->sflist) { + if (rcu_access_pointer(pmc->sflist)) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; @@ -377,10 +407,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, pmc->sfmode = omode; } - write_lock(&pmc->sflock); - pmclocked = 1; - - psl = pmc->sflist; + psl = sock_dereference(pmc->sflist, sk); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ @@ -420,7 +447,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) count += psl->sl_max; - newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC); + newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -430,9 +457,11 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psl, rcu); } - pmc->sflist = psl = newpsl; + psl = newpsl; + rcu_assign_pointer(pmc->sflist, psl); } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { @@ -448,10 +477,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: - if (pmclocked) - write_unlock(&pmc->sflock); - read_unlock_bh(&idev->lock); - rcu_read_unlock(); + mutex_unlock(&idev->mc_lock); if (leavegroup) err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; @@ -477,13 +503,9 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - rcu_read_lock(); - idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); - - if (!idev) { - rcu_read_unlock(); + idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface); + if (!idev) return -ENODEV; - } err = 0; @@ -492,7 +514,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, goto done; } - for_each_pmc_rcu(inet6, pmc) { + for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) @@ -504,7 +526,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, } if (gsf->gf_numsrc) { newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc), - GFP_ATOMIC); + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -516,32 +538,37 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } + mutex_lock(&idev->mc_lock); err = ip6_mc_add_src(idev, group, gsf->gf_fmode, - newpsl->sl_count, newpsl->sl_addr, 0); + newpsl->sl_count, newpsl->sl_addr, 0); if (err) { + mutex_unlock(&idev->mc_lock); sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); goto done; } + mutex_unlock(&idev->mc_lock); } else { newpsl = NULL; - (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); + mutex_lock(&idev->mc_lock); + ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); + mutex_unlock(&idev->mc_lock); } - write_lock(&pmc->sflock); - psl = pmc->sflist; + mutex_lock(&idev->mc_lock); + psl = sock_dereference(pmc->sflist, sk); if (psl) { - (void) ip6_mc_del_src(idev, group, pmc->sfmode, - psl->sl_count, psl->sl_addr, 0); - sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); - } else - (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); - pmc->sflist = newpsl; + ip6_mc_del_src(idev, group, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); + atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psl, rcu); + } else { + ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + } + mutex_unlock(&idev->mc_lock); + rcu_assign_pointer(pmc->sflist, newpsl); pmc->sfmode = gsf->gf_fmode; - write_unlock(&pmc->sflock); err = 0; done: - read_unlock_bh(&idev->lock); - rcu_read_unlock(); if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; @@ -550,52 +577,37 @@ done: int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage __user *p) { - int err, i, count, copycount; + struct ipv6_pinfo *inet6 = inet6_sk(sk); const struct in6_addr *group; struct ipv6_mc_socklist *pmc; - struct inet6_dev *idev; - struct ipv6_pinfo *inet6 = inet6_sk(sk); struct ip6_sf_socklist *psl; - struct net *net = sock_net(sk); + int i, count, copycount; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; if (!ipv6_addr_is_multicast(group)) return -EINVAL; - rcu_read_lock(); - idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); - - if (!idev) { - rcu_read_unlock(); - return -ENODEV; - } - - err = -EADDRNOTAVAIL; /* changes to the ipv6_mc_list require the socket lock and - * rtnl lock. We have the socket lock and rcu read lock, - * so reading the list is safe. + * rtnl lock. We have the socket lock, so reading the list is safe. */ - for_each_pmc_rcu(inet6, pmc) { + for_each_pmc_socklock(inet6, sk, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(group, &pmc->addr)) break; } if (!pmc) /* must have a prior join */ - goto done; + return -EADDRNOTAVAIL; + gsf->gf_fmode = pmc->sfmode; - psl = pmc->sflist; + psl = sock_dereference(pmc->sflist, sk); count = psl ? psl->sl_count : 0; - read_unlock_bh(&idev->lock); - rcu_read_unlock(); copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - /* changes to psl require the socket lock, and a write lock - * on pmc->sflock. We have the socket lock so reading here is safe. - */ + for (i = 0; i < copycount; i++, p++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -608,10 +620,6 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, return -EFAULT; } return 0; -done: - read_unlock_bh(&idev->lock); - rcu_read_unlock(); - return err; } bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, @@ -631,8 +639,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, rcu_read_unlock(); return np->mc_all; } - read_lock(&mc->sflock); - psl = mc->sflist; + psl = rcu_dereference(mc->sflist); if (!psl) { rv = mc->sfmode == MCAST_EXCLUDE; } else { @@ -647,12 +654,12 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = false; } - read_unlock(&mc->sflock); rcu_read_unlock(); return rv; } +/* called with mc_lock */ static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; @@ -662,13 +669,11 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) IPV6_ADDR_SCOPE_LINKLOCAL) return; - spin_lock_bh(&mc->mca_lock); if (!(mc->mca_flags&MAF_LOADED)) { mc->mca_flags |= MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } - spin_unlock_bh(&mc->mca_lock); if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) return; @@ -689,6 +694,7 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) mld_ifc_event(mc->idev); } +/* called with mc_lock */ static void igmp6_group_dropped(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; @@ -698,28 +704,25 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) IPV6_ADDR_SCOPE_LINKLOCAL) return; - spin_lock_bh(&mc->mca_lock); if (mc->mca_flags&MAF_LOADED) { mc->mca_flags &= ~MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } - spin_unlock_bh(&mc->mca_lock); if (mc->mca_flags & MAF_NOREPORT) return; if (!mc->idev->dead) igmp6_leave_group(mc); - spin_lock_bh(&mc->mca_lock); - if (del_timer(&mc->mca_timer)) + if (cancel_delayed_work(&mc->mca_work)) refcount_dec(&mc->mca_refcnt); - spin_unlock_bh(&mc->mca_lock); } /* * deleted ifmcaddr6 manipulation + * called with mc_lock */ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { @@ -731,12 +734,10 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ - pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC); + pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); if (!pmc) return; - spin_lock_bh(&im->mca_lock); - spin_lock_init(&pmc->mca_lock); pmc->idev = im->idev; in6_dev_hold(idev); pmc->mca_addr = im->mca_addr; @@ -745,90 +746,110 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) if (pmc->mca_sfmode == MCAST_INCLUDE) { struct ip6_sf_list *psf; - pmc->mca_tomb = im->mca_tomb; - pmc->mca_sources = im->mca_sources; - im->mca_tomb = im->mca_sources = NULL; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) + rcu_assign_pointer(pmc->mca_tomb, + mc_dereference(im->mca_tomb, idev)); + rcu_assign_pointer(pmc->mca_sources, + mc_dereference(im->mca_sources, idev)); + RCU_INIT_POINTER(im->mca_tomb, NULL); + RCU_INIT_POINTER(im->mca_sources, NULL); + + for_each_psf_mclock(pmc, psf) psf->sf_crcount = pmc->mca_crcount; } - spin_unlock_bh(&im->mca_lock); - spin_lock_bh(&idev->mc_lock); - pmc->next = idev->mc_tomb; - idev->mc_tomb = pmc; - spin_unlock_bh(&idev->mc_lock); + rcu_assign_pointer(pmc->next, idev->mc_tomb); + rcu_assign_pointer(idev->mc_tomb, pmc); } +/* called with mc_lock */ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { - struct ifmcaddr6 *pmc, *pmc_prev; - struct ip6_sf_list *psf; + struct ip6_sf_list *psf, *sources, *tomb; struct in6_addr *pmca = &im->mca_addr; + struct ifmcaddr6 *pmc, *pmc_prev; - spin_lock_bh(&idev->mc_lock); pmc_prev = NULL; - for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) { + for_each_mc_tomb(idev, pmc) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) - pmc_prev->next = pmc->next; + rcu_assign_pointer(pmc_prev->next, pmc->next); else - idev->mc_tomb = pmc->next; + rcu_assign_pointer(idev->mc_tomb, pmc->next); } - spin_unlock_bh(&idev->mc_lock); - spin_lock_bh(&im->mca_lock); if (pmc) { im->idev = pmc->idev; if (im->mca_sfmode == MCAST_INCLUDE) { - swap(im->mca_tomb, pmc->mca_tomb); - swap(im->mca_sources, pmc->mca_sources); - for (psf = im->mca_sources; psf; psf = psf->sf_next) + tomb = rcu_replace_pointer(im->mca_tomb, + mc_dereference(pmc->mca_tomb, pmc->idev), + lockdep_is_held(&im->idev->mc_lock)); + rcu_assign_pointer(pmc->mca_tomb, tomb); + + sources = rcu_replace_pointer(im->mca_sources, + mc_dereference(pmc->mca_sources, pmc->idev), + lockdep_is_held(&im->idev->mc_lock)); + rcu_assign_pointer(pmc->mca_sources, sources); + for_each_psf_mclock(im, psf) psf->sf_crcount = idev->mc_qrv; } else { im->mca_crcount = idev->mc_qrv; } in6_dev_put(pmc->idev); ip6_mc_clear_src(pmc); - kfree(pmc); + kfree_rcu(pmc, rcu); } - spin_unlock_bh(&im->mca_lock); } +/* called with mc_lock */ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; - spin_lock_bh(&idev->mc_lock); - pmc = idev->mc_tomb; - idev->mc_tomb = NULL; - spin_unlock_bh(&idev->mc_lock); + pmc = mc_dereference(idev->mc_tomb, idev); + RCU_INIT_POINTER(idev->mc_tomb, NULL); for (; pmc; pmc = nextpmc) { - nextpmc = pmc->next; + nextpmc = mc_dereference(pmc->next, idev); ip6_mc_clear_src(pmc); in6_dev_put(pmc->idev); - kfree(pmc); + kfree_rcu(pmc, rcu); } /* clear dead sources, too */ - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + for_each_mc_mclock(idev, pmc) { struct ip6_sf_list *psf, *psf_next; - spin_lock_bh(&pmc->mca_lock); - psf = pmc->mca_tomb; - pmc->mca_tomb = NULL; - spin_unlock_bh(&pmc->mca_lock); + psf = mc_dereference(pmc->mca_tomb, idev); + RCU_INIT_POINTER(pmc->mca_tomb, NULL); for (; psf; psf = psf_next) { - psf_next = psf->sf_next; - kfree(psf); + psf_next = mc_dereference(psf->sf_next, idev); + kfree_rcu(psf, rcu); } } - read_unlock_bh(&idev->lock); +} + +static void mld_clear_query(struct inet6_dev *idev) +{ + struct sk_buff *skb; + + spin_lock_bh(&idev->mc_query_lock); + while ((skb = __skb_dequeue(&idev->mc_query_queue))) + kfree_skb(skb); + spin_unlock_bh(&idev->mc_query_lock); +} + +static void mld_clear_report(struct inet6_dev *idev) +{ + struct sk_buff *skb; + + spin_lock_bh(&idev->mc_report_lock); + while ((skb = __skb_dequeue(&idev->mc_report_queue))) + kfree_skb(skb); + spin_unlock_bh(&idev->mc_report_lock); } static void mca_get(struct ifmcaddr6 *mc) @@ -840,21 +861,22 @@ static void ma_put(struct ifmcaddr6 *mc) { if (refcount_dec_and_test(&mc->mca_refcnt)) { in6_dev_put(mc->idev); - kfree(mc); + kfree_rcu(mc, rcu); } } +/* called with mc_lock */ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; - mc = kzalloc(sizeof(*mc), GFP_ATOMIC); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; - timer_setup(&mc->mca_timer, igmp6_timer_handler, 0); + INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work); mc->mca_addr = *addr; mc->idev = idev; /* reference taken by caller */ @@ -862,7 +884,6 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, /* mca_stamp should be updated upon changes */ mc->mca_cstamp = mc->mca_tstamp = jiffies; refcount_set(&mc->mca_refcnt, 1); - spin_lock_init(&mc->mca_lock); mc->mca_sfmode = mode; mc->mca_sfcount[mode] = 1; @@ -891,18 +912,17 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, if (!idev) return -EINVAL; - write_lock_bh(&idev->lock); if (idev->dead) { - write_unlock_bh(&idev->lock); in6_dev_put(idev); return -ENODEV; } - for (mc = idev->mc_list; mc; mc = mc->next) { + mutex_lock(&idev->mc_lock); + for_each_mc_mclock(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; - write_unlock_bh(&idev->lock); ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return 0; } @@ -910,22 +930,19 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, mc = mca_alloc(idev, addr, mode); if (!mc) { - write_unlock_bh(&idev->lock); + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return -ENOMEM; } - mc->next = idev->mc_list; - idev->mc_list = mc; + rcu_assign_pointer(mc->next, idev->mc_list); + rcu_assign_pointer(idev->mc_list, mc); - /* Hold this for the code below before we unlock, - * it is already exposed via idev->mc_list. - */ mca_get(mc); - write_unlock_bh(&idev->lock); mld_del_delrec(idev, mc); igmp6_group_added(mc); + mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; } @@ -937,33 +954,35 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) EXPORT_SYMBOL(ipv6_dev_mc_inc); /* - * device multicast group del + * device multicast group del */ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { - struct ifmcaddr6 *ma, **map; + struct ifmcaddr6 *ma, __rcu **map; ASSERT_RTNL(); - write_lock_bh(&idev->lock); - for (map = &idev->mc_list; (ma = *map) != NULL; map = &ma->next) { + mutex_lock(&idev->mc_lock); + for (map = &idev->mc_list; + (ma = mc_dereference(*map, idev)); + map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { if (--ma->mca_users == 0) { *map = ma->next; - write_unlock_bh(&idev->lock); igmp6_group_dropped(ma); ip6_mc_clear_src(ma); + mutex_unlock(&idev->mc_lock); ma_put(ma); return 0; } - write_unlock_bh(&idev->lock); + mutex_unlock(&idev->mc_lock); return 0; } } - write_unlock_bh(&idev->lock); + mutex_unlock(&idev->mc_lock); return -ENOENT; } @@ -997,8 +1016,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - for (mc = idev->mc_list; mc; mc = mc->next) { + for_each_mc_rcu(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, group)) break; } @@ -1006,8 +1024,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, if (src_addr && !ipv6_addr_any(src_addr)) { struct ip6_sf_list *psf; - spin_lock_bh(&mc->mca_lock); - for (psf = mc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_rcu(mc, psf) { if (ipv6_addr_equal(&psf->sf_addr, src_addr)) break; } @@ -1017,89 +1034,107 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, mc->mca_sfcount[MCAST_EXCLUDE]; else rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; - spin_unlock_bh(&mc->mca_lock); } else rv = true; /* don't filter unspecified source */ } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); return rv; } -static void mld_gq_start_timer(struct inet6_dev *idev) +/* called with mc_lock */ +static void mld_gq_start_work(struct inet6_dev *idev) { unsigned long tv = prandom_u32() % idev->mc_maxdelay; idev->mc_gq_running = 1; - if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) + if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) in6_dev_hold(idev); } -static void mld_gq_stop_timer(struct inet6_dev *idev) +/* called with mc_lock */ +static void mld_gq_stop_work(struct inet6_dev *idev) { idev->mc_gq_running = 0; - if (del_timer(&idev->mc_gq_timer)) + if (cancel_delayed_work(&idev->mc_gq_work)) __in6_dev_put(idev); } -static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay) +/* called with mc_lock */ +static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = prandom_u32() % delay; - if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) + if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); } -static void mld_ifc_stop_timer(struct inet6_dev *idev) +/* called with mc_lock */ +static void mld_ifc_stop_work(struct inet6_dev *idev) { idev->mc_ifc_count = 0; - if (del_timer(&idev->mc_ifc_timer)) + if (cancel_delayed_work(&idev->mc_ifc_work)) __in6_dev_put(idev); } -static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay) +/* called with mc_lock */ +static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = prandom_u32() % delay; - if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2)) + if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); } -static void mld_dad_stop_timer(struct inet6_dev *idev) +static void mld_dad_stop_work(struct inet6_dev *idev) +{ + if (cancel_delayed_work(&idev->mc_dad_work)) + __in6_dev_put(idev); +} + +static void mld_query_stop_work(struct inet6_dev *idev) { - if (del_timer(&idev->mc_dad_timer)) + spin_lock_bh(&idev->mc_query_lock); + if (cancel_delayed_work(&idev->mc_query_work)) + __in6_dev_put(idev); + spin_unlock_bh(&idev->mc_query_lock); +} + +static void mld_report_stop_work(struct inet6_dev *idev) +{ + if (cancel_delayed_work_sync(&idev->mc_report_work)) __in6_dev_put(idev); } /* - * IGMP handling (alias multicast ICMPv6 messages) + * IGMP handling (alias multicast ICMPv6 messages) + * called with mc_lock */ - static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; - /* Do not start timer for these addresses */ + /* Do not start work for these addresses */ if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; - if (del_timer(&ma->mca_timer)) { + if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); - delay = ma->mca_timer.expires - jiffies; + delay = ma->mca_work.timer.expires - jiffies; } if (delay >= resptime) delay = prandom_u32() % resptime; - ma->mca_timer.expires = jiffies + delay; - if (!mod_timer(&ma->mca_timer, jiffies + delay)) + if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING; } -/* mark EXCLUDE-mode sources */ +/* mark EXCLUDE-mode sources + * called with mc_lock + */ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { @@ -1107,7 +1142,7 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, int i, scount; scount = 0; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { @@ -1128,6 +1163,7 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, return true; } +/* called with mc_lock */ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { @@ -1140,7 +1176,7 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, /* mark INCLUDE-mode sources */ scount = 0; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { @@ -1305,10 +1341,10 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, if (v1_query) mld_set_v1_mode(idev); - /* cancel MLDv2 report timer */ - mld_gq_stop_timer(idev); - /* cancel the interface change timer */ - mld_ifc_stop_timer(idev); + /* cancel MLDv2 report work */ + mld_gq_stop_work(idev); + /* cancel the interface change work */ + mld_ifc_stop_work(idev); /* clear deleted report items */ mld_clear_delrec(idev); @@ -1332,18 +1368,41 @@ static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, /* called with rcu_read_lock() */ int igmp6_event_query(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return -EINVAL; + + if (idev->dead) { + kfree_skb(skb); + return -ENODEV; + } + + spin_lock_bh(&idev->mc_query_lock); + if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { + __skb_queue_tail(&idev->mc_query_queue, skb); + if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) + in6_dev_hold(idev); + } + spin_unlock_bh(&idev->mc_query_lock); + + return 0; +} + +static void __mld_query_work(struct sk_buff *skb) +{ struct mld2_query *mlh2 = NULL; - struct ifmcaddr6 *ma; const struct in6_addr *group; unsigned long max_delay; struct inet6_dev *idev; + struct ifmcaddr6 *ma; struct mld_msg *mld; int group_type; int mark = 0; int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) - return -EINVAL; + goto kfree_skb; /* compute payload length excluding extension headers */ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); @@ -1360,11 +1419,11 @@ int igmp6_event_query(struct sk_buff *skb) ipv6_hdr(skb)->hop_limit != 1 || !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) - return -EINVAL; + goto kfree_skb; - idev = __in6_dev_get(skb->dev); + idev = in6_dev_get(skb->dev); if (!idev) - return 0; + goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); group = &mld->mld_mca; @@ -1372,60 +1431,56 @@ int igmp6_event_query(struct sk_buff *skb) if (group_type != IPV6_ADDR_ANY && !(group_type&IPV6_ADDR_MULTICAST)) - return -EINVAL; + goto out; if (len < MLD_V1_QUERY_LEN) { - return -EINVAL; + goto out; } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) { err = mld_process_v1(idev, mld, &max_delay, len == MLD_V1_QUERY_LEN); if (err < 0) - return err; + goto out; } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); if (!pskb_may_pull(skb, srcs_offset)) - return -EINVAL; + goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); err = mld_process_v2(idev, mlh2, &max_delay); if (err < 0) - return err; + goto out; if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) - return -EINVAL; /* no sources allowed */ + goto out; /* no sources allowed */ - mld_gq_start_timer(idev); - return 0; + mld_gq_start_work(idev); + goto out; } /* mark sources to include, if group & source-specific */ if (mlh2->mld2q_nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) - return -EINVAL; + goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } } else { - return -EINVAL; + goto out; } - read_lock_bh(&idev->lock); if (group_type == IPV6_ADDR_ANY) { - for (ma = idev->mc_list; ma; ma = ma->next) { - spin_lock_bh(&ma->mca_lock); + for_each_mc_mclock(idev, ma) { igmp6_group_queried(ma, max_delay); - spin_unlock_bh(&ma->mca_lock); } } else { - for (ma = idev->mc_list; ma; ma = ma->next) { + for_each_mc_mclock(idev, ma) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; - spin_lock_bh(&ma->mca_lock); if (ma->mca_flags & MAF_TIMER_RUNNING) { /* gsquery <- gsquery && mark */ if (!mark) @@ -1440,34 +1495,91 @@ int igmp6_event_query(struct sk_buff *skb) if (!(ma->mca_flags & MAF_GSQUERY) || mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) igmp6_group_queried(ma, max_delay); - spin_unlock_bh(&ma->mca_lock); break; } } - read_unlock_bh(&idev->lock); - return 0; +out: + in6_dev_put(idev); +kfree_skb: + consume_skb(skb); +} + +static void mld_query_work(struct work_struct *work) +{ + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_query_work); + struct sk_buff_head q; + struct sk_buff *skb; + bool rework = false; + int cnt = 0; + + skb_queue_head_init(&q); + + spin_lock_bh(&idev->mc_query_lock); + while ((skb = __skb_dequeue(&idev->mc_query_queue))) { + __skb_queue_tail(&q, skb); + + if (++cnt >= MLD_MAX_QUEUE) { + rework = true; + schedule_delayed_work(&idev->mc_query_work, 0); + break; + } + } + spin_unlock_bh(&idev->mc_query_lock); + + mutex_lock(&idev->mc_lock); + while ((skb = __skb_dequeue(&q))) + __mld_query_work(skb); + mutex_unlock(&idev->mc_lock); + + if (!rework) + in6_dev_put(idev); } /* called with rcu_read_lock() */ int igmp6_event_report(struct sk_buff *skb) { - struct ifmcaddr6 *ma; + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return -EINVAL; + + if (idev->dead) { + kfree_skb(skb); + return -ENODEV; + } + + spin_lock_bh(&idev->mc_report_lock); + if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { + __skb_queue_tail(&idev->mc_report_queue, skb); + if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) + in6_dev_hold(idev); + } + spin_unlock_bh(&idev->mc_report_lock); + + return 0; +} + +static void __mld_report_work(struct sk_buff *skb) +{ struct inet6_dev *idev; + struct ifmcaddr6 *ma; struct mld_msg *mld; int addr_type; /* Our own report looped back. Ignore it. */ if (skb->pkt_type == PACKET_LOOPBACK) - return 0; + goto kfree_skb; /* send our report if the MC router may not have heard this report */ if (skb->pkt_type != PACKET_MULTICAST && skb->pkt_type != PACKET_BROADCAST) - return 0; + goto kfree_skb; if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) - return -EINVAL; + goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); @@ -1475,29 +1587,61 @@ int igmp6_event_report(struct sk_buff *skb) addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); if (addr_type != IPV6_ADDR_ANY && !(addr_type&IPV6_ADDR_LINKLOCAL)) - return -EINVAL; + goto kfree_skb; - idev = __in6_dev_get(skb->dev); + idev = in6_dev_get(skb->dev); if (!idev) - return -ENODEV; + goto kfree_skb; /* - * Cancel the timer for this group + * Cancel the work for this group */ - read_lock_bh(&idev->lock); - for (ma = idev->mc_list; ma; ma = ma->next) { + for_each_mc_mclock(idev, ma) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { - spin_lock(&ma->mca_lock); - if (del_timer(&ma->mca_timer)) + if (cancel_delayed_work(&ma->mca_work)) refcount_dec(&ma->mca_refcnt); - ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING); - spin_unlock(&ma->mca_lock); + ma->mca_flags &= ~(MAF_LAST_REPORTER | + MAF_TIMER_RUNNING); break; } } - read_unlock_bh(&idev->lock); - return 0; + + in6_dev_put(idev); +kfree_skb: + consume_skb(skb); +} + +static void mld_report_work(struct work_struct *work) +{ + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_report_work); + struct sk_buff_head q; + struct sk_buff *skb; + bool rework = false; + int cnt = 0; + + skb_queue_head_init(&q); + spin_lock_bh(&idev->mc_report_lock); + while ((skb = __skb_dequeue(&idev->mc_report_queue))) { + __skb_queue_tail(&q, skb); + + if (++cnt >= MLD_MAX_QUEUE) { + rework = true; + schedule_delayed_work(&idev->mc_report_work, 0); + break; + } + } + spin_unlock_bh(&idev->mc_report_lock); + + mutex_lock(&idev->mc_lock); + while ((skb = __skb_dequeue(&q))) + __mld_report_work(skb); + mutex_unlock(&idev->mc_lock); + + if (!rework) + in6_dev_put(idev); } static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, @@ -1550,7 +1694,7 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) struct ip6_sf_list *psf; int scount = 0; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_mclock(pmc, psf) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -1724,15 +1868,18 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) +/* called with mc_lock */ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, int gdeleted, int sdeleted, int crsend) + int type, int gdeleted, int sdeleted, + int crsend) { + struct ip6_sf_list *psf, *psf_prev, *psf_next; + int scount, stotal, first, isquery, truncate; + struct ip6_sf_list __rcu **psf_list; struct inet6_dev *idev = pmc->idev; struct net_device *dev = idev->dev; - struct mld2_report *pmr; struct mld2_grec *pgr = NULL; - struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; - int scount, stotal, first, isquery, truncate; + struct mld2_report *pmr; unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) @@ -1751,7 +1898,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; - if (!*psf_list) + if (!rcu_access_pointer(*psf_list)) goto empty_source; pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL; @@ -1767,10 +1914,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } first = 1; psf_prev = NULL; - for (psf = *psf_list; psf; psf = psf_next) { + for (psf = mc_dereference(*psf_list, idev); + psf; + psf = psf_next) { struct in6_addr *psrc; - psf_next = psf->sf_next; + psf_next = mc_dereference(psf->sf_next, idev); if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; @@ -1817,10 +1966,12 @@ decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) - psf_prev->sf_next = psf->sf_next; + rcu_assign_pointer(psf_prev->sf_next, + mc_dereference(psf->sf_next, idev)); else - *psf_list = psf->sf_next; - kfree(psf); + rcu_assign_pointer(*psf_list, + mc_dereference(psf->sf_next, idev)); + kfree_rcu(psf, rcu); continue; } } @@ -1849,72 +2000,73 @@ empty_source: return skb; } +/* called with mc_lock */ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) { struct sk_buff *skb = NULL; int type; - read_lock_bh(&idev->lock); if (!pmc) { - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + for_each_mc_mclock(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) continue; - spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); - spin_unlock_bh(&pmc->mca_lock); } } else { - spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); - spin_unlock_bh(&pmc->mca_lock); } - read_unlock_bh(&idev->lock); if (skb) mld_sendpack(skb); } /* * remove zero-count source records from a source filter list + * called with mc_lock */ -static void mld_clear_zeros(struct ip6_sf_list **ppsf) +static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev) { struct ip6_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; - for (psf = *ppsf; psf; psf = psf_next) { - psf_next = psf->sf_next; + for (psf = mc_dereference(*ppsf, idev); + psf; + psf = psf_next) { + psf_next = mc_dereference(psf->sf_next, idev); if (psf->sf_crcount == 0) { if (psf_prev) - psf_prev->sf_next = psf->sf_next; + rcu_assign_pointer(psf_prev->sf_next, + mc_dereference(psf->sf_next, idev)); else - *ppsf = psf->sf_next; - kfree(psf); - } else + rcu_assign_pointer(*ppsf, + mc_dereference(psf->sf_next, idev)); + kfree_rcu(psf, rcu); + } else { psf_prev = psf; + } } } +/* called with mc_lock */ static void mld_send_cr(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; struct sk_buff *skb = NULL; int type, dtype; - read_lock_bh(&idev->lock); - spin_lock(&idev->mc_lock); - /* deleted MCA's */ pmc_prev = NULL; - for (pmc = idev->mc_tomb; pmc; pmc = pmc_next) { - pmc_next = pmc->next; + for (pmc = mc_dereference(idev->mc_tomb, idev); + pmc; + pmc = pmc_next) { + pmc_next = mc_dereference(pmc->next, idev); if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; @@ -1928,26 +2080,25 @@ static void mld_send_cr(struct inet6_dev *idev) } pmc->mca_crcount--; if (pmc->mca_crcount == 0) { - mld_clear_zeros(&pmc->mca_tomb); - mld_clear_zeros(&pmc->mca_sources); + mld_clear_zeros(&pmc->mca_tomb, idev); + mld_clear_zeros(&pmc->mca_sources, idev); } } - if (pmc->mca_crcount == 0 && !pmc->mca_tomb && - !pmc->mca_sources) { + if (pmc->mca_crcount == 0 && + !rcu_access_pointer(pmc->mca_tomb) && + !rcu_access_pointer(pmc->mca_sources)) { if (pmc_prev) - pmc_prev->next = pmc_next; + rcu_assign_pointer(pmc_prev->next, pmc_next); else - idev->mc_tomb = pmc_next; + rcu_assign_pointer(idev->mc_tomb, pmc_next); in6_dev_put(pmc->idev); - kfree(pmc); + kfree_rcu(pmc, rcu); } else pmc_prev = pmc; } - spin_unlock(&idev->mc_lock); /* change recs */ - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { - spin_lock_bh(&pmc->mca_lock); + for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_ALLOW_NEW_SOURCES; @@ -1967,9 +2118,7 @@ static void mld_send_cr(struct inet6_dev *idev) skb = add_grec(skb, pmc, type, 0, 0, 0); pmc->mca_crcount--; } - spin_unlock_bh(&pmc->mca_lock); } - read_unlock_bh(&idev->lock); if (!skb) return; (void) mld_sendpack(skb); @@ -2071,6 +2220,7 @@ err_out: goto out; } +/* called with mc_lock */ static void mld_send_initial_cr(struct inet6_dev *idev) { struct sk_buff *skb; @@ -2081,47 +2231,49 @@ static void mld_send_initial_cr(struct inet6_dev *idev) return; skb = NULL; - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { - spin_lock_bh(&pmc->mca_lock); + for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_ALLOW_NEW_SOURCES; skb = add_grec(skb, pmc, type, 0, 0, 1); - spin_unlock_bh(&pmc->mca_lock); } - read_unlock_bh(&idev->lock); if (skb) mld_sendpack(skb); } void ipv6_mc_dad_complete(struct inet6_dev *idev) { + mutex_lock(&idev->mc_lock); idev->mc_dad_count = idev->mc_qrv; if (idev->mc_dad_count) { mld_send_initial_cr(idev); idev->mc_dad_count--; if (idev->mc_dad_count) - mld_dad_start_timer(idev, - unsolicited_report_interval(idev)); + mld_dad_start_work(idev, + unsolicited_report_interval(idev)); } + mutex_unlock(&idev->mc_lock); } -static void mld_dad_timer_expire(struct timer_list *t) +static void mld_dad_work(struct work_struct *work) { - struct inet6_dev *idev = from_timer(idev, t, mc_dad_timer); - + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_dad_work); + mutex_lock(&idev->mc_lock); mld_send_initial_cr(idev); if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) - mld_dad_start_timer(idev, - unsolicited_report_interval(idev)); + mld_dad_start_work(idev, + unsolicited_report_interval(idev)); } + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } +/* called with mc_lock */ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) { @@ -2129,7 +2281,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2144,21 +2296,27 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, /* no more filters for this source */ if (psf_prev) - psf_prev->sf_next = psf->sf_next; + rcu_assign_pointer(psf_prev->sf_next, + mc_dereference(psf->sf_next, idev)); else - pmc->mca_sources = psf->sf_next; + rcu_assign_pointer(pmc->mca_sources, + mc_dereference(psf->sf_next, idev)); + if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; - psf->sf_next = pmc->mca_tomb; - pmc->mca_tomb = psf; + rcu_assign_pointer(psf->sf_next, + mc_dereference(pmc->mca_tomb, idev)); + rcu_assign_pointer(pmc->mca_tomb, psf); rv = 1; - } else - kfree(psf); + } else { + kfree_rcu(psf, rcu); + } } return rv; } +/* called with mc_lock */ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) @@ -2169,24 +2327,19 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + + for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } - if (!pmc) { - /* MCA not found?? bug */ - read_unlock_bh(&idev->lock); + if (!pmc) return -ESRCH; - } - spin_lock_bh(&pmc->mca_lock); + sf_markstate(pmc); if (!delta) { - if (!pmc->mca_sfcount[sfmode]) { - spin_unlock_bh(&pmc->mca_lock); - read_unlock_bh(&idev->lock); + if (!pmc->mca_sfcount[sfmode]) return -EINVAL; - } + pmc->mca_sfcount[sfmode]--; } err = 0; @@ -2206,18 +2359,19 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_sfmode = MCAST_INCLUDE; pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) + for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(pmc->idev); - } else if (sf_setstate(pmc) || changerec) + } else if (sf_setstate(pmc) || changerec) { mld_ifc_event(pmc->idev); - spin_unlock_bh(&pmc->mca_lock); - read_unlock_bh(&idev->lock); + } + return err; } /* * Add multicast single-source filter to the interface list + * called with mc_lock */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) @@ -2225,40 +2379,45 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, struct ip6_sf_list *psf, *psf_prev; psf_prev = NULL; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; } if (!psf) { - psf = kzalloc(sizeof(*psf), GFP_ATOMIC); + psf = kzalloc(sizeof(*psf), GFP_KERNEL); if (!psf) return -ENOBUFS; psf->sf_addr = *psfsrc; if (psf_prev) { - psf_prev->sf_next = psf; - } else - pmc->mca_sources = psf; + rcu_assign_pointer(psf_prev->sf_next, psf); + } else { + rcu_assign_pointer(pmc->mca_sources, psf); + } } psf->sf_count[sfmode]++; return 0; } +/* called with mc_lock */ static void sf_markstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) + for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; - } else + } else { psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; + } + } } +/* called with mc_lock */ static int sf_setstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *dpsf; @@ -2267,7 +2426,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) int new_in, rv; rv = 0; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -2277,8 +2436,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) if (!psf->sf_oldin) { struct ip6_sf_list *prev = NULL; - for (dpsf = pmc->mca_tomb; dpsf; - dpsf = dpsf->sf_next) { + for_each_psf_tomb(pmc, dpsf) { if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; @@ -2286,10 +2444,14 @@ static int sf_setstate(struct ifmcaddr6 *pmc) } if (dpsf) { if (prev) - prev->sf_next = dpsf->sf_next; + rcu_assign_pointer(prev->sf_next, + mc_dereference(dpsf->sf_next, + pmc->idev)); else - pmc->mca_tomb = dpsf->sf_next; - kfree(dpsf); + rcu_assign_pointer(pmc->mca_tomb, + mc_dereference(dpsf->sf_next, + pmc->idev)); + kfree_rcu(dpsf, rcu); } psf->sf_crcount = qrv; rv++; @@ -2300,18 +2462,19 @@ static int sf_setstate(struct ifmcaddr6 *pmc) * add or update "delete" records if an active filter * is now inactive */ - for (dpsf = pmc->mca_tomb; dpsf; dpsf = dpsf->sf_next) + + for_each_psf_tomb(pmc, dpsf) if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; if (!dpsf) { - dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); + dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL); if (!dpsf) continue; *dpsf = *psf; - /* pmc->mca_lock held by callers */ - dpsf->sf_next = pmc->mca_tomb; - pmc->mca_tomb = dpsf; + rcu_assign_pointer(dpsf->sf_next, + mc_dereference(pmc->mca_tomb, pmc->idev)); + rcu_assign_pointer(pmc->mca_tomb, dpsf); } dpsf->sf_crcount = qrv; rv++; @@ -2322,6 +2485,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) /* * Add multicast source filter list to the interface list + * called with mc_lock */ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, @@ -2333,17 +2497,13 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + + for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } - if (!pmc) { - /* MCA not found?? bug */ - read_unlock_bh(&idev->lock); + if (!pmc) return -ESRCH; - } - spin_lock_bh(&pmc->mca_lock); sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; @@ -2374,36 +2534,40 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) + for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(idev); - } else if (sf_setstate(pmc)) + } else if (sf_setstate(pmc)) { mld_ifc_event(idev); - spin_unlock_bh(&pmc->mca_lock); - read_unlock_bh(&idev->lock); + } return err; } +/* called with mc_lock */ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; - for (psf = pmc->mca_tomb; psf; psf = nextpsf) { - nextpsf = psf->sf_next; - kfree(psf); + for (psf = mc_dereference(pmc->mca_tomb, pmc->idev); + psf; + psf = nextpsf) { + nextpsf = mc_dereference(psf->sf_next, pmc->idev); + kfree_rcu(psf, rcu); } - pmc->mca_tomb = NULL; - for (psf = pmc->mca_sources; psf; psf = nextpsf) { - nextpsf = psf->sf_next; - kfree(psf); + RCU_INIT_POINTER(pmc->mca_tomb, NULL); + for (psf = mc_dereference(pmc->mca_sources, pmc->idev); + psf; + psf = nextpsf) { + nextpsf = mc_dereference(psf->sf_next, pmc->idev); + kfree_rcu(psf, rcu); } - pmc->mca_sources = NULL; + RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; pmc->mca_sfcount[MCAST_EXCLUDE] = 1; } - +/* called with mc_lock */ static void igmp6_join_group(struct ifmcaddr6 *ma) { unsigned long delay; @@ -2415,93 +2579,115 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) delay = prandom_u32() % unsolicited_report_interval(ma->idev); - spin_lock_bh(&ma->mca_lock); - if (del_timer(&ma->mca_timer)) { + if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); - delay = ma->mca_timer.expires - jiffies; + delay = ma->mca_work.timer.expires - jiffies; } - if (!mod_timer(&ma->mca_timer, jiffies + delay)) + if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; - spin_unlock_bh(&ma->mca_lock); } static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev) { + struct ip6_sf_socklist *psl; int err; - write_lock_bh(&iml->sflock); - if (!iml->sflist) { + psl = sock_dereference(iml->sflist, sk); + + if (idev) + mutex_lock(&idev->mc_lock); + + if (!psl) { /* any-source empty exclude case */ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); } else { err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, - iml->sflist->sl_count, iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + psl->sl_count, psl->sl_addr, 0); + RCU_INIT_POINTER(iml->sflist, NULL); + atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psl, rcu); } - write_unlock_bh(&iml->sflock); + + if (idev) + mutex_unlock(&idev->mc_lock); + return err; } +/* called with mc_lock */ static void igmp6_leave_group(struct ifmcaddr6 *ma) { if (mld_in_v1_mode(ma->idev)) { - if (ma->mca_flags & MAF_LAST_REPORTER) + if (ma->mca_flags & MAF_LAST_REPORTER) { igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REDUCTION); + } } else { mld_add_delrec(ma->idev, ma); mld_ifc_event(ma->idev); } } -static void mld_gq_timer_expire(struct timer_list *t) +static void mld_gq_work(struct work_struct *work) { - struct inet6_dev *idev = from_timer(idev, t, mc_gq_timer); + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_gq_work); - idev->mc_gq_running = 0; + mutex_lock(&idev->mc_lock); mld_send_report(idev, NULL); + idev->mc_gq_running = 0; + mutex_unlock(&idev->mc_lock); + in6_dev_put(idev); } -static void mld_ifc_timer_expire(struct timer_list *t) +static void mld_ifc_work(struct work_struct *work) { - struct inet6_dev *idev = from_timer(idev, t, mc_ifc_timer); + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_ifc_work); + mutex_lock(&idev->mc_lock); mld_send_cr(idev); + if (idev->mc_ifc_count) { idev->mc_ifc_count--; if (idev->mc_ifc_count) - mld_ifc_start_timer(idev, - unsolicited_report_interval(idev)); + mld_ifc_start_work(idev, + unsolicited_report_interval(idev)); } + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } +/* called with mc_lock */ static void mld_ifc_event(struct inet6_dev *idev) { if (mld_in_v1_mode(idev)) return; + idev->mc_ifc_count = idev->mc_qrv; - mld_ifc_start_timer(idev, 1); + mld_ifc_start_work(idev, 1); } -static void igmp6_timer_handler(struct timer_list *t) +static void mld_mca_work(struct work_struct *work) { - struct ifmcaddr6 *ma = from_timer(ma, t, mca_timer); + struct ifmcaddr6 *ma = container_of(to_delayed_work(work), + struct ifmcaddr6, mca_work); + mutex_lock(&ma->idev->mc_lock); if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); - - spin_lock(&ma->mca_lock); ma->mca_flags |= MAF_LAST_REPORTER; ma->mca_flags &= ~MAF_TIMER_RUNNING; - spin_unlock(&ma->mca_lock); + mutex_unlock(&ma->idev->mc_lock); + ma_put(ma); } @@ -2513,10 +2699,10 @@ void ipv6_mc_unmap(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ - read_lock_bh(&idev->lock); - for (i = idev->mc_list; i; i = i->next) + mutex_lock(&idev->mc_lock); + for_each_mc_mclock(idev, i) igmp6_group_dropped(i); - read_unlock_bh(&idev->lock); + mutex_unlock(&idev->mc_lock); } void ipv6_mc_remap(struct inet6_dev *idev) @@ -2525,25 +2711,25 @@ void ipv6_mc_remap(struct inet6_dev *idev) } /* Device going down */ - void ipv6_mc_down(struct inet6_dev *idev) { struct ifmcaddr6 *i; + mutex_lock(&idev->mc_lock); /* Withdraw multicast list */ - - read_lock_bh(&idev->lock); - - for (i = idev->mc_list; i; i = i->next) + for_each_mc_mclock(idev, i) igmp6_group_dropped(i); + mutex_unlock(&idev->mc_lock); - /* Should stop timer after group drop. or we will - * start timer again in mld_ifc_event() + /* Should stop work after group drop. or we will + * start work again in mld_ifc_event() */ - mld_ifc_stop_timer(idev); - mld_gq_stop_timer(idev); - mld_dad_stop_timer(idev); - read_unlock_bh(&idev->lock); + synchronize_net(); + mld_query_stop_work(idev); + mld_report_stop_work(idev); + mld_ifc_stop_work(idev); + mld_gq_stop_work(idev); + mld_dad_stop_work(idev); } static void ipv6_mc_reset(struct inet6_dev *idev) @@ -2563,29 +2749,33 @@ void ipv6_mc_up(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ - read_lock_bh(&idev->lock); ipv6_mc_reset(idev); - for (i = idev->mc_list; i; i = i->next) { + mutex_lock(&idev->mc_lock); + for_each_mc_mclock(idev, i) { mld_del_delrec(idev, i); igmp6_group_added(i); } - read_unlock_bh(&idev->lock); + mutex_unlock(&idev->mc_lock); } /* IPv6 device initialization. */ void ipv6_mc_init_dev(struct inet6_dev *idev) { - write_lock_bh(&idev->lock); - spin_lock_init(&idev->mc_lock); idev->mc_gq_running = 0; - timer_setup(&idev->mc_gq_timer, mld_gq_timer_expire, 0); - idev->mc_tomb = NULL; + INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work); + RCU_INIT_POINTER(idev->mc_tomb, NULL); idev->mc_ifc_count = 0; - timer_setup(&idev->mc_ifc_timer, mld_ifc_timer_expire, 0); - timer_setup(&idev->mc_dad_timer, mld_dad_timer_expire, 0); + INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work); + INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work); + INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work); + INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work); + skb_queue_head_init(&idev->mc_query_queue); + skb_queue_head_init(&idev->mc_report_queue); + spin_lock_init(&idev->mc_query_lock); + spin_lock_init(&idev->mc_report_lock); + mutex_init(&idev->mc_lock); ipv6_mc_reset(idev); - write_unlock_bh(&idev->lock); } /* @@ -2596,9 +2786,13 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) { struct ifmcaddr6 *i; - /* Deactivate timers */ + /* Deactivate works */ ipv6_mc_down(idev); + mutex_lock(&idev->mc_lock); mld_clear_delrec(idev); + mutex_unlock(&idev->mc_lock); + mld_clear_query(idev); + mld_clear_report(idev); /* Delete all-nodes address. */ /* We cannot call ipv6_dev_mc_dec() directly, our caller in @@ -2610,16 +2804,14 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) if (idev->cnf.forwarding) __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); - write_lock_bh(&idev->lock); - while ((i = idev->mc_list) != NULL) { - idev->mc_list = i->next; + mutex_lock(&idev->mc_lock); + while ((i = mc_dereference(idev->mc_list, idev))) { + rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev)); - write_unlock_bh(&idev->lock); ip6_mc_clear_src(i); ma_put(i); - write_lock_bh(&idev->lock); } - write_unlock_bh(&idev->lock); + mutex_unlock(&idev->mc_lock); } static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) @@ -2628,13 +2820,14 @@ static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) ASSERT_RTNL(); + mutex_lock(&idev->mc_lock); if (mld_in_v1_mode(idev)) { - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) + for_each_mc_mclock(idev, pmc) igmp6_join_group(pmc); - read_unlock_bh(&idev->lock); - } else + } else { mld_send_report(idev, NULL); + } + mutex_unlock(&idev->mc_lock); } static int ipv6_mc_netdev_event(struct notifier_block *this, @@ -2681,13 +2874,12 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) idev = __in6_dev_get(state->dev); if (!idev) continue; - read_lock_bh(&idev->lock); - im = idev->mc_list; + + im = rcu_dereference(idev->mc_list); if (im) { state->idev = idev; break; } - read_unlock_bh(&idev->lock); } return im; } @@ -2696,11 +2888,8 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); - im = im->next; + im = rcu_dereference(im->next); while (!im) { - if (likely(state->idev)) - read_unlock_bh(&state->idev->lock); - state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; @@ -2709,8 +2898,7 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; - read_lock_bh(&state->idev->lock); - im = state->idev->mc_list; + im = rcu_dereference(state->idev->mc_list); } return im; } @@ -2744,10 +2932,8 @@ static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); - if (likely(state->idev)) { - read_unlock_bh(&state->idev->lock); + if (likely(state->idev)) state->idev = NULL; - } state->dev = NULL; rcu_read_unlock(); } @@ -2762,8 +2948,8 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v) state->dev->ifindex, state->dev->name, &im->mca_addr, im->mca_users, im->mca_flags, - (im->mca_flags&MAF_TIMER_RUNNING) ? - jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0); + (im->mca_flags & MAF_TIMER_RUNNING) ? + jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0); return 0; } @@ -2797,19 +2983,16 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) idev = __in6_dev_get(state->dev); if (unlikely(idev == NULL)) continue; - read_lock_bh(&idev->lock); - im = idev->mc_list; + + im = rcu_dereference(idev->mc_list); if (likely(im)) { - spin_lock_bh(&im->mca_lock); - psf = im->mca_sources; + psf = rcu_dereference(im->mca_sources); if (likely(psf)) { state->im = im; state->idev = idev; break; } - spin_unlock_bh(&im->mca_lock); } - read_unlock_bh(&idev->lock); } return psf; } @@ -2818,14 +3001,10 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); - psf = psf->sf_next; + psf = rcu_dereference(psf->sf_next); while (!psf) { - spin_unlock_bh(&state->im->mca_lock); - state->im = state->im->next; + state->im = rcu_dereference(state->im->next); while (!state->im) { - if (likely(state->idev)) - read_unlock_bh(&state->idev->lock); - state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; @@ -2834,13 +3013,11 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; - read_lock_bh(&state->idev->lock); - state->im = state->idev->mc_list; + state->im = rcu_dereference(state->idev->mc_list); } if (!state->im) break; - spin_lock_bh(&state->im->mca_lock); - psf = state->im->mca_sources; + psf = rcu_dereference(state->im->mca_sources); } out: return psf; @@ -2877,14 +3054,12 @@ static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); - if (likely(state->im)) { - spin_unlock_bh(&state->im->mca_lock); + + if (likely(state->im)) state->im = NULL; - } - if (likely(state->idev)) { - read_unlock_bh(&state->idev->lock); + if (likely(state->idev)) state->idev = NULL; - } + state->dev = NULL; rcu_read_unlock(); } @@ -2965,6 +3140,7 @@ static int __net_init igmp6_net_init(struct net *net) } inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; + net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL; err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); @@ -3002,7 +3178,19 @@ static struct pernet_operations igmp6_net_ops = { int __init igmp6_init(void) { - return register_pernet_subsys(&igmp6_net_ops); + int err; + + err = register_pernet_subsys(&igmp6_net_ops); + if (err) + return err; + + mld_wq = create_workqueue("mld"); + if (!mld_wq) { + unregister_pernet_subsys(&igmp6_net_ops); + return -ENOMEM; + } + + return err; } int __init igmp6_late_init(void) @@ -3013,6 +3201,7 @@ int __init igmp6_late_init(void) void igmp6_cleanup(void) { unregister_pernet_subsys(&igmp6_net_ops); + destroy_workqueue(mld_wq); } void igmp6_late_cleanup(void) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index ab9a279dd6d4..6ab710b5a1a8 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -24,6 +24,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff { const struct ipv6hdr *iph = ipv6_hdr(skb); struct sock *sk = sk_to_full_sk(sk_partial); + struct flow_keys flkeys; unsigned int hh_len; struct dst_entry *dst; int strict = (ipv6_addr_type(&iph->daddr) & @@ -38,6 +39,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff }; int err; + fib6_rules_early_flow_dissect(net, skb, &fl6, &flkeys); dst = ip6_route_output(net, sk, &fl6); err = dst->error; if (err) { diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 262bb51a2d99..f22233e44ee9 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -69,7 +69,10 @@ config NF_REJECT_IPV6 config NF_LOG_IPV6 tristate "IPv6 packet logging" default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON + select NF_LOG_SYSLOG + help + This is a backwards-compat option for the user's convenience + (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG. config IP6_NF_IPTABLES tristate "IP6 tables support (required for filtering)" diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 731a74c60dca..b85383606df7 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -18,9 +18,6 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o -# logging -obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o - # reject obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c129ad334eb3..a0108415275f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -15,28 +15,13 @@ #include <linux/errno.h> #include <linux/types.h> #include <linux/string.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/jiffies.h> #include <linux/net.h> -#include <linux/list.h> #include <linux/netdevice.h> -#include <linux/in6.h> #include <linux/ipv6.h> -#include <linux/icmpv6.h> -#include <linux/random.h> #include <linux/slab.h> -#include <net/sock.h> -#include <net/snmp.h> #include <net/ipv6_frag.h> -#include <net/protocol.h> -#include <net/transp_v6.h> -#include <net/rawv6.h> -#include <net/ndisc.h> -#include <net/addrconf.h> -#include <net/inet_ecn.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #include <linux/sysctl.h> #include <linux/netfilter.h> @@ -44,11 +29,18 @@ #include <linux/kernel.h> #include <linux/module.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/netns/generic.h> static const char nf_frags_cache_name[] = "nf-frags"; +unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; +static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) +{ + return net_generic(net, nf_frag_pernet_id); +} + #ifdef CONFIG_SYSCTL static struct ctl_table nf_ct_frag6_sysctl_table[] = { @@ -75,6 +67,7 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { static int nf_ct_frag6_sysctl_register(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag; struct ctl_table *table; struct ctl_table_header *hdr; @@ -86,18 +79,20 @@ static int nf_ct_frag6_sysctl_register(struct net *net) goto err_alloc; } - table[0].data = &net->nf_frag.fqdir->timeout; - table[1].data = &net->nf_frag.fqdir->low_thresh; - table[1].extra2 = &net->nf_frag.fqdir->high_thresh; - table[2].data = &net->nf_frag.fqdir->high_thresh; - table[2].extra1 = &net->nf_frag.fqdir->low_thresh; - table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh; + nf_frag = nf_frag_pernet(net); + + table[0].data = &nf_frag->fqdir->timeout; + table[1].data = &nf_frag->fqdir->low_thresh; + table[1].extra2 = &nf_frag->fqdir->high_thresh; + table[2].data = &nf_frag->fqdir->high_thresh; + table[2].extra1 = &nf_frag->fqdir->low_thresh; + table[2].extra2 = &nf_frag->fqdir->high_thresh; hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) goto err_reg; - net->nf_frag_frags_hdr = hdr; + nf_frag->nf_frag_frags_hdr = hdr; return 0; err_reg: @@ -109,10 +104,11 @@ err_alloc: static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); struct ctl_table *table; - table = net->nf_frag_frags_hdr->ctl_table_arg; - unregister_net_sysctl_table(net->nf_frag_frags_hdr); + table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } @@ -149,6 +145,7 @@ static void nf_ct_frag6_expire(struct timer_list *t) static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, const struct ipv6hdr *hdr, int iif) { + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); struct frag_v6_compare_key key = { .id = id, .saddr = hdr->saddr, @@ -158,7 +155,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; - q = inet_frag_find(net->nf_frag.fqdir, &key); + q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; @@ -495,37 +492,44 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); int res; - res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net); + res = fqdir_init(&nf_frag->fqdir, &nf_frags, net); if (res < 0) return res; - net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; - net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; - net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT; + nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = nf_ct_frag6_sysctl_register(net); if (res < 0) - fqdir_exit(net->nf_frag.fqdir); + fqdir_exit(nf_frag->fqdir); return res; } static void nf_ct_net_pre_exit(struct net *net) { - fqdir_pre_exit(net->nf_frag.fqdir); + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); + + fqdir_pre_exit(nf_frag->fqdir); } static void nf_ct_net_exit(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); + nf_ct_frags6_sysctl_unregister(net); - fqdir_exit(net->nf_frag.fqdir); + fqdir_exit(nf_frag->fqdir); } static struct pernet_operations nf_ct_net_ops = { .init = nf_ct_net_init, .pre_exit = nf_ct_net_pre_exit, .exit = nf_ct_net_exit, + .id = &nf_frag_pernet_id, + .size = sizeof(struct nft_ct_frag6_pernet), }; static const struct rhashtable_params nfct_rhash_params = { diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 6646a87fb5dc..402dc4ca9504 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -25,6 +25,8 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> +extern unsigned int nf_frag_pernet_id; + static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, @@ -89,10 +91,12 @@ static const struct nf_hook_ops ipv6_defrag_ops[] = { static void __net_exit defrag6_net_exit(struct net *net) { - if (net->nf.defrag_ipv6) { + struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); + + if (nf_frag->users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); - net->nf.defrag_ipv6 = false; + nf_frag->users = 0; } } @@ -130,21 +134,22 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv6_enable(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); int err = 0; might_sleep(); - if (net->nf.defrag_ipv6) + if (nf_frag->users) return 0; mutex_lock(&defrag6_mutex); - if (net->nf.defrag_ipv6) + if (nf_frag->users) goto out_unlock; err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) - net->nf.defrag_ipv6 = true; + nf_frag->users = 1; out_unlock: mutex_unlock(&defrag6_mutex); diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c deleted file mode 100644 index 8210ff34ed9b..000000000000 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ /dev/null @@ -1,427 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <net/ipv6.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/tcp.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv6.h> -#include <linux/netfilter/xt_LOG.h> -#include <net/netfilter/nf_log.h> - -static const struct nf_loginfo default_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = LOGLEVEL_NOTICE, - .logflags = NF_LOG_DEFAULT_MASK, - }, - }, -}; - -/* One level of recursion won't kill us */ -static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m, - const struct nf_loginfo *info, - const struct sk_buff *skb, unsigned int ip6hoff, - int recurse) -{ - u_int8_t currenthdr; - int fragment; - struct ipv6hdr _ip6h; - const struct ipv6hdr *ih; - unsigned int ptr; - unsigned int hdrlen = 0; - unsigned int logflags; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - else - logflags = NF_LOG_DEFAULT_MASK; - - ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); - if (ih == NULL) { - nf_log_buf_add(m, "TRUNCATED"); - return; - } - - /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ - nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); - - /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ - nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", - ntohs(ih->payload_len) + sizeof(struct ipv6hdr), - (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, - ih->hop_limit, - (ntohl(*(__be32 *)ih) & 0x000fffff)); - - fragment = 0; - ptr = ip6hoff + sizeof(struct ipv6hdr); - currenthdr = ih->nexthdr; - while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) { - struct ipv6_opt_hdr _hdr; - const struct ipv6_opt_hdr *hp; - - hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); - if (hp == NULL) { - nf_log_buf_add(m, "TRUNCATED"); - return; - } - - /* Max length: 48 "OPT (...) " */ - if (logflags & NF_LOG_IPOPT) - nf_log_buf_add(m, "OPT ( "); - - switch (currenthdr) { - case IPPROTO_FRAGMENT: { - struct frag_hdr _fhdr; - const struct frag_hdr *fh; - - nf_log_buf_add(m, "FRAG:"); - fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), - &_fhdr); - if (fh == NULL) { - nf_log_buf_add(m, "TRUNCATED "); - return; - } - - /* Max length: 6 "65535 " */ - nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); - - /* Max length: 11 "INCOMPLETE " */ - if (fh->frag_off & htons(0x0001)) - nf_log_buf_add(m, "INCOMPLETE "); - - nf_log_buf_add(m, "ID:%08x ", - ntohl(fh->identification)); - - if (ntohs(fh->frag_off) & 0xFFF8) - fragment = 1; - - hdrlen = 8; - - break; - } - case IPPROTO_DSTOPTS: - case IPPROTO_ROUTING: - case IPPROTO_HOPOPTS: - if (fragment) { - if (logflags & NF_LOG_IPOPT) - nf_log_buf_add(m, ")"); - return; - } - hdrlen = ipv6_optlen(hp); - break; - /* Max Length */ - case IPPROTO_AH: - if (logflags & NF_LOG_IPOPT) { - struct ip_auth_hdr _ahdr; - const struct ip_auth_hdr *ah; - - /* Max length: 3 "AH " */ - nf_log_buf_add(m, "AH "); - - if (fragment) { - nf_log_buf_add(m, ")"); - return; - } - - ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), - &_ahdr); - if (ah == NULL) { - /* - * Max length: 26 "INCOMPLETE [65535 - * bytes] )" - */ - nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 15 "SPI=0xF1234567 */ - nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); - - } - - hdrlen = ipv6_authlen(hp); - break; - case IPPROTO_ESP: - if (logflags & NF_LOG_IPOPT) { - struct ip_esp_hdr _esph; - const struct ip_esp_hdr *eh; - - /* Max length: 4 "ESP " */ - nf_log_buf_add(m, "ESP "); - - if (fragment) { - nf_log_buf_add(m, ")"); - return; - } - - /* - * Max length: 26 "INCOMPLETE [65535 bytes] )" - */ - eh = skb_header_pointer(skb, ptr, sizeof(_esph), - &_esph); - if (eh == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", - skb->len - ptr); - return; - } - - /* Length: 16 "SPI=0xF1234567 )" */ - nf_log_buf_add(m, "SPI=0x%x )", - ntohl(eh->spi)); - } - return; - default: - /* Max length: 20 "Unknown Ext Hdr 255" */ - nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); - return; - } - if (logflags & NF_LOG_IPOPT) - nf_log_buf_add(m, ") "); - - currenthdr = hp->nexthdr; - ptr += hdrlen; - } - - switch (currenthdr) { - case IPPROTO_TCP: - if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment, - ptr, logflags)) - return; - break; - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr)) - return; - break; - case IPPROTO_ICMPV6: { - struct icmp6hdr _icmp6h; - const struct icmp6hdr *ic; - - /* Max length: 13 "PROTO=ICMPv6 " */ - nf_log_buf_add(m, "PROTO=ICMPv6 "); - - if (fragment) - break; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); - if (ic == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", - skb->len - ptr); - return; - } - - /* Max length: 18 "TYPE=255 CODE=255 " */ - nf_log_buf_add(m, "TYPE=%u CODE=%u ", - ic->icmp6_type, ic->icmp6_code); - - switch (ic->icmp6_type) { - case ICMPV6_ECHO_REQUEST: - case ICMPV6_ECHO_REPLY: - /* Max length: 19 "ID=65535 SEQ=65535 " */ - nf_log_buf_add(m, "ID=%u SEQ=%u ", - ntohs(ic->icmp6_identifier), - ntohs(ic->icmp6_sequence)); - break; - case ICMPV6_MGM_QUERY: - case ICMPV6_MGM_REPORT: - case ICMPV6_MGM_REDUCTION: - break; - - case ICMPV6_PARAMPROB: - /* Max length: 17 "POINTER=ffffffff " */ - nf_log_buf_add(m, "POINTER=%08x ", - ntohl(ic->icmp6_pointer)); - fallthrough; - case ICMPV6_DEST_UNREACH: - case ICMPV6_PKT_TOOBIG: - case ICMPV6_TIME_EXCEED: - /* Max length: 3+maxlen */ - if (recurse) { - nf_log_buf_add(m, "["); - dump_ipv6_packet(net, m, info, skb, - ptr + sizeof(_icmp6h), 0); - nf_log_buf_add(m, "] "); - } - - /* Max length: 10 "MTU=65535 " */ - if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) { - nf_log_buf_add(m, "MTU=%u ", - ntohl(ic->icmp6_mtu)); - } - } - break; - } - /* Max length: 10 "PROTO=255 " */ - default: - nf_log_buf_add(m, "PROTO=%u ", currenthdr); - } - - /* Max length: 15 "UID=4294967295 " */ - if ((logflags & NF_LOG_UID) && recurse) - nf_log_dump_sk_uid_gid(net, m, skb->sk); - - /* Max length: 16 "MARK=0xFFFFFFFF " */ - if (recurse && skb->mark) - nf_log_buf_add(m, "MARK=0x%x ", skb->mark); -} - -static void dump_ipv6_mac_header(struct nf_log_buf *m, - const struct nf_loginfo *info, - const struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - unsigned int logflags = 0; - - if (info->type == NF_LOG_TYPE_LOG) - logflags = info->u.log.logflags; - - if (!(logflags & NF_LOG_MACDECODE)) - goto fallback; - - switch (dev->type) { - case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); - nf_log_dump_vlan(m, skb); - nf_log_buf_add(m, "MACPROTO=%04x ", - ntohs(eth_hdr(skb)->h_proto)); - return; - default: - break; - } - -fallback: - nf_log_buf_add(m, "MAC="); - if (dev->hard_header_len && - skb->mac_header != skb->network_header) { - const unsigned char *p = skb_mac_header(skb); - unsigned int len = dev->hard_header_len; - unsigned int i; - - if (dev->type == ARPHRD_SIT) { - p -= ETH_HLEN; - - if (p < skb->head) - p = NULL; - } - - if (p != NULL) { - nf_log_buf_add(m, "%02x", *p++); - for (i = 1; i < len; i++) - nf_log_buf_add(m, ":%02x", *p++); - } - nf_log_buf_add(m, " "); - - if (dev->type == ARPHRD_SIT) { - const struct iphdr *iph = - (struct iphdr *)skb_mac_header(skb); - nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, - &iph->daddr); - } - } else { - nf_log_buf_add(m, " "); - } -} - -static void nf_log_ip6_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - struct nf_log_buf *m; - - /* FIXME: Disabled from containers until syslog ns is supported */ - if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) - return; - - m = nf_log_buf_open(); - - if (!loginfo) - loginfo = &default_loginfo; - - nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, - loginfo, prefix); - - if (in != NULL) - dump_ipv6_mac_header(m, loginfo, skb); - - dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); - - nf_log_buf_close(m); -} - -static struct nf_logger nf_ip6_logger __read_mostly = { - .name = "nf_log_ipv6", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_ip6_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_ipv6_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); -} - -static void __net_exit nf_log_ipv6_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_ip6_logger); -} - -static struct pernet_operations nf_log_ipv6_net_ops = { - .init = nf_log_ipv6_net_init, - .exit = nf_log_ipv6_net_exit, -}; - -static int __init nf_log_ipv6_init(void) -{ - int ret; - - ret = register_pernet_subsys(&nf_log_ipv6_net_ops); - if (ret < 0) - return ret; - - ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); - if (ret < 0) { - pr_err("failed to register logger\n"); - goto err1; - } - - return 0; - -err1: - unregister_pernet_subsys(&nf_log_ipv6_net_ops); - return ret; -} - -static void __exit nf_log_ipv6_exit(void) -{ - unregister_pernet_subsys(&nf_log_ipv6_net_ops); - nf_log_unregister(&nf_ip6_logger); -} - -module_init(nf_log_ipv6_init); -module_exit(nf_log_ipv6_exit); - -MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); -MODULE_DESCRIPTION("Netfilter IPv6 packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(AF_INET6, 0); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 373d48073106..a22822bdbf39 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2085,13 +2085,10 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh; - __u8 neigh_flags = 0; neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); - if (neigh) - neigh_flags = neigh->flags; - if (!(neigh_flags & NTF_ROUTER)) { + if (!(neigh && (neigh->flags & NTF_ROUTER))) { RT6_TRACE("purging route %p via non-router but gateway\n", rt); rt6_remove_exception(bucket, rt6_ex); @@ -2360,7 +2357,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, memset(&hash_keys, 0, sizeof(hash_keys)); - if (!flkeys) { + if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } @@ -2500,20 +2497,20 @@ struct dst_entry *ip6_route_output_flags(struct net *net, struct flowi6 *fl6, int flags) { - struct dst_entry *dst; - struct rt6_info *rt6; + struct dst_entry *dst; + struct rt6_info *rt6; - rcu_read_lock(); - dst = ip6_route_output_flags_noref(net, sk, fl6, flags); - rt6 = (struct rt6_info *)dst; - /* For dst cached in uncached_list, refcnt is already taken. */ - if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { - dst = &net->ipv6.ip6_null_entry->dst; - dst_hold(dst); - } - rcu_read_unlock(); + rcu_read_lock(); + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); + rt6 = (struct rt6_info *)dst; + /* For dst cached in uncached_list, refcnt is already taken. */ + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + } + rcu_read_unlock(); - return dst; + return dst; } EXPORT_SYMBOL_GPL(ip6_route_output_flags); @@ -6077,7 +6074,7 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send - * notfication. + * notification. */ return; diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index c2a0c78e84d4..bd7140885e60 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -119,12 +119,12 @@ static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt) return (struct seg6_local_lwt *)lwt->data; } -static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) +static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb, int flags) { struct ipv6_sr_hdr *srh; int len, srhoff = 0; - if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0) return NULL; if (!pskb_may_pull(skb, srhoff + sizeof(*srh))) @@ -152,13 +152,10 @@ static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb) { struct ipv6_sr_hdr *srh; - srh = get_srh(skb); + srh = get_srh(skb, IP6_FH_F_SKIP_RH); if (!srh) return NULL; - if (srh->segments_left == 0) - return NULL; - #ifdef CONFIG_IPV6_SEG6_HMAC if (!seg6_hmac_validate_skb(skb)) return NULL; @@ -172,7 +169,7 @@ static bool decap_and_validate(struct sk_buff *skb, int proto) struct ipv6_sr_hdr *srh; unsigned int off = 0; - srh = get_srh(skb); + srh = get_srh(skb, 0); if (srh && srh->segments_left > 0) return false; @@ -1478,7 +1475,7 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) /* Forcing the desc->optattrs *set* and the desc->attrs *set* to be * disjoined, this allow us to release acquired resources by optional * attributes and by required attributes independently from each other - * without any interfarence. + * without any interference. * In other terms, we are sure that we do not release some the acquired * resources twice. * diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 9fdccf0718b5..aa98294a3ad3 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -218,8 +218,6 @@ static int ipip6_tunnel_create(struct net_device *dev) ipip6_tunnel_clone_6rd(dev, sitn); - dev_hold(dev); - ipip6_tunnel_link(sitn, t); return 0; @@ -325,7 +323,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) rcu_read_lock(); - ca = t->prl_count < cmax ? t->prl_count : cmax; + ca = min(t->prl_count, cmax); if (!kp) { /* We don't try hard to allocate much memory for @@ -1456,7 +1454,7 @@ static int ipip6_tunnel_init(struct net_device *dev) dev->tstats = NULL; return err; } - + dev_hold(dev); return 0; } @@ -1472,7 +1470,6 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; iph->ttl = 64; - dev_hold(dev); rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 263ab43ed06b..27102c3d6e1d 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,7 +23,6 @@ static int two = 2; static int flowlabel_reflect_max = 0x7; -static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, @@ -34,7 +33,7 @@ static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_policy); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); @@ -45,39 +44,38 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", .data = &init_net.ipv6.sysctl.bindv6only, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "anycast_src_echo_reply", .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_consistency", .data = &init_net.ipv6.sysctl.flowlabel_consistency, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "auto_flowlabels", .data = &init_net.ipv6.sysctl.auto_flowlabels, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &auto_flowlabels_min, + .proc_handler = proc_dou8vec_minmax, .extra2 = &auto_flowlabels_max }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "idgen_retries", @@ -96,16 +94,16 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "flowlabel_state_ranges", .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_reflect", @@ -147,7 +145,7 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv6.sysctl.multipath_hash_policy, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, @@ -163,9 +161,9 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d0f007741e8e..5f47c0b6e3de 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -879,8 +879,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); + __be32 mrst = 0, *topt; struct dst_entry *dst; - __be32 *topt; __u32 mark = 0; if (tsecr) @@ -890,6 +890,15 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif +#ifdef CONFIG_MPTCP + if (rst && !key) { + mrst = mptcp_reset_option(skb); + + if (mrst) + tot_len += sizeof(__be32); + } +#endif + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (!buff) @@ -920,6 +929,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 *topt++ = htonl(tsecr); } + if (mrst) + *topt++ = mrst; + #ifdef CONFIG_TCP_MD5SIG if (key) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -2139,6 +2151,9 @@ struct proto tcpv6_prot = { .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d25e5a9252fd..199b080d418a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -749,6 +749,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); + udp_post_segment_fix_csum(skb); ret = udpv6_queue_rcv_one_skb(sk, skb); if (ret > 0) ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret, @@ -1713,6 +1714,9 @@ struct proto udpv6_prot = { .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index faa823c24292..b3d9ed96e5ea 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -163,7 +163,8 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); - if (NAPI_GRO_CB(skb)->is_flist) { + /* do fraglist only if there is no outer UDP encap (or we already processed it) */ + if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) { uh->len = htons(skb->len - nhoff); skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 6092d5cb7168..0fdb389c3390 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -621,7 +621,7 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, for_each_netdev_rcu(&init_net, dev) { if (!memcmp(dev->perm_addr, uid, 8)) { memcpy(iucv->src_user_id, sa->siucv_user_id, 8); - /* Check for unitialized siucv_name */ + /* Check for uninitialized siucv_name */ if (strncmp(sa->siucv_name, " ", 8) == 0) __iucv_auto_name(iucv); else @@ -2134,7 +2134,7 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, } /** - * afiucv_hs_callback_txnotify() - handle send notifcations from HiperSockets + * afiucv_hs_callback_txnotify() - handle send notifications from HiperSockets * transport **/ static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify n) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index d0b56ffbb057..6201965bd822 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -663,7 +663,7 @@ do_frag: /* Hard failure in sending message, abort this * psock since it has lost framing - * synchonization and retry sending the + * synchronization and retry sending the * message from the beginning. */ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, @@ -1419,7 +1419,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock, write_lock_bh(&csk->sk_callback_lock); - /* Check if sk_user_data is aready by KCM or someone else. + /* Check if sk_user_data is already by KCM or someone else. * Must be done under lock to prevent race conditions. */ if (csk->sk_user_data) { diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 203890e378cb..2ee20743cb41 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -802,7 +802,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) u16 version; int length; - /* UDP has verifed checksum */ + /* UDP has verified checksum */ /* UDP always verifies the packet length. */ __skb_pull(skb, sizeof(struct udphdr)); diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index ad7730b68772..17927966abb3 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -103,7 +103,7 @@ unlock: EXPORT_SYMBOL_GPL(l3mdev_ifindex_lookup_by_table_id); /** - * l3mdev_master_ifindex - get index of L3 master device + * l3mdev_master_ifindex_rcu - get index of L3 master device * @dev: targeted interface */ @@ -136,7 +136,7 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev) EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); /** - * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master + * l3mdev_master_upper_ifindex_by_index_rcu - get index of upper l3 master * device * @net: network namespace for device index lookup * @ifindex: targeted interface diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 0511bbe4af7b..1078e14f1acf 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -122,8 +122,8 @@ static struct lapb_cb *lapb_create_cb(void) timer_setup(&lapb->t1timer, NULL, 0); timer_setup(&lapb->t2timer, NULL, 0); - lapb->t1timer_stop = true; - lapb->t2timer_stop = true; + lapb->t1timer_running = false; + lapb->t2timer_running = false; lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 0230b272b7d1..5be68869064d 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -40,7 +40,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb) lapb->t1timer.function = lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; - lapb->t1timer_stop = false; + lapb->t1timer_running = true; add_timer(&lapb->t1timer); } @@ -51,25 +51,25 @@ void lapb_start_t2timer(struct lapb_cb *lapb) lapb->t2timer.function = lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; - lapb->t2timer_stop = false; + lapb->t2timer_running = true; add_timer(&lapb->t2timer); } void lapb_stop_t1timer(struct lapb_cb *lapb) { - lapb->t1timer_stop = true; + lapb->t1timer_running = false; del_timer(&lapb->t1timer); } void lapb_stop_t2timer(struct lapb_cb *lapb) { - lapb->t2timer_stop = true; + lapb->t2timer_running = false; del_timer(&lapb->t2timer); } int lapb_t1timer_running(struct lapb_cb *lapb) { - return timer_pending(&lapb->t1timer); + return lapb->t1timer_running; } static void lapb_t2timer_expiry(struct timer_list *t) @@ -79,13 +79,14 @@ static void lapb_t2timer_expiry(struct timer_list *t) spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */ goto out; - if (lapb->t2timer_stop) /* The timer has been stopped */ + if (!lapb->t2timer_running) /* The timer has been stopped */ goto out; if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; lapb_timeout_response(lapb); } + lapb->t2timer_running = false; out: spin_unlock_bh(&lapb->lock); @@ -98,7 +99,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */ goto out; - if (lapb->t1timer_stop) /* The timer has been stopped */ + if (!lapb->t1timer_running) /* The timer has been stopped */ goto out; switch (lapb->state) { @@ -127,6 +128,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); + lapb->t1timer_running = false; goto out; } else { lapb->n2count++; @@ -151,6 +153,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev); + lapb->t1timer_running = false; goto out; } else { lapb->n2count++; @@ -169,6 +172,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb_stop_t2timer(lapb); lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev); + lapb->t1timer_running = false; goto out; } else { lapb->n2count++; @@ -186,6 +190,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev); + lapb->t1timer_running = false; goto out; } else { lapb->n2count++; diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c index 523fdd1cf781..d6627a80cb45 100644 --- a/net/llc/llc_c_ev.c +++ b/net/llc/llc_c_ev.c @@ -608,7 +608,7 @@ int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb) } /** - * conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window + * llc_conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window * @sk: current connection structure. * @skb: current event. * @@ -624,7 +624,7 @@ int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb) } /** - * conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window + * llc_conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window * @sk: current connection structure. * @skb: current event. * diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c index 64d4bef04e73..6e387aadffce 100644 --- a/net/llc/llc_core.c +++ b/net/llc/llc_core.c @@ -59,10 +59,10 @@ out: } /** - * llc_sap_find - searchs a SAP in station + * llc_sap_find - searches a SAP in station * @sap_value: sap to be found * - * Searchs for a sap in the sap list of the LLC's station upon the sap ID. + * Searches for a sap in the sap list of the LLC's station upon the sap ID. * If the sap is found it will be refcounted and the user will have to do * a llc_sap_put after use. * Returns the sap or %NULL if not found. diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c index 792d195c8bae..63749dde542f 100644 --- a/net/llc/llc_pdu.c +++ b/net/llc/llc_pdu.c @@ -24,7 +24,7 @@ void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type) } /** - * pdu_set_pf_bit - sets poll/final bit in LLC header + * llc_pdu_set_pf_bit - sets poll/final bit in LLC header * @skb: Frame to set bit in * @bit_value: poll/final bit (0 or 1). * diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index 7ae4cc684d3a..b554f26c68ee 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -27,7 +27,7 @@ /** - * llc_sap_action_unit_data_ind - forward UI PDU to network layer + * llc_sap_action_unitdata_ind - forward UI PDU to network layer * @sap: SAP * @skb: the event to forward * diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c index b31f1021ad9c..48c04f89de20 100644 --- a/net/mac80211/aes_cmac.c +++ b/net/mac80211/aes_cmac.c @@ -2,6 +2,7 @@ /* * AES-128-CMAC with TLen 16 for IEEE 802.11w BIP * Copyright 2008, Jouni Malinen <j@w1.fi> + * Copyright (C) 2020 Intel Corporation */ #include <linux/kernel.h> @@ -73,8 +74,14 @@ struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[], struct crypto_shash *tfm; tfm = crypto_alloc_shash("cmac(aes)", 0, 0); - if (!IS_ERR(tfm)) - crypto_shash_setkey(tfm, key, key_len); + if (!IS_ERR(tfm)) { + int err = crypto_shash_setkey(tfm, key, key_len); + + if (err) { + crypto_free_shash(tfm); + return ERR_PTR(err); + } + } return tfm; } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 860bc35383d5..7a99892e5aba 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1486,7 +1486,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME); /* auth flags will be set later for TDLS, - * and for unassociated stations that move to assocaited */ + * and for unassociated stations that move to associated */ if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) && !((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) && (set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) { diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 5296898875ff..9245c0421bda 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -456,6 +456,7 @@ static const char *hw_flag_names[] = { FLAG(AMPDU_KEYBORDER_SUPPORT), FLAG(SUPPORTS_TX_ENCAP_OFFLOAD), FLAG(SUPPORTS_RX_DECAP_OFFLOAD), + FLAG(SUPPORTS_CONC_MON_RX_DECAP), #undef FLAG }; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 5a27c61a7b38..936c9dfa86c8 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -711,17 +711,17 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA"); switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) { - case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT: - PRINT("MAX-AMPDU-LEN-EXP-USE-VHT"); + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0: + PRINT("MAX-AMPDU-LEN-EXP-USE-EXT-0"); break; - case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1: - PRINT("MAX-AMPDU-LEN-EXP-VHT-1"); + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1: + PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-1"); break; - case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2: - PRINT("MAX-AMPDU-LEN-EXP-VHT-2"); + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2: + PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-2"); break; - case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED: - PRINT("MAX-AMPDU-LEN-EXP-RESERVED"); + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3: + PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-3"); break; } @@ -732,15 +732,15 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG"); PFLAG(MAC, 4, QTP, "QTP"); PFLAG(MAC, 4, BQR, "BQR"); - PFLAG(MAC, 4, SRP_RESP, "SRP-RESP"); + PFLAG(MAC, 4, PSR_RESP, "PSR-RESP"); PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP"); PFLAG(MAC, 4, OPS, "OPS"); - PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU"); + PFLAG(MAC, 4, AMSDU_IN_AMPDU, "AMSDU-IN-AMPDU"); PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7); - PFLAG(MAC, 5, SUBCHAN_SELECVITE_TRANSMISSION, - "SUBCHAN-SELECVITE-TRANSMISSION"); + PFLAG(MAC, 5, SUBCHAN_SELECTIVE_TRANSMISSION, + "SUBCHAN-SELECTIVE-TRANSMISSION"); PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU"); PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX"); PFLAG(MAC, 5, HE_DYNAMIC_SM_PS, "HE-DYNAMIC-SM-PS"); @@ -832,8 +832,8 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(PHY, 3, DCM_MAX_RX_NSS_1, "DCM-MAX-RX-NSS-1"); PFLAG(PHY, 3, DCM_MAX_RX_NSS_2, "DCM-MAX-RX-NSS-2"); - PFLAG(PHY, 3, RX_HE_MU_PPDU_FROM_NON_AP_STA, - "RX-HE-MU-PPDU-FROM-NON-AP-STA"); + PFLAG(PHY, 3, RX_PARTIAL_BW_SU_IN_20MHZ_MU, + "RX-PARTIAL-BW-SU-IN-20MHZ-MU"); PFLAG(PHY, 3, SU_BEAMFORMER, "SU-BEAMFORMER"); PFLAG(PHY, 4, SU_BEAMFORMEE, "SU-BEAMFORMEE"); @@ -853,16 +853,17 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(PHY, 6, CODEBOOK_SIZE_42_SU, "CODEBOOK-SIZE-42-SU"); PFLAG(PHY, 6, CODEBOOK_SIZE_75_MU, "CODEBOOK-SIZE-75-MU"); - PFLAG(PHY, 6, TRIG_SU_BEAMFORMER_FB, "TRIG-SU-BEAMFORMER-FB"); - PFLAG(PHY, 6, TRIG_MU_BEAMFORMER_FB, "TRIG-MU-BEAMFORMER-FB"); + PFLAG(PHY, 6, TRIG_SU_BEAMFORMING_FB, "TRIG-SU-BEAMFORMING-FB"); + PFLAG(PHY, 6, TRIG_MU_BEAMFORMING_PARTIAL_BW_FB, + "MU-BEAMFORMING-PARTIAL-BW-FB"); PFLAG(PHY, 6, TRIG_CQI_FB, "TRIG-CQI-FB"); PFLAG(PHY, 6, PARTIAL_BW_EXT_RANGE, "PARTIAL-BW-EXT-RANGE"); PFLAG(PHY, 6, PARTIAL_BANDWIDTH_DL_MUMIMO, "PARTIAL-BANDWIDTH-DL-MUMIMO"); PFLAG(PHY, 6, PPE_THRESHOLD_PRESENT, "PPE-THRESHOLD-PRESENT"); - PFLAG(PHY, 7, SRP_BASED_SR, "SRP-BASED-SR"); - PFLAG(PHY, 7, POWER_BOOST_FACTOR_AR, "POWER-BOOST-FACTOR-AR"); + PFLAG(PHY, 7, PSR_BASED_SR, "PSR-BASED-SR"); + PFLAG(PHY, 7, POWER_BOOST_FACTOR_SUPP, "POWER-BOOST-FACTOR-SUPP"); PFLAG(PHY, 7, HE_SU_MU_PPDU_4XLTF_AND_08_US_GI, "HE-SU-MU-PPDU-4XLTF-AND-08-US-GI"); PFLAG_RANGE(PHY, 7, MAX_NC, 0, 1, 1, "MAX-NC-%d"); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ecda126a7026..8fcbaa1eedf3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1660,6 +1660,8 @@ void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, void ieee80211_mgd_quiesce(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_handle_tspec_ac_params(struct ieee80211_sub_if_data *sdata); +void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, + u8 *bssid, u8 reason, bool tx); /* IBSS code */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b80c9b016b2b..b1c170939e44 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -807,7 +807,8 @@ static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdat ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { flags |= IEEE80211_OFFLOAD_DECAP_ENABLED; - if (local->monitors) + if (local->monitors && + !ieee80211_hw_check(&local->hw, SUPPORTS_CONC_MON_RX_DECAP)) flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; } else { flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 1b9c82616606..62145e5f9628 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #include <net/mac80211.h> @@ -282,6 +282,13 @@ static void ieee80211_restart_work(struct work_struct *work) * Then we can have a race... */ cancel_work_sync(&sdata->u.mgd.csa_connection_drop_work); + if (sdata->vif.csa_active) { + sdata_lock(sdata); + ieee80211_sta_connection_lost(sdata, + sdata->u.mgd.associated->bssid, + WLAN_REASON_UNSPECIFIED, false); + sdata_unlock(sdata); + } } flush_delayed_work(&sdata->dec_tailroom_needed_wk); } @@ -1141,8 +1148,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (local->hw.wiphy->max_scan_ie_len) local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len; - WARN_ON(!ieee80211_cs_list_valid(local->hw.cipher_schemes, - local->hw.n_cipher_schemes)); + if (WARN_ON(!ieee80211_cs_list_valid(local->hw.cipher_schemes, + local->hw.n_cipher_schemes))) { + result = -EINVAL; + goto fail_workqueue; + } result = ieee80211_init_cipher_suites(local); if (result < 0) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 96f487fc0071..2480bd0577bb 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1295,6 +1295,11 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata) sdata->vif.csa_active = false; ifmgd->csa_waiting_bcn = false; + /* + * If the CSA IE is still present on the beacon after the switch, + * we need to consider it as a new CSA (possibly to self). + */ + ifmgd->beacon_crc_valid = false; ret = drv_post_channel_switch(sdata); if (ret) { @@ -1400,11 +1405,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, ch_switch.delay = csa_ie.max_switch_time; } - if (res < 0) { - ieee80211_queue_work(&local->hw, - &ifmgd->csa_connection_drop_work); - return; - } + if (res < 0) + goto lock_and_drop_connection; if (beacon && sdata->vif.csa_active && !ifmgd->csa_waiting_bcn) { if (res) @@ -4382,8 +4384,8 @@ static void ieee80211_sta_timer(struct timer_list *t) ieee80211_queue_work(&sdata->local->hw, &sdata->work); } -static void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, - u8 *bssid, u8 reason, bool tx) +void ieee80211_sta_connection_lost(struct ieee80211_sub_if_data *sdata, + u8 *bssid, u8 reason, bool tx) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index ecad9b10984f..6487b05da6fa 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -370,7 +370,7 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, /* short preamble */ if ((mi->supported[group] & BIT(idx + 4)) && (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)) - idx += 4; + idx += 4; goto out; } @@ -868,7 +868,6 @@ static u16 minstrel_ht_next_jump_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur, u32 slow_rate_dur, int *slow_rate_ofs) { - struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; u32 max_duration = slow_rate_dur; int i, index, offset; @@ -886,7 +885,6 @@ minstrel_ht_next_jump_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur, u8 type; group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); - mg = &mi->groups[group]; supported = mi->supported[group]; if (!supported) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3b3bcefbf657..0b719f3d2dec 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * Transmit and frame generation functions. */ @@ -1388,8 +1388,20 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, ieee80211_set_skb_enqueue_time(skb); spin_lock_bh(&fq->lock); - fq_tin_enqueue(fq, tin, flow_idx, skb, - fq_skb_free_func); + /* + * For management frames, don't really apply codel etc., + * we don't want to apply any shaping or anything we just + * want to simplify the driver API by having them on the + * txqi. + */ + if (unlikely(txqi->txq.tid == IEEE80211_NUM_TIDS)) { + IEEE80211_SKB_CB(skb)->control.flags |= + IEEE80211_TX_INTCFL_NEED_TXPROCESSING; + __skb_queue_tail(&txqi->frags, skb); + } else { + fq_tin_enqueue(fq, tin, flow_idx, skb, + fq_skb_free_func); + } spin_unlock_bh(&fq->lock); } @@ -1684,7 +1696,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata; struct ieee80211_vif *vif; struct sk_buff *skb; - bool result = true; + bool result; __le16 fc; if (WARN_ON(skb_queue_empty(skbs))) @@ -2267,17 +2279,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, payload[7]); } - /* Initialize skb->priority for QoS frames. If the DONT_REORDER flag - * is set, stick to the default value for skb->priority to assure - * frames injected with this flag are not reordered relative to each - * other. - */ - if (ieee80211_is_data_qos(hdr->frame_control) && - !(info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER)) { - u8 *p = ieee80211_get_qos_ctl(hdr); - skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; - } - rcu_read_lock(); /* @@ -2341,6 +2342,15 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, info->band = chandef->chan->band; + /* Initialize skb->priority according to frame type and TID class, + * with respect to the sub interface that the frame will actually + * be transmitted on. If the DONT_REORDER flag is set, the original + * skb-priority is preserved to assure frames injected with this + * flag are not reordered relative to each other. + */ + ieee80211_select_queue_80211(sdata, skb, hdr); + skb_set_queue_mapping(skb, ieee80211_ac_from_tid(skb->priority)); + /* remove the injection radiotap header */ skb_pull(skb, len_rthdr); @@ -3580,10 +3590,16 @@ begin: /* Make sure fragments stay together. */ skb = __skb_dequeue(&txqi->frags); - if (skb) - goto out; + if (unlikely(skb)) { + if (!(IEEE80211_SKB_CB(skb)->control.flags & + IEEE80211_TX_INTCFL_NEED_TXPROCESSING)) + goto out; + IEEE80211_SKB_CB(skb)->control.flags &= + ~IEEE80211_TX_INTCFL_NEED_TXPROCESSING; + } else { + skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); + } - skb = fq_tin_dequeue(fq, tin, fq_tin_dequeue_func); if (!skb) goto out; @@ -3835,6 +3851,9 @@ bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw, if (!txq->sta) return true; + if (unlikely(txq->tid == IEEE80211_NUM_TIDS)) + return true; + sta = container_of(txq->sta, struct sta_info, sta); if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) < sta->airtime[txq->ac].aql_limit_low) @@ -4150,6 +4169,9 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, unsigned long flags; int q = info->hw_queue; + if (sta) + sk_pacing_shift_update(skb->sk, local->hw.tx_sk_pacing_shift); + if (ieee80211_queue_skb(local, sdata, sta, skb)) return true; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index c0fa526a45b4..0a0481f5af48 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -888,18 +888,10 @@ EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { - struct ieee80211_sub_if_data *sdata; - if (!vif) return NULL; - sdata = vif_to_sdata(vif); - - if (!ieee80211_sdata_running(sdata) || - !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) - return NULL; - - return &sdata->wdev; + return &vif_to_sdata(vif)->wdev; } EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev); diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig index a014149aa323..20328920f6ed 100644 --- a/net/mptcp/Kconfig +++ b/net/mptcp/Kconfig @@ -22,7 +22,7 @@ config MPTCP_IPV6 depends on IPV6=y default y -config MPTCP_KUNIT_TESTS +config MPTCP_KUNIT_TEST tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS depends on KUNIT default KUNIT_ALL_TESTS diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index a611968be4d7..e54daceac58b 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,11 +2,11 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o + mib.o pm_netlink.o sockopt.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o mptcp_crypto_test-objs := crypto_test.o mptcp_token_test-objs := token_test.o -obj-$(CONFIG_MPTCP_KUNIT_TESTS) += mptcp_crypto_test.o mptcp_token_test.o +obj-$(CONFIG_MPTCP_KUNIT_TEST) += mptcp_crypto_test.o mptcp_token_test.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index b472dc149856..a8931349933c 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -78,6 +78,6 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); } -#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) EXPORT_SYMBOL_GPL(mptcp_crypto_hmac_sha); #endif diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 3780c29c321d..eb2dc6dbe212 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -10,9 +10,12 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), + SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), + SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), + SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 72afbc135f8e..f0da4f060fe1 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -3,9 +3,12 @@ enum linux_mptcp_mib_field { MPTCP_MIB_NUM = 0, MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEACTIVE, /* Sent SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEACTIVEACK, /* Received SYN/ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ + MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 89a4225ed321..99fc21406168 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -13,6 +13,8 @@ #include "protocol.h" #include "mib.h" +#include <trace/events/mptcp.h> + static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; @@ -26,6 +28,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, int expected_opsize; u8 version; u8 flags; + u8 i; switch (subtype) { case MPTCPOPT_MP_CAPABLE: @@ -219,45 +222,45 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (!mp_opt->echo) { if (opsize == TCPOLEN_MPTCP_ADD_ADDR || opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) - mp_opt->family = MPTCP_ADDR_IPVERSION_4; + mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) - mp_opt->family = MPTCP_ADDR_IPVERSION_6; + mp_opt->addr.family = AF_INET6; #endif else break; } else { if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) - mp_opt->family = MPTCP_ADDR_IPVERSION_4; + mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) - mp_opt->family = MPTCP_ADDR_IPVERSION_6; + mp_opt->addr.family = AF_INET6; #endif else break; } mp_opt->add_addr = 1; - mp_opt->addr_id = *ptr++; - if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { - memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); + mp_opt->addr.id = *ptr++; + if (mp_opt->addr.family == AF_INET) { + memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { - mp_opt->port = get_unaligned_be16(ptr); + mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else { - memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16); + memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16); ptr += 16; if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { - mp_opt->port = get_unaligned_be16(ptr); + mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } @@ -267,19 +270,22 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 8; } pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", - (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "", - mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port); + (mp_opt->addr.family == AF_INET6) ? "6" : "", + mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; case MPTCPOPT_RM_ADDR: - if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) + if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 || + opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX) break; ptr++; mp_opt->rm_addr = 1; - mp_opt->rm_id = *ptr++; - pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); + mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; + for (i = 0; i < mp_opt->rm_list.nr; i++) + mp_opt->rm_list.ids[i] = *ptr++; + pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: @@ -301,6 +307,18 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->fastclose = 1; break; + case MPTCPOPT_RST: + if (opsize != TCPOLEN_MPTCP_RST) + break; + + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) + break; + mp_opt->reset = 1; + flags = *ptr++; + mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; + mp_opt->reset_reason = *ptr; + break; + default: break; } @@ -319,10 +337,11 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->add_addr = 0; mp_opt->ahmac = 0; mp_opt->fastclose = 0; - mp_opt->port = 0; + mp_opt->addr.port = 0; mp_opt->rm_addr = 0; mp_opt->dss = 0; mp_opt->mp_prio = 0; + mp_opt->reset = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -566,39 +585,32 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return true; } -static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, - struct in_addr *addr, u16 port) -{ - u8 hmac[SHA256_DIGEST_SIZE]; - u8 msg[7]; - - msg[0] = addr_id; - memcpy(&msg[1], &addr->s_addr, 4); - msg[5] = port >> 8; - msg[6] = port & 0xFF; - - mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); - - return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); -} - -#if IS_ENABLED(CONFIG_MPTCP_IPV6) -static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, - struct in6_addr *addr, u16 port) +static u64 add_addr_generate_hmac(u64 key1, u64 key2, + struct mptcp_addr_info *addr) { + u16 port = ntohs(addr->port); u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; + int i = 0; - msg[0] = addr_id; - memcpy(&msg[1], &addr->s6_addr, 16); - msg[17] = port >> 8; - msg[18] = port & 0xFF; + msg[i++] = addr->id; + if (addr->family == AF_INET) { + memcpy(&msg[i], &addr->addr.s_addr, 4); + i += 4; + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) { + memcpy(&msg[i], &addr->addr6.s6_addr, 16); + i += 16; + } +#endif + msg[i++] = port >> 8; + msg[i++] = port & 0xFF; - mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); + mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac); return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } -#endif static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, @@ -609,13 +621,13 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; - struct mptcp_addr_info saddr; bool echo; bool port; int len; if ((mptcp_pm_should_add_signal_ipv6(msk) || - mptcp_pm_should_add_signal_port(msk)) && + mptcp_pm_should_add_signal_port(msk) || + mptcp_pm_should_add_signal_echo(msk)) && skb && skb_is_tcp_pure_ack(skb)) { pr_debug("drop other suboptions"); opts->suboptions = 0; @@ -626,45 +638,24 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * } if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port))) + !(mptcp_pm_add_addr_signal(msk, remaining, &opts->addr, &echo, &port))) return false; - len = mptcp_add_addr_len(saddr.family, echo, port); + len = mptcp_add_addr_len(opts->addr.family, echo, port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) *size -= opt_size; - opts->addr_id = saddr.id; - if (port) - opts->port = ntohs(saddr.port); - if (saddr.family == AF_INET) { - opts->suboptions |= OPTION_MPTCP_ADD_ADDR; - opts->addr = saddr.addr; - if (!echo) { - opts->ahmac = add_addr_generate_hmac(msk->local_key, - msk->remote_key, - opts->addr_id, - &opts->addr, - opts->port); - } - } -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (saddr.family == AF_INET6) { - opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; - opts->addr6 = saddr.addr6; - if (!echo) { - opts->ahmac = add_addr6_generate_hmac(msk->local_key, - msk->remote_key, - opts->addr_id, - &opts->addr6, - opts->port); - } + opts->suboptions |= OPTION_MPTCP_ADD_ADDR; + if (!echo) { + opts->ahmac = add_addr_generate_hmac(msk->local_key, + msk->remote_key, + &opts->addr); } -#endif pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", - opts->addr_id, opts->ahmac, echo, opts->port); + opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } @@ -676,20 +667,25 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - u8 rm_id; + struct mptcp_rm_list rm_list; + int i, len; if (!mptcp_pm_should_rm_signal(msk) || - !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id))) + !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; - if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE) + len = mptcp_rm_addr_len(&rm_list); + if (len < 0) + return false; + if (remaining < len) return false; - *size = TCPOLEN_MPTCP_RM_ADDR_BASE; + *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; - opts->rm_id = rm_id; + opts->rm_list = rm_list; - pr_debug("rm_id=%d", opts->rm_id); + for (i = 0; i < opts->rm_list.nr; i++) + pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]); return true; } @@ -717,6 +713,22 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, return true; } +static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (remaining < TCPOLEN_MPTCP_RST) + return; + + *size = TCPOLEN_MPTCP_RST; + opts->suboptions |= OPTION_MPTCP_RST; + opts->reset_transient = subflow->reset_transient; + opts->reset_reason = subflow->reset_reason; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -732,11 +744,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(__mptcp_check_fallback(msk))) return false; - /* prevent adding of any MPTCP related options on reset packet - * until we support MP_TCPRST/MP_FASTCLOSE - */ - if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) - return false; + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { + mptcp_established_options_rst(sk, skb, size, remaining, opts); + return true; + } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) @@ -873,7 +884,7 @@ fully_established: subflow->pm_notified = 1; if (subflow->mp_join) { clear_3rdack_retransmission(ssk); - mptcp_pm_subflow_established(msk, subflow); + mptcp_pm_subflow_established(msk); } else { mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC); } @@ -934,6 +945,10 @@ static void ack_update_msk(struct mptcp_sock *msk, __mptcp_data_acked(sk); } mptcp_data_unlock(sk); + + trace_ack_update_msk(mp_opt->data_ack, + old_snd_una, new_snd_una, + new_wnd_end, msk->wnd_end); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) @@ -943,7 +958,7 @@ bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool us * should match. If they mismatch, the peer is misbehaving and * we will prefer the most recent information. */ - if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first)) + if (READ_ONCE(msk->rcv_data_fin)) return false; WRITE_ONCE(msk->rcv_data_fin_seq, @@ -961,18 +976,9 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, if (mp_opt->echo) return true; - if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) - hmac = add_addr_generate_hmac(msk->remote_key, - msk->local_key, - mp_opt->addr_id, &mp_opt->addr, - mp_opt->port); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else - hmac = add_addr6_generate_hmac(msk->remote_key, - msk->local_key, - mp_opt->addr_id, &mp_opt->addr6, - mp_opt->port); -#endif + hmac = add_addr_generate_hmac(msk->remote_key, + msk->local_key, + &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", msk, (unsigned long long)hmac, @@ -1013,36 +1019,23 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { - struct mptcp_addr_info addr; - - addr.port = htons(mp_opt.port); - addr.id = mp_opt.addr_id; - if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) { - addr.family = AF_INET; - addr.addr = mp_opt.addr; - } -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) { - addr.family = AF_INET6; - addr.addr6 = mp_opt.addr6; - } -#endif if (!mp_opt.echo) { - mptcp_pm_add_addr_received(msk, &addr); + mptcp_pm_add_addr_received(msk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { - mptcp_pm_del_add_timer(msk, &addr); + mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); + mptcp_pm_del_add_timer(msk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } - if (mp_opt.port) + if (mp_opt.addr.port) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); mp_opt.add_addr = 0; } if (mp_opt.rm_addr) { - mptcp_pm_rm_addr_received(msk, mp_opt.rm_id); + mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); mp_opt.rm_addr = 0; } @@ -1052,6 +1045,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mp_opt.mp_prio = 0; } + if (mp_opt.reset) { + subflow->reset_seen = 1; + subflow->reset_reason = mp_opt.reset_reason; + subflow->reset_transient = mp_opt.reset_transient; + } + if (!mp_opt.dss) return; @@ -1160,20 +1159,16 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, } mp_capable_done: - if ((OPTION_MPTCP_ADD_ADDR -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - | OPTION_MPTCP_ADD_ADDR6 -#endif - ) & opts->suboptions) { + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) + if (opts->addr.family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif - if (opts->port) + if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { @@ -1182,28 +1177,30 @@ mp_capable_done: } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - len, echo, opts->addr_id); - if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { - memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + len, echo, opts->addr.id); + if (opts->addr.family == AF_INET) { + memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { - memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + else if (opts->addr.family == AF_INET6) { + memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif - if (!opts->port) { + if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { + u16 port = ntohs(opts->addr.port); + if (opts->ahmac) { u8 *bptr = (u8 *)ptr; - put_unaligned_be16(opts->port, bptr); + put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; @@ -1212,7 +1209,7 @@ mp_capable_done: ptr += 3; } else { - put_unaligned_be32(opts->port << 16 | + put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; @@ -1221,9 +1218,23 @@ mp_capable_done: } if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { + u8 i = 1; + *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, - TCPOLEN_MPTCP_RM_ADDR_BASE, - 0, opts->rm_id); + TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, + 0, opts->rm_list.ids[0]); + + while (i < opts->rm_list.nr) { + u8 id1, id2, id3, id4; + + id1 = opts->rm_list.ids[i]; + id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; + id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; + id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; + put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); + ptr += 1; + i += 4; + } } if (OPTION_MPTCP_PRIO & opts->suboptions) { @@ -1265,6 +1276,12 @@ mp_capable_done: ptr += 5; } + if (OPTION_MPTCP_RST & opts->suboptions) + *ptr++ = mptcp_option(MPTCPOPT_RST, + TCPOLEN_MPTCP_RST, + opts->reset_transient, + opts->reset_reason); + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; @@ -1316,3 +1333,20 @@ mp_capable_done: if (tp) mptcp_set_rwin(tp); } + +__be32 mptcp_get_reset_option(const struct sk_buff *skb) +{ + const struct mptcp_ext *ext = mptcp_get_ext(skb); + u8 flags, reason; + + if (ext) { + flags = ext->reset_transient; + reason = ext->reset_reason; + + return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, + flags, reason); + } + + return htonl(0u); +} +EXPORT_SYMBOL_GPL(mptcp_get_reset_option); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 6fd4b2c1b076..9d00fa6d22e9 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -14,7 +14,7 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo, bool port) + bool echo) { u8 add_addr = READ_ONCE(msk->pm.addr_signal); @@ -33,35 +33,36 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); if (addr->family == AF_INET6) add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); - if (port) + if (addr->port) add_addr |= BIT(MPTCP_ADD_ADDR_PORT); WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } -int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) +int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { u8 rm_addr = READ_ONCE(msk->pm.addr_signal); - pr_debug("msk=%p, local_id=%d", msk, local_id); + pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); if (rm_addr) { pr_warn("addr_signal error, rm_addr=%d", rm_addr); return -EINVAL; } - msk->pm.rm_id = local_id; + msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); + mptcp_pm_nl_addr_send_ack(msk); return 0; } -int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id) +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { - pr_debug("msk=%p, local_id=%d", msk, local_id); + pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, local_id); + mptcp_pm_nl_rm_subflow_received(msk, rm_list); spin_unlock_bh(&msk->pm.lock); return 0; } @@ -152,8 +153,7 @@ void mptcp_pm_connection_closed(struct mptcp_sock *msk) pr_debug("msk=%p", msk); } -void mptcp_pm_subflow_established(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) +void mptcp_pm_subflow_established(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; @@ -188,7 +188,7 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, spin_lock_bh(&pm->lock); if (!READ_ONCE(pm->accept_addr)) { - mptcp_pm_announce_addr(msk, addr, true, addr->port); + mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->remote = *addr; @@ -197,6 +197,21 @@ void mptcp_pm_add_addr_received(struct mptcp_sock *msk, spin_unlock_bh(&pm->lock); } +void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, + struct mptcp_addr_info *addr) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p", msk); + + spin_lock_bh(&pm->lock); + + if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) + mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); + + spin_unlock_bh(&pm->lock); +} + void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) { if (!mptcp_pm_should_add_signal(msk)) @@ -205,17 +220,20 @@ void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); } -void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id) +void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { struct mptcp_pm_data *pm = &msk->pm; + u8 i; - pr_debug("msk=%p remote_id=%d", msk, rm_id); + pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr); - mptcp_event_addr_removed(msk, rm_id); + for (i = 0; i < rm_list->nr; i++) + mptcp_event_addr_removed(msk, rm_list->ids[i]); spin_lock_bh(&pm->lock); mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED); - pm->rm_id = rm_id; + pm->rm_list_rx = *rm_list; spin_unlock_bh(&pm->lock); } @@ -258,9 +276,9 @@ out_unlock: } bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - u8 *rm_id) + struct mptcp_rm_list *rm_list) { - int ret = false; + int ret = false, len; spin_lock_bh(&msk->pm.lock); @@ -268,10 +286,15 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_rm_signal(msk)) goto out_unlock; - if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE) + len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); + if (len < 0) { + WRITE_ONCE(msk->pm.addr_signal, 0); + goto out_unlock; + } + if (remaining < len) goto out_unlock; - *rm_id = msk->pm.rm_id; + *rm_list = msk->pm.rm_list_tx; WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; @@ -291,7 +314,8 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) msk->pm.add_addr_accepted = 0; msk->pm.local_addr_used = 0; msk->pm.subflows = 0; - msk->pm.rm_id = 0; + msk->pm.rm_list_tx.nr = 0; + msk->pm.rm_list_rx.nr = 0; WRITE_ONCE(msk->pm.work_pending, false); WRITE_ONCE(msk->pm.addr_signal, 0); WRITE_ONCE(msk->pm.accept_addr, false); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 8e8e35fa4002..6ba040897738 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -25,6 +25,8 @@ static int pm_nl_pernet_id; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; + u8 flags; + int ifindex; struct rcu_head rcu; struct socket *lsk; }; @@ -56,8 +58,6 @@ struct pm_nl_pernet { #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 -static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk); - static bool addresses_equal(const struct mptcp_addr_info *a, struct mptcp_addr_info *b, bool use_port) { @@ -140,6 +140,24 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, return false; } +static bool lookup_subflow_by_daddr(const struct list_head *list, + struct mptcp_addr_info *daddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + struct sock_common *skc; + + list_for_each_entry(subflow, list, node) { + skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + + remote_address(skc, &cur); + if (addresses_equal(&cur, daddr, daddr->port)) + return true; + } + + return false; +} + static struct mptcp_pm_addr_entry * select_local_address(const struct pm_nl_pernet *pernet, struct mptcp_sock *msk) @@ -152,7 +170,7 @@ select_local_address(const struct pm_nl_pernet *pernet, rcu_read_lock(); __mptcp_flush_join_list(msk); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; if (entry->addr.family != sk->sk_family) { @@ -190,7 +208,7 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) * can lead to additional addresses not being announced. */ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; if (i++ == pos) { ret = entry; @@ -245,9 +263,9 @@ static void check_work_pending(struct mptcp_sock *msk) WRITE_ONCE(msk->pm.work_pending, false); } -static struct mptcp_pm_add_entry * -lookup_anno_list_by_saddr(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) +struct mptcp_pm_add_entry * +mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, + struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; @@ -308,7 +326,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!mptcp_pm_should_add_signal(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); - mptcp_pm_announce_addr(msk, &entry->addr, false, entry->addr.port); + mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } @@ -319,6 +337,9 @@ static void mptcp_pm_add_timer(struct timer_list *timer) spin_unlock_bh(&msk->pm.lock); + if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) + mptcp_pm_subflow_established(msk); + out: __sock_put(sk); } @@ -331,7 +352,7 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct sock *sk = (struct sock *)msk; spin_lock_bh(&msk->pm.lock); - entry = lookup_anno_list_by_saddr(msk, addr); + entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry) entry->retrans_times = ADD_ADDR_RETRANS_MAX; spin_unlock_bh(&msk->pm.lock); @@ -351,7 +372,7 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, lockdep_assert_held(&msk->pm.lock); - if (lookup_anno_list_by_saddr(msk, &entry->addr)) + if (mptcp_lookup_anno_list_by_saddr(msk, &entry->addr)) return false; add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); @@ -417,8 +438,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (local) { if (mptcp_pm_alloc_anno_list(msk, local)) { msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false, local->addr.port); - mptcp_pm_nl_add_addr_send_ack(msk); + mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_nl_addr_send_ack(msk); } } else { /* pick failed, avoid fourther attempts later */ @@ -440,7 +461,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) check_work_pending(msk); remote_address((struct sock_common *)sk, &remote); spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, &local->addr, &remote); + __mptcp_subflow_connect(sk, &local->addr, &remote, + local->flags, local->ifindex); spin_lock_bh(&msk->pm.lock); return; } @@ -468,7 +490,6 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) struct mptcp_addr_info remote; struct mptcp_addr_info local; unsigned int subflows_max; - bool use_port = false; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); @@ -476,6 +497,10 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) pr_debug("accepted %d:%d remote family %d", msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); + + if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) + goto add_addr_echo; + msk->pm.add_addr_accepted++; msk->pm.subflows++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || @@ -488,37 +513,37 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) remote = msk->pm.remote; if (!remote.port) remote.port = sk->sk_dport; - else - use_port = true; memset(&local, 0, sizeof(local)); local.family = remote.family; spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, &local, &remote); + __mptcp_subflow_connect(sk, &local, &remote, 0, 0); spin_lock_bh(&msk->pm.lock); - mptcp_pm_announce_addr(msk, &remote, true, use_port); - mptcp_pm_nl_add_addr_send_ack(msk); +add_addr_echo: + mptcp_pm_announce_addr(msk, &msk->pm.remote, true); + mptcp_pm_nl_addr_send_ack(msk); } -static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) +void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); - if (!mptcp_pm_should_add_signal(msk)) + if (!mptcp_pm_should_add_signal(msk) && + !mptcp_pm_should_rm_signal(msk)) return; __mptcp_flush_join_list(msk); subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); if (subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - u8 add_addr; spin_unlock_bh(&msk->pm.lock); - pr_debug("send ack for add_addr%s%s", + pr_debug("send ack for %s%s%s", + mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr", mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "", mptcp_pm_should_add_signal_port(msk) ? " [port]" : ""); @@ -526,13 +551,6 @@ static void mptcp_pm_nl_add_addr_send_ack(struct mptcp_sock *msk) tcp_send_ack(ssk); release_sock(ssk); spin_lock_bh(&msk->pm.lock); - - add_addr = READ_ONCE(msk->pm.addr_signal); - if (mptcp_pm_should_add_signal_ipv6(msk)) - add_addr &= ~BIT(MPTCP_ADD_ADDR_IPV6); - if (mptcp_pm_should_add_signal_port(msk)) - add_addr &= ~BIT(MPTCP_ADD_ADDR_PORT); - WRITE_ONCE(msk->pm.addr_signal, add_addr); } } @@ -571,43 +589,68 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } -static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) +static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list, + enum linux_mptcp_mib_field rm_type) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; + u8 i; - pr_debug("address rm_id %d", msk->pm.rm_id); + pr_debug("%s rm_list_nr %d", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); msk_owned_by_me(msk); - if (!msk->pm.rm_id) + if (!rm_list->nr) return; if (list_empty(&msk->conn_list)) return; - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - int how = RCV_SHUTDOWN | SEND_SHUTDOWN; + for (i = 0; i < rm_list->nr; i++) { + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int how = RCV_SHUTDOWN | SEND_SHUTDOWN; + u8 id = subflow->local_id; - if (msk->pm.rm_id != subflow->remote_id) - continue; + if (rm_type == MPTCP_MIB_RMADDR) + id = subflow->remote_id; - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, how); - mptcp_close_ssk(sk, ssk, subflow); - spin_lock_bh(&msk->pm.lock); - - msk->pm.add_addr_accepted--; - msk->pm.subflows--; - WRITE_ONCE(msk->pm.accept_addr, true); + if (rm_list->ids[i] != id) + continue; - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMADDR); + pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", + i, rm_list->ids[i], subflow->local_id, subflow->remote_id); + spin_unlock_bh(&msk->pm.lock); + mptcp_subflow_shutdown(sk, ssk, how); + mptcp_close_ssk(sk, ssk, subflow); + spin_lock_bh(&msk->pm.lock); - break; + if (rm_type == MPTCP_MIB_RMADDR) { + msk->pm.add_addr_accepted--; + WRITE_ONCE(msk->pm.accept_addr, true); + } else if (rm_type == MPTCP_MIB_RMSUBFLOW) { + msk->pm.local_addr_used--; + } + msk->pm.subflows--; + __MPTCP_INC_STATS(sock_net(sk), rm_type); + } } } +static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) +{ + mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); +} + +void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) +{ + mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); +} + void mptcp_pm_nl_work(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; @@ -623,7 +666,7 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) } if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_nl_add_addr_send_ack(msk); + mptcp_pm_nl_addr_send_ack(msk); } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); @@ -641,45 +684,9 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } -void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id) -{ - struct mptcp_subflow_context *subflow, *tmp; - struct sock *sk = (struct sock *)msk; - - pr_debug("subflow rm_id %d", rm_id); - - msk_owned_by_me(msk); - - if (!rm_id) - return; - - if (list_empty(&msk->conn_list)) - return; - - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - - if (rm_id != subflow->local_id) - continue; - - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, how); - mptcp_close_ssk(sk, ssk, subflow); - spin_lock_bh(&msk->pm.lock); - - msk->pm.local_addr_used--; - msk->pm.subflows--; - - __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); - - break; - } -} - static bool address_use_port(struct mptcp_pm_addr_entry *entry) { - return (entry->addr.flags & + return (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == MPTCP_PM_ADDR_FLAG_SIGNAL; } @@ -731,11 +738,11 @@ find_next: if (entry->addr.id > pernet->next_id) pernet->next_id = entry->addr.id; - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); } - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max + 1); } @@ -837,10 +844,10 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return -ENOMEM; entry->addr = skc_local; - entry->addr.ifindex = 0; - entry->addr.flags = 0; entry->addr.id = 0; entry->addr.port = 0; + entry->ifindex = 0; + entry->flags = 0; entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) @@ -955,14 +962,14 @@ skip_family: if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); - entry->addr.ifindex = val; + entry->ifindex = val; } if (tb[MPTCP_PM_ADDR_ATTR_ID]) entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) - entry->addr.flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); @@ -1071,12 +1078,15 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, struct mptcp_addr_info *addr, bool force) { + struct mptcp_rm_list list = { .nr = 0 }; bool ret; + list.ids[list.nr++] = addr->id; + ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); - mptcp_pm_remove_addr(msk, addr->id); + mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } return ret; @@ -1087,9 +1097,12 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, { struct mptcp_sock *msk; long s_slot = 0, s_num = 0; + struct mptcp_rm_list list = { .nr = 0 }; pr_debug("remove_id=%d", addr->id); + list.ids[list.nr++] = addr->id; + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; bool remove_subflow; @@ -1103,7 +1116,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow); if (remove_subflow) - mptcp_pm_remove_subflow(msk, addr->id); + mptcp_pm_remove_subflow(msk, &list); release_sock(sk); next: @@ -1146,6 +1159,41 @@ static void mptcp_pm_free_addr_entry(struct mptcp_pm_addr_entry *entry) } } +static int mptcp_nl_remove_id_zero_address(struct net *net, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + list.ids[list.nr++] = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info msk_local; + + if (list_empty(&msk->conn_list)) + goto next; + + local_address((struct sock_common *)msk, &msk_local); + if (!addresses_equal(&msk_local, addr, addr->port)) + goto next; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + mptcp_pm_remove_addr(msk, &list); + mptcp_pm_nl_rm_subflow_received(msk, &list); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; @@ -1158,6 +1206,14 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; + /* the zero id address is special: the first address used by the msk + * always gets such an id, so different subflows can have different zero + * id addresses. Additionally zero id is not accounted for in id_bitmap. + * Let's use an 'mptcp_rm_list' instead of the common remove code. + */ + if (addr.addr.id == 0) + return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); + spin_lock_bh(&pernet->lock); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { @@ -1165,11 +1221,11 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) spin_unlock_bh(&pernet->lock); return -EINVAL; } - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); } - if (entry->addr.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max - 1); } @@ -1185,14 +1241,61 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void __flush_addrs(struct net *net, struct list_head *list) +static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) +{ + struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, rm_list, list) { + if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && + alist.nr < MPTCP_RM_IDS_MAX && + slist.nr < MPTCP_RM_IDS_MAX) { + alist.ids[alist.nr++] = entry->addr.id; + slist.ids[slist.nr++] = entry->addr.id; + } else if (remove_anno_list_by_saddr(msk, &entry->addr) && + alist.nr < MPTCP_RM_IDS_MAX) { + alist.ids[alist.nr++] = entry->addr.id; + } + } + + if (alist.nr) { + spin_lock_bh(&msk->pm.lock); + mptcp_pm_remove_addr(msk, &alist); + spin_unlock_bh(&msk->pm.lock); + } + if (slist.nr) + mptcp_pm_remove_subflow(msk, &slist); +} + +static void mptcp_nl_remove_addrs_list(struct net *net, + struct list_head *rm_list) +{ + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + if (list_empty(rm_list)) + return; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + lock_sock(sk); + mptcp_pm_remove_addrs_and_subflows(msk, rm_list); + release_sock(sk); + + sock_put(sk); + cond_resched(); + } +} + +static void __flush_addrs(struct list_head *list) { while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); - mptcp_nl_remove_subflow_and_signal_addr(net, &cur->addr); list_del_rcu(&cur->list); mptcp_pm_free_addr_entry(cur); } @@ -1217,7 +1320,8 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); - __flush_addrs(sock_net(skb->sk), &free_list); + mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); + __flush_addrs(&free_list); return 0; } @@ -1237,10 +1341,10 @@ static int mptcp_nl_fill_addr(struct sk_buff *skb, goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; - if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->addr.flags)) + if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) goto nla_put_failure; - if (entry->addr.ifindex && - nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->addr.ifindex)) + if (entry->ifindex && + nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) goto nla_put_failure; if (addr->family == AF_INET && @@ -1468,7 +1572,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) if (ret < 0) return ret; - if (addr.addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; list_for_each_entry(entry, &pernet->local_addr_list, list) { @@ -1478,9 +1582,9 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) return ret; if (bkup) - entry->addr.flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; else - entry->addr.flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; } } @@ -1586,9 +1690,21 @@ static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { + const struct mptcp_subflow_context *sf; + if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; + sf = mptcp_subflow_ctx(ssk); + if (!sf->reset_seen) + return 0; + + if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) + return -EMSGSIZE; + + if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) + return -EMSGSIZE; + return 0; } @@ -1814,7 +1930,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) /* net is removed from namespace list, can't race with * other modifiers */ - __flush_addrs(net, &pernet->local_addr_list); + __flush_addrs(&pernet->local_addr_list); } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 61329b8181ea..29a2d690d8d5 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -25,6 +25,9 @@ #include "protocol.h" #include "mib.h" +#define CREATE_TRACE_POINTS +#include <trace/events/mptcp.h> + #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; @@ -90,16 +93,6 @@ static bool mptcp_is_tcpsk(struct sock *sk) return false; } -static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) -{ - sock_owned_by_me((const struct sock *)msk); - - if (likely(!__mptcp_check_fallback(msk))) - return NULL; - - return msk->first; -} - static int __mptcp_socket_create(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -417,18 +410,6 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } -static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ - if (subflow->request_join && !subflow->fully_established) - return false; - - /* only send if our side has not closed yet */ - return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); -} - static bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & @@ -499,7 +480,7 @@ static bool mptcp_check_data_fin(struct sock *sk) u64 rcv_data_fin_seq; bool ret = false; - if (__mptcp_check_fallback(msk) || !msk->first) + if (__mptcp_check_fallback(msk)) return ret; /* Need to ack a DATA_FIN received from a peer while this side @@ -748,18 +729,47 @@ wake: sk->sk_data_ready(sk); } -void __mptcp_flush_join_list(struct mptcp_sock *msk) +static bool mptcp_do_flush_join_list(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; + bool ret = false; if (likely(list_empty(&msk->join_list))) - return; + return false; spin_lock_bh(&msk->join_list_lock); - list_for_each_entry(subflow, &msk->join_list, node) + list_for_each_entry(subflow, &msk->join_list, node) { + u32 sseq = READ_ONCE(subflow->setsockopt_seq); + mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow)); + if (READ_ONCE(msk->setsockopt_seq) != sseq) + ret = true; + } list_splice_tail_init(&msk->join_list, &msk->conn_list); spin_unlock_bh(&msk->join_list_lock); + + return ret; +} + +void __mptcp_flush_join_list(struct mptcp_sock *msk) +{ + if (likely(!mptcp_do_flush_join_list(msk))) + return; + + if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags)) + mptcp_schedule_work((struct sock *)msk); +} + +static void mptcp_flush_join_list(struct mptcp_sock *msk) +{ + bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags); + + might_sleep(); + + if (!mptcp_do_flush_join_list(msk) && !sync_needed) + return; + + mptcp_sockopt_sync_all(msk); } static bool mptcp_timer_pending(struct sock *sk) @@ -1283,7 +1293,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, int avail_size; size_t ret = 0; - pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d", + pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); /* compute send limit */ @@ -1411,6 +1421,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) send_info[i].ratio = -1; } mptcp_for_each_subflow(msk, subflow) { + trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; @@ -1431,10 +1442,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } } - pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld", - msk, nr_active, send_info[0].ssk, send_info[0].ratio, - send_info[1].ssk, send_info[1].ratio); - /* pick the best backup if no other subflow is active */ if (!nr_active) send_info[0].ssk = send_info[1].ssk; @@ -1475,7 +1482,7 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags) int ret = 0; prev_ssk = ssk; - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); /* try to keep the subflow socket lock across @@ -1615,9 +1622,13 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int ret = 0; long timeo; - if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) + /* we don't support FASTOPEN yet */ + if (msg->msg_flags & MSG_FASTOPEN) return -EOPNOTSUPP; + /* silently ignore everything else */ + msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1701,7 +1712,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); } - pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk, + pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); @@ -1740,36 +1751,41 @@ static void mptcp_wait_data(struct sock *sk, long *timeo) static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, - size_t len) + size_t len, int flags) { - struct sk_buff *skb; + struct sk_buff *skb, *tmp; int copied = 0; - while ((skb = skb_peek(&msk->receive_queue)) != NULL) { + skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); int err; - err = skb_copy_datagram_msg(skb, offset, msg, count); - if (unlikely(err < 0)) { - if (!copied) - return err; - break; + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_msg(skb, offset, msg, count); + if (unlikely(err < 0)) { + if (!copied) + return err; + break; + } } copied += count; if (count < data_len) { - MPTCP_SKB_CB(skb)->offset += count; + if (!(flags & MSG_PEEK)) + MPTCP_SKB_CB(skb)->offset += count; break; } - /* we will bulk release the skb memory later */ - skb->destructor = NULL; - msk->rmem_released += skb->truesize; - __skb_unlink(skb, &msk->receive_queue); - __kfree_skb(skb); + if (!(flags & MSG_PEEK)) { + /* we will bulk release the skb memory later */ + skb->destructor = NULL; + msk->rmem_released += skb->truesize; + __skb_unlink(skb, &msk->receive_queue); + __kfree_skb(skb); + } if (copied >= len) break; @@ -1901,7 +1917,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) unsigned int moved = 0; bool ret, done; - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); do { struct sock *ssk = mptcp_subflow_recv_lookup(msk); bool slowpath; @@ -1946,8 +1962,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int target; long timeo; - if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT)) - return -EOPNOTSUPP; + /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */ + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); if (unlikely(sk->sk_state == TCP_LISTEN)) { @@ -1963,7 +1980,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied); + bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2047,34 +2064,28 @@ out_err: pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", msk, test_bit(MPTCP_DATA_READY, &msk->flags), skb_queue_empty_lockless(&sk->sk_receive_queue), copied); - mptcp_rcv_space_adjust(msk, copied); + if (!(flags & MSG_PEEK)) + mptcp_rcv_space_adjust(msk, copied); release_sock(sk); return copied; } -static void mptcp_retransmit_handler(struct sock *sk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - set_bit(MPTCP_WORK_RTX, &msk->flags); - mptcp_schedule_work(sk); -} - static void mptcp_retransmit_timer(struct timer_list *t) { struct inet_connection_sock *icsk = from_timer(icsk, t, icsk_retransmit_timer); struct sock *sk = &icsk->icsk_inet.sk; + struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - mptcp_retransmit_handler(sk); + /* we need a process context to retransmit */ + if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags)) + mptcp_schedule_work(sk); } else { /* delegate our work to tcp_release_cb() */ - if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, - &sk->sk_tsq_flags)) - sock_hold(sk); + set_bit(MPTCP_RETRANSMIT, &msk->flags); } bh_unlock_sock(sk); sock_put(sk); @@ -2343,7 +2354,7 @@ static void mptcp_worker(struct work_struct *work) goto unlock; mptcp_check_data_fin_ack(sk); - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_check_fastclose(msk); @@ -2406,6 +2417,9 @@ static int __mptcp_init_sock(struct sock *sk) /* re-use the csk retrans timer for MPTCP-level retrans */ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0); + + tcp_assign_congestion_control(sk); + return 0; } @@ -2545,7 +2559,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk) } } - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -2601,6 +2615,8 @@ static void __mptcp_destroy_sock(struct sock *sk) WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); + + tcp_cleanup_congestion_control(sk); sk_refcnt_debug_release(sk); mptcp_dispose_initial_subflow(msk); sock_put(sk); @@ -2627,7 +2643,7 @@ static void mptcp_close(struct sock *sk, long timeout) cleanup: /* orphan all the subflows */ inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; - list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) { + mptcp_for_each_subflow(mptcp_sk(sk), subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast(ssk); @@ -2682,7 +2698,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - __mptcp_flush_join_list(msk); + mptcp_do_flush_join_list(msk); + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -2731,6 +2748,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->snd_nxt = msk->write_seq; msk->snd_una = msk->write_seq; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; + msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; if (mp_opt->mp_capable) { msk->can_ack = true; @@ -2839,161 +2857,6 @@ static void mptcp_destroy(struct sock *sk) sk_sockets_allocated_dec(sk); } -static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct sock *sk = (struct sock *)msk; - struct socket *ssock; - int ret; - - switch (optname) { - case SO_REUSEPORT: - case SO_REUSEADDR: - lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - release_sock(sk); - return -EINVAL; - } - - ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); - if (ret == 0) { - if (optname == SO_REUSEPORT) - sk->sk_reuseport = ssock->sk->sk_reuseport; - else if (optname == SO_REUSEADDR) - sk->sk_reuse = ssock->sk->sk_reuse; - } - release_sock(sk); - return ret; - } - - return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); -} - -static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct sock *sk = (struct sock *)msk; - int ret = -EOPNOTSUPP; - struct socket *ssock; - - switch (optname) { - case IPV6_V6ONLY: - lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - release_sock(sk); - return -EINVAL; - } - - ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); - if (ret == 0) - sk->sk_ipv6only = ssock->sk->sk_ipv6only; - - release_sock(sk); - break; - } - - return ret; -} - -static bool mptcp_unsupported(int level, int optname) -{ - if (level == SOL_IP) { - switch (optname) { - case IP_ADD_MEMBERSHIP: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: - return true; - } - return false; - } - if (level == SOL_IPV6) { - switch (optname) { - case IPV6_ADDRFORM: - case IPV6_ADD_MEMBERSHIP: - case IPV6_DROP_MEMBERSHIP: - case IPV6_JOIN_ANYCAST: - case IPV6_LEAVE_ANYCAST: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_MSFILTER: - return true; - } - return false; - } - return false; -} - -static int mptcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - pr_debug("msk=%p", msk); - - if (mptcp_unsupported(level, optname)) - return -ENOPROTOOPT; - - if (level == SOL_SOCKET) - return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); - - /* @@ the meaning of setsockopt() when the socket is connected and - * there are multiple subflows is not yet defined. It is up to the - * MPTCP-level socket to configure the subflows until the subflow - * is in TCP fallback, when TCP socket options are passed through - * to the one remaining subflow. - */ - lock_sock(sk); - ssk = __mptcp_tcp_fallback(msk); - release_sock(sk); - if (ssk) - return tcp_setsockopt(ssk, level, optname, optval, optlen); - - if (level == SOL_IPV6) - return mptcp_setsockopt_v6(msk, optname, optval, optlen); - - return -EOPNOTSUPP; -} - -static int mptcp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *option) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - struct sock *ssk; - - pr_debug("msk=%p", msk); - - /* @@ the meaning of setsockopt() when the socket is connected and - * there are multiple subflows is not yet defined. It is up to the - * MPTCP-level socket to configure the subflows until the subflow - * is in TCP fallback, when socket options are passed through - * to the one remaining subflow. - */ - lock_sock(sk); - ssk = __mptcp_tcp_fallback(msk); - release_sock(sk); - if (ssk) - return tcp_getsockopt(ssk, level, optname, optval, option); - - return -EOPNOTSUPP; -} - void __mptcp_data_acked(struct sock *sk) { if (!sock_owned_by_user(sk)) @@ -3022,17 +2885,16 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) } } -#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED) - /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) { - unsigned long flags, nflags; - for (;;) { - flags = 0; + unsigned long flags = 0; + if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) flags |= BIT(MPTCP_PUSH_PENDING); + if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags)) + flags |= BIT(MPTCP_RETRANSMIT); if (!flags) break; @@ -3047,6 +2909,8 @@ static void mptcp_release_cb(struct sock *sk) spin_unlock_bh(&sk->sk_lock.slock); if (flags & BIT(MPTCP_PUSH_PENDING)) __mptcp_push_pending(sk, 0); + if (flags & BIT(MPTCP_RETRANSMIT)) + __mptcp_retrans(sk); cond_resched(); spin_lock_bh(&sk->sk_lock.slock); @@ -3062,20 +2926,6 @@ static void mptcp_release_cb(struct sock *sk) */ __mptcp_update_wmem(sk); __mptcp_update_rmem(sk); - - do { - flags = sk->sk_tsq_flags; - if (!(flags & MPTCP_DEFERRED_ALL)) - return; - nflags = flags & ~MPTCP_DEFERRED_ALL; - } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); - - sock_release_ownership(sk); - - if (flags & TCPF_WRITE_TIMER_DEFERRED) { - mptcp_retransmit_handler(sk); - __sock_put(sk); - } } void mptcp_subflow_process_delegated(struct sock *ssk) @@ -3174,14 +3024,18 @@ bool mptcp_finish_join(struct sock *ssk) pr_debug("msk=%p, subflow=%p", msk, subflow); /* mptcp socket already closing? */ - if (!mptcp_is_fully_established(parent)) + if (!mptcp_is_fully_established(parent)) { + subflow->reset_reason = MPTCP_RST_EMPTCP; return false; + } if (!msk->pm.server_side) goto out; - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* active connections are already on conn_list, and we can't acquire * msk lock here. @@ -3195,8 +3049,10 @@ bool mptcp_finish_join(struct sock *ssk) sock_hold(ssk); } spin_unlock_bh(&msk->join_list_lock); - if (!ret) + if (!ret) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* attach to msk socket only after we are sure he will deal with us * at close time @@ -3308,8 +3164,12 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { + MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); mptcp_subflow_early_fallback(msk, subflow); + } + if (likely(!__mptcp_check_fallback(msk))) + MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE); do_connect: err = ssock->ops->connect(ssock, uaddr, addr_len, flags); @@ -3406,7 +3266,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ - __mptcp_flush_join_list(msk); + mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e21a5bc36cf0..edc0128730df 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -22,10 +22,10 @@ #define OPTION_MPTCP_MPJ_SYNACK BIT(4) #define OPTION_MPTCP_MPJ_ACK BIT(5) #define OPTION_MPTCP_ADD_ADDR BIT(6) -#define OPTION_MPTCP_ADD_ADDR6 BIT(7) -#define OPTION_MPTCP_RM_ADDR BIT(8) -#define OPTION_MPTCP_FASTCLOSE BIT(9) -#define OPTION_MPTCP_PRIO BIT(10) +#define OPTION_MPTCP_RM_ADDR BIT(7) +#define OPTION_MPTCP_FASTCLOSE BIT(8) +#define OPTION_MPTCP_PRIO BIT(9) +#define OPTION_MPTCP_RST BIT(10) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -36,6 +36,7 @@ #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 +#define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 @@ -61,10 +62,11 @@ #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 #define TCPOLEN_MPTCP_PORT_LEN 2 #define TCPOLEN_MPTCP_PORT_ALIGN 2 -#define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_RM_ADDR_BASE 3 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 +#define TCPOLEN_MPTCP_RST 4 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -88,12 +90,13 @@ /* MPTCP ADD_ADDR flags */ #define MPTCP_ADDR_ECHO BIT(0) -#define MPTCP_ADDR_IPVERSION_4 4 -#define MPTCP_ADDR_IPVERSION_6 6 /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) +/* MPTCP TCPRST flags */ +#define MPTCP_RST_TRANSIENT BIT(0) + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_NOSPACE 1 @@ -104,6 +107,8 @@ #define MPTCP_PUSH_PENDING 6 #define MPTCP_CLEAN_UNA 7 #define MPTCP_ERROR_REPORT 8 +#define MPTCP_RETRANSMIT 9 +#define MPTCP_WORK_SYNC_SETSOCKOPT 10 static inline bool before64(__u64 seq1, __u64 seq2) { @@ -122,11 +127,11 @@ struct mptcp_options_received { u16 mp_capable : 1, mp_join : 1, fastclose : 1, + reset : 1, dss : 1, add_addr : 1, rm_addr : 1, mp_prio : 1, - family : 4, echo : 1, backup : 1; u32 token; @@ -141,16 +146,11 @@ struct mptcp_options_received { ack64:1, mpc_map:1, __unused:2; - u8 addr_id; - u8 rm_id; - union { - struct in_addr addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - struct in6_addr addr6; -#endif - }; + struct mptcp_addr_info addr; + struct mptcp_rm_list rm_list; u64 ahmac; - u16 port; + u8 reset_reason:4; + u8 reset_transient:1; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) @@ -159,20 +159,6 @@ static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) ((nib & 0xF) << 8) | field); } -struct mptcp_addr_info { - sa_family_t family; - __be16 port; - u8 id; - u8 flags; - int ifindex; - union { - struct in_addr addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - struct in6_addr addr6; -#endif - }; -}; - enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, MPTCP_PM_ADD_ADDR_SEND_ACK, @@ -207,7 +193,8 @@ struct mptcp_pm_data { u8 local_addr_used; u8 subflows; u8 status; - u8 rm_id; + struct mptcp_rm_list rm_list_tx; + struct mptcp_rm_list rm_list_rx; }; struct mptcp_data_frag { @@ -269,6 +256,8 @@ struct mptcp_sock { u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; + + u32 setsockopt_seq; }; #define mptcp_lock_sock(___sk, cb) do { \ @@ -420,10 +409,15 @@ struct mptcp_subflow_context { u8 hmac[MPTCPOPT_HMAC_LEN]; u8 local_id; u8 remote_id; + u8 reset_seen:1; + u8 reset_transient:1; + u8 reset_reason:4; long delegated_status; struct list_head delegated_node; /* link into delegated_action, protected by local BH */ + u32 setsockopt_seq; + struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ const struct inet_connection_sock_af_ops *icsk_af_ops; @@ -543,12 +537,25 @@ struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *remote); + const struct mptcp_addr_info *remote, + u8 flags, int ifindex); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); +static inline bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */ + if (subflow->request_join && !subflow->fully_established) + return false; + + /* only send if our side has not closed yet */ + return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); +} + static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { @@ -581,6 +588,11 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); +int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option); + void __mptcp_check_push(struct sock *sk, struct sock *ssk); void __mptcp_data_acked(struct sock *sk); void __mptcp_error_report(struct sock *sk); @@ -641,13 +653,16 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); -void mptcp_pm_subflow_established(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow); +void mptcp_pm_subflow_established(struct mptcp_sock *msk); void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); void mptcp_pm_add_addr_received(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); +void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, + struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); -void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, @@ -657,12 +672,15 @@ bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_addr_info *addr); +struct mptcp_pm_add_entry * +mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, + struct mptcp_addr_info *addr); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, - bool echo, bool port); -int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); -int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 local_id); + bool echo); +int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); @@ -709,23 +727,38 @@ static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) return len; } +static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) +{ + if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX) + return -EINVAL; + + return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; +} + bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr, bool *echo, bool *port); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - u8 *rm_id); + struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_data_init(struct mptcp_sock *msk); void mptcp_pm_nl_work(struct mptcp_sock *msk); -void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list); int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); unsigned int mptcp_pm_get_add_addr_signal_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); -static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); + +void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); +void mptcp_sockopt_sync_all(struct mptcp_sock *msk); + +static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c new file mode 100644 index 000000000000..00d941b66c1e --- /dev/null +++ b/net/mptcp/sockopt.c @@ -0,0 +1,756 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2021, Red Hat. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <net/sock.h> +#include <net/protocol.h> +#include <net/tcp.h> +#include <net/mptcp.h> +#include "protocol.h" + +static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) +{ + sock_owned_by_me((const struct sock *)msk); + + if (likely(!__mptcp_check_fallback(msk))) + return NULL; + + return msk->first; +} + +static u32 sockopt_seq_reset(const struct sock *sk) +{ + sock_owned_by_me(sk); + + /* Highbits contain state. Allows to distinguish sockopt_seq + * of listener and established: + * s0 = new_listener() + * sockopt(s0) - seq is 1 + * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0) + * sockopt(s0) - seq increments to 2 on s0 + * sockopt(s1) // seq increments to 2 on s1 (different option) + * new ssk completes join, inherits options from s0 // seq 2 + * Needs sync from mptcp join logic, but ssk->seq == msk->seq + * + * Set High order bits to sk_state so ssk->seq == msk->seq test + * will fail. + */ + + return (u32)sk->sk_state << 24u; +} + +static void sockopt_seq_inc(struct mptcp_sock *msk) +{ + u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff; + + msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq; +} + +static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen, int *val) +{ + if (optlen < sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(val, optval, sizeof(*val))) + return -EFAULT; + + return 0; +} + +static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + + lock_sock(sk); + sockopt_seq_inc(msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + switch (optname) { + case SO_DEBUG: + sock_valbool_flag(ssk, SOCK_DBG, !!val); + break; + case SO_KEEPALIVE: + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, !!val); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val); + break; + case SO_PRIORITY: + ssk->sk_priority = val; + break; + case SO_SNDBUF: + case SO_SNDBUFFORCE: + ssk->sk_userlocks |= SOCK_SNDBUF_LOCK; + WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + break; + case SO_RCVBUF: + case SO_RCVBUFFORCE: + ssk->sk_userlocks |= SOCK_RCVBUF_LOCK; + WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + break; + case SO_MARK: + if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { + ssk->sk_mark = sk->sk_mark; + sk_dst_reset(ssk); + } + break; + case SO_INCOMING_CPU: + WRITE_ONCE(ssk->sk_incoming_cpu, val); + break; + } + + subflow->setsockopt_seq = msk->setsockopt_seq; + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); +} + +static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val) +{ + sockptr_t optval = KERNEL_SOCKPTR(&val); + struct sock *sk = (struct sock *)msk; + int ret; + + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, + optval, sizeof(val)); + if (ret) + return ret; + + mptcp_sol_socket_sync_intval(msk, optname, val); + return 0; +} + +static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val) +{ + struct sock *sk = (struct sock *)msk; + + WRITE_ONCE(sk->sk_incoming_cpu, val); + + mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val); +} + +static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + int val, ret; + + ret = mptcp_get_int_option(msk, optval, optlen, &val); + if (ret) + return ret; + + switch (optname) { + case SO_KEEPALIVE: + mptcp_sol_socket_sync_intval(msk, optname, val); + return 0; + case SO_DEBUG: + case SO_MARK: + case SO_PRIORITY: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + return mptcp_sol_socket_intval(msk, optname, val); + case SO_INCOMING_CPU: + mptcp_so_incoming_cpu(msk, val); + return 0; + } + + return -ENOPROTOOPT; +} + +static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct linger ling; + sockptr_t kopt; + int ret; + + if (optlen < sizeof(ling)) + return -EINVAL; + + if (copy_from_sockptr(&ling, optval, sizeof(ling))) + return -EFAULT; + + kopt = KERNEL_SOCKPTR(&ling); + ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling)); + if (ret) + return ret; + + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow = lock_sock_fast(ssk); + + if (!ling.l_onoff) { + sock_reset_flag(ssk, SOCK_LINGER); + } else { + ssk->sk_lingertime = sk->sk_lingertime; + sock_set_flag(ssk, SOCK_LINGER); + } + + subflow->setsockopt_seq = msk->setsockopt_seq; + unlock_sock_fast(ssk, slow); + } + + release_sock(sk); + return 0; +} + +static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret; + + switch (optname) { + case SO_REUSEPORT: + case SO_REUSEADDR: + case SO_BINDTODEVICE: + case SO_BINDTOIFINDEX: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); + if (ret == 0) { + if (optname == SO_REUSEPORT) + sk->sk_reuseport = ssock->sk->sk_reuseport; + else if (optname == SO_REUSEADDR) + sk->sk_reuse = ssock->sk->sk_reuse; + else if (optname == SO_BINDTODEVICE) + sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + else if (optname == SO_BINDTOIFINDEX) + sk->sk_bound_dev_if = ssock->sk->sk_bound_dev_if; + } + release_sock(sk); + return ret; + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + case SO_MARK: + case SO_INCOMING_CPU: + case SO_DEBUG: + return mptcp_setsockopt_sol_socket_int(msk, optname, optval, optlen); + case SO_LINGER: + return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen); + case SO_NO_CHECK: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_BSDCOMPAT: + case SO_PASSCRED: + case SO_PASSSEC: + case SO_RXQ_OVFL: + case SO_WIFI_STATUS: + case SO_NOFCS: + case SO_SELECT_ERR_QUEUE: + return 0; + } + + return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen); +} + +static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = (struct sock *)msk; + int ret = -EOPNOTSUPP; + struct socket *ssock; + + switch (optname) { + case IPV6_V6ONLY: + lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) { + release_sock(sk); + return -EINVAL; + } + + ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); + if (ret == 0) + sk->sk_ipv6only = ssock->sk->sk_ipv6only; + + release_sock(sk); + break; + } + + return ret; +} + +static bool mptcp_supported_sockopt(int level, int optname) +{ + if (level == SOL_SOCKET) { + switch (optname) { + case SO_DEBUG: + case SO_REUSEPORT: + case SO_REUSEADDR: + + /* the following ones need a better implementation, + * but are quite common we want to preserve them + */ + case SO_BINDTODEVICE: + case SO_SNDBUF: + case SO_SNDBUFFORCE: + case SO_RCVBUF: + case SO_RCVBUFFORCE: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_LINGER: + case SO_TIMESTAMP_OLD: + case SO_TIMESTAMP_NEW: + case SO_TIMESTAMPNS_OLD: + case SO_TIMESTAMPNS_NEW: + case SO_TIMESTAMPING_OLD: + case SO_TIMESTAMPING_NEW: + case SO_RCVLOWAT: + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + case SO_MARK: + case SO_INCOMING_CPU: + case SO_BINDTOIFINDEX: + case SO_BUSY_POLL: + case SO_PREFER_BUSY_POLL: + case SO_BUSY_POLL_BUDGET: + + /* next ones are no-op for plain TCP */ + case SO_NO_CHECK: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_BSDCOMPAT: + case SO_PASSCRED: + case SO_PASSSEC: + case SO_RXQ_OVFL: + case SO_WIFI_STATUS: + case SO_NOFCS: + case SO_SELECT_ERR_QUEUE: + return true; + } + + /* SO_OOBINLINE is not supported, let's avoid the related mess */ + /* SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF, + * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER, + * we must be careful with subflows + */ + /* SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks + * explicitly the sk_protocol field + */ + /* SO_PEEK_OFF is unsupported, as it is for plain TCP */ + /* SO_MAX_PACING_RATE is unsupported, we must be careful with subflows */ + /* SO_CNX_ADVICE is currently unsupported, could possibly be relevant, + * but likely needs careful design + */ + /* SO_ZEROCOPY is currently unsupported, TODO in sndmsg */ + /* SO_TXTIME is currently unsupported */ + return false; + } + if (level == SOL_IP) { + switch (optname) { + /* should work fine */ + case IP_FREEBIND: + case IP_TRANSPARENT: + + /* the following are control cmsg related */ + case IP_PKTINFO: + case IP_RECVTTL: + case IP_RECVTOS: + case IP_RECVOPTS: + case IP_RETOPTS: + case IP_PASSSEC: + case IP_RECVORIGDSTADDR: + case IP_CHECKSUM: + case IP_RECVFRAGSIZE: + + /* common stuff that need some love */ + case IP_TOS: + case IP_TTL: + case IP_BIND_ADDRESS_NO_PORT: + case IP_MTU_DISCOVER: + case IP_RECVERR: + + /* possibly less common may deserve some love */ + case IP_MINTTL: + + /* the following is apparently a no-op for plain TCP */ + case IP_RECVERR_RFC4884: + return true; + } + + /* IP_OPTIONS is not supported, needs subflow care */ + /* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */ + /* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF, + * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP, + * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE, + * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP, + * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, + * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal + * with mcast stuff + */ + /* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */ + return false; + } + if (level == SOL_IPV6) { + switch (optname) { + case IPV6_V6ONLY: + + /* the following are control cmsg related */ + case IPV6_RECVPKTINFO: + case IPV6_2292PKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_2292HOPLIMIT: + case IPV6_RECVRTHDR: + case IPV6_2292RTHDR: + case IPV6_RECVHOPOPTS: + case IPV6_2292HOPOPTS: + case IPV6_RECVDSTOPTS: + case IPV6_2292DSTOPTS: + case IPV6_RECVTCLASS: + case IPV6_FLOWINFO: + case IPV6_RECVPATHMTU: + case IPV6_RECVORIGDSTADDR: + case IPV6_RECVFRAGSIZE: + + /* the following ones need some love but are quite common */ + case IPV6_TCLASS: + case IPV6_TRANSPARENT: + case IPV6_FREEBIND: + case IPV6_PKTINFO: + case IPV6_2292PKTOPTIONS: + case IPV6_UNICAST_HOPS: + case IPV6_MTU_DISCOVER: + case IPV6_MTU: + case IPV6_RECVERR: + case IPV6_FLOWINFO_SEND: + case IPV6_FLOWLABEL_MGR: + case IPV6_MINHOPCOUNT: + case IPV6_DONTFRAG: + case IPV6_AUTOFLOWLABEL: + + /* the following one is a no-op for plain TCP */ + case IPV6_RECVERR_RFC4884: + return true; + } + + /* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are + * not supported + */ + /* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF, + * IPV6_MULTICAST_IF, IPV6_ADDRFORM, + * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST, + * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP, + * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP, + * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER + * are not supported better not deal with mcast + */ + /* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */ + + /* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */ + /* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */ + return false; + } + if (level == SOL_TCP) { + switch (optname) { + /* the following are no-op or should work just fine */ + case TCP_THIN_DUPACK: + case TCP_DEFER_ACCEPT: + + /* the following need some love */ + case TCP_MAXSEG: + case TCP_NODELAY: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_CONGESTION: + case TCP_ULP: + case TCP_CORK: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_SAVE_SYN: + case TCP_LINGER2: + case TCP_WINDOW_CLAMP: + case TCP_QUICKACK: + case TCP_USER_TIMEOUT: + case TCP_TIMESTAMP: + case TCP_NOTSENT_LOWAT: + case TCP_TX_DELAY: + return true; + } + + /* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */ + + /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, + * TCP_REPAIR_WINDOW are not supported, better avoid this mess + */ + /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE, + * are not supported fastopen is currently unsupported + */ + /* TCP_INQ is currently unsupported, needs some recvmsg work */ + } + return false; +} + +static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + char name[TCP_CA_NAME_MAX]; + bool cap_net_admin; + int ret; + + if (optlen < 1) + return -EINVAL; + + ret = strncpy_from_sockptr(name, optval, + min_t(long, TCP_CA_NAME_MAX - 1, optlen)); + if (ret < 0) + return -EFAULT; + + name[ret] = 0; + + cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN); + + ret = 0; + lock_sock(sk); + sockopt_seq_inc(msk); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int err; + + lock_sock(ssk); + err = tcp_set_congestion_control(ssk, name, true, cap_net_admin); + if (err < 0 && ret == 0) + ret = err; + subflow->setsockopt_seq = msk->setsockopt_seq; + release_sock(ssk); + } + + if (ret == 0) + tcp_set_congestion_control(sk, name, false, cap_net_admin); + + release_sock(sk); + return ret; +} + +static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) +{ + switch (optname) { + case TCP_ULP: + return -EOPNOTSUPP; + case TCP_CONGESTION: + return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen); + } + + return -EOPNOTSUPP; +} + +int mptcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sock *ssk; + + pr_debug("msk=%p", msk); + + if (!mptcp_supported_sockopt(level, optname)) + return -ENOPROTOOPT; + + if (level == SOL_SOCKET) + return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when TCP socket options are passed through + * to the one remaining subflow. + */ + lock_sock(sk); + ssk = __mptcp_tcp_fallback(msk); + release_sock(sk); + if (ssk) + return tcp_setsockopt(ssk, level, optname, optval, optlen); + + if (level == SOL_IPV6) + return mptcp_setsockopt_v6(msk, optname, optval, optlen); + + if (level == SOL_TCP) + return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen); + + return -EOPNOTSUPP; +} + +static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = (struct sock *)msk; + struct socket *ssock; + int ret = -EINVAL; + struct sock *ssk; + + lock_sock(sk); + ssk = msk->first; + if (ssk) { + ret = tcp_getsockopt(ssk, level, optname, optval, optlen); + goto out; + } + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + goto out; + + ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen); + +out: + release_sock(sk); + return ret; +} + +static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + switch (optname) { + case TCP_ULP: + case TCP_CONGESTION: + case TCP_INFO: + case TCP_CC_INFO: + return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, + optval, optlen); + } + return -EOPNOTSUPP; +} + +int mptcp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sock *ssk; + + pr_debug("msk=%p", msk); + + /* @@ the meaning of setsockopt() when the socket is connected and + * there are multiple subflows is not yet defined. It is up to the + * MPTCP-level socket to configure the subflows until the subflow + * is in TCP fallback, when socket options are passed through + * to the one remaining subflow. + */ + lock_sock(sk); + ssk = __mptcp_tcp_fallback(msk); + release_sock(sk); + if (ssk) + return tcp_getsockopt(ssk, level, optname, optval, option); + + if (level == SOL_TCP) + return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); + return -EOPNOTSUPP; +} + +static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) +{ + static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; + struct sock *sk = (struct sock *)msk; + + if (ssk->sk_prot->keepalive) { + if (sock_flag(sk, SOCK_KEEPOPEN)) + ssk->sk_prot->keepalive(ssk, 1); + else + ssk->sk_prot->keepalive(ssk, 0); + } + + ssk->sk_priority = sk->sk_priority; + ssk->sk_bound_dev_if = sk->sk_bound_dev_if; + ssk->sk_incoming_cpu = sk->sk_incoming_cpu; + + if (sk->sk_userlocks & tx_rx_locks) { + ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks; + if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) + WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf); + } + + if (sock_flag(sk, SOCK_LINGER)) { + ssk->sk_lingertime = sk->sk_lingertime; + sock_set_flag(ssk, SOCK_LINGER); + } else { + sock_reset_flag(ssk, SOCK_LINGER); + } + + if (sk->sk_mark != ssk->sk_mark) { + ssk->sk_mark = sk->sk_mark; + sk_dst_reset(ssk); + } + + sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG)); + + if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops) + tcp_set_congestion_control(ssk, inet_csk(sk)->icsk_ca_ops->name, false, true); +} + +static void __mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) +{ + bool slow = lock_sock_fast(ssk); + + sync_socket_options(msk, ssk); + + unlock_sock_fast(ssk, slow); +} + +void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + + msk_owned_by_me(msk); + + if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) { + __mptcp_sockopt_sync(msk, ssk); + + subflow->setsockopt_seq = msk->setsockopt_seq; + } +} + +void mptcp_sockopt_sync_all(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + u32 seq; + + seq = sockopt_seq_reset(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u32 sseq = READ_ONCE(subflow->setsockopt_seq); + + if (sseq != msk->setsockopt_seq) { + __mptcp_sockopt_sync(msk, ssk); + WRITE_ONCE(subflow->setsockopt_seq, seq); + } else if (sseq != seq) { + WRITE_ONCE(subflow->setsockopt_seq, seq); + } + + cond_resched(); + } + + msk->setsockopt_seq = seq; +} diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d17d39ccdf34..82e91b00ad39 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -25,6 +25,8 @@ #include "protocol.h" #include "mib.h" +#include <trace/events/mptcp.h> + static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, @@ -115,6 +117,16 @@ static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } +static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) +{ + struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + + if (mpext) { + memset(mpext, 0, sizeof(*mpext)); + mpext->reset_reason = reason; + } +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -165,6 +177,7 @@ again: if (mptcp_token_exists(subflow_req->token)) { if (retries-- > 0) goto again; + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else { subflow_req->mp_capable = 1; } @@ -176,6 +189,8 @@ again: subflow_req->mp_capable = 1; else if (retries-- > 0) goto again; + else + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); } else if (mp_opt.mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; @@ -187,8 +202,10 @@ again: subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ - if (!subflow_req->msk) + if (!subflow_req->msk) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; + } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d", @@ -392,12 +409,15 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->remote_key = mp_opt.sndr_key; pr_debug("subflow=%p, remote_key=%llu", subflow, subflow->remote_key); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!mp_opt.mp_join) + if (!mp_opt.mp_join) { + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; + } subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; @@ -406,6 +426,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } @@ -434,6 +455,7 @@ fallback: return; do_reset: + subflow->reset_transient = 0; mptcp_subflow_reset(sk); } @@ -650,13 +672,18 @@ create_child: * to reset the context to non MPTCP status. */ if (!ctx || fallback) { - if (fallback_is_fatal) + if (fallback_is_fatal) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; + } subflow_drop_ctx(child); goto out; } + /* ssk inherits options of listener sk */ + ctx->setsockopt_seq = listener->setsockopt_seq; + if (ctx->mp_capable) { /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space @@ -672,6 +699,7 @@ create_child: * created mptcp socket */ new_msk->sk_destruct = mptcp_sock_destruct; + mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; @@ -686,8 +714,10 @@ create_child: struct mptcp_sock *owner; owner = subflow_req->msk; - if (!owner) + if (!owner) { + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; + } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; @@ -834,9 +864,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, goto validate_seq; } - pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", - mpext->data_seq, mpext->dsn64, mpext->subflow_seq, - mpext->data_len, mpext->data_fin); + trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { @@ -974,8 +1002,6 @@ static bool subflow_check_data_avail(struct sock *ssk) struct mptcp_sock *msk; struct sk_buff *skb; - pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, - subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); if (!skb_peek(&ssk->sk_receive_queue)) subflow->data_avail = 0; if (subflow->data_avail) @@ -987,7 +1013,7 @@ static bool subflow_check_data_avail(struct sock *ssk) u64 old_ack; status = get_mapping_status(ssk, msk); - pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); + trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (status == MAPPING_INVALID) { ssk->sk_err = EBADMSG; goto fatal; @@ -1052,6 +1078,8 @@ fatal: smp_wmb(); ssk->sk_error_report(ssk); tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); subflow->data_avail = 0; return false; @@ -1081,7 +1109,7 @@ bool mptcp_subflow_data_available(struct sock *sk) * In mptcp, rwin is about the mptcp-level connection data. * * Data that is still on the ssk rx queue can thus be ignored, - * as far as mptcp peer is concerened that data is still inflight. + * as far as mptcp peer is concerned that data is still inflight. * DSS ACK is updated when skb is moved to the mptcp rx queue. */ void mptcp_space(const struct sock *ssk, int *space, int *full_space) @@ -1230,7 +1258,8 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *remote) + const struct mptcp_addr_info *remote, + u8 flags, int ifindex) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; @@ -1274,7 +1303,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - ssk->sk_bound_dev_if = loc->ifindex; + ssk->sk_bound_dev_if = ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); if (err) goto failed; @@ -1286,10 +1315,11 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, subflow->local_id = local_id; subflow->remote_id = remote_id; subflow->request_join = 1; - subflow->request_bkup = !!(loc->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); mptcp_info2sockaddr(remote, &addr, ssk->sk_family); mptcp_add_pending_subflow(msk, subflow); + mptcp_sockopt_sync(msk, ssk); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) goto failed_unlink; diff --git a/net/mptcp/token.c b/net/mptcp/token.c index feb4b9ffd462..8f0270a780ce 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -402,7 +402,7 @@ void __init mptcp_token_init(void) } } -#if IS_MODULE(CONFIG_MPTCP_KUNIT_TESTS) +#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) EXPORT_SYMBOL_GPL(mptcp_token_new_request); EXPORT_SYMBOL_GPL(mptcp_token_new_connect); EXPORT_SYMBOL_GPL(mptcp_token_accept); diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index e37102546be6..49031f804276 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -100,7 +100,7 @@ enum { struct ncsi_channel_version { u32 version; /* Supported BCD encoded NCSI version */ u32 alpha2; /* Supported BCD encoded NCSI version */ - u8 fw_name[12]; /* Firware name string */ + u8 fw_name[12]; /* Firmware name string */ u32 fw_version; /* Firmware version */ u16 pci_ids[4]; /* PCI identification */ u32 mf_id; /* Manufacture ID */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 1a92063c73a4..fcd8682704c4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -71,12 +71,17 @@ config NF_CONNTRACK To compile it as a module, choose M here. If unsure, say N. -config NF_LOG_COMMON - tristate - -config NF_LOG_NETDEV - tristate "Netdev packet logging" - select NF_LOG_COMMON +config NF_LOG_SYSLOG + tristate "Syslog packet logging" + default m if NETFILTER_ADVANCED=n + help + This option enable support for packet logging via syslog. + It supports IPv4, IPV6, ARP and common transport protocols such + as TCP and UDP. + This is a simpler but less flexible logging method compared to + CONFIG_NETFILTER_NETLINK_LOG. + If both are enabled the backend to use can be configured at run-time + by means of per-address-family sysctl tunables. if NF_CONNTRACK config NETFILTER_CONNCOUNT @@ -922,8 +927,7 @@ config NETFILTER_XT_TARGET_LED config NETFILTER_XT_TARGET_LOG tristate "LOG target support" - select NF_LOG_COMMON - select NF_LOG_IPV4 + select NF_LOG_SYSLOG select NF_LOG_IPV6 if IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 33da7bf1b68e..e80e010354b1 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -48,11 +48,7 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o nf_nat-y := nf_nat_core.o nf_nat_proto.o nf_nat_helper.o -# generic transport layer logging -obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o - -# packet logging for netdev family -obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o +obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o obj-$(CONFIG_NF_NAT) += nf_nat.o nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 89009c82a6b2..359ff8ec236a 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -963,20 +963,9 @@ static struct nlmsghdr * start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, enum ipset_cmd cmd) { - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - - nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), - sizeof(*nfmsg), flags); - if (!nlh) - return NULL; - - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = NFPROTO_IPV4; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - return nlh; + return nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags, + NFPROTO_IPV4, NFNETLINK_V0, 0); } /* Create a set */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 0c132ff9b446..128690c512df 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2398,7 +2398,7 @@ static int __net_init __ip_vs_init(struct net *net) if (ipvs == NULL) return -ENOMEM; - /* Hold the beast until a service is registerd */ + /* Hold the beast until a service is registered */ ipvs->enable = 0; ipvs->net = net; /* Counters used for creating unique names */ diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index cf925906f59b..ef1f45e43b63 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -591,8 +591,6 @@ static int __net_init __ip_vs_ftp_init(struct net *net) ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; - pr_info("%s: loaded support on port[%d] = %u\n", - app->name, i, ports[i]); } return 0; diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 2ccda8ace796..91bc8df3e4b0 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* Accouting handling for netfilter. */ +/* Accounting handling for netfilter. */ /* * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ff0168736f6e..e0befcf8113a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -55,6 +55,8 @@ #include "nf_internals.h" +extern unsigned int nf_conntrack_net_id; + __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); @@ -85,6 +87,8 @@ static __read_mostly bool nf_conntrack_locks_all; static struct conntrack_gc_work conntrack_gc_work; +extern unsigned int nf_conntrack_net_id; + void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { /* 1) Acquire the lock */ @@ -656,6 +660,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; + struct net *net; if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) return false; @@ -670,11 +675,13 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) * be done by event cache worker on redelivery. */ nf_ct_delete_from_lists(ct); - nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); + nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); return false; } - nf_conntrack_ecache_work(nf_ct_net(ct)); + net = nf_ct_net(ct); + if (nf_conntrack_ecache_dwork_pending(net)) + nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; @@ -1376,6 +1383,7 @@ static void gc_worker(struct work_struct *work) i = 0; hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { + struct nf_conntrack_net *cnet; struct net *net; tmp = nf_ct_tuplehash_to_ctrack(h); @@ -1396,7 +1404,8 @@ static void gc_worker(struct work_struct *work) continue; net = nf_ct_net(tmp); - if (atomic_read(&net->ct.count) < nf_conntrack_max95) + cnet = net_generic(net, nf_conntrack_net_id); + if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; /* need to take reference to avoid possible races */ @@ -1475,17 +1484,18 @@ __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + unsigned int ct_count; struct nf_conn *ct; /* We don't want any race condition at early drop stage */ - atomic_inc(&net->ct.count); + ct_count = atomic_inc_return(&cnet->count); - if (nf_conntrack_max && - unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { + if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; - atomic_dec(&net->ct.count); + atomic_dec(&cnet->count); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); } @@ -1520,7 +1530,7 @@ __nf_conntrack_alloc(struct net *net, atomic_set(&ct->ct_general.use, 0); return ct; out: - atomic_dec(&net->ct.count); + atomic_dec(&cnet->count); return ERR_PTR(-ENOMEM); } @@ -1537,6 +1547,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc); void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_net *cnet; /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU @@ -1545,8 +1556,10 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); kmem_cache_free(nf_conntrack_cachep, ct); + cnet = net_generic(net, nf_conntrack_net_id); + smp_mb__before_atomic(); - atomic_dec(&net->ct.count); + atomic_dec(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -1567,6 +1580,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; + struct nf_conntrack_net *cnet; if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { pr_debug("Can't invert tuple.\n"); @@ -1600,7 +1614,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, GFP_ATOMIC); local_bh_disable(); - if (net->ct.expect_count) { + cnet = net_generic(net, nf_conntrack_net_id); + if (cnet->expect_count) { spin_lock(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { @@ -2302,9 +2317,11 @@ __nf_ct_unconfirmed_destroy(struct net *net) void nf_ct_unconfirmed_destroy(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + might_sleep(); - if (atomic_read(&net->ct.count) > 0) { + if (atomic_read(&cnet->count) > 0) { __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); synchronize_net(); @@ -2316,11 +2333,12 @@ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct iter_data d; might_sleep(); - if (atomic_read(&net->ct.count) == 0) + if (atomic_read(&cnet->count) == 0) return; d.iter = iter; @@ -2349,7 +2367,9 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) down_read(&net_rwsem); for_each_net(net) { - if (atomic_read(&net->ct.count) == 0) + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + if (atomic_read(&cnet->count) == 0) continue; __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); @@ -2429,8 +2449,10 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + nf_ct_iterate_cleanup(kill_all, net, 0, 0); - if (atomic_read(&net->ct.count) != 0) + if (atomic_read(&cnet->count) != 0) busy = 1; } if (busy) { @@ -2711,12 +2733,13 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); int ret = -ENOMEM; int cpu; BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); - atomic_set(&net->ct.count, 0); + atomic_set(&cnet->count, 0); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 7956c9f19899..759d87aef95f 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -27,6 +27,8 @@ #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_extend.h> +extern unsigned int nf_conntrack_net_id; + static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) @@ -96,8 +98,8 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) static void ecache_work(struct work_struct *work) { - struct netns_ct *ctnet = - container_of(work, struct netns_ct, ecache_dwork.work); + struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache_dwork.work); + struct netns_ct *ctnet = cnet->ct_net; int cpu, delay = -1; struct ct_pcpu *pcpu; @@ -127,7 +129,7 @@ static void ecache_work(struct work_struct *work) ctnet->ecache_dwork_pending = delay > 0; if (delay >= 0) - schedule_delayed_work(&ctnet->ecache_dwork, delay); + schedule_delayed_work(&cnet->ecache_dwork, delay); } int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, @@ -344,6 +346,20 @@ void nf_ct_expect_unregister_notifier(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); +void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) +{ + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + if (state == NFCT_ECACHE_DESTROY_FAIL && + !delayed_work_pending(&cnet->ecache_dwork)) { + schedule_delayed_work(&cnet->ecache_dwork, HZ); + net->ct.ecache_dwork_pending = true; + } else if (state == NFCT_ECACHE_DESTROY_SENT) { + net->ct.ecache_dwork_pending = false; + mod_delayed_work(system_wq, &cnet->ecache_dwork, 0); + } +} + #define NF_CT_EVENTS_DEFAULT 1 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; @@ -355,13 +371,18 @@ static const struct nf_ct_ext_type event_extend = { void nf_conntrack_ecache_pernet_init(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + net->ct.sysctl_events = nf_ct_events; - INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); + cnet->ct_net = &net->ct; + INIT_DELAYED_WORK(&cnet->ecache_dwork, ecache_work); } void nf_conntrack_ecache_pernet_fini(struct net *net) { - cancel_delayed_work_sync(&net->ct.ecache_dwork); + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + cancel_delayed_work_sync(&cnet->ecache_dwork); } int nf_conntrack_ecache_init(void) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 42557d2b6a90..efdd391b3f72 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -43,18 +43,23 @@ unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static unsigned int nf_ct_expect_hashrnd __read_mostly; +extern unsigned int nf_conntrack_net_id; + /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); + struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); - net->ct.expect_count--; + + cnet = net_generic(net, nf_conntrack_net_id); + cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; @@ -118,10 +123,11 @@ __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct nf_conntrack_expect *i; unsigned int h; - if (!net->ct.expect_count) + if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); @@ -158,10 +164,11 @@ nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; - if (!net->ct.expect_count) + if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); @@ -368,6 +375,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { + struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); @@ -389,7 +397,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - net->ct.expect_count++; + cnet = net_generic(net, nf_conntrack_net_id); + cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } @@ -415,6 +424,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; + struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; @@ -458,7 +468,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, } } - if (net->ct.expect_count >= nf_ct_expect_max) { + cnet = net_generic(net, nf_conntrack_net_id); + if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } @@ -686,7 +697,6 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { - net->ct.expect_count = 0; return exp_proc_init(net); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index b055187235f8..ac396cc8bfae 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -43,6 +43,8 @@ MODULE_PARM_DESC(nf_conntrack_helper, static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; +extern unsigned int nf_conntrack_net_id; + /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) @@ -212,8 +214,10 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); static struct nf_conntrack_helper * nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) { - if (!net->ct.sysctl_auto_assign_helper) { - if (net->ct.auto_assign_helper_warned) + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + if (!cnet->sysctl_auto_assign_helper) { + if (cnet->auto_assign_helper_warned) return NULL; if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) return NULL; @@ -221,7 +225,7 @@ nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) "has been turned off for security reasons and CT-based " "firewall rule not found. Use the iptables CT target " "to attach helpers instead.\n"); - net->ct.auto_assign_helper_warned = 1; + cnet->auto_assign_helper_warned = true; return NULL; } @@ -556,8 +560,9 @@ static const struct nf_ct_ext_type helper_extend = { void nf_conntrack_helper_pernet_init(struct net *net) { - net->ct.auto_assign_helper_warned = false; - net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; } int nf_conntrack_helper_init(void) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1d519b0e51a5..44e3cb80e2e0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -555,22 +555,17 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct nlattr *nest_parms; unsigned int event; if (portid) flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct), + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = nf_ct_l3num(ct); - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); @@ -713,7 +708,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct nlattr *nest_parms; struct nf_conn *ct = item->ct; struct sk_buff *skb; @@ -743,15 +737,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type); - nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct), + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = nf_ct_l3num(ct); - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); @@ -2490,20 +2480,15 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, __u16 cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS_CPU); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, htons(cpu)); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || @@ -2574,21 +2559,17 @@ static int ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; - unsigned int nr_conntracks = atomic_read(&net->ct.count); + unsigned int nr_conntracks; + struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - + nr_conntracks = nf_conntrack_count(net); if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; @@ -3085,19 +3066,14 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, + exp->tuple.src.l3num, NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = exp->tuple.src.l3num; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; @@ -3117,7 +3093,6 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct sk_buff *skb; unsigned int type, group; int flags = 0; @@ -3140,15 +3115,11 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type); - nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, + exp->tuple.src.l3num, NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = exp->tuple.src.l3num; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; @@ -3716,20 +3687,15 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_EXP_GET_STATS_CPU); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, htons(cpu)); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index db7479db8512..4f33307fa3cf 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -397,6 +397,7 @@ dccp_new(struct nf_conn *ct, const struct sk_buff *skb, msg = "not picking up existing connection "; goto out_invalid; } + break; case CT_DCCP_REQUEST: break; case CT_DCCP_INVALID: diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index ec23330687a5..318b8f723349 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -31,20 +31,6 @@ #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> -/* "Be conservative in what you do, - be liberal in what you accept from others." - If it's non-zero, we mark only out of window RST segments as INVALID. */ -static int nf_ct_tcp_be_liberal __read_mostly = 0; - -/* If it is set to zero, we disable picking up already established - connections. */ -static int nf_ct_tcp_loose __read_mostly = 1; - -/* Max number of the retransmitted packets without receiving an (acceptable) - ACK from the destination. If this number is reached, a shorter timer - will be started. */ -static int nf_ct_tcp_max_retrans __read_mostly = 3; - /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ @@ -1436,9 +1422,23 @@ void nf_conntrack_tcp_init_net(struct net *net) * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; - tn->tcp_loose = nf_ct_tcp_loose; - tn->tcp_be_liberal = nf_ct_tcp_be_liberal; - tn->tcp_max_retrans = nf_ct_tcp_max_retrans; + + /* If it is set to zero, we disable picking up already established + * connections. + */ + tn->tcp_loose = 1; + + /* "Be conservative in what you do, + * be liberal in what you accept from others." + * If it's non-zero, we mark only out of window RST segments as INVALID. + */ + tn->tcp_be_liberal = 0; + + /* Max number of the retransmitted packets without receiving an (acceptable) + * ACK from the destination. If this number is reached, a shorter timer + * will be started. + */ + tn->tcp_max_retrans = 3; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index c6c0cb465664..aaa55246d0ca 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -425,14 +425,16 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); - unsigned int nr_conntracks = atomic_read(&net->ct.count); const struct ip_conntrack_stat *st = v; + unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } + nr_conntracks = nf_conntrack_count(net); + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, @@ -508,13 +510,19 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) } #endif /* CONFIG_NF_CONNTRACK_PROCFS */ +u32 nf_conntrack_count(const struct net *net) +{ + const struct nf_conntrack_net *cnet; + + cnet = net_generic(net, nf_conntrack_net_id); + + return atomic_read(&cnet->count); +} +EXPORT_SYMBOL_GPL(nf_conntrack_count); + /* Sysctl support */ #ifdef CONFIG_SYSCTL -/* Log invalid packets of a given protocol */ -static int log_invalid_proto_min __read_mostly; -static int log_invalid_proto_max __read_mostly = 255; - /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; @@ -615,7 +623,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", - .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, @@ -630,20 +637,18 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &log_invalid_proto_min, - .extra2 = &log_invalid_proto_max, + .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", @@ -655,18 +660,17 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", .data = &init_net.ct.sysctl_acct, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_HELPER] = { .procname = "nf_conntrack_helper", - .data = &init_net.ct.sysctl_auto_assign_helper, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -674,9 +678,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", .data = &init_net.ct.sysctl_events, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -685,9 +689,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_TIMESTAMP] = { .procname = "nf_conntrack_timestamp", .data = &init_net.ct.sysctl_tstamp, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -760,25 +764,25 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = { .procname = "nf_conntrack_udp_timeout", @@ -905,9 +909,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { .procname = "nf_conntrack_dccp_loose", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1028,6 +1032,7 @@ static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, static int nf_conntrack_standalone_init_sysctl(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; @@ -1038,11 +1043,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) if (!table) return -ENOMEM; - table[NF_SYSCTL_CT_COUNT].data = &net->ct.count; + table[NF_SYSCTL_CT_COUNT].data = &cnet->count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; - table[NF_SYSCTL_CT_HELPER].data = &net->ct.sysctl_auto_assign_helper; + table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif @@ -1060,21 +1065,15 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); - /* Don't allow unprivileged users to alter certain sysctls */ - if (net->user_ns != &init_user_ns) { + /* Don't allow non-init_net ns to alter global sysctls */ + if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_MAX].mode = 0444; table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444; - table[NF_SYSCTL_CT_HELPER].mode = 0444; -#ifdef CONFIG_NF_CONNTRACK_EVENTS - table[NF_SYSCTL_CT_EVENTS].mode = 0444; -#endif - table[NF_SYSCTL_CT_BUCKETS].mode = 0444; - } else if (!net_eq(&init_net, net)) { table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } - net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); - if (!net->ct.sysctl_header) + cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table); + if (!cnet->sysctl_header) goto out_unregister_netfilter; return 0; @@ -1086,10 +1085,11 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct ctl_table *table; - table = net->ct.sysctl_header->ctl_table_arg; - unregister_net_sysctl_table(net->ct.sysctl_header); + table = cnet->sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(cnet->sysctl_header); kfree(table); } #else diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index c77ba8690ed8..39c02d1aeedf 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -74,16 +74,25 @@ err_ct_refcnt: } EXPORT_SYMBOL_GPL(flow_offload_alloc); +static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) +{ + const struct rt6_info *rt; + + if (flow_tuple->l3proto == NFPROTO_IPV6) { + rt = (const struct rt6_info *)flow_tuple->dst_cache; + return rt6_get_cookie(rt); + } + + return 0; +} + static int flow_offload_fill_route(struct flow_offload *flow, const struct nf_flow_route *route, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; - struct dst_entry *other_dst = route->tuple[!dir].dst; struct dst_entry *dst = route->tuple[dir].dst; - - if (!dst_hold_safe(route->tuple[dir].dst)) - return -1; + int i, j = 0; switch (flow_tuple->l3proto) { case NFPROTO_IPV4: @@ -94,12 +103,50 @@ static int flow_offload_fill_route(struct flow_offload *flow, break; } - flow_tuple->iifidx = other_dst->dev->ifindex; - flow_tuple->dst_cache = dst; + flow_tuple->iifidx = route->tuple[dir].in.ifindex; + for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { + flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; + flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; + if (route->tuple[dir].in.ingress_vlans & BIT(i)) + flow_tuple->in_vlan_ingress |= BIT(j); + j++; + } + flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + + switch (route->tuple[dir].xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest, + ETH_ALEN); + memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, + ETH_ALEN); + flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; + flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; + break; + case FLOW_OFFLOAD_XMIT_XFRM: + case FLOW_OFFLOAD_XMIT_NEIGH: + if (!dst_hold_safe(route->tuple[dir].dst)) + return -1; + + flow_tuple->dst_cache = dst; + flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); + break; + default: + WARN_ON_ONCE(1); + break; + } + flow_tuple->xmit_type = route->tuple[dir].xmit_type; return 0; } +static void nft_flow_dst_release(struct flow_offload *flow, + enum flow_offload_tuple_dir dir) +{ + if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || + flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) + dst_release(flow->tuplehash[dir].tuple.dst_cache); +} + int flow_offload_route_init(struct flow_offload *flow, const struct nf_flow_route *route) { @@ -118,7 +165,7 @@ int flow_offload_route_init(struct flow_offload *flow, return 0; err_route_reply: - dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); + nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); return err; } @@ -169,8 +216,8 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) static void flow_offload_route_release(struct flow_offload *flow) { - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); + nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); + nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY); } void flow_offload_free(struct flow_offload *flow) @@ -359,11 +406,33 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } +static bool flow_offload_stale_dst(struct flow_offload_tuple *tuple) +{ + struct dst_entry *dst; + + if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || + tuple->xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { + dst = tuple->dst_cache; + if (!dst_check(dst, tuple->dst_cookie)) + return true; + } + + return false; +} + +static bool nf_flow_has_stale_dst(struct flow_offload *flow) +{ + return flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple) || + flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple); +} + static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct)) + if (nf_flow_has_expired(flow) || + nf_ct_is_dying(flow->ct) || + nf_flow_has_stale_dst(flow)) set_bit(NF_FLOW_TEARDOWN, &flow->flags); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { @@ -389,29 +458,20 @@ static void nf_flow_offload_work_gc(struct work_struct *work) queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } - -static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, - __be16 port, __be16 new_port) +static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) { struct tcphdr *tcph; - if (skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); - - return 0; } -static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, - __be16 port, __be16 new_port) +static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) { struct udphdr *udph; - if (skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace2(&udph->check, skb, port, @@ -419,37 +479,28 @@ static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, - u8 protocol, __be16 port, __be16 new_port) +static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, + u8 protocol, __be16 port, __be16 new_port) { switch (protocol) { case IPPROTO_TCP: - if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) - return NF_DROP; + nf_flow_nat_port_tcp(skb, thoff, port, new_port); break; case IPPROTO_UDP: - if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) - return NF_DROP; + nf_flow_nat_port_udp(skb, thoff, port, new_port); break; } - - return 0; } -int nf_flow_snat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir) +void nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; - if (skb_try_make_writable(skb, thoff + sizeof(*hdr))) - return -1; - hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { @@ -463,24 +514,19 @@ int nf_flow_snat_port(const struct flow_offload *flow, new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; hdr->dest = new_port; break; - default: - return -1; } - return nf_flow_nat_port(skb, thoff, protocol, port, new_port); + nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_snat_port); -int nf_flow_dnat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir) +void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb, + unsigned int thoff, u8 protocol, + enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; - if (skb_try_make_writable(skb, thoff + sizeof(*hdr))) - return -1; - hdr = (void *)(skb_network_header(skb) + thoff); switch (dir) { @@ -494,11 +540,9 @@ int nf_flow_dnat_port(const struct flow_offload *flow, new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; hdr->source = new_port; break; - default: - return -1; } - return nf_flow_nat_port(skb, thoff, protocol, port, new_port); + nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index a698dbe28ef5..889cf88d3dba 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -7,6 +7,9 @@ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netdevice.h> +#include <linux/if_ether.h> +#include <linux/if_pppox.h> +#include <linux/ppp_defs.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> @@ -34,28 +37,20 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto, return 0; } -static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) +static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) { struct tcphdr *tcph; - if (skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); - - return 0; } -static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) +static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) { struct udphdr *udph; - if (skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&udph->check, skb, addr, @@ -63,31 +58,25 @@ static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, - unsigned int thoff, __be32 addr, - __be32 new_addr) +static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, __be32 addr, + __be32 new_addr) { switch (iph->protocol) { case IPPROTO_TCP: - if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr); break; case IPPROTO_UDP: - if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ip_udp(skb, thoff, addr, new_addr); break; } - - return 0; } -static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_snat_ip(const struct flow_offload *flow, + struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, enum flow_offload_tuple_dir dir) { __be32 addr, new_addr; @@ -102,17 +91,15 @@ static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; iph->daddr = new_addr; break; - default: - return -1; } csum_replace4(&iph->check, addr, new_addr); - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); + nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } -static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_dnat_ip(const struct flow_offload *flow, + struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, enum flow_offload_tuple_dir dir) { __be32 addr, new_addr; @@ -127,31 +114,24 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; iph->saddr = new_addr; break; - default: - return -1; } csum_replace4(&iph->check, addr, new_addr); - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); + nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } -static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, - unsigned int thoff, enum flow_offload_tuple_dir dir) +static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, + unsigned int thoff, enum flow_offload_tuple_dir dir, + struct iphdr *iph) { - struct iphdr *iph = ip_hdr(skb); - - if (test_bit(NF_FLOW_SNAT, &flow->flags) && - (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) - return -1; - - iph = ip_hdr(skb); - if (test_bit(NF_FLOW_DNAT, &flow->flags) && - (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0)) - return -1; - - return 0; + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir); + nf_flow_snat_ip(flow, skb, iph, thoff, dir); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir); + nf_flow_dnat_ip(flow, skb, iph, thoff, dir); + } } static bool ip_has_options(unsigned int thoff) @@ -159,29 +139,58 @@ static bool ip_has_options(unsigned int thoff) return thoff != sizeof(struct iphdr); } +static void nf_flow_tuple_encap(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + struct vlan_ethhdr *veth; + struct pppoe_hdr *phdr; + int i = 0; + + if (skb_vlan_tag_present(skb)) { + tuple->encap[i].id = skb_vlan_tag_get(skb); + tuple->encap[i].proto = skb->vlan_proto; + i++; + } + switch (skb->protocol) { + case htons(ETH_P_8021Q): + veth = (struct vlan_ethhdr *)skb_mac_header(skb); + tuple->encap[i].id = ntohs(veth->h_vlan_TCI); + tuple->encap[i].proto = skb->protocol; + break; + case htons(ETH_P_PPP_SES): + phdr = (struct pppoe_hdr *)skb_mac_header(skb); + tuple->encap[i].id = ntohs(phdr->sid); + tuple->encap[i].proto = skb->protocol; + break; + } +} + static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple) + struct flow_offload_tuple *tuple, u32 *hdrsize, + u32 offset) { - unsigned int thoff, hdrsize; struct flow_ports *ports; + unsigned int thoff; struct iphdr *iph; - if (!pskb_may_pull(skb, sizeof(*iph))) + if (!pskb_may_pull(skb, sizeof(*iph) + offset)) return -1; - iph = ip_hdr(skb); - thoff = iph->ihl * 4; + iph = (struct iphdr *)(skb_network_header(skb) + offset); + thoff = (iph->ihl * 4); if (ip_is_fragment(iph) || unlikely(ip_has_options(thoff))) return -1; + thoff += offset; + switch (iph->protocol) { case IPPROTO_TCP: - hdrsize = sizeof(struct tcphdr); + *hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: - hdrsize = sizeof(struct udphdr); + *hdrsize = sizeof(struct udphdr); break; default: return -1; @@ -190,11 +199,10 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (iph->ttl <= 1) return -1; - thoff = iph->ihl * 4; - if (!pskb_may_pull(skb, thoff + hdrsize)) + if (!pskb_may_pull(skb, thoff + *hdrsize)) return -1; - iph = ip_hdr(skb); + iph = (struct iphdr *)(skb_network_header(skb) + offset); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v4.s_addr = iph->saddr; @@ -204,6 +212,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, tuple->l3proto = AF_INET; tuple->l4proto = iph->protocol; tuple->iifidx = dev->ifindex; + nf_flow_tuple_encap(skb, tuple); return 0; } @@ -220,14 +229,6 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } -static int nf_flow_offload_dst_check(struct dst_entry *dst) -{ - if (unlikely(dst_xfrm(dst))) - return dst_check(dst, 0) ? 0 : -1; - - return 0; -} - static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, const struct nf_hook_state *state, struct dst_entry *dst) @@ -238,6 +239,91 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +{ + __be16 proto; + + proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); + switch (proto) { + case htons(PPP_IP): + return htons(ETH_P_IP); + case htons(PPP_IPV6): + return htons(ETH_P_IPV6); + } + + return 0; +} + +static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, + u32 *offset) +{ + struct vlan_ethhdr *veth; + + switch (skb->protocol) { + case htons(ETH_P_8021Q): + veth = (struct vlan_ethhdr *)skb_mac_header(skb); + if (veth->h_vlan_encapsulated_proto == proto) { + *offset += VLAN_HLEN; + return true; + } + break; + case htons(ETH_P_PPP_SES): + if (nf_flow_pppoe_proto(skb) == proto) { + *offset += PPPOE_SES_HLEN; + return true; + } + break; + } + + return false; +} + +static void nf_flow_encap_pop(struct sk_buff *skb, + struct flow_offload_tuple_rhash *tuplehash) +{ + struct vlan_hdr *vlan_hdr; + int i; + + for (i = 0; i < tuplehash->tuple.encap_num; i++) { + if (skb_vlan_tag_present(skb)) { + __vlan_hwaccel_clear_tag(skb); + continue; + } + switch (skb->protocol) { + case htons(ETH_P_8021Q): + vlan_hdr = (struct vlan_hdr *)skb->data; + __skb_pull(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vlan_hdr); + skb_reset_network_header(skb); + break; + case htons(ETH_P_PPP_SES): + skb->protocol = nf_flow_pppoe_proto(skb); + skb_pull(skb, PPPOE_SES_HLEN); + skb_reset_network_header(skb); + break; + } + } +} + +static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + const struct flow_offload_tuple_rhash *tuplehash, + unsigned short type) +{ + struct net_device *outdev; + + outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); + if (!outdev) + return NF_DROP; + + skb->dev = outdev; + dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, + tuplehash->tuple.out.h_source, skb->len); + dev_queue_xmit(skb); + + return NF_STOLEN; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -248,15 +334,18 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, enum flow_offload_tuple_dir dir; struct flow_offload *flow; struct net_device *outdev; + u32 hdrsize, offset = 0; + unsigned int thoff, mtu; struct rtable *rt; - unsigned int thoff; struct iphdr *iph; __be32 nexthop; + int ret; - if (skb->protocol != htons(ETH_P_IP)) + if (skb->protocol != htons(ETH_P_IP) && + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset)) return NF_ACCEPT; - if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) + if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0) return NF_ACCEPT; tuplehash = flow_offload_lookup(flow_table, &tuple); @@ -265,77 +354,80 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; - outdev = rt->dst.dev; - if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) + mtu = flow->tuplehash[dir].tuple.mtu + offset; + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return NF_ACCEPT; - if (skb_try_make_writable(skb, sizeof(*iph))) - return NF_DROP; - - thoff = ip_hdr(skb)->ihl * 4; - if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) + iph = (struct iphdr *)(skb_network_header(skb) + offset); + thoff = (iph->ihl * 4) + offset; + if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) return NF_ACCEPT; - flow_offload_refresh(flow_table, flow); + if (skb_try_make_writable(skb, thoff + hdrsize)) + return NF_DROP; - if (nf_flow_offload_dst_check(&rt->dst)) { - flow_offload_teardown(flow); - return NF_ACCEPT; - } + flow_offload_refresh(flow_table, flow); - if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) - return NF_DROP; + nf_flow_encap_pop(skb, tuplehash); + thoff -= offset; iph = ip_hdr(skb); + nf_flow_nat_ip(flow, skb, thoff, dir, iph); + ip_decrease_ttl(iph); skb->tstamp = 0; if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); - if (unlikely(dst_xfrm(&rt->dst))) { + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { + rt = (struct rtable *)tuplehash->tuple.dst_cache; memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); - skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + switch (tuplehash->tuple.xmit_type) { + case FLOW_OFFLOAD_XMIT_NEIGH: + rt = (struct rtable *)tuplehash->tuple.dst_cache; + outdev = rt->dst.dev; + skb->dev = outdev; + nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + skb_dst_set_noref(skb, &rt->dst); + neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + ret = NF_STOLEN; + break; + case FLOW_OFFLOAD_XMIT_DIRECT: + ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); + if (ret == NF_DROP) + flow_offload_teardown(flow); + break; + } - return NF_STOLEN; + return ret; } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); -static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr, + struct ipv6hdr *ip6h) { struct tcphdr *tcph; - if (skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, new_addr->s6_addr32, true); - - return 0; } -static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) { struct udphdr *udph; - if (skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, @@ -343,32 +435,26 @@ static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, struct in6_addr *addr, + struct in6_addr *new_addr) { switch (ip6h->nexthdr) { case IPPROTO_TCP: - if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h); break; case IPPROTO_UDP: - if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr); break; } - - return 0; } -static int nf_flow_snat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_snat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) { struct in6_addr addr, new_addr; @@ -383,17 +469,15 @@ static int nf_flow_snat_ipv6(const struct flow_offload *flow, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; ip6h->daddr = new_addr; break; - default: - return -1; } - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); + nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); } -static int nf_flow_dnat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_dnat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) { struct in6_addr addr, new_addr; @@ -408,52 +492,48 @@ static int nf_flow_dnat_ipv6(const struct flow_offload *flow, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; ip6h->saddr = new_addr; break; - default: - return -1; } - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); + nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); } -static int nf_flow_nat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, - enum flow_offload_tuple_dir dir) +static void nf_flow_nat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, + enum flow_offload_tuple_dir dir, + struct ipv6hdr *ip6h) { - struct ipv6hdr *ip6h = ipv6_hdr(skb); unsigned int thoff = sizeof(*ip6h); - if (test_bit(NF_FLOW_SNAT, &flow->flags) && - (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) - return -1; - - ip6h = ipv6_hdr(skb); - if (test_bit(NF_FLOW_DNAT, &flow->flags) && - (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0)) - return -1; - - return 0; + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir); + nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir); + nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir); + } } static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple) + struct flow_offload_tuple *tuple, u32 *hdrsize, + u32 offset) { - unsigned int thoff, hdrsize; struct flow_ports *ports; struct ipv6hdr *ip6h; + unsigned int thoff; - if (!pskb_may_pull(skb, sizeof(*ip6h))) + thoff = sizeof(*ip6h) + offset; + if (!pskb_may_pull(skb, thoff)) return -1; - ip6h = ipv6_hdr(skb); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); switch (ip6h->nexthdr) { case IPPROTO_TCP: - hdrsize = sizeof(struct tcphdr); + *hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: - hdrsize = sizeof(struct udphdr); + *hdrsize = sizeof(struct udphdr); break; default: return -1; @@ -462,11 +542,10 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (ip6h->hop_limit <= 1) return -1; - thoff = sizeof(*ip6h); - if (!pskb_may_pull(skb, thoff + hdrsize)) + if (!pskb_may_pull(skb, thoff + *hdrsize)) return -1; - ip6h = ipv6_hdr(skb); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v6 = ip6h->saddr; @@ -476,6 +555,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, tuple->l3proto = AF_INET6; tuple->l4proto = ip6h->nexthdr; tuple->iifidx = dev->ifindex; + nf_flow_tuple_encap(skb, tuple); return 0; } @@ -491,13 +571,17 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct in6_addr *nexthop; struct flow_offload *flow; struct net_device *outdev; + unsigned int thoff, mtu; + u32 hdrsize, offset = 0; struct ipv6hdr *ip6h; struct rt6_info *rt; + int ret; - if (skb->protocol != htons(ETH_P_IPV6)) + if (skb->protocol != htons(ETH_P_IPV6) && + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset)) return NF_ACCEPT; - if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) + if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0) return NF_ACCEPT; tuplehash = flow_offload_lookup(flow_table, &tuple); @@ -506,48 +590,57 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; - outdev = rt->dst.dev; - if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) + mtu = flow->tuplehash[dir].tuple.mtu + offset; + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return NF_ACCEPT; - if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb, - sizeof(*ip6h))) + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + thoff = sizeof(*ip6h) + offset; + if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) return NF_ACCEPT; - flow_offload_refresh(flow_table, flow); - - if (nf_flow_offload_dst_check(&rt->dst)) { - flow_offload_teardown(flow); - return NF_ACCEPT; - } - - if (skb_try_make_writable(skb, sizeof(*ip6h))) + if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; - if (nf_flow_nat_ipv6(flow, skb, dir) < 0) - return NF_DROP; + flow_offload_refresh(flow_table, flow); + + nf_flow_encap_pop(skb, tuplehash); ip6h = ipv6_hdr(skb); + nf_flow_nat_ipv6(flow, skb, dir, ip6h); + ip6h->hop_limit--; skb->tstamp = 0; if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); - if (unlikely(dst_xfrm(&rt->dst))) { + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { + rt = (struct rt6_info *)tuplehash->tuple.dst_cache; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); - skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + switch (tuplehash->tuple.xmit_type) { + case FLOW_OFFLOAD_XMIT_NEIGH: + rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + outdev = rt->dst.dev; + skb->dev = outdev; + nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + skb_dst_set_noref(skb, &rt->dst); + neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + ret = NF_STOLEN; + break; + case FLOW_OFFLOAD_XMIT_DIRECT: + ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); + if (ret == NF_DROP) + flow_offload_teardown(flow); + break; + } - return NF_STOLEN; + return ret; } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 1c5460e7bce8..2af7bdb38407 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -13,7 +13,9 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> -static struct workqueue_struct *nf_flow_offload_wq; +static struct workqueue_struct *nf_flow_offload_add_wq; +static struct workqueue_struct *nf_flow_offload_del_wq; +static struct workqueue_struct *nf_flow_offload_stats_wq; struct flow_offload_work { struct list_head list; @@ -76,6 +78,16 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, match->dissector.used_keys |= enc_keys; } +static void nf_flow_rule_vlan_match(struct flow_dissector_key_vlan *key, + struct flow_dissector_key_vlan *mask, + u16 vlan_id, __be16 proto) +{ + key->vlan_id = vlan_id; + mask->vlan_id = VLAN_VID_MASK; + key->vlan_tpid = proto; + mask->vlan_tpid = 0xffff; +} + static int nf_flow_rule_match(struct nf_flow_match *match, const struct flow_offload_tuple *tuple, struct dst_entry *other_dst) @@ -83,6 +95,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; struct ip_tunnel_info *tun_info; + bool vlan_encap = false; NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); @@ -100,6 +113,32 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->meta.ingress_ifindex = tuple->iifidx; mask->meta.ingress_ifindex = 0xffffffff; + if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) && + tuple->encap[0].proto == htons(ETH_P_8021Q)) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, vlan); + nf_flow_rule_vlan_match(&key->vlan, &mask->vlan, + tuple->encap[0].id, + tuple->encap[0].proto); + vlan_encap = true; + } + + if (tuple->encap_num > 1 && !(tuple->in_vlan_ingress & BIT(1)) && + tuple->encap[1].proto == htons(ETH_P_8021Q)) { + if (vlan_encap) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CVLAN, + cvlan); + nf_flow_rule_vlan_match(&key->cvlan, &mask->cvlan, + tuple->encap[1].id, + tuple->encap[1].proto); + } else { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, + vlan); + nf_flow_rule_vlan_match(&key->vlan, &mask->vlan, + tuple->encap[1].id, + tuple->encap[1].proto); + } + } + switch (tuple->l3proto) { case AF_INET: key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -175,28 +214,45 @@ static int flow_offload_eth_src(struct net *net, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { - const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple; struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); - struct net_device *dev; + const struct flow_offload_tuple *other_tuple, *this_tuple; + struct net_device *dev = NULL; + const unsigned char *addr; u32 mask, val; u16 val16; - dev = dev_get_by_index(net, tuple->iifidx); - if (!dev) - return -ENOENT; + this_tuple = &flow->tuplehash[dir].tuple; + + switch (this_tuple->xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + addr = this_tuple->out.h_source; + break; + case FLOW_OFFLOAD_XMIT_NEIGH: + other_tuple = &flow->tuplehash[!dir].tuple; + dev = dev_get_by_index(net, other_tuple->iifidx); + if (!dev) + return -ENOENT; + + addr = dev->dev_addr; + break; + default: + return -EOPNOTSUPP; + } mask = ~0xffff0000; - memcpy(&val16, dev->dev_addr, 2); + memcpy(&val16, addr, 2); val = val16 << 16; flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, &val, &mask); mask = ~0xffffffff; - memcpy(&val, dev->dev_addr + 2, 4); + memcpy(&val, addr + 2, 4); flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, &val, &mask); - dev_put(dev); + + if (dev) + dev_put(dev); return 0; } @@ -208,27 +264,40 @@ static int flow_offload_eth_dst(struct net *net, { struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule); struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule); - const void *daddr = &flow->tuplehash[!dir].tuple.src_v4; + const struct flow_offload_tuple *other_tuple, *this_tuple; const struct dst_entry *dst_cache; unsigned char ha[ETH_ALEN]; struct neighbour *n; + const void *daddr; u32 mask, val; u8 nud_state; u16 val16; - dst_cache = flow->tuplehash[dir].tuple.dst_cache; - n = dst_neigh_lookup(dst_cache, daddr); - if (!n) - return -ENOENT; - - read_lock_bh(&n->lock); - nud_state = n->nud_state; - ether_addr_copy(ha, n->ha); - read_unlock_bh(&n->lock); + this_tuple = &flow->tuplehash[dir].tuple; - if (!(nud_state & NUD_VALID)) { + switch (this_tuple->xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + ether_addr_copy(ha, this_tuple->out.h_dest); + break; + case FLOW_OFFLOAD_XMIT_NEIGH: + other_tuple = &flow->tuplehash[!dir].tuple; + daddr = &other_tuple->src_v4; + dst_cache = this_tuple->dst_cache; + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -ENOENT; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); neigh_release(n); - return -ENOENT; + + if (!(nud_state & NUD_VALID)) + return -ENOENT; + break; + default: + return -EOPNOTSUPP; } mask = ~0xffffffff; @@ -241,7 +310,6 @@ static int flow_offload_eth_dst(struct net *net, val = val16; flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4, &val, &mask); - neigh_release(n); return 0; } @@ -463,27 +531,52 @@ static void flow_offload_ipv4_checksum(struct net *net, } } -static void flow_offload_redirect(const struct flow_offload *flow, +static void flow_offload_redirect(struct net *net, + const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { - struct flow_action_entry *entry = flow_action_entry_next(flow_rule); - struct rtable *rt; + const struct flow_offload_tuple *this_tuple, *other_tuple; + struct flow_action_entry *entry; + struct net_device *dev; + int ifindex; + + this_tuple = &flow->tuplehash[dir].tuple; + switch (this_tuple->xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + this_tuple = &flow->tuplehash[dir].tuple; + ifindex = this_tuple->out.hw_ifidx; + break; + case FLOW_OFFLOAD_XMIT_NEIGH: + other_tuple = &flow->tuplehash[!dir].tuple; + ifindex = other_tuple->iifidx; + break; + default: + return; + } + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return; - rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_REDIRECT; - entry->dev = rt->dst.dev; - dev_hold(rt->dst.dev); + entry->dev = dev; } static void flow_offload_encap_tunnel(const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { + const struct flow_offload_tuple *this_tuple; struct flow_action_entry *entry; struct dst_entry *dst; - dst = flow->tuplehash[dir].tuple.dst_cache; + this_tuple = &flow->tuplehash[dir].tuple; + if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) + return; + + dst = this_tuple->dst_cache; if (dst && dst->lwtstate) { struct ip_tunnel_info *tun_info; @@ -500,10 +593,15 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { + const struct flow_offload_tuple *other_tuple; struct flow_action_entry *entry; struct dst_entry *dst; - dst = flow->tuplehash[!dir].tuple.dst_cache; + other_tuple = &flow->tuplehash[!dir].tuple; + if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) + return; + + dst = other_tuple->dst_cache; if (dst && dst->lwtstate) { struct ip_tunnel_info *tun_info; @@ -515,10 +613,15 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow, } } -int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, - enum flow_offload_tuple_dir dir, - struct nf_flow_rule *flow_rule) +static int +nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) { + const struct flow_offload_tuple *other_tuple; + const struct flow_offload_tuple *tuple; + int i; + flow_offload_decap_tunnel(flow, dir, flow_rule); flow_offload_encap_tunnel(flow, dir, flow_rule); @@ -526,6 +629,53 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; + tuple = &flow->tuplehash[dir].tuple; + + for (i = 0; i < tuple->encap_num; i++) { + struct flow_action_entry *entry; + + if (tuple->in_vlan_ingress & BIT(i)) + continue; + + if (tuple->encap[i].proto == htons(ETH_P_8021Q)) { + entry = flow_action_entry_next(flow_rule); + entry->id = FLOW_ACTION_VLAN_POP; + } + } + + other_tuple = &flow->tuplehash[!dir].tuple; + + for (i = 0; i < other_tuple->encap_num; i++) { + struct flow_action_entry *entry; + + if (other_tuple->in_vlan_ingress & BIT(i)) + continue; + + entry = flow_action_entry_next(flow_rule); + + switch (other_tuple->encap[i].proto) { + case htons(ETH_P_PPP_SES): + entry->id = FLOW_ACTION_PPPOE_PUSH; + entry->pppoe.sid = other_tuple->encap[i].id; + break; + case htons(ETH_P_8021Q): + entry->id = FLOW_ACTION_VLAN_PUSH; + entry->vlan.vid = other_tuple->encap[i].id; + entry->vlan.proto = other_tuple->encap[i].proto; + break; + } + } + + return 0; +} + +int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) +{ + if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0) + return -1; + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv4_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); @@ -538,7 +688,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, test_bit(NF_FLOW_DNAT, &flow->flags)) flow_offload_ipv4_checksum(net, flow, flow_rule); - flow_offload_redirect(flow, dir, flow_rule); + flow_offload_redirect(net, flow, dir, flow_rule); return 0; } @@ -548,11 +698,7 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, enum flow_offload_tuple_dir dir, struct nf_flow_rule *flow_rule) { - flow_offload_decap_tunnel(flow, dir, flow_rule); - flow_offload_encap_tunnel(flow, dir, flow_rule); - - if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 || - flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) + if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0) return -1; if (test_bit(NF_FLOW_SNAT, &flow->flags)) { @@ -564,7 +710,7 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, flow_offload_port_dnat(net, flow, dir, flow_rule); } - flow_offload_redirect(flow, dir, flow_rule); + flow_offload_redirect(net, flow, dir, flow_rule); return 0; } @@ -578,10 +724,10 @@ nf_flow_offload_rule_alloc(struct net *net, enum flow_offload_tuple_dir dir) { const struct nf_flowtable *flowtable = offload->flowtable; + const struct flow_offload_tuple *tuple, *other_tuple; const struct flow_offload *flow = offload->flow; - const struct flow_offload_tuple *tuple; + struct dst_entry *other_dst = NULL; struct nf_flow_rule *flow_rule; - struct dst_entry *other_dst; int err = -ENOMEM; flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL); @@ -597,7 +743,10 @@ nf_flow_offload_rule_alloc(struct net *net, flow_rule->rule->match.key = &flow_rule->match.key; tuple = &flow->tuplehash[dir].tuple; - other_dst = flow->tuplehash[!dir].tuple.dst_cache; + other_tuple = &flow->tuplehash[!dir].tuple; + if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + other_dst = other_tuple->dst_cache; + err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst); if (err < 0) goto err_flow_match; @@ -826,7 +975,12 @@ static void flow_offload_work_handler(struct work_struct *work) static void flow_offload_queue_work(struct flow_offload_work *offload) { - queue_work(nf_flow_offload_wq, &offload->work); + if (offload->cmd == FLOW_CLS_REPLACE) + queue_work(nf_flow_offload_add_wq, &offload->work); + else if (offload->cmd == FLOW_CLS_DESTROY) + queue_work(nf_flow_offload_del_wq, &offload->work); + else + queue_work(nf_flow_offload_stats_wq, &offload->work); } static struct flow_offload_work * @@ -898,8 +1052,11 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { - if (nf_flowtable_hw_offload(flowtable)) - flush_workqueue(nf_flow_offload_wq); + if (nf_flowtable_hw_offload(flowtable)) { + flush_workqueue(nf_flow_offload_add_wq); + flush_workqueue(nf_flow_offload_del_wq); + flush_workqueue(nf_flow_offload_stats_wq); + } } static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, @@ -1011,15 +1168,33 @@ EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); int nf_flow_table_offload_init(void) { - nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", - WQ_UNBOUND, 0); - if (!nf_flow_offload_wq) + nf_flow_offload_add_wq = alloc_workqueue("nf_ft_offload_add", + WQ_UNBOUND | WQ_SYSFS, 0); + if (!nf_flow_offload_add_wq) return -ENOMEM; + nf_flow_offload_del_wq = alloc_workqueue("nf_ft_offload_del", + WQ_UNBOUND | WQ_SYSFS, 0); + if (!nf_flow_offload_del_wq) + goto err_del_wq; + + nf_flow_offload_stats_wq = alloc_workqueue("nf_ft_offload_stats", + WQ_UNBOUND | WQ_SYSFS, 0); + if (!nf_flow_offload_stats_wq) + goto err_stats_wq; + return 0; + +err_stats_wq: + destroy_workqueue(nf_flow_offload_del_wq); +err_del_wq: + destroy_workqueue(nf_flow_offload_add_wq); + return -ENOMEM; } void nf_flow_table_offload_exit(void) { - destroy_workqueue(nf_flow_offload_wq); + destroy_workqueue(nf_flow_offload_add_wq); + destroy_workqueue(nf_flow_offload_del_wq); + destroy_workqueue(nf_flow_offload_stats_wq); } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 6cb9f9474b05..edee7fa944c1 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -151,13 +151,6 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf) } EXPORT_SYMBOL(nf_log_unbind_pf); -void nf_logger_request_module(int pf, enum nf_log_type type) -{ - if (loggers[pf][type] == NULL) - request_module("nf-logger-%u-%u", pf, type); -} -EXPORT_SYMBOL_GPL(nf_logger_request_module); - int nf_logger_find_get(int pf, enum nf_log_type type) { struct nf_logger *logger; @@ -177,9 +170,6 @@ int nf_logger_find_get(int pf, enum nf_log_type type) return 0; } - if (rcu_access_pointer(loggers[pf][type]) == NULL) - request_module("nf-logger-%u-%u", pf, type); - rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); if (logger == NULL) diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c deleted file mode 100644 index fd7c5f0f5c25..000000000000 --- a/net/netfilter/nf_log_common.c +++ /dev/null @@ -1,224 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/tcp.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_bridge.h> -#include <linux/netfilter/xt_LOG.h> -#include <net/netfilter/nf_log.h> - -int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset) -{ - struct udphdr _udph; - const struct udphdr *uh; - - if (proto == IPPROTO_UDP) - /* Max length: 10 "PROTO=UDP " */ - nf_log_buf_add(m, "PROTO=UDP "); - else /* Max length: 14 "PROTO=UDPLITE " */ - nf_log_buf_add(m, "PROTO=UDPLITE "); - - if (fragment) - goto out; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); - if (uh == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", - ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); - -out: - return 0; -} -EXPORT_SYMBOL_GPL(nf_log_dump_udp_header); - -int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset, - unsigned int logflags) -{ - struct tcphdr _tcph; - const struct tcphdr *th; - - /* Max length: 10 "PROTO=TCP " */ - nf_log_buf_add(m, "PROTO=TCP "); - - if (fragment) - return 0; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); - if (th == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - nf_log_buf_add(m, "SPT=%u DPT=%u ", - ntohs(th->source), ntohs(th->dest)); - /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & NF_LOG_TCPSEQ) { - nf_log_buf_add(m, "SEQ=%u ACK=%u ", - ntohl(th->seq), ntohl(th->ack_seq)); - } - - /* Max length: 13 "WINDOW=65535 " */ - nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); - /* Max length: 9 "RES=0x3C " */ - nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & - TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ - if (th->cwr) - nf_log_buf_add(m, "CWR "); - if (th->ece) - nf_log_buf_add(m, "ECE "); - if (th->urg) - nf_log_buf_add(m, "URG "); - if (th->ack) - nf_log_buf_add(m, "ACK "); - if (th->psh) - nf_log_buf_add(m, "PSH "); - if (th->rst) - nf_log_buf_add(m, "RST "); - if (th->syn) - nf_log_buf_add(m, "SYN "); - if (th->fin) - nf_log_buf_add(m, "FIN "); - /* Max length: 11 "URGP=65535 " */ - nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); - - if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { - u_int8_t _opt[60 - sizeof(struct tcphdr)]; - const u_int8_t *op; - unsigned int i; - unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); - - op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), - optsize, _opt); - if (op == NULL) { - nf_log_buf_add(m, "OPT (TRUNCATED)"); - return 1; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - nf_log_buf_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - nf_log_buf_add(m, "%02X", op[i]); - - nf_log_buf_add(m, ") "); - } - - return 0; -} -EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); - -void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, - struct sock *sk) -{ - if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) - return; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - const struct cred *cred = sk->sk_socket->file->f_cred; - nf_log_buf_add(m, "UID=%u GID=%u ", - from_kuid_munged(&init_user_ns, cred->fsuid), - from_kgid_munged(&init_user_ns, cred->fsgid)); - } - read_unlock_bh(&sk->sk_callback_lock); -} -EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid); - -void -nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, const char *prefix) -{ - const struct net_device *physoutdev __maybe_unused; - const struct net_device *physindev __maybe_unused; - - nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", - '0' + loginfo->u.log.level, prefix, - in ? in->name : "", - out ? out->name : ""); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - physindev = nf_bridge_get_physindev(skb); - if (physindev && in != physindev) - nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); - physoutdev = nf_bridge_get_physoutdev(skb); - if (physoutdev && out != physoutdev) - nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); -#endif -} -EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); - -void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) -{ - u16 vid; - - if (!skb_vlan_tag_present(skb)) - return; - - vid = skb_vlan_tag_get(skb); - nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); -} -EXPORT_SYMBOL_GPL(nf_log_dump_vlan); - -/* bridge and netdev logging families share this code. */ -void nf_log_l2packet(struct net *net, u_int8_t pf, - __be16 protocol, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - switch (protocol) { - case htons(ETH_P_IP): - nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - case htons(ETH_P_IPV6): - nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - case htons(ETH_P_ARP): - case htons(ETH_P_RARP): - nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - } -} -EXPORT_SYMBOL_GPL(nf_log_l2packet); - -static int __init nf_log_common_init(void) -{ - return 0; -} - -static void __exit nf_log_common_exit(void) {} - -module_init(nf_log_common_init); -module_exit(nf_log_common_exit); - -MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c deleted file mode 100644 index 968dafa684c9..000000000000 --- a/net/netfilter/nf_log_netdev.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * (C) 2016 by Pablo Neira Ayuso <pablo@netfilter.org> - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter.h> -#include <net/netfilter/nf_log.h> - -static void nf_log_netdev_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out, - loginfo, prefix); -} - -static struct nf_logger nf_netdev_logger __read_mostly = { - .name = "nf_log_netdev", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_netdev_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_netdev_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger); -} - -static void __net_exit nf_log_netdev_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_netdev_logger); -} - -static struct pernet_operations nf_log_netdev_net_ops = { - .init = nf_log_netdev_net_init, - .exit = nf_log_netdev_net_exit, -}; - -static int __init nf_log_netdev_init(void) -{ - int ret; - - /* Request to load the real packet loggers. */ - nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG); - nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG); - nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG); - - ret = register_pernet_subsys(&nf_log_netdev_net_ops); - if (ret < 0) - return ret; - - nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger); - return 0; -} - -static void __exit nf_log_netdev_exit(void) -{ - unregister_pernet_subsys(&nf_log_netdev_net_ops); - nf_log_unregister(&nf_netdev_logger); -} - -module_init(nf_log_netdev_init); -module_exit(nf_log_netdev_exit); - -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_DESCRIPTION("Netfilter netdev packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */ diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c new file mode 100644 index 000000000000..2518818ed479 --- /dev/null +++ b/net/netfilter/nf_log_syslog.c @@ -0,0 +1,1089 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter/xt_LOG.h> +#include <net/netfilter/nf_log.h> + +static const struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = LOGLEVEL_NOTICE, + .logflags = NF_LOG_DEFAULT_MASK, + }, + }, +}; + +struct arppayload { + unsigned char mac_src[ETH_ALEN]; + unsigned char ip_src[4]; + unsigned char mac_dst[ETH_ALEN]; + unsigned char ip_dst[4]; +}; + +static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) +{ + u16 vid; + + if (!skb_vlan_tag_present(skb)) + return; + + vid = skb_vlan_tag_get(skb); + nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); +} +static void noinline_for_stack +dump_arp_packet(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int nhoff) +{ + const struct arppayload *ap; + struct arppayload _arpp; + const struct arphdr *ah; + unsigned int logflags; + struct arphdr _arph; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (!ah) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + if (logflags & NF_LOG_MACDECODE) { + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + } + + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); + /* If it's for Ethernet and the lengths are OK, then log the ARP + * payload. + */ + if (ah->ar_hrd != htons(ARPHRD_ETHER) || + ah->ar_hln != ETH_ALEN || + ah->ar_pln != sizeof(__be32)) + return; + + ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); + if (!ap) { + nf_log_buf_add(m, " INCOMPLETE [%zu bytes]", + skb->len - sizeof(_arph)); + return; + } + nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4", + ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); +} + +static void +nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, const char *prefix) +{ + const struct net_device *physoutdev __maybe_unused; + const struct net_device *physindev __maybe_unused; + + nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, + in ? in->name : "", + out ? out->name : ""); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + physindev = nf_bridge_get_physindev(skb); + if (physindev && in != physindev) + nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = nf_bridge_get_physoutdev(skb); + if (physoutdev && out != physoutdev) + nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); +#endif +} + +static void nf_log_arp_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo, + prefix); + dump_arp_packet(m, loginfo, skb, 0); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_arp_logger __read_mostly = { + .name = "nf_log_arp", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_arp_packet, + .me = THIS_MODULE, +}; + +static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk) +{ + if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + const struct cred *cred = sk->sk_socket->file->f_cred; + + nf_log_buf_add(m, "UID=%u GID=%u ", + from_kuid_munged(&init_user_ns, cred->fsuid), + from_kgid_munged(&init_user_ns, cred->fsgid)); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static noinline_for_stack int +nf_log_dump_tcp_header(struct nf_log_buf *m, + const struct sk_buff *skb, + u8 proto, int fragment, + unsigned int offset, + unsigned int logflags) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + nf_log_buf_add(m, "PROTO=TCP "); + + if (fragment) + return 0; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (!th) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & NF_LOG_TCPSEQ) { + nf_log_buf_add(m, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + } + + /* Max length: 13 "WINDOW=65535 " */ + nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & + TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + nf_log_buf_add(m, "CWR "); + if (th->ece) + nf_log_buf_add(m, "ECE "); + if (th->urg) + nf_log_buf_add(m, "URG "); + if (th->ack) + nf_log_buf_add(m, "ACK "); + if (th->psh) + nf_log_buf_add(m, "PSH "); + if (th->rst) + nf_log_buf_add(m, "RST "); + if (th->syn) + nf_log_buf_add(m, "SYN "); + if (th->fin) + nf_log_buf_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & NF_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { + unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr); + u8 _opt[60 - sizeof(struct tcphdr)]; + unsigned int i; + const u8 *op; + + op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), + optsize, _opt); + if (!op) { + nf_log_buf_add(m, "OPT (TRUNCATED)"); + return 1; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + + nf_log_buf_add(m, ") "); + } + + return 0; +} + +static noinline_for_stack int +nf_log_dump_udp_header(struct nf_log_buf *m, + const struct sk_buff *skb, + u8 proto, int fragment, + unsigned int offset) +{ + struct udphdr _udph; + const struct udphdr *uh; + + if (proto == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + nf_log_buf_add(m, "PROTO=UDP "); + else /* Max length: 14 "PROTO=UDPLITE " */ + nf_log_buf_add(m, "PROTO=UDPLITE "); + + if (fragment) + goto out; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (!uh) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); + +out: + return 0; +} + +/* One level of recursion won't kill us */ +static noinline_for_stack void +dump_ipv4_packet(struct net *net, struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int iphoff) +{ + const struct iphdr *ih; + unsigned int logflags; + struct iphdr _iph; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (!ih) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. + * Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " + */ + nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + nf_log_buf_add(m, "CE "); + if (ntohs(ih->frag_off) & IP_DF) + nf_log_buf_add(m, "DF "); + if (ntohs(ih->frag_off) & IP_MF) + nf_log_buf_add(m, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & NF_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; + const unsigned char *op; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff + sizeof(_iph), + optsize, _opt); + if (!op) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + nf_log_buf_add(m, ") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff + ih->ihl * 4, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, ih->protocol, + ntohs(ih->frag_off) & IP_OFFSET, + iphoff + ih->ihl * 4)) + return; + break; + case IPPROTO_ICMP: { + static const size_t required_len[NR_ICMP_TYPES + 1] = { + [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + const struct icmphdr *ich; + struct icmphdr _icmph; + + /* Max length: 11 "PROTO=ICMP " */ + nf_log_buf_add(m, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (!ich) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl * 4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len - iphoff - ih->ihl * 4 < required_len[ich->type]) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl * 4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + nf_log_buf_add(m, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); + fallthrough; + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + nf_log_buf_add(m, "["); + dump_ipv4_packet(net, m, info, skb, + iphoff + ih->ihl * 4 + sizeof(_icmph)); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) { + nf_log_buf_add(m, "MTU=%u ", + ntohs(ich->un.frag.mtu)); + } + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + const struct ip_auth_hdr *ah; + struct ip_auth_hdr _ahdr; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + nf_log_buf_add(m, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_ahdr), &_ahdr); + if (!ah) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl * 4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + const struct ip_esp_hdr *eh; + struct ip_esp_hdr _esph; + + /* Max length: 10 "PROTO=ESP " */ + nf_log_buf_add(m, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_esph), &_esph); + if (!eh) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl * 4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & NF_LOG_UID) && !iphoff) + nf_log_dump_sk_uid_gid(net, m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (!iphoff && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* UDPLITE: 14+max(25,20) = 39 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static noinline_for_stack void +dump_ipv6_packet(struct net *net, struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + const struct ipv6hdr *ih; + unsigned int hdrlen = 0; + unsigned int logflags; + struct ipv6hdr _ip6h; + unsigned int ptr; + u8 currenthdr; + int fragment; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (!ih) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ + nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(__be32 *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && nf_ip6_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr; + const struct ipv6_opt_hdr *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (!hp) { + nf_log_buf_add(m, "TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (logflags & NF_LOG_IPOPT) + nf_log_buf_add(m, "OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr; + const struct frag_hdr *fh; + + nf_log_buf_add(m, "FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (!fh) { + nf_log_buf_add(m, "TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + nf_log_buf_add(m, "INCOMPLETE "); + + nf_log_buf_add(m, "ID:%08x ", + ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (logflags & NF_LOG_IPOPT) + nf_log_buf_add(m, ")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (logflags & NF_LOG_IPOPT) { + struct ip_auth_hdr _ahdr; + const struct ip_auth_hdr *ah; + + /* Max length: 3 "AH " */ + nf_log_buf_add(m, "AH "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (!ah) { + /* Max length: 26 "INCOMPLETE [65535 bytes] )" */ + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi)); + } + + hdrlen = ipv6_authlen(hp); + break; + case IPPROTO_ESP: + if (logflags & NF_LOG_IPOPT) { + struct ip_esp_hdr _esph; + const struct ip_esp_hdr *eh; + + /* Max length: 4 "ESP " */ + nf_log_buf_add(m, "ESP "); + + if (fragment) { + nf_log_buf_add(m, ")"); + return; + } + + /* Max length: 26 "INCOMPLETE [65535 bytes] )" */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (!eh) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + nf_log_buf_add(m, "SPI=0x%x )", + ntohl(eh->spi)); + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr); + return; + } + if (logflags & NF_LOG_IPOPT) + nf_log_buf_add(m, ") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: + if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment, + ptr, logflags)) + return; + break; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr)) + return; + break; + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h; + const struct icmp6hdr *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + nf_log_buf_add(m, "PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (!ic) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", + skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + nf_log_buf_add(m, "TYPE=%u CODE=%u ", + ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + nf_log_buf_add(m, "ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + nf_log_buf_add(m, "POINTER=%08x ", + ntohl(ic->icmp6_pointer)); + fallthrough; + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + nf_log_buf_add(m, "["); + dump_ipv6_packet(net, m, info, skb, + ptr + sizeof(_icmp6h), 0); + nf_log_buf_add(m, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) { + nf_log_buf_add(m, "MTU=%u ", + ntohl(ic->icmp6_mtu)); + } + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + nf_log_buf_add(m, "PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & NF_LOG_UID) && recurse) + nf_log_dump_sk_uid_gid(net, m, skb->sk); + + /* Max length: 16 "MARK=0xFFFFFFFF " */ + if (recurse && skb->mark) + nf_log_buf_add(m, "MARK=0x%x ", skb->mark); +} + +static void dump_ipv4_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & NF_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int i; + + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < dev->hard_header_len; i++, p++) + nf_log_buf_add(m, ":%02x", *p); + } + nf_log_buf_add(m, " "); +} + +static void nf_log_ip_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, + out, loginfo, prefix); + + if (in) + dump_ipv4_mac_header(m, loginfo, skb); + + dump_ipv4_packet(net, m, loginfo, skb, 0); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_ip_logger __read_mostly = { + .name = "nf_log_ipv4", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip_packet, + .me = THIS_MODULE, +}; + +static void dump_ipv6_mac_header(struct nf_log_buf *m, + const struct nf_loginfo *info, + const struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + unsigned int logflags = 0; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + + if (!(logflags & NF_LOG_MACDECODE)) + goto fallback; + + switch (dev->type) { + case ARPHRD_ETHER: + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + return; + default: + break; + } + +fallback: + nf_log_buf_add(m, "MAC="); + if (dev->hard_header_len && + skb->mac_header != skb->network_header) { + const unsigned char *p = skb_mac_header(skb); + unsigned int len = dev->hard_header_len; + unsigned int i; + + if (dev->type == ARPHRD_SIT) { + p -= ETH_HLEN; + + if (p < skb->head) + p = NULL; + } + + if (p) { + nf_log_buf_add(m, "%02x", *p++); + for (i = 1; i < len; i++) + nf_log_buf_add(m, ":%02x", *p++); + } + nf_log_buf_add(m, " "); + + if (dev->type == ARPHRD_SIT) { + const struct iphdr *iph = + (struct iphdr *)skb_mac_header(skb); + nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, + &iph->daddr); + } + } else { + nf_log_buf_add(m, " "); + } +} + +static void nf_log_ip6_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + struct nf_log_buf *m; + + /* FIXME: Disabled from containers until syslog ns is supported */ + if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns) + return; + + m = nf_log_buf_open(); + + if (!loginfo) + loginfo = &default_loginfo; + + nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, + loginfo, prefix); + + if (in) + dump_ipv6_mac_header(m, loginfo, skb); + + dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1); + + nf_log_buf_close(m); +} + +static struct nf_logger nf_ip6_logger __read_mostly = { + .name = "nf_log_ipv6", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_ip6_packet, + .me = THIS_MODULE, +}; + +static void nf_log_netdev_packet(struct net *net, u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, + const char *prefix) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + nf_log_ip_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + case htons(ETH_P_IPV6): + nf_log_ip6_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + } +} + +static struct nf_logger nf_netdev_logger __read_mostly = { + .name = "nf_log_netdev", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_netdev_packet, + .me = THIS_MODULE, +}; + +static struct nf_logger nf_bridge_logger __read_mostly = { + .name = "nf_log_bridge", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_netdev_packet, + .me = THIS_MODULE, +}; + +static int __net_init nf_log_syslog_net_init(struct net *net) +{ + int ret = nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); + + if (ret) + return ret; + + ret = nf_log_set(net, NFPROTO_ARP, &nf_arp_logger); + if (ret) + goto err1; + + ret = nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger); + if (ret) + goto err2; + + ret = nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger); + if (ret) + goto err3; + + ret = nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); + if (ret) + goto err4; + return 0; +err4: + nf_log_unset(net, &nf_netdev_logger); +err3: + nf_log_unset(net, &nf_ip6_logger); +err2: + nf_log_unset(net, &nf_arp_logger); +err1: + nf_log_unset(net, &nf_ip_logger); + return ret; +} + +static void __net_exit nf_log_syslog_net_exit(struct net *net) +{ + nf_log_unset(net, &nf_ip_logger); + nf_log_unset(net, &nf_arp_logger); + nf_log_unset(net, &nf_ip6_logger); + nf_log_unset(net, &nf_netdev_logger); +} + +static struct pernet_operations nf_log_syslog_net_ops = { + .init = nf_log_syslog_net_init, + .exit = nf_log_syslog_net_exit, +}; + +static int __init nf_log_syslog_init(void) +{ + int ret; + + ret = register_pernet_subsys(&nf_log_syslog_net_ops); + if (ret < 0) + return ret; + + ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger); + if (ret < 0) + goto err1; + + ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger); + if (ret < 0) + goto err2; + + ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger); + if (ret < 0) + goto err3; + + ret = nf_log_register(NFPROTO_NETDEV, &nf_netdev_logger); + if (ret < 0) + goto err4; + + ret = nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); + if (ret < 0) + goto err5; + + return 0; +err5: + nf_log_unregister(&nf_netdev_logger); +err4: + nf_log_unregister(&nf_ip6_logger); +err3: + nf_log_unregister(&nf_arp_logger); +err2: + nf_log_unregister(&nf_ip_logger); +err1: + pr_err("failed to register logger\n"); + unregister_pernet_subsys(&nf_log_syslog_net_ops); + return ret; +} + +static void __exit nf_log_syslog_exit(void) +{ + unregister_pernet_subsys(&nf_log_syslog_net_ops); + nf_log_unregister(&nf_ip_logger); + nf_log_unregister(&nf_arp_logger); + nf_log_unregister(&nf_ip6_logger); + nf_log_unregister(&nf_netdev_logger); + nf_log_unregister(&nf_bridge_logger); +} + +module_init(nf_log_syslog_init); +module_exit(nf_log_syslog_exit); + +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("Netfilter syslog packet logging"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("nf_log_arp"); +MODULE_ALIAS("nf_log_bridge"); +MODULE_ALIAS("nf_log_ipv4"); +MODULE_ALIAS("nf_log_ipv6"); +MODULE_ALIAS("nf_log_netdev"); +MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); +MODULE_ALIAS_NF_LOGGER(AF_INET, 0); +MODULE_ALIAS_NF_LOGGER(3, 0); +MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */ +MODULE_ALIAS_NF_LOGGER(AF_INET6, 0); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 589d2f6978d3..357443b3c0e4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -21,10 +21,13 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/sock.h> #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) +unsigned int nf_tables_net_id __read_mostly; + static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); @@ -66,9 +69,46 @@ static const struct rhashtable_params nft_objname_ht_params = { .automatic_shrinking = true, }; +struct nft_audit_data { + struct nft_table *table; + int entries; + int op; + struct list_head list; +}; + +static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types + [NFT_MSG_NEWTABLE] = AUDIT_NFT_OP_TABLE_REGISTER, + [NFT_MSG_GETTABLE] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELTABLE] = AUDIT_NFT_OP_TABLE_UNREGISTER, + [NFT_MSG_NEWCHAIN] = AUDIT_NFT_OP_CHAIN_REGISTER, + [NFT_MSG_GETCHAIN] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELCHAIN] = AUDIT_NFT_OP_CHAIN_UNREGISTER, + [NFT_MSG_NEWRULE] = AUDIT_NFT_OP_RULE_REGISTER, + [NFT_MSG_GETRULE] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELRULE] = AUDIT_NFT_OP_RULE_UNREGISTER, + [NFT_MSG_NEWSET] = AUDIT_NFT_OP_SET_REGISTER, + [NFT_MSG_GETSET] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELSET] = AUDIT_NFT_OP_SET_UNREGISTER, + [NFT_MSG_NEWSETELEM] = AUDIT_NFT_OP_SETELEM_REGISTER, + [NFT_MSG_GETSETELEM] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELSETELEM] = AUDIT_NFT_OP_SETELEM_UNREGISTER, + [NFT_MSG_NEWGEN] = AUDIT_NFT_OP_GEN_REGISTER, + [NFT_MSG_GETGEN] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_TRACE] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_NEWOBJ] = AUDIT_NFT_OP_OBJ_REGISTER, + [NFT_MSG_GETOBJ] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELOBJ] = AUDIT_NFT_OP_OBJ_UNREGISTER, + [NFT_MSG_GETOBJ_RESET] = AUDIT_NFT_OP_OBJ_RESET, + [NFT_MSG_NEWFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_REGISTER, + [NFT_MSG_GETFLOWTABLE] = AUDIT_NFT_OP_INVALID, + [NFT_MSG_DELFLOWTABLE] = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, +}; + static void nft_validate_state_update(struct net *net, u8 new_validate_state) { - switch (net->nft.validate_state) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + switch (nft_net->validate_state) { case NFT_VALIDATE_SKIP: WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); break; @@ -79,7 +119,7 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) return; } - net->nft.validate_state = new_validate_state; + nft_net->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); @@ -134,13 +174,15 @@ static void nft_trans_destroy(struct nft_trans *trans) static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) { + struct nftables_pernet *nft_net; struct net *net = ctx->net; struct nft_trans *trans; if (!nft_set_is_anonymous(set)) return; - list_for_each_entry_reverse(trans, &net->nft.commit_list, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) @@ -234,6 +276,14 @@ static void nf_tables_unregister_hook(struct net *net, nf_unregister_net_hook(net, &basechain->ops); } +static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) +{ + struct nftables_pernet *nft_net; + + nft_net = net_generic(net, nf_tables_net_id); + list_add_tail(&trans->list, &nft_net->commit_list); +} + static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; @@ -245,7 +295,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) if (msg_type == NFT_MSG_NEWTABLE) nft_activate_next(ctx->net, ctx->table); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -278,7 +328,7 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) } } - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } @@ -351,7 +401,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); } nft_trans_rule(trans) = rule; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } @@ -417,7 +467,7 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -449,7 +499,7 @@ static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, obj); nft_trans_obj(trans) = obj; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -482,7 +532,7 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, flowtable); nft_trans_flowtable(trans) = flowtable; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -510,13 +560,15 @@ static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, u8 family, u8 genmask, u32 nlpid) { + struct nftables_pernet *nft_net; struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(table, &net->nft.tables, list, - lockdep_is_held(&net->nft.commit_mutex)) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_rcu(table, &nft_net->tables, list, + lockdep_is_held(&nft_net->commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) { @@ -535,9 +587,11 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, u8 genmask) { + struct nftables_pernet *nft_net; struct nft_table *table; - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && nft_active_genmask(table, genmask)) return table; @@ -586,10 +640,11 @@ struct nft_module_request { }; #ifdef CONFIG_MODULES -static __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, - ...) +__printf(2, 3) int nft_request_module(struct net *net, const char *fmt, + ...) { char module_name[MODULE_NAME_LEN]; + struct nftables_pernet *nft_net; struct nft_module_request *req; va_list args; int ret; @@ -600,7 +655,8 @@ static __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, if (ret >= MODULE_NAME_LEN) return 0; - list_for_each_entry(req, &net->nft.module_list, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry(req, &nft_net->module_list, list) { if (!strcmp(req->module, module_name)) { if (req->done) return 0; @@ -616,10 +672,11 @@ static __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, req->done = false; strlcpy(req->module, module_name, MODULE_NAME_LEN); - list_add_tail(&req->list, &net->nft.module_list); + list_add_tail(&req->list, &nft_net->module_list); return -EAGAIN; } +EXPORT_SYMBOL_GPL(nft_request_module); #endif static void lockdep_nfnl_nft_mutex_not_held(void) @@ -652,6 +709,13 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } +static __be16 nft_base_seq(const struct net *net) +{ + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + return htons(nft_net->base_seq & 0xffff); +} + static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_TABLE_MAXNAMELEN - 1 }, @@ -666,18 +730,13 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, int family, const struct nft_table *table) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || @@ -715,19 +774,9 @@ static void nft_notify_enqueue(struct sk_buff *skb, bool report, static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { + struct nftables_pernet *nft_net; struct sk_buff *skb; int err; - char *buf = kasprintf(GFP_KERNEL, "%s:%llu;?:0", - ctx->table->name, ctx->table->handle); - - audit_log_nfcfg(buf, - ctx->family, - ctx->table->use, - event == NFT_MSG_NEWTABLE ? - AUDIT_NFT_OP_TABLE_REGISTER : - AUDIT_NFT_OP_TABLE_UNREGISTER, - GFP_KERNEL); - kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -744,7 +793,8 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_net = net_generic(ctx->net, nf_tables_net_id); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -754,15 +804,17 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nftables_pernet *nft_net; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -900,6 +952,12 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) nft_table_disable(net, table, 0); } +enum { + NFT_TABLE_STATE_UNCHANGED = 0, + NFT_TABLE_STATE_DORMANT, + NFT_TABLE_STATE_WAKEUP +}; + static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; @@ -929,21 +987,19 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if ((flags & NFT_TABLE_F_DORMANT) && !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { - nft_trans_table_enable(trans) = false; + nft_trans_table_state(trans) = NFT_TABLE_STATE_DORMANT; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ctx->table->flags &= ~NFT_TABLE_F_DORMANT; ret = nf_tables_table_enable(ctx->net, ctx->table); if (ret >= 0) - nft_trans_table_enable(trans) = true; - else - ctx->table->flags |= NFT_TABLE_F_DORMANT; + nft_trans_table_state(trans) = NFT_TABLE_STATE_WAKEUP; } if (ret < 0) goto err; + nft_trans_table_flags(trans) = flags; nft_trans_table_update(trans) = true; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err: nft_trans_destroy(trans); @@ -1006,6 +1062,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -1015,7 +1072,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, u32 flags = 0; int err; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, NETLINK_CB(skb).portid); @@ -1076,7 +1133,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (err < 0) goto err_trans; - list_add_tail_rcu(&table->list, &net->nft.tables); + list_add_tail_rcu(&table->list, &nft_net->tables); return 0; err_trans: rhltable_destroy(&table->chains_ht); @@ -1164,11 +1221,12 @@ out: static int nft_flush(struct nft_ctx *ctx, int family) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_table *table, *nt; const struct nlattr * const *nla = ctx->nla; int err = 0; - list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) { + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (family != AF_UNSPEC && table->family != family) continue; @@ -1287,7 +1345,9 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING - return lockdep_is_held(&net->nft.commit_mutex); + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + return lockdep_is_held(&nft_net->commit_mutex); #else return true; #endif @@ -1434,18 +1494,13 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, const struct nft_chain *chain) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), @@ -1495,20 +1550,9 @@ nla_put_failure: static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { + struct nftables_pernet *nft_net; struct sk_buff *skb; int err; - char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu", - ctx->table->name, ctx->table->handle, - ctx->chain->name, ctx->chain->handle); - - audit_log_nfcfg(buf, - ctx->family, - ctx->chain->use, - event == NFT_MSG_NEWCHAIN ? - AUDIT_NFT_OP_CHAIN_REGISTER : - AUDIT_NFT_OP_CHAIN_UNREGISTER, - GFP_KERNEL); - kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -1526,7 +1570,8 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_net = net_generic(ctx->net, nf_tables_net_id); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -1541,11 +1586,13 @@ static int nf_tables_dump_chains(struct sk_buff *skb, unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -1861,11 +1908,12 @@ static int nft_chain_parse_hook(struct net *net, struct nft_chain_hook *hook, u8 family, bool autoload) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; int err; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX, @@ -2254,6 +2302,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_trans *tmp; char *name; @@ -2263,7 +2312,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, goto err; err = -EEXIST; - list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) { + list_for_each_entry(tmp, &nft_net->commit_list, list) { if (tmp->msg_type == NFT_MSG_NEWCHAIN && tmp->ctx.table == table && nft_trans_chain_update(tmp) && @@ -2277,7 +2326,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_chain_name(trans) = name; } - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err: @@ -2289,10 +2338,11 @@ err: static struct nft_chain *nft_chain_lookup_byid(const struct net *net, const struct nlattr *nla) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { struct nft_chain *chain = trans->ctx.chain; if (trans->msg_type == NFT_MSG_NEWCHAIN && @@ -2307,6 +2357,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -2318,7 +2369,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, u64 handle = 0; u32 flags = 0; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, NETLINK_CB(skb).portid); @@ -2803,20 +2854,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, const struct nft_rule *prule) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; const struct nft_expr *expr, *next; struct nlattr *list; u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, + nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) @@ -2832,6 +2878,9 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; } + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_stats(chain, rule); + list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS); if (list == NULL) goto nla_put_failure; @@ -2859,20 +2908,9 @@ nla_put_failure: static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct sk_buff *skb; int err; - char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu", - ctx->table->name, ctx->table->handle, - ctx->chain->name, ctx->chain->handle); - - audit_log_nfcfg(buf, - ctx->family, - rule->handle, - event == NFT_MSG_NEWRULE ? - AUDIT_NFT_OP_RULE_REGISTER : - AUDIT_NFT_OP_RULE_UNREGISTER, - GFP_KERNEL); - kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -2890,7 +2928,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -2948,11 +2986,13 @@ static int nf_tables_dump_rules(struct sk_buff *skb, unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -3183,6 +3223,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); struct nft_expr_info *info = NULL; @@ -3200,7 +3241,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, int err, rem; u64 handle, pos_handle; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); @@ -3372,7 +3413,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, kvfree(info); chain->use++; - if (net->nft.validate_state == NFT_VALIDATE_DO) + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { @@ -3401,10 +3442,11 @@ err1: static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nlattr *nla) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { struct nft_rule *rule = nft_trans_rule(trans); if (trans->msg_type == NFT_MSG_NEWRULE && @@ -3517,13 +3559,14 @@ nft_select_set_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, enum nft_set_policies policy) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; u32 flags = 0; int i; - lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); if (nla[NFTA_SET_FLAGS] != NULL) @@ -3661,10 +3704,11 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, static struct nft_set *nft_set_lookup_byid(const struct net *net, const struct nlattr *nla, u8 genmask) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWSET) { struct nft_set *set = nft_trans_set(trans); @@ -3799,7 +3843,6 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; u32 portid = ctx->portid; struct nlattr *nest; @@ -3807,16 +3850,11 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, + NFNETLINK_V0, nft_base_seq(ctx->net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) @@ -3904,21 +3942,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, int event, gfp_t gfp_flags) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct sk_buff *skb; u32 portid = ctx->portid; int err; - char *buf = kasprintf(gfp_flags, "%s:%llu;%s:%llu", - ctx->table->name, ctx->table->handle, - set->name, set->handle); - - audit_log_nfcfg(buf, - ctx->family, - set->field_count, - event == NFT_MSG_NEWSET ? - AUDIT_NFT_OP_SET_REGISTER : - AUDIT_NFT_OP_SET_UNREGISTER, - gfp_flags); - kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -3934,7 +3961,7 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -3947,14 +3974,16 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); struct nft_ctx *ctx = cb->data, ctx_set; + struct nftables_pernet *nft_net; if (cb->args[1]) return skb->len; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && ctx->family != table->family) continue; @@ -4793,18 +4822,19 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); + struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; bool set_found = false; - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; int event; rcu_read_lock(); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && dump_ctx->ctx.family != table->family) continue; @@ -4830,16 +4860,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - NLM_F_MULTI); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, + table->family, NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = table->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) @@ -4896,22 +4921,16 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; int err; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, + NFNETLINK_V0, nft_base_seq(ctx->net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) @@ -5099,22 +5118,11 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set_elem *elem, int event, u16 flags) { + struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; int err; - char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu", - ctx->table->name, ctx->table->handle, - set->name, set->handle); - - audit_log_nfcfg(buf, - ctx->family, - set->handle, - event == NFT_MSG_NEWSETELEM ? - AUDIT_NFT_OP_SETELEM_REGISTER : - AUDIT_NFT_OP_SETELEM_UNREGISTER, - GFP_KERNEL); - kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; @@ -5130,7 +5138,8 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_net = net_generic(net, nf_tables_net_id); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -5620,7 +5629,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } nft_trans_elem(trans) = elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_set_full: @@ -5651,6 +5660,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; @@ -5679,7 +5689,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return err; } - if (net->nft.validate_state == NFT_VALIDATE_DO) + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, ctx.table); return 0; @@ -5815,7 +5825,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_set_elem_deactivate(ctx->net, set, &elem); nft_trans_elem(trans) = elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; fail_ops: @@ -5849,7 +5859,7 @@ static int nft_flush_set(const struct nft_ctx *ctx, nft_set_elem_deactivate(ctx->net, set, elem); nft_trans_elem_set(trans) = set; nft_trans_elem(trans) = *elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err1: @@ -6143,7 +6153,7 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, nft_trans_obj(trans) = obj; nft_trans_obj_update(trans) = true; nft_trans_obj_newobj(trans) = newobj; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -6263,19 +6273,14 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, int family, const struct nft_table *table, struct nft_object *obj, bool reset) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || @@ -6310,6 +6315,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) struct nft_obj_filter *filter = cb->data; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; struct nft_object *obj; bool reset = false; @@ -6317,9 +6323,10 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) reset = true; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -6338,12 +6345,11 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) filter->type != NFT_OBJECT_UNSPEC && obj->ops->type->type != filter->type) goto cont; - if (reset) { char *buf = kasprintf(GFP_ATOMIC, - "%s:%llu;?:0", + "%s:%u", table->name, - table->handle); + nft_net->base_seq); audit_log_nfcfg(buf, family, @@ -6464,8 +6470,11 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, reset = true; if (reset) { - char *buf = kasprintf(GFP_ATOMIC, "%s:%llu;?:0", - table->name, table->handle); + const struct nftables_pernet *nft_net; + char *buf; + + nft_net = net_generic(net, nf_tables_net_id); + buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq); audit_log_nfcfg(buf, family, @@ -6551,17 +6560,18 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, int family, int report, gfp_t gfp) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct sk_buff *skb; int err; - char *buf = kasprintf(gfp, "%s:%llu;?:0", - table->name, table->handle); + char *buf = kasprintf(gfp, "%s:%u", + table->name, nft_net->base_seq); audit_log_nfcfg(buf, family, obj->handle, event == NFT_MSG_NEWOBJ ? - AUDIT_NFT_OP_OBJ_REGISTER : - AUDIT_NFT_OP_OBJ_UNREGISTER, + AUDIT_NFT_OP_OBJ_REGISTER : + AUDIT_NFT_OP_OBJ_UNREGISTER, gfp); kfree(buf); @@ -6580,7 +6590,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, goto err; } - nft_notify_enqueue(skb, report, &net->nft.notify_list); + nft_notify_enqueue(skb, report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -6912,7 +6922,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans)); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -7100,7 +7110,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); nft_flowtable_hook_release(&flowtable_hook); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -7176,20 +7186,15 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, struct list_head *hook_list) { struct nlattr *nest, *nest_devs; - struct nfgenmsg *nfmsg; struct nft_hook *hook; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || @@ -7237,12 +7242,14 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; + struct nftables_pernet *nft_net; const struct nft_table *table; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -7377,20 +7384,9 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct list_head *hook_list, int event) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct sk_buff *skb; int err; - char *buf = kasprintf(GFP_KERNEL, "%s:%llu;%s:%llu", - flowtable->table->name, flowtable->table->handle, - flowtable->name, flowtable->handle); - - audit_log_nfcfg(buf, - ctx->family, - flowtable->hooknum, - event == NFT_MSG_NEWFLOWTABLE ? - AUDIT_NFT_OP_FLOWTABLE_REGISTER : - AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, - GFP_KERNEL); - kfree(buf); if (!ctx->report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES)) @@ -7408,7 +7404,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -7433,21 +7429,17 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -7482,6 +7474,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_flowtable *flowtable; + struct nftables_pernet *nft_net; struct nft_table *table; struct net *net; @@ -7489,13 +7482,14 @@ static int nf_tables_flowtable_event(struct notifier_block *this, return 0; net = dev_net(dev); - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { nft_flowtable_event(event, dev, flowtable); } } - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } @@ -7511,9 +7505,6 @@ static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb, struct sk_buff *skb2; int err; - audit_log_nfcfg("?:0;?:0", 0, net->nft.base_seq, - AUDIT_NFT_OP_GEN_REGISTER, GFP_KERNEL); - if (!nlmsg_report(nlh) && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) return; @@ -7679,16 +7670,17 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { static int nf_tables_validate(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table; - switch (net->nft.validate_state) { + switch (nft_net->validate_state) { case NFT_VALIDATE_SKIP: break; case NFT_VALIDATE_NEED: nft_validate_state_update(net, NFT_VALIDATE_DO); fallthrough; case NFT_VALIDATE_DO: - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_validate(net, table) < 0) return -EAGAIN; } @@ -7863,9 +7855,10 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha static void nf_tables_commit_chain_prepare_cancel(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { struct nft_chain *chain = trans->ctx.chain; if (trans->msg_type == NFT_MSG_NEWRULE || @@ -7974,10 +7967,11 @@ static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, static void nf_tables_module_autoload_cleanup(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_module_request *req, *next; - WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); - list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + list_for_each_entry_safe(req, next, &nft_net->module_list, list) { WARN_ON_ONCE(!req->done); list_del(&req->list); kfree(req); @@ -7986,6 +7980,7 @@ static void nf_tables_module_autoload_cleanup(struct net *net) static void nf_tables_commit_release(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; /* all side effects have to be made visible. @@ -7995,35 +7990,36 @@ static void nf_tables_commit_release(struct net *net) * Memory reclaim happens asynchronously from work queue * to prevent expensive synchronize_rcu() in commit phase. */ - if (list_empty(&net->nft.commit_list)) { + if (list_empty(&nft_net->commit_list)) { nf_tables_module_autoload_cleanup(net); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return; } - trans = list_last_entry(&net->nft.commit_list, + trans = list_last_entry(&nft_net->commit_list, struct nft_trans, list); get_net(trans->ctx.net); WARN_ON_ONCE(trans->put_net); trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); - list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); + list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); schedule_work(&trans_destroy_work); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); } static void nft_commit_notify(struct net *net, u32 portid) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct sk_buff *batch_skb = NULL, *nskb, *skb; unsigned char *data; int len; - list_for_each_entry_safe(skb, nskb, &net->nft.notify_list, list) { + list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) { if (!batch_skb) { new_batch: batch_skb = skb; @@ -8049,19 +8045,72 @@ new_batch: NFT_CB(batch_skb).report, GFP_KERNEL); } - WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); + WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); +} + +static int nf_tables_commit_audit_alloc(struct list_head *adl, + struct nft_table *table) +{ + struct nft_audit_data *adp; + + list_for_each_entry(adp, adl, list) { + if (adp->table == table) + return 0; + } + adp = kzalloc(sizeof(*adp), GFP_KERNEL); + if (!adp) + return -ENOMEM; + adp->table = table; + list_add(&adp->list, adl); + return 0; +} + +static void nf_tables_commit_audit_collect(struct list_head *adl, + struct nft_table *table, u32 op) +{ + struct nft_audit_data *adp; + + list_for_each_entry(adp, adl, list) { + if (adp->table == table) + goto found; + } + WARN_ONCE(1, "table=%s not expected in commit list", table->name); + return; +found: + adp->entries++; + if (!adp->op || adp->op > op) + adp->op = op; +} + +#define AUNFTABLENAMELEN (NFT_TABLE_MAXNAMELEN + 22) + +static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) +{ + struct nft_audit_data *adp, *adn; + char aubuf[AUNFTABLENAMELEN]; + + list_for_each_entry_safe(adp, adn, adl, list) { + snprintf(aubuf, AUNFTABLENAMELEN, "%s:%u", adp->table->name, + generation); + audit_log_nfcfg(aubuf, adp->table->family, adp->entries, + nft2audit_op[adp->op], GFP_KERNEL); + list_del(&adp->list); + kfree(adp); + } } static int nf_tables_commit(struct net *net, struct sk_buff *skb) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; + LIST_HEAD(adl); int err; - if (list_empty(&net->nft.commit_list)) { - mutex_unlock(&net->nft.commit_mutex); + if (list_empty(&nft_net->commit_list)) { + mutex_unlock(&nft_net->commit_mutex); return 0; } @@ -8074,9 +8123,14 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return err; /* 1. Allocate space for next generation rules_gen_X[] */ - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { int ret; + ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table); + if (ret) { + nf_tables_commit_chain_prepare_cancel(net); + return ret; + } if (trans->msg_type == NFT_MSG_NEWRULE || trans->msg_type == NFT_MSG_DELRULE) { chain = trans->ctx.chain; @@ -8090,7 +8144,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } /* step 2. Make rules_gen_X visible to packet path */ - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(chain, &table->chains, list) nf_tables_commit_chain(net, chain); } @@ -8099,20 +8153,22 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - while (++net->nft.base_seq == 0); + while (++nft_net->base_seq == 0) + ; /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { + nf_tables_commit_audit_collect(&adl, trans->ctx.table, + trans->msg_type); switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (!nft_trans_table_enable(trans)) { - nf_tables_table_disable(net, - trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; - } + if (nft_trans_table_state(trans) == NFT_TABLE_STATE_DORMANT) + nf_tables_table_disable(net, trans->ctx.table); + + trans->ctx.table->flags = nft_trans_table_flags(trans); } else { nft_clear(net, trans->ctx.table); } @@ -8258,6 +8314,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + nf_tables_commit_audit_log(&adl, nft_net->base_seq); nf_tables_commit_release(net); return 0; @@ -8265,17 +8322,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) static void nf_tables_module_autoload(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_module_request *req, *next; LIST_HEAD(module_list); - list_splice_init(&net->nft.module_list, &module_list); - mutex_unlock(&net->nft.commit_mutex); + list_splice_init(&nft_net->module_list, &module_list); + mutex_unlock(&nft_net->commit_mutex); list_for_each_entry_safe(req, next, &module_list, list) { request_module("%s", req->module); req->done = true; } - mutex_lock(&net->nft.commit_mutex); - list_splice(&module_list, &net->nft.module_list); + mutex_lock(&nft_net->commit_mutex); + list_splice(&module_list, &nft_net->module_list); } static void nf_tables_abort_release(struct nft_trans *trans) @@ -8312,6 +8370,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_hook *hook; @@ -8320,16 +8379,14 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_validate(net) < 0) return -EAGAIN; - list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, + list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (nft_trans_table_enable(trans)) { - nf_tables_table_disable(net, - trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; - } + if (nft_trans_table_state(trans) == NFT_TABLE_STATE_WAKEUP) + nf_tables_table_disable(net, trans->ctx.table); + nft_trans_destroy(trans); } else { list_del_rcu(&trans->ctx.table->list); @@ -8446,7 +8503,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, - &net->nft.commit_list, list) { + &nft_net->commit_list, list) { list_del(&trans->list); nf_tables_abort_release(trans); } @@ -8467,22 +8524,24 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); int ret = __nf_tables_abort(net, action); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return ret; } static bool nf_tables_valid_genid(struct net *net, u32 genid) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); bool genid_ok; - mutex_lock(&net->nft.commit_mutex); + mutex_lock(&nft_net->commit_mutex); - genid_ok = genid == 0 || net->nft.base_seq == genid; + genid_ok = genid == 0 || nft_net->base_seq == genid; if (!genid_ok) - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); /* else, commit mutex has to be released by commit or abort function */ return genid_ok; @@ -8599,6 +8658,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, data->verdict.chain); if (err < 0) return err; + break; default: break; } @@ -8657,15 +8717,6 @@ int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) } EXPORT_SYMBOL_GPL(nft_parse_u32_check); -/** - * nft_parse_register - parse a register value from a netlink attribute - * - * @attr: netlink attribute - * - * Parse and translate a register value from a netlink attribute. - * Registers used to be 128 bit wide, these register numbers will be - * mapped to the corresponding 32 bit register numbers. - */ static unsigned int nft_parse_register(const struct nlattr *attr) { unsigned int reg; @@ -8701,15 +8752,6 @@ int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg) } EXPORT_SYMBOL_GPL(nft_dump_register); -/** - * nft_validate_register_load - validate a load from a register - * - * @reg: the register number - * @len: the length of the data - * - * Validate that the input register is one of the general purpose - * registers and that the length of the load is within the bounds. - */ static int nft_validate_register_load(enum nft_registers reg, unsigned int len) { if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE) @@ -8737,20 +8779,6 @@ int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) } EXPORT_SYMBOL_GPL(nft_parse_register_load); -/** - * nft_validate_register_store - validate an expressions' register store - * - * @ctx: context of the expression performing the load - * @reg: the destination register number - * @data: the data to load - * @type: the data type - * @len: the length of the data - * - * Validate that a data load uses the appropriate data type for - * the destination register and the length is within the bounds. - * A value of NULL for the data means that its runtime gathered - * data. - */ static int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, @@ -9068,9 +9096,10 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) static void __nft_release_hooks(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table; - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table)) continue; @@ -9127,9 +9156,10 @@ static void __nft_release_table(struct net *net, struct nft_table *table) static void __nft_release_tables(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table, *nt; - list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (nft_table_has_owner(table)) continue; @@ -9140,6 +9170,7 @@ static void __nft_release_tables(struct net *net) static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct nftables_pernet *nft_net; struct netlink_notify *n = ptr; struct nft_table *table, *nt; struct net *net = n->net; @@ -9148,8 +9179,9 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { __nft_release_hook(net, table); @@ -9158,13 +9190,13 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, } if (release) { synchronize_rcu(); - list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) __nft_release_table(net, table); } } - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } @@ -9175,13 +9207,15 @@ static struct notifier_block nft_nl_notifier = { static int __net_init nf_tables_init_net(struct net *net) { - INIT_LIST_HEAD(&net->nft.tables); - INIT_LIST_HEAD(&net->nft.commit_list); - INIT_LIST_HEAD(&net->nft.module_list); - INIT_LIST_HEAD(&net->nft.notify_list); - mutex_init(&net->nft.commit_mutex); - net->nft.base_seq = 1; - net->nft.validate_state = NFT_VALIDATE_SKIP; + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + INIT_LIST_HEAD(&nft_net->tables); + INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->module_list); + INIT_LIST_HEAD(&nft_net->notify_list); + mutex_init(&nft_net->commit_mutex); + nft_net->base_seq = 1; + nft_net->validate_state = NFT_VALIDATE_SKIP; return 0; } @@ -9193,27 +9227,30 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { - mutex_lock(&net->nft.commit_mutex); - if (!list_empty(&net->nft.commit_list)) + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + mutex_lock(&nft_net->commit_mutex); + if (!list_empty(&nft_net->commit_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); __nft_release_tables(net); - mutex_unlock(&net->nft.commit_mutex); - WARN_ON_ONCE(!list_empty(&net->nft.tables)); - WARN_ON_ONCE(!list_empty(&net->nft.module_list)); - WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); + mutex_unlock(&nft_net->commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->tables)); + WARN_ON_ONCE(!list_empty(&nft_net->module_list)); + WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .id = &nf_tables_net_id, + .size = sizeof(struct nftables_pernet), }; static int __init nf_tables_module_init(void) { int err; - spin_lock_init(&nf_tables_destroy_list_lock); err = register_pernet_subsys(&nf_tables_net_ops); if (err < 0) return err; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 9ae14270c543..19215e81dd66 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -7,6 +7,8 @@ #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> +extern unsigned int nf_tables_net_id; + static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; @@ -45,6 +47,48 @@ void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, offsetof(struct nft_flow_key, control); } +struct nft_offload_ethertype { + __be16 value; + __be16 mask; +}; + +static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow) +{ + struct nft_flow_match *match = &flow->match; + struct nft_offload_ethertype ethertype; + + if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_CONTROL) && + match->key.basic.n_proto != htons(ETH_P_8021Q) && + match->key.basic.n_proto != htons(ETH_P_8021AD)) + return; + + ethertype.value = match->key.basic.n_proto; + ethertype.mask = match->mask.basic.n_proto; + + if (match->dissector.used_keys & BIT(FLOW_DISSECTOR_KEY_VLAN) && + (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || + match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { + match->key.basic.n_proto = match->key.cvlan.vlan_tpid; + match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; + match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; + match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; + match->key.vlan.vlan_tpid = ethertype.value; + match->mask.vlan.vlan_tpid = ethertype.mask; + match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = + offsetof(struct nft_flow_key, cvlan); + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CVLAN); + } else { + match->key.basic.n_proto = match->key.vlan.vlan_tpid; + match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; + match->key.vlan.vlan_tpid = ethertype.value; + match->mask.vlan.vlan_tpid = ethertype.mask; + match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = + offsetof(struct nft_flow_key, vlan); + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_VLAN); + } +} + struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { @@ -89,6 +133,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, expr = nft_expr_next(expr); } + nft_flow_rule_transfer_vlan(ctx, flow); + flow->proto = ctx->dep.l3num; kfree(ctx); @@ -197,26 +243,56 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, cls_flow->rule = flow->rule; } -static int nft_flow_offload_rule(struct nft_chain *chain, - struct nft_rule *rule, - struct nft_flow_rule *flow, - enum flow_cls_command command) +static int nft_flow_offload_cmd(const struct nft_chain *chain, + const struct nft_rule *rule, + struct nft_flow_rule *flow, + enum flow_cls_command command, + struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; - struct flow_cls_offload cls_flow; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); - nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack, + nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); - return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, + return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } +static int nft_flow_offload_rule(const struct nft_chain *chain, + struct nft_rule *rule, + struct nft_flow_rule *flow, + enum flow_cls_command command) +{ + struct flow_cls_offload cls_flow; + + return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); +} + +int nft_flow_rule_stats(const struct nft_chain *chain, + const struct nft_rule *rule) +{ + struct flow_cls_offload cls_flow = {}; + struct nft_expr *expr, *next; + int err; + + err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, + &cls_flow); + if (err < 0) + return err; + + nft_rule_for_each_expr(expr, next, rule) { + if (expr->ops->offload_stats) + expr->ops->offload_stats(expr, &cls_flow.stats); + } + + return 0; +} + static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { @@ -307,16 +383,18 @@ static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; + struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); - mutex_lock(&net->nft.commit_mutex); + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, @@ -412,9 +490,10 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); int err = 0; - list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) { + list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; @@ -460,11 +539,12 @@ static void nft_flow_rule_offload_abort(struct net *net, int nft_flow_rule_offload_commit(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; int err = 0; u8 policy; - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; @@ -516,7 +596,7 @@ int nft_flow_rule_offload_commit(struct net *net) } } - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; @@ -536,15 +616,15 @@ int nft_flow_rule_offload_commit(struct net *net) return err; } -static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) +static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, + struct net_device *dev) { struct nft_base_chain *basechain; - struct net *net = dev_net(dev); struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -576,19 +656,21 @@ static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 87b36da5cd98..0cf3278007ba 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -183,7 +183,6 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) void nft_trace_notify(struct nft_traceinfo *info) { const struct nft_pktinfo *pkt = info->pkt; - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; @@ -219,15 +218,11 @@ void nft_trace_notify(struct nft_traceinfo *info) return; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE); - nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(skb, 0, 0, event, 0, info->basechain->type->family, + NFNETLINK_V0, 0); if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = info->basechain->type->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt)))) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index d3df66a39b5e..06f5886f652e 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -28,6 +28,7 @@ #include <linux/sched/signal.h> #include <net/netlink.h> +#include <net/netns/generic.h> #include <linux/netfilter/nfnetlink.h> MODULE_LICENSE("GPL"); @@ -41,6 +42,12 @@ MODULE_DESCRIPTION("Netfilter messages via netlink socket"); #define NFNL_MAX_ATTR_COUNT 32 +static unsigned int nfnetlink_pernet_id __read_mostly; + +struct nfnl_net { + struct sock *nfnl; +}; + static struct { struct mutex mutex; const struct nfnetlink_subsystem __rcu *subsys; @@ -75,6 +82,11 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, }; +static struct nfnl_net *nfnl_pernet(struct net *net) +{ + return net_generic(net, nfnetlink_pernet_id); +} + void nfnl_lock(__u8 subsys_id) { mutex_lock(&table[subsys_id].mutex); @@ -149,28 +161,35 @@ nfnetlink_find_client(u16 type, const struct nfnetlink_subsystem *ss) int nfnetlink_has_listeners(struct net *net, unsigned int group) { - return netlink_has_listeners(net->nfnl, group); + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + return netlink_has_listeners(nfnlnet->nfnl, group); } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { - return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + return nlmsg_notify(nfnlnet->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { - return netlink_set_err(net->nfnl, portid, group, error); + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + return netlink_set_err(nfnlnet->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) { + struct nfnl_net *nfnlnet = nfnl_pernet(net); int err; - err = nlmsg_unicast(net->nfnl, skb, portid); + err = nlmsg_unicast(nfnlnet->nfnl, skb, portid); if (err == -EAGAIN) err = -ENOBUFS; @@ -178,6 +197,15 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) } EXPORT_SYMBOL_GPL(nfnetlink_unicast); +void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, + __u32 group, gfp_t allocation) +{ + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + netlink_broadcast(nfnlnet->nfnl, skb, portid, group, allocation); +} +EXPORT_SYMBOL_GPL(nfnetlink_broadcast); + /* Process one complete nfnetlink message. */ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) @@ -194,6 +222,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, type = nlh->nlmsg_type; replay: rcu_read_lock(); + ss = nfnetlink_get_subsys(type); if (!ss) { #ifdef CONFIG_MODULES @@ -217,6 +246,7 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + struct nfnl_net *nfnlnet = nfnl_pernet(net); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; @@ -238,7 +268,7 @@ replay: } if (nc->call_rcu) { - err = nc->call_rcu(net, net->nfnl, skb, nlh, + err = nc->call_rcu(net, nfnlnet->nfnl, skb, nlh, (const struct nlattr **)cda, extack); rcu_read_unlock(); @@ -249,7 +279,7 @@ replay: nfnetlink_find_client(type, ss) != nc) err = -EAGAIN; else if (nc->call) - err = nc->call(net, net->nfnl, skb, nlh, + err = nc->call(net, nfnlnet->nfnl, skb, nlh, (const struct nlattr **)cda, extack); else @@ -453,7 +483,9 @@ replay_abort: goto ack; if (nc->call_batch) { - err = nc->call_batch(net, net->nfnl, skb, nlh, + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + err = nc->call_batch(net, nfnlnet->nfnl, skb, nlh, (const struct nlattr **)cda, &extack); } @@ -622,7 +654,7 @@ static int nfnetlink_bind(struct net *net, int group) static int __net_init nfnetlink_net_init(struct net *net) { - struct sock *nfnl; + struct nfnl_net *nfnlnet = nfnl_pernet(net); struct netlink_kernel_cfg cfg = { .groups = NFNLGRP_MAX, .input = nfnetlink_rcv, @@ -631,28 +663,29 @@ static int __net_init nfnetlink_net_init(struct net *net) #endif }; - nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); - if (!nfnl) + nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); + if (!nfnlnet->nfnl) return -ENOMEM; - net->nfnl_stash = nfnl; - rcu_assign_pointer(net->nfnl, nfnl); return 0; } static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) { + struct nfnl_net *nfnlnet; struct net *net; - list_for_each_entry(net, net_exit_list, exit_list) - RCU_INIT_POINTER(net->nfnl, NULL); - synchronize_net(); - list_for_each_entry(net, net_exit_list, exit_list) - netlink_kernel_release(net->nfnl_stash); + list_for_each_entry(net, net_exit_list, exit_list) { + nfnlnet = nfnl_pernet(net); + + netlink_kernel_release(nfnlnet->nfnl); + } } static struct pernet_operations nfnetlink_net_ops = { .init = nfnetlink_net_init, .exit_batch = nfnetlink_net_exit_batch, + .id = &nfnetlink_pernet_id, + .size = sizeof(struct nfnl_net), }; static int __init nfnetlink_init(void) diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 0fa1653b5f19..6895f31c5fbb 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -145,21 +145,16 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_acct *acct) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; u64 pkts, bytes; u32 old_flags; event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFACCT_NAME, acct->name)) goto nla_put_failure; @@ -474,8 +469,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct) kfree_skb(skb); return; } - netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, - GFP_ATOMIC); + nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC); } int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 0f94fce1d3ed..22f6f7fcc724 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -526,20 +526,15 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_conntrack_helper *helper) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; int status; event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFCTH_NAME, helper->name)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 89a381f7f945..46da5548d0b3 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -20,6 +20,7 @@ #include <linux/netfilter.h> #include <net/netlink.h> +#include <net/netns/generic.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -30,6 +31,12 @@ #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> +static unsigned int nfct_timeout_id __read_mostly; + +struct nfct_timeout_pernet { + struct list_head nfct_timeout_list; +}; + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning"); @@ -42,6 +49,11 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = { [CTA_TIMEOUT_DATA] = { .type = NLA_NESTED }, }; +static struct nfct_timeout_pernet *nfct_timeout_pernet(struct net *net) +{ + return net_generic(net, nfct_timeout_id); +} + static int ctnl_timeout_parse_policy(void *timeout, const struct nf_conntrack_l4proto *l4proto, @@ -77,6 +89,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, const struct nlattr * const cda[], struct netlink_ext_ack *extack) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); __u16 l3num; __u8 l4num; const struct nf_conntrack_l4proto *l4proto; @@ -94,7 +107,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO])); l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); - list_for_each_entry(timeout, &net->nfct_timeout_list, head) { + list_for_each_entry(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; @@ -146,7 +159,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, timeout->timeout.l3num = l3num; timeout->timeout.l4proto = l4proto; refcount_set(&timeout->refcnt, 1); - list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list); + list_add_tail_rcu(&timeout->head, &pernet->nfct_timeout_list); return 0; err: @@ -160,22 +173,17 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct ctnl_timeout *timeout) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->timeout.l3num)) || @@ -206,6 +214,7 @@ nla_put_failure: static int ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct nfct_timeout_pernet *pernet; struct net *net = sock_net(skb->sk); struct ctnl_timeout *cur, *last; @@ -217,7 +226,8 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &net->nfct_timeout_list, head) { + pernet = nfct_timeout_pernet(net); + list_for_each_entry_rcu(cur, &pernet->nfct_timeout_list, head) { if (last) { if (cur != last) continue; @@ -244,6 +254,7 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, const struct nlattr * const cda[], struct netlink_ext_ack *extack) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); int ret = -ENOENT; char *name; struct ctnl_timeout *cur; @@ -259,7 +270,7 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, return -EINVAL; name = nla_data(cda[CTA_TIMEOUT_NAME]); - list_for_each_entry(cur, &net->nfct_timeout_list, head) { + list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) @@ -315,12 +326,13 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, const struct nlattr * const cda[], struct netlink_ext_ack *extack) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; if (!cda[CTA_TIMEOUT_NAME]) { - list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) ctnl_timeout_try_del(net, cur); @@ -328,7 +340,7 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, } name = nla_data(cda[CTA_TIMEOUT_NAME]); - list_for_each_entry(cur, &net->nfct_timeout_list, head) { + list_for_each_entry(cur, &pernet->nfct_timeout_list, head) { if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; @@ -382,21 +394,16 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, const unsigned int *timeouts) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; @@ -513,9 +520,10 @@ err: static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net, const char *name) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *timeout, *matching = NULL; - list_for_each_entry_rcu(timeout, &net->nfct_timeout_list, head) { + list_for_each_entry_rcu(timeout, &pernet->nfct_timeout_list, head) { if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; @@ -573,19 +581,22 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT); static int __net_init cttimeout_net_init(struct net *net) { - INIT_LIST_HEAD(&net->nfct_timeout_list); + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + + INIT_LIST_HEAD(&pernet->nfct_timeout_list); return 0; } static void __net_exit cttimeout_net_exit(struct net *net) { + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; nf_ct_unconfirmed_destroy(net); nf_ct_untimeout(net, NULL); - list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) { + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) { list_del_rcu(&cur->head); if (refcount_dec_and_test(&cur->refcnt)) @@ -596,6 +607,8 @@ static void __net_exit cttimeout_net_exit(struct net *net) static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, .exit = cttimeout_net_exit, + .id = &nfct_timeout_id, + .size = sizeof(struct nfct_timeout_pernet), }; static int __init cttimeout_init(void) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 26776b88a539..d5f458d0ff3d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -456,20 +456,15 @@ __build_packet_message(struct nfnl_log_net *log, { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; - nlh = nlmsg_put(inst->skb, 0, 0, - nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), - sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(inst->skb, 0, 0, + nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), + 0, pf, NFNETLINK_V0, htons(inst->group_num)); if (!nlh) return -1; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(inst->group_num); memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 48a07914fd94..37e81d895e61 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -383,7 +383,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; @@ -471,18 +470,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nlmsg_failure; } - nlh = nlmsg_put(skb, 0, 0, - nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), - sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(skb, 0, 0, + nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), + 0, entry->state.pf, NFNETLINK_V0, + htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = entry->state.pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(queue->queue_num); nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index ff8528ad3dc6..7a9aa57b195b 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -2,6 +2,7 @@ #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/netfilter/nf_tables.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> @@ -10,6 +11,8 @@ #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> +extern unsigned int nf_tables_net_id; + #ifdef CONFIG_NF_TABLES_IPV4 static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, @@ -355,6 +358,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; struct nft_table *table; struct nft_chain *chain, *nr; struct nft_ctx ctx = { @@ -365,8 +369,9 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - mutex_lock(&ctx.net->nft.commit_mutex); - list_for_each_entry(table, &ctx.net->nft.tables, list) { + nft_net = net_generic(ctx.net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -380,7 +385,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, nft_netdev_event(event, dev, &ctx); } } - mutex_unlock(&ctx.net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index eb6a43a180bb..47b6d05f1ae6 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -114,19 +114,56 @@ nla_put_failure: return -1; } +union nft_cmp_offload_data { + u16 val16; + u32 val32; + u64 val64; +}; + +static void nft_payload_n2h(union nft_cmp_offload_data *data, + const u8 *val, u32 len) +{ + switch (len) { + case 2: + data->val16 = ntohs(*((u16 *)val)); + break; + case 4: + data->val32 = ntohl(*((u32 *)val)); + break; + case 8: + data->val64 = be64_to_cpu(*((u64 *)val)); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + static int __nft_cmp_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_cmp_expr *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->sreg]; + union nft_cmp_offload_data _data, _datamask; u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; + u8 *data, *datamask; if (priv->op != NFT_CMP_EQ || priv->len > reg->len) return -EOPNOTSUPP; - memcpy(key + reg->offset, &priv->data, reg->len); - memcpy(mask + reg->offset, ®->mask, reg->len); + if (reg->flags & NFT_OFFLOAD_F_NETWORK2HOST) { + nft_payload_n2h(&_data, (u8 *)&priv->data, reg->len); + nft_payload_n2h(&_datamask, (u8 *)®->mask, reg->len); + data = (u8 *)&_data; + datamask = (u8 *)&_datamask; + } else { + data = (u8 *)&priv->data; + datamask = (u8 *)®->mask; + } + + memcpy(key + reg->offset, data, reg->len); + memcpy(mask + reg->offset, datamask, reg->len); flow->match.dissector.used_keys |= BIT(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8e56f353ff35..b8dbd20a6a4c 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -591,19 +591,14 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int rev, int target) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 85ed461ec24e..8edd3b3c173d 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -13,6 +13,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_offload.h> struct nft_counter { s64 bytes; @@ -248,6 +249,32 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) return 0; } +static int nft_counter_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + /* No specific offload action is needed, but report success. */ + return 0; +} + +static void nft_counter_offload_stats(struct nft_expr *expr, + const struct flow_stats *stats) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter *this_cpu; + seqcount_t *myseq; + + preempt_disable(); + this_cpu = this_cpu_ptr(priv->counter); + myseq = this_cpu_ptr(&nft_counter_seq); + + write_seqcount_begin(myseq); + this_cpu->packets += stats->pkts; + this_cpu->bytes += stats->bytes; + write_seqcount_end(myseq); + preempt_enable(); +} + static struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, @@ -258,6 +285,8 @@ static const struct nft_expr_ops nft_counter_ops = { .destroy_clone = nft_counter_destroy, .dump = nft_counter_dump, .clone = nft_counter_clone, + .offload = nft_counter_offload, + .offload_stats = nft_counter_offload_stats, }; static struct nft_expr_type nft_counter_type __read_mostly = { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 882fe8648653..0592a9456084 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -527,6 +527,7 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) case NFT_CT_ZONE: if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); + break; #endif default: break; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index d44a70c11b3f..f9437a0dcfef 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -11,6 +11,9 @@ #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +#include <net/netns/generic.h> + +extern unsigned int nf_tables_net_id; struct nft_dynset { struct nft_set *set; @@ -161,13 +164,14 @@ static int nft_dynset_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_dynset *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; int err, i; - lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); if (tb[NFTA_DYNSET_SET_NAME] == NULL || tb[NFTA_DYNSET_OP] == NULL || diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 3a6c84fb2c90..4843dd2b410c 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -19,10 +19,205 @@ struct nft_flow_offload { struct nft_flowtable *flowtable; }; +static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) +{ + if (dst_xfrm(dst)) + return FLOW_OFFLOAD_XMIT_XFRM; + + return FLOW_OFFLOAD_XMIT_NEIGH; +} + +static void nft_default_forward_path(struct nf_flow_route *route, + struct dst_entry *dst_cache, + enum ip_conntrack_dir dir) +{ + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; + route->tuple[dir].dst = dst_cache; + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); +} + +static int nft_dev_fill_forward_path(const struct nf_flow_route *route, + const struct dst_entry *dst_cache, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, u8 *ha, + struct net_device_path_stack *stack) +{ + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; + struct net_device *dev = dst_cache->dev; + struct neighbour *n; + u8 nud_state; + + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -1; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + neigh_release(n); + + if (!(nud_state & NUD_VALID)) + return -1; + + return dev_fill_forward_path(dev, ha, stack); +} + +struct nft_forward_info { + const struct net_device *indev; + const struct net_device *outdev; + const struct net_device *hw_outdev; + struct id { + __u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; + u8 ingress_vlans; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + enum flow_offload_xmit_type xmit_type; +}; + +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + +static void nft_dev_path_info(const struct net_device_path_stack *stack, + struct nft_forward_info *info, + unsigned char *ha, struct nf_flowtable *flowtable) +{ + const struct net_device_path *path; + int i; + + memcpy(info->h_dest, ha, ETH_ALEN); + + for (i = 0; i < stack->num_paths; i++) { + path = &stack->path[i]; + switch (path->type) { + case DEV_PATH_ETHERNET: + case DEV_PATH_DSA: + case DEV_PATH_VLAN: + case DEV_PATH_PPPOE: + info->indev = path->dev; + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + if (path->type == DEV_PATH_ETHERNET) + break; + if (path->type == DEV_PATH_DSA) { + i = stack->num_paths; + break; + } + + /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->outdev = path->dev; + info->encap[info->num_encaps].id = path->encap.id; + info->encap[info->num_encaps].proto = path->encap.proto; + info->num_encaps++; + if (path->type == DEV_PATH_PPPOE) + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + break; + case DEV_PATH_BRIDGE: + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_UNTAG_HW: + info->ingress_vlans |= BIT(info->num_encaps - 1); + break; + case DEV_PATH_BR_VLAN_TAG: + info->encap[info->num_encaps].id = path->bridge.vlan_id; + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; + info->num_encaps++; + break; + case DEV_PATH_BR_VLAN_UNTAG: + info->num_encaps--; + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + break; + default: + info->indev = NULL; + break; + } + } + if (!info->outdev) + info->outdev = info->indev; + + info->hw_outdev = info->indev; + + if (nf_flowtable_hw_offload(flowtable) && + nft_is_valid_ether_device(info->indev)) + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; +} + +static bool nft_flowtable_find_dev(const struct net_device *dev, + struct nft_flowtable *ft) +{ + struct nft_hook *hook; + bool found = false; + + list_for_each_entry_rcu(hook, &ft->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + found = true; + break; + } + + return found; +} + +static void nft_dev_forward_path(struct nf_flow_route *route, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + const struct dst_entry *dst = route->tuple[dir].dst; + struct net_device_path_stack stack; + struct nft_forward_info info = {}; + unsigned char ha[ETH_ALEN]; + int i; + + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) + nft_dev_path_info(&stack, &info, ha, &ft->data); + + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) + return; + + route->tuple[!dir].in.ifindex = info.indev->ifindex; + for (i = 0; i < info.num_encaps; i++) { + route->tuple[!dir].in.encap[i].id = info.encap[i].id; + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; + } + route->tuple[!dir].in.num_encaps = info.num_encaps; + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; + + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); + route->tuple[dir].out.ifindex = info.outdev->ifindex; + route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; + route->tuple[dir].xmit_type = info.xmit_type; + } +} + static int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, struct nf_flow_route *route, - enum ip_conntrack_dir dir) + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) { struct dst_entry *this_dst = skb_dst(pkt->skb); struct dst_entry *other_dst = NULL; @@ -44,8 +239,14 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, if (!other_dst) return -ENOENT; - route->tuple[dir].dst = this_dst; - route->tuple[!dir].dst = other_dst; + nft_default_forward_path(route, this_dst, dir); + nft_default_forward_path(route, other_dst, !dir); + + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && + route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { + nft_dev_forward_path(route, ct, dir, ft); + nft_dev_forward_path(route, ct, !dir, ft); + } return 0; } @@ -74,8 +275,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; + struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; - struct nf_flow_route route; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; @@ -112,7 +313,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto out; dir = CTINFO2DIR(ctinfo); - if (nft_flow_route(pkt, ct, &route, dir) < 0) + if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index a06a46b039c5..54f6c2035e84 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -128,6 +128,20 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_FLAGS] = { .type = NLA_U32 }, }; +static int nft_log_modprobe(struct net *net, enum nf_log_type t) +{ + switch (t) { + case NF_LOG_TYPE_LOG: + return nft_request_module(net, "%s", "nf_log_syslog"); + case NF_LOG_TYPE_ULOG: + return nft_request_module(net, "%s", "nfnetlink_log"); + case NF_LOG_TYPE_MAX: + break; + } + + return -ENOENT; +} + static int nft_log_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -197,8 +211,12 @@ static int nft_log_init(const struct nft_ctx *ctx, return 0; err = nf_logger_find_get(ctx->family, li->type); - if (err < 0) + if (err < 0) { + if (nft_log_modprobe(ctx->net, li->type) == -EAGAIN) + err = -EAGAIN; + goto err1; + } return 0; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index cb1c8c231880..501c5b24cc39 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -226,8 +226,9 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, - vlan_tci, sizeof(__be16), reg); + NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tci, sizeof(__be16), reg, + NFT_OFFLOAD_F_NETWORK2HOST); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) @@ -241,16 +242,18 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, - vlan_tci, sizeof(__be16), reg); + NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_CVLAN, cvlan, + vlan_tci, sizeof(__be16), reg, + NFT_OFFLOAD_F_NETWORK2HOST); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + sizeof(struct vlan_hdr): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan, vlan_tpid, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; default: return -EOPNOTSUPP; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 92e9d4ebc5e8..b7f8d2ed3cc2 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -24,6 +24,7 @@ #include <linux/audit.h> #include <linux/user_namespace.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp.h> @@ -38,6 +39,10 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define XT_PCPU_BLOCK_SIZE 4096 #define XT_MAX_TABLE_SIZE (512 * 1024 * 1024) +struct xt_pernet { + struct list_head tables[NFPROTO_NUMPROTO]; +}; + struct compat_delta { unsigned int offset; /* offset in kernel */ int delta; /* delta in 32bit user land */ @@ -55,7 +60,8 @@ struct xt_af { #endif }; -static struct xt_af *xt; +static unsigned int xt_pernet_id __read_mostly; +static struct xt_af *xt __read_mostly; static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_UNSPEC] = "x", @@ -1197,10 +1203,11 @@ EXPORT_SYMBOL(xt_free_table_info); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct xt_table *t, *found = NULL; mutex_lock(&xt[af].mutex); - list_for_each_entry(t, &net->xt.tables[af], list) + list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; @@ -1208,7 +1215,8 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, goto out; /* Table doesn't exist in this netns, re-try init */ - list_for_each_entry(t, &init_net.xt.tables[af], list) { + xt_net = net_generic(&init_net, xt_pernet_id); + list_for_each_entry(t, &xt_net->tables[af], list) { int err; if (strcmp(t->name, name)) @@ -1231,8 +1239,9 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, if (!found) goto out; + xt_net = net_generic(net, xt_pernet_id); /* and once again: */ - list_for_each_entry(t, &net->xt.tables[af], list) + list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0) return t; @@ -1417,9 +1426,10 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table_info *bootstrap, struct xt_table_info *newinfo) { - int ret; + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); struct xt_table_info *private; struct xt_table *t, *table; + int ret; /* Don't add one object to multiple lists. */ table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL); @@ -1430,7 +1440,7 @@ struct xt_table *xt_register_table(struct net *net, mutex_lock(&xt[table->af].mutex); /* Don't autoload: we'd eat our tail... */ - list_for_each_entry(t, &net->xt.tables[table->af], list) { + list_for_each_entry(t, &xt_net->tables[table->af], list) { if (strcmp(t->name, table->name) == 0) { ret = -EEXIST; goto unlock; @@ -1449,7 +1459,7 @@ struct xt_table *xt_register_table(struct net *net, /* save number of initial entries */ private->initial_entries = private->number; - list_add(&table->list, &net->xt.tables[table->af]); + list_add(&table->list, &xt_net->tables[table->af]); mutex_unlock(&xt[table->af].mutex); return table; @@ -1480,19 +1490,25 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { + u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); struct net *net = seq_file_net(seq); - u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); + struct xt_pernet *xt_net; + + xt_net = net_generic(net, xt_pernet_id); mutex_lock(&xt[af].mutex); - return seq_list_start(&net->xt.tables[af], *pos); + return seq_list_start(&xt_net->tables[af], *pos); } static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); struct net *net = seq_file_net(seq); - u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); + struct xt_pernet *xt_net; + + xt_net = net_generic(net, xt_pernet_id); - return seq_list_next(v, &net->xt.tables[af], pos); + return seq_list_next(v, &xt_net->tables[af], pos); } static void xt_table_seq_stop(struct seq_file *seq, void *v) @@ -1858,24 +1874,28 @@ EXPORT_SYMBOL_GPL(xt_percpu_counter_free); static int __net_init xt_net_init(struct net *net) { + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; for (i = 0; i < NFPROTO_NUMPROTO; i++) - INIT_LIST_HEAD(&net->xt.tables[i]); + INIT_LIST_HEAD(&xt_net->tables[i]); return 0; } static void __net_exit xt_net_exit(struct net *net) { + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); int i; for (i = 0; i < NFPROTO_NUMPROTO; i++) - WARN_ON_ONCE(!list_empty(&net->xt.tables[i])); + WARN_ON_ONCE(!list_empty(&xt_net->tables[i])); } static struct pernet_operations xt_net_ops = { .init = xt_net_init, .exit = xt_net_exit, + .id = &xt_pernet_id, + .size = sizeof(struct xt_pernet), }; static int __init xt_init(void) diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index a1e79b517c01..2ff75f7637b0 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -108,3 +108,4 @@ MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); MODULE_DESCRIPTION("Xtables: IPv4/IPv6 packet logging"); MODULE_ALIAS("ipt_LOG"); MODULE_ALIAS("ip6t_LOG"); +MODULE_SOFTDEP("pre: nf_log_syslog"); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 6e83ce3000db..fb5793208059 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -79,3 +79,4 @@ static void __exit nflog_tg_exit(void) module_init(nflog_tg_init); module_exit(nflog_tg_exit); +MODULE_SOFTDEP("pre: nfnetlink_log"); diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 349ab5609b1b..5582dce98cae 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -52,3 +52,4 @@ static void __exit trace_tg_exit(void) module_init(trace_tg_init); module_exit(trace_tg_exit); +MODULE_SOFTDEP("pre: nf_log_syslog"); diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index df1b41ed73fd..ca52f5085989 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -63,7 +63,7 @@ static const struct nla_policy netlbl_mgmt_genl_policy[NLBL_MGMT_A_MAX + 1] = { */ /** - * netlbl_mgmt_add - Handle an ADD message + * netlbl_mgmt_add_common - Handle an ADD message * @info: the Generic NETLINK info block * @audit_info: NetLabel audit information * diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index da7e2112771f..5044c7db577e 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -457,7 +457,7 @@ static void digital_add_poll_tech(struct nfc_digital_dev *ddev, u8 rf_tech, } /** - * start_poll operation + * digital_start_poll - start_poll operation * @nfc_dev: device to be polled * @im_protocols: bitset of nfc initiator protocols to be used for polling * @tm_protocols: bitset of nfc transport protocols to be used for polling diff --git a/net/nfc/digital_dep.c b/net/nfc/digital_dep.c index 5971fb6f51cc..1150731126e2 100644 --- a/net/nfc/digital_dep.c +++ b/net/nfc/digital_dep.c @@ -1217,7 +1217,7 @@ static void digital_tg_recv_dep_req(struct nfc_digital_dev *ddev, void *arg, /* ACK */ if (ddev->atn_count) { - /* The target has previously recevied one or more ATN + /* The target has previously received one or more ATN * PDUs. */ ddev->atn_count = 0; diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 59257400697d..9a585332ea84 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1507,7 +1507,7 @@ static void nci_rx_work(struct work_struct *work) } } - /* check if a data exchange timout has occurred */ + /* check if a data exchange timeout has occurred */ if (test_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags)) { /* complete the data exchange transaction, if exists */ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 1204c438e87d..6af5752cde09 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -234,7 +234,7 @@ static void nci_uart_tty_wakeup(struct tty_struct *tty) * Called by tty low level driver when receive data is * available. * - * Arguments: tty pointer to tty isntance data + * Arguments: tty pointer to tty instance data * data pointer to received data * flags pointer to flags for data * count count of received data in bytes @@ -374,7 +374,7 @@ static int nci_uart_default_recv_buf(struct nci_uart *nu, const u8 *data, data += chunk_len; count -= chunk_len; - /* Chcek if packet is fully received */ + /* Check if packet is fully received */ if (nu->rx_packet_len == nu->rx_skb->len) { /* Pass RX packet to driver */ if (nu->ops.recv(nu, nu->rx_skb) != 0) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index d217bd91176b..cadb6a29b285 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -809,8 +809,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, err = nf_nat_packet(ct, ctinfo, hooknum, skb); push: - skb_push(skb, nh_off); - skb_postpush_rcsum(skb, skb->data, nh_off); + skb_push_rcsum(skb, nh_off); return err; } @@ -1322,8 +1321,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, else err = ovs_ct_lookup(net, key, info, skb); - skb_push(skb, nh_ofs); - skb_postpush_rcsum(skb, skb->data, nh_ofs); + skb_push_rcsum(skb, nh_ofs); if (err) kfree_skb(skb); return err; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 57d6436e6f6a..8e1a88f13622 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -44,10 +44,9 @@ static void netdev_port_receive(struct sk_buff *skb) if (unlikely(!skb)) return; - if (skb->dev->type == ARPHRD_ETHER) { - skb_push(skb, ETH_HLEN); - skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - } + if (skb->dev->type == ARPHRD_ETHER) + skb_push_rcsum(skb, ETH_HLEN); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 4ed7e52c7012..88deb5b41429 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -497,10 +497,12 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) if (unlikely(packet_length(skb, vport->dev) > mtu && !skb_is_gso(skb))) { - net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - vport->dev->name, - packet_length(skb, vport->dev), mtu); vport->dev->stats.tx_errors++; + if (vport->dev->flags & IFF_UP) + net_warn_ratelimited("%s: dropped over-mtu packet: " + "%d > %d\n", vport->dev->name, + packet_length(skb, vport->dev), + mtu); goto drop; } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 1eb7495ac5b4..8a930ca6d6b1 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -20,7 +20,7 @@ struct vport; struct vport_parms; -/* The following definitions are for users of the vport subsytem: */ +/* The following definitions are for users of the vport subsystem: */ int ovs_vport_init(void); void ovs_vport_exit(void); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e24b2841c643..ba96db1880ea 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1359,7 +1359,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; - po = pkt_sk(f->arr[idx]); + po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); @@ -1371,7 +1371,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { - po_next = pkt_sk(f->arr[i]); + po_next = pkt_sk(rcu_dereference(f->arr[i])); if (po_next != po_skip && !READ_ONCE(po_next->pressure) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) @@ -1466,7 +1466,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); - po = pkt_sk(f->arr[idx]); + po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } @@ -1480,7 +1480,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po) struct packet_fanout *f = po->fanout; spin_lock(&f->lock); - f->arr[f->num_members] = sk; + rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) @@ -1495,11 +1495,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { - if (f->arr[i] == sk) + if (rcu_dereference_protected(f->arr[i], + lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); - f->arr[i] = f->arr[f->num_members - 1]; + rcu_assign_pointer(f->arr[i], + rcu_dereference_protected(f->arr[f->num_members - 1], + lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); @@ -2057,7 +2060,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh - * sequencially, so that if we return skb to original state on exit, + * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ diff --git a/net/packet/internal.h b/net/packet/internal.h index 5f61e59ebbff..48af35b1aed2 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -94,7 +94,7 @@ struct packet_fanout { spinlock_t lock; refcount_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; - struct sock *arr[]; + struct sock __rcu *arr[]; }; struct packet_rollover { diff --git a/net/psample/psample.c b/net/psample/psample.c index 482c07f2766b..118d5d2a81a0 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/module.h> +#include <linux/timekeeping.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -356,9 +357,12 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) #endif void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, - u32 trunc_size, int in_ifindex, int out_ifindex, - u32 sample_rate) + u32 sample_rate, const struct psample_metadata *md) { + ktime_t tstamp = ktime_get_real(); + int out_ifindex = md->out_ifindex; + int in_ifindex = md->in_ifindex; + u32 trunc_size = md->trunc_size; #ifdef CONFIG_INET struct ip_tunnel_info *tun_info; #endif @@ -370,10 +374,15 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + + (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) + + (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + + (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + nla_total_size(sizeof(u32)) + /* sample_rate */ nla_total_size(sizeof(u32)) + /* orig_size */ nla_total_size(sizeof(u32)) + /* group_num */ - nla_total_size(sizeof(u32)); /* seq */ + nla_total_size(sizeof(u32)) + /* seq */ + nla_total_size_64bit(sizeof(u64)) + /* timestamp */ + nla_total_size(sizeof(u16)); /* protocol */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); @@ -423,6 +432,36 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, if (unlikely(ret < 0)) goto error; + if (md->out_tc_valid) { + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc); + if (unlikely(ret < 0)) + goto error; + } + + if (md->out_tc_occ_valid) { + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC, + md->out_tc_occ, PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + } + + if (md->latency_valid) { + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY, + md->latency, PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + } + + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP, + ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO, + be16_to_cpu(skb->protocol)); + if (unlikely(ret < 0)) + goto error; + if (data_len) { int nla_len = nla_total_size(data_len); struct nlattr *nla; diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 1e4fb568fa84..c0477bec09bd 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -20,6 +20,8 @@ /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff +#define QRTR_EPH_PORT_RANGE \ + XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 @@ -106,8 +108,7 @@ static LIST_HEAD(qrtr_all_nodes); static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ -static DEFINE_IDR(qrtr_ports); -static DEFINE_MUTEX(qrtr_port_lock); +static DEFINE_XARRAY_ALLOC(qrtr_ports); /** * struct qrtr_node - endpoint node @@ -656,7 +657,7 @@ static struct qrtr_sock *qrtr_port_lookup(int port) port = 0; rcu_read_lock(); - ipc = idr_find(&qrtr_ports, port); + ipc = xa_load(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); rcu_read_unlock(); @@ -698,9 +699,7 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) __sock_put(&ipc->sk); - mutex_lock(&qrtr_port_lock); - idr_remove(&qrtr_ports, port); - mutex_unlock(&qrtr_port_lock); + xa_erase(&qrtr_ports, port); /* Ensure that if qrtr_port_lookup() did enter the RCU read section we * wait for it to up increment the refcount */ @@ -719,29 +718,20 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) */ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) { - u32 min_port; int rc; - mutex_lock(&qrtr_port_lock); if (!*port) { - min_port = QRTR_MIN_EPH_SOCKET; - rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, QRTR_MAX_EPH_SOCKET, GFP_ATOMIC); - if (!rc) - *port = min_port; + rc = xa_alloc(&qrtr_ports, port, ipc, QRTR_EPH_PORT_RANGE, + GFP_KERNEL); } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { rc = -EACCES; } else if (*port == QRTR_PORT_CTRL) { - min_port = 0; - rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, 0, GFP_ATOMIC); + rc = xa_insert(&qrtr_ports, 0, ipc, GFP_KERNEL); } else { - min_port = *port; - rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, *port, GFP_ATOMIC); - if (!rc) - *port = min_port; + rc = xa_insert(&qrtr_ports, *port, ipc, GFP_KERNEL); } - mutex_unlock(&qrtr_port_lock); - if (rc == -ENOSPC) + if (rc == -EBUSY) return -EADDRINUSE; else if (rc < 0) return rc; @@ -755,20 +745,16 @@ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) static void qrtr_reset_ports(void) { struct qrtr_sock *ipc; - int id; - - mutex_lock(&qrtr_port_lock); - idr_for_each_entry(&qrtr_ports, ipc, id) { - /* Don't reset control port */ - if (id == 0) - continue; + unsigned long index; + rcu_read_lock(); + xa_for_each_start(&qrtr_ports, index, ipc, 1) { sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; ipc->sk.sk_error_report(&ipc->sk); sock_put(&ipc->sk); } - mutex_unlock(&qrtr_port_lock); + rcu_read_unlock(); } /* Bind socket to address. diff --git a/net/rds/recv.c b/net/rds/recv.c index aba4afe4dfed..4db109fb6ec2 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -722,8 +722,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (msg->msg_name) { if (ipv6_addr_v4mapped(&inc->i_saddr)) { - sin = (struct sockaddr_in *)msg->msg_name; - sin->sin_family = AF_INET; sin->sin_port = inc->i_hdr.h_sport; sin->sin_addr.s_addr = @@ -731,8 +729,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); msg->msg_namelen = sizeof(*sin); } else { - sin6 = (struct sockaddr_in6 *)msg->msg_name; - sin6->sin6_family = AF_INET6; sin6->sin6_port = inc->i_hdr.h_sport; sin6->sin6_addr = inc->i_saddr; diff --git a/net/rds/send.c b/net/rds/send.c index fe5264b9d4b3..ee7214ea0fdb 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1225,7 +1225,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) } /* If the socket is already bound to a link local address, * it can only send to peers on the same link. But allow - * communicating beween link local and non-link local address. + * communicating between link local and non-link local address. */ if (scope_id != rs->rs_bound_scope_id) { if (!scope_id) { diff --git a/net/rfkill/input.c b/net/rfkill/input.c index 4b01baea1d4a..598d0a61bda7 100644 --- a/net/rfkill/input.c +++ b/net/rfkill/input.c @@ -36,7 +36,7 @@ module_param_named(master_switch_mode, rfkill_master_switch_mode, uint, 0); MODULE_PARM_DESC(master_switch_mode, "SW_RFKILL_ALL ON should: 0=do nothing (only unlock); 1=restore; 2=unblock all"); -static spinlock_t rfkill_op_lock; +static DEFINE_SPINLOCK(rfkill_op_lock); static bool rfkill_op_pending; static unsigned long rfkill_sw_pending[BITS_TO_LONGS(NUM_RFKILL_TYPES)]; static unsigned long rfkill_sw_state[BITS_TO_LONGS(NUM_RFKILL_TYPES)]; @@ -330,8 +330,6 @@ int __init rfkill_handler_init(void) return -EINVAL; } - spin_lock_init(&rfkill_op_lock); - /* Avoid delay at first schedule */ rfkill_last_scheduled = jiffies - msecs_to_jiffies(RFKILL_OPS_DELAY) - 1; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 6e35703ff353..c0e04c261a15 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -347,6 +347,7 @@ static int rose_del_node(struct rose_route_struct *rose_route, case 1: rose_node->neighbour[1] = rose_node->neighbour[2]; + break; case 2: break; } @@ -508,6 +509,7 @@ void rose_rt_device_down(struct net_device *dev) fallthrough; case 1: t->neighbour[1] = t->neighbour[2]; + break; case 2: break; } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8d8452b1cdd4..0fab8de176d2 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -42,6 +42,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, [TCA_POLICE_RATE64] = { .type = NLA_U64 }, [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 }, + [TCA_POLICE_PKTRATE64] = { .type = NLA_U64, .min = 1 }, + [TCA_POLICE_PKTBURST64] = { .type = NLA_U64, .min = 1 }, }; static int tcf_police_init(struct net *net, struct nlattr *nla, @@ -61,6 +63,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, bool exists = false; u32 index; u64 rate64, prate64; + u64 pps, ppsburst; if (nla == NULL) return -EINVAL; @@ -142,6 +145,21 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } } + if ((tb[TCA_POLICE_PKTRATE64] && !tb[TCA_POLICE_PKTBURST64]) || + (!tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64])) { + NL_SET_ERR_MSG(extack, + "Both or neither packet-per-second burst and rate must be provided"); + err = -EINVAL; + goto failure; + } + + if (tb[TCA_POLICE_PKTRATE64] && R_tab) { + NL_SET_ERR_MSG(extack, + "packet-per-second and byte-per-second rate limits not allowed in same action"); + err = -EINVAL; + goto failure; + } + new = kzalloc(sizeof(*new), GFP_KERNEL); if (unlikely(!new)) { err = -ENOMEM; @@ -183,6 +201,14 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + if (tb[TCA_POLICE_PKTRATE64]) { + pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]); + ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]); + new->pps_present = true; + new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst); + psched_ppscfg_precompute(&new->ppsrate, pps); + } + spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); @@ -217,8 +243,8 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); + s64 now, toks, ppstoks = 0, ptoks = 0; struct tcf_police_params *p; - s64 now, toks, ptoks = 0; int ret; tcf_lastuse_update(&police->tcf_tm); @@ -236,7 +262,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, } if (qdisc_pkt_len(skb) <= p->tcfp_mtu) { - if (!p->rate_present) { + if (!p->rate_present && !p->pps_present) { ret = p->tcfp_result; goto end; } @@ -251,14 +277,23 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, ptoks -= (s64)psched_l2t_ns(&p->peak, qdisc_pkt_len(skb)); } - toks += police->tcfp_toks; - if (toks > p->tcfp_burst) - toks = p->tcfp_burst; - toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); - if ((toks|ptoks) >= 0) { + if (p->rate_present) { + toks += police->tcfp_toks; + if (toks > p->tcfp_burst) + toks = p->tcfp_burst; + toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); + } else if (p->pps_present) { + ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst); + ppstoks += police->tcfp_pkttoks; + if (ppstoks > p->tcfp_pkt_burst) + ppstoks = p->tcfp_pkt_burst; + ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1); + } + if ((toks | ptoks | ppstoks) >= 0) { police->tcfp_t_c = now; police->tcfp_toks = toks; police->tcfp_ptoks = ptoks; + police->tcfp_pkttoks = ppstoks; spin_unlock_bh(&police->tcfp_lock); ret = p->tcfp_result; goto inc_drops; @@ -331,6 +366,16 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, TCA_POLICE_PAD)) goto nla_put_failure; } + if (p->pps_present) { + if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64, + police->params->ppsrate.rate_pkts_ps, + TCA_POLICE_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64, + PSCHED_NS2TICKS(p->tcfp_pkt_burst), + TCA_POLICE_PAD)) + goto nla_put_failure; + } if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (p->tcfp_result && diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 3ebf9ede3cf1..6a0c16e4351d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -158,10 +158,8 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; + struct psample_metadata md = {}; int retval; - int size; - int iif; - int oif; tcf_lastuse_update(&s->tcf_tm); bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); @@ -172,20 +170,18 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, /* randomly sample packets according to rate */ if (psample_group && (prandom_u32() % s->rate == 0)) { if (!skb_at_tc_ingress(skb)) { - iif = skb->skb_iif; - oif = skb->dev->ifindex; + md.in_ifindex = skb->skb_iif; + md.out_ifindex = skb->dev->ifindex; } else { - iif = skb->dev->ifindex; - oif = 0; + md.in_ifindex = skb->dev->ifindex; } /* on ingress, the mac header gets popped, so push it back */ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_push(skb, skb->mac_len); - size = s->truncate ? s->trunc_size : skb->len; - psample_sample_packet(psample_group, skb, size, iif, oif, - s->rate); + md.trunc_size = s->truncate ? s->trunc_size : skb->len; + psample_sample_packet(psample_group, skb, s->rate, &md); if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_pull(skb, skb->mac_len); @@ -194,6 +190,16 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, return retval; } +static void tcf_sample_stats_update(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, u64 lastuse, bool hw) +{ + struct tcf_sample *s = to_sample(a); + struct tcf_t *tm = &s->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, drops, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -280,6 +286,7 @@ static struct tc_action_ops act_sample_ops = { .id = TCA_ID_SAMPLE, .owner = THIS_MODULE, .act = tcf_sample_act, + .stats_update = tcf_sample_stats_update, .dump = tcf_sample_dump, .init = tcf_sample_init, .cleanup = tcf_sample_cleanup, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 340d5af86e87..40fbea626dfd 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3662,6 +3662,9 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->police.burst = tcf_police_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); + entry->police.burst_pkt = tcf_police_burst_pkt(act); + entry->police.rate_pkt_ps = + tcf_police_rate_pkt_ps(act); entry->police.mtu = tcf_police_tcfp_mtu(act); entry->police.index = act->tcfa_index; } else if (is_tcf_ct(act)) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c69a4ba9c33f..d7869a984881 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -209,16 +209,16 @@ static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter, struct fl_flow_key *key, struct fl_flow_key *mkey) { - __be16 min_mask, max_mask, min_val, max_val; + u16 min_mask, max_mask, min_val, max_val; - min_mask = htons(filter->mask->key.tp_range.tp_min.dst); - max_mask = htons(filter->mask->key.tp_range.tp_max.dst); - min_val = htons(filter->key.tp_range.tp_min.dst); - max_val = htons(filter->key.tp_range.tp_max.dst); + min_mask = ntohs(filter->mask->key.tp_range.tp_min.dst); + max_mask = ntohs(filter->mask->key.tp_range.tp_max.dst); + min_val = ntohs(filter->key.tp_range.tp_min.dst); + max_val = ntohs(filter->key.tp_range.tp_max.dst); if (min_mask && max_mask) { - if (htons(key->tp_range.tp.dst) < min_val || - htons(key->tp_range.tp.dst) > max_val) + if (ntohs(key->tp_range.tp.dst) < min_val || + ntohs(key->tp_range.tp.dst) > max_val) return false; /* skb does not have min and max values */ @@ -232,16 +232,16 @@ static bool fl_range_port_src_cmp(struct cls_fl_filter *filter, struct fl_flow_key *key, struct fl_flow_key *mkey) { - __be16 min_mask, max_mask, min_val, max_val; + u16 min_mask, max_mask, min_val, max_val; - min_mask = htons(filter->mask->key.tp_range.tp_min.src); - max_mask = htons(filter->mask->key.tp_range.tp_max.src); - min_val = htons(filter->key.tp_range.tp_min.src); - max_val = htons(filter->key.tp_range.tp_max.src); + min_mask = ntohs(filter->mask->key.tp_range.tp_min.src); + max_mask = ntohs(filter->mask->key.tp_range.tp_max.src); + min_val = ntohs(filter->key.tp_range.tp_min.src); + max_val = ntohs(filter->key.tp_range.tp_max.src); if (min_mask && max_mask) { - if (htons(key->tp_range.tp.src) < min_val || - htons(key->tp_range.tp.src) > max_val) + if (ntohs(key->tp_range.tp.src) < min_val || + ntohs(key->tp_range.tp.src) > max_val) return false; /* skb does not have min and max values */ @@ -783,16 +783,16 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src)); if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst && - htons(key->tp_range.tp_max.dst) <= - htons(key->tp_range.tp_min.dst)) { + ntohs(key->tp_range.tp_max.dst) <= + ntohs(key->tp_range.tp_min.dst)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_FLOWER_KEY_PORT_DST_MIN], "Invalid destination port range (min must be strictly smaller than max)"); return -EINVAL; } if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src && - htons(key->tp_range.tp_max.src) <= - htons(key->tp_range.tp_min.src)) { + ntohs(key->tp_range.tp_max.src) <= + ntohs(key->tp_range.tp_min.src)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_FLOWER_KEY_PORT_SRC_MIN], "Invalid source port range (min must be strictly smaller than max)"); @@ -1044,8 +1044,8 @@ static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key, return -EINVAL; } - key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS])); - mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); + key = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS])); + mask = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); *flags_key = 0; *flags_mask = 0; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 320b3d31fa97..b79a7e27bb31 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -263,7 +263,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 3+n. If classifier selected a link sharing class, * apply agency specific classifier. - * Repeat this procdure until we hit a leaf node. + * Repeat this procedure until we hit a leaf node. */ head = cl; } @@ -859,7 +859,7 @@ cbq_dequeue(struct Qdisc *sch) return NULL; } -/* CBQ class maintanance routines */ +/* CBQ class maintenance routines */ static void cbq_adjust_levels(struct cbq_class *this) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 49eae93d1489..44991ea726fc 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1325,6 +1325,48 @@ void dev_shutdown(struct net_device *dev) WARN_ON(timer_pending(&dev->watchdog_timer)); } +/** + * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division + * @rate: Rate to compute reciprocal division values of + * @mult: Multiplier for reciprocal division + * @shift: Shift for reciprocal division + * + * The multiplier and shift for reciprocal division by rate are stored + * in mult and shift. + * + * The deal here is to replace a divide by a reciprocal one + * in fast path (a reciprocal divide is a multiply and a shift) + * + * Normal formula would be : + * time_in_ns = (NSEC_PER_SEC * len) / rate_bps + * + * We compute mult/shift to use instead : + * time_in_ns = (len * mult) >> shift; + * + * We try to get the highest possible mult value for accuracy, + * but have to make sure no overflows will ever happen. + * + * reciprocal_value() is not used here it doesn't handle 64-bit values. + */ +static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) +{ + u64 factor = NSEC_PER_SEC; + + *mult = 1; + *shift = 0; + + if (rate <= 0) + return; + + for (;;) { + *mult = div64_u64(factor, rate); + if (*mult & (1U << 31) || factor & (1ULL << 63)) + break; + factor <<= 1; + (*shift)++; + } +} + void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64) @@ -1333,34 +1375,17 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, r->overhead = conf->overhead; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); - r->mult = 1; - /* - * The deal here is to replace a divide by a reciprocal one - * in fast path (a reciprocal divide is a multiply and a shift) - * - * Normal formula would be : - * time_in_ns = (NSEC_PER_SEC * len) / rate_bps - * - * We compute mult/shift to use instead : - * time_in_ns = (len * mult) >> shift; - * - * We try to get the highest possible mult value for accuracy, - * but have to make sure no overflows will ever happen. - */ - if (r->rate_bytes_ps > 0) { - u64 factor = NSEC_PER_SEC; - - for (;;) { - r->mult = div64_u64(factor, r->rate_bytes_ps); - if (r->mult & (1U << 31) || factor & (1ULL << 63)) - break; - factor <<= 1; - r->shift++; - } - } + psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ratecfg_precompute); +void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) +{ + r->rate_pkts_ps = pktrate64; + psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift); +} +EXPORT_SYMBOL(psched_ppscfg_precompute); + static void mini_qdisc_rcu_func(struct rcu_head *head) { } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 909c798b7403..5c91df52b8c2 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -411,18 +411,10 @@ done: return txtime; } -static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) +static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); - struct Qdisc *child; - int queue; - - queue = skb_get_queue_mapping(skb); - - child = q->qdiscs[queue]; - if (unlikely(!child)) - return qdisc_drop(skb, sch, to_free); if (skb->sk && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) @@ -439,6 +431,58 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, return qdisc_enqueue(skb, child, to_free); } +static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct Qdisc *child; + int queue; + + queue = skb_get_queue_mapping(skb); + + child = q->qdiscs[queue]; + if (unlikely(!child)) + return qdisc_drop(skb, sch, to_free); + + /* Large packets might not be transmitted when the transmission duration + * exceeds any configured interval. Therefore, segment the skb into + * smaller chunks. Skip it for the full offload case, as the driver + * and/or the hardware is expected to handle this. + */ + if (skb_is_gso(skb) && !FULL_OFFLOAD_IS_ENABLED(q->flags)) { + unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); + netdev_features_t features = netif_skb_features(skb); + struct sk_buff *segs, *nskb; + int ret; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) + return qdisc_drop(skb, sch, to_free); + + skb_list_walk_safe(segs, segs, nskb) { + skb_mark_not_on_list(segs); + qdisc_skb_cb(segs)->pkt_len = segs->len; + slen += segs->len; + + ret = taprio_enqueue_one(segs, sch, child, to_free); + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + qdisc_qstats_drop(sch); + } else { + numsegs++; + } + } + + if (numsegs > 1) + qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); + consume_skb(skb); + + return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; + } + + return taprio_enqueue_one(skb, sch, child, to_free); +} + static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f77484df097b..5f9a7c028274 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3147,7 +3147,7 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, * primary. */ if (af->is_any(&addr)) - memcpy(&addr.v4, sctp_source(asconf), sizeof(addr)); + memcpy(&addr, sctp_source(asconf), sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_SET_PRIMARY, @@ -3217,7 +3217,7 @@ bool sctp_verify_asconf(const struct sctp_association *asoc, return false; break; default: - /* This is unkown to us, reject! */ + /* This is unknown to us, reject! */ return false; } } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index af2b7041fa4e..7632714c1e5b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1452,7 +1452,7 @@ static char sctp_tietags_compare(struct sctp_association *new_asoc, return 'E'; } -/* Common helper routine for both duplicate and simulataneous INIT +/* Common helper routine for both duplicate and simultaneous INIT * chunk handling. */ static enum sctp_disposition sctp_sf_do_unexpected_init( @@ -1685,7 +1685,7 @@ enum sctp_disposition sctp_sf_do_5_2_1_siminit( void *arg, struct sctp_cmd_seq *commands) { - /* Call helper to do the real work for both simulataneous and + /* Call helper to do the real work for both simultaneous and * duplicate INIT chunk handling. */ return sctp_sf_do_unexpected_init(net, ep, asoc, type, arg, commands); @@ -1740,7 +1740,7 @@ enum sctp_disposition sctp_sf_do_5_2_2_dupinit( void *arg, struct sctp_cmd_seq *commands) { - /* Call helper to do the real work for both simulataneous and + /* Call helper to do the real work for both simultaneous and * duplicate INIT chunk handling. */ return sctp_sf_do_unexpected_init(net, ep, asoc, type, arg, commands); @@ -2221,11 +2221,11 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( break; } - /* Delete the tempory new association. */ + /* Delete the temporary new association. */ sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); - /* Restore association pointer to provide SCTP command interpeter + /* Restore association pointer to provide SCTP command interpreter * with a valid context in case it needs to manipulate * the queues */ sctp_add_cmd_sf(commands, SCTP_CMD_SET_ASOC, diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b9b3d899a611..b7b90135c36a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9324,7 +9324,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) net_enable_timestamp(); - /* Set newsk security attributes from orginal sk and connection + /* Set newsk security attributes from original sk and connection * security attribute from ep. */ security_sctp_sk_clone(ep, sk, newsk); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index e8e448771f85..6d6fd1397c87 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -410,7 +410,6 @@ static inline void smc_set_pci_values(struct pci_dev *pci_dev, struct smc_sock; struct smc_clc_msg_accept_confirm; -struct smc_clc_msg_local; void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); diff --git a/net/socket.c b/net/socket.c index 84a8049c2b09..27e3e7d53f8e 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3568,7 +3568,7 @@ EXPORT_SYMBOL(kernel_accept); * @addrlen: address length * @flags: flags (O_NONBLOCK, ...) * - * For datagram sockets, @addr is the addres to which datagrams are sent + * For datagram sockets, @addr is the address to which datagrams are sent * by default, and the only address from which datagrams are received. * For stream sockets, attempts to connect to @addr. * Returns 0 or an error code. diff --git a/net/sysctl_net.c b/net/sysctl_net.c index d14dab8b6774..f6cb0d4d114c 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -115,9 +115,57 @@ out1: goto out; } +/* Verify that sysctls for non-init netns are safe by either: + * 1) being read-only, or + * 2) having a data pointer which points outside of the global kernel/module + * data segment, and rather into the heap where a per-net object was + * allocated. + */ +static void ensure_safe_net_sysctl(struct net *net, const char *path, + struct ctl_table *table) +{ + struct ctl_table *ent; + + pr_debug("Registering net sysctl (net %p): %s\n", net, path); + for (ent = table; ent->procname; ent++) { + unsigned long addr; + const char *where; + + pr_debug(" procname=%s mode=%o proc_handler=%ps data=%p\n", + ent->procname, ent->mode, ent->proc_handler, ent->data); + + /* If it's not writable inside the netns, then it can't hurt. */ + if ((ent->mode & 0222) == 0) { + pr_debug(" Not writable by anyone\n"); + continue; + } + + /* Where does data point? */ + addr = (unsigned long)ent->data; + if (is_module_address(addr)) + where = "module"; + else if (core_kernel_data(addr)) + where = "kernel"; + else + continue; + + /* If it is writable and points to kernel/module global + * data, then it's probably a netns leak. + */ + WARN(1, "sysctl %s/%s: data points to %s global data: %ps\n", + path, ent->procname, where, ent->data); + + /* Make it "safe" by dropping writable perms */ + ent->mode &= ~0222; + } +} + struct ctl_table_header *register_net_sysctl(struct net *net, const char *path, struct ctl_table *table) { + if (!net_eq(net, &init_net)) + ensure_safe_net_sysctl(net, path, table); + return __register_sysctl_table(&net->sysctls, path, table); } EXPORT_SYMBOL_GPL(register_net_sysctl); diff --git a/net/tipc/addr.c b/net/tipc/addr.c index abe29d1aa23a..fd0796269eed 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -3,6 +3,7 @@ * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems + * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 1a11831bef62..0772cfadaa0d 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -3,7 +3,7 @@ * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, Wind River Systems - * Copyright (c) 2020, Red Hat Inc + * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -44,6 +44,50 @@ #include <net/netns/generic.h> #include "core.h" +/* Struct tipc_uaddr: internal version of struct sockaddr_tipc. + * Must be kept aligned both regarding field positions and size. + */ +struct tipc_uaddr { + unsigned short family; + unsigned char addrtype; + signed char scope; + union { + struct { + struct tipc_service_addr sa; + u32 lookup_node; + }; + struct tipc_service_range sr; + struct tipc_socket_addr sk; + }; +}; + +static inline void tipc_uaddr(struct tipc_uaddr *ua, u32 atype, u32 scope, + u32 type, u32 lower, u32 upper) +{ + ua->family = AF_TIPC; + ua->addrtype = atype; + ua->scope = scope; + ua->sr.type = type; + ua->sr.lower = lower; + ua->sr.upper = upper; +} + +static inline bool tipc_uaddr_valid(struct tipc_uaddr *ua, int len) +{ + u32 atype; + + if (len < sizeof(struct sockaddr_tipc)) + return false; + atype = ua->addrtype; + if (ua->family != AF_TIPC) + return false; + if (atype == TIPC_SERVICE_ADDR || atype == TIPC_SOCKET_ADDR) + return true; + if (atype == TIPC_SERVICE_RANGE) + return ua->sr.upper >= ua->sr.lower; + return false; +} + static inline u32 tipc_own_addr(struct net *net) { return tipc_net(net)->node_addr; diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index a4389ef08a98..443f8e5b9477 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -240,10 +240,12 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) * @disc_domain: bearer domain * @prio: bearer priority * @attr: nlattr array + * @extack: netlink extended ack */ static int tipc_enable_bearer(struct net *net, const char *name, u32 disc_domain, u32 prio, - struct nlattr *attr[]) + struct nlattr *attr[], + struct netlink_ext_ack *extack) { struct tipc_net *tn = tipc_net(net); struct tipc_bearer_names b_names; @@ -254,20 +256,24 @@ static int tipc_enable_bearer(struct net *net, const char *name, int bearer_id = 0; int res = -EINVAL; char *errstr = ""; + u32 i; if (!bearer_name_validate(name, &b_names)) { errstr = "illegal name"; + NL_SET_ERR_MSG(extack, "Illegal name"); goto rejected; } if (prio > TIPC_MAX_LINK_PRI && prio != TIPC_MEDIA_LINK_PRI) { errstr = "illegal priority"; + NL_SET_ERR_MSG(extack, "Illegal priority"); goto rejected; } m = tipc_media_find(b_names.media_name); if (!m) { errstr = "media not registered"; + NL_SET_ERR_MSG(extack, "Media not registered"); goto rejected; } @@ -275,33 +281,43 @@ static int tipc_enable_bearer(struct net *net, const char *name, prio = m->priority; /* Check new bearer vs existing ones and find free bearer id if any */ - while (bearer_id < MAX_BEARERS) { - b = rtnl_dereference(tn->bearer_list[bearer_id]); - if (!b) - break; + bearer_id = MAX_BEARERS; + i = MAX_BEARERS; + while (i-- != 0) { + b = rtnl_dereference(tn->bearer_list[i]); + if (!b) { + bearer_id = i; + continue; + } if (!strcmp(name, b->name)) { errstr = "already enabled"; + NL_SET_ERR_MSG(extack, "Already enabled"); goto rejected; } - bearer_id++; - if (b->priority != prio) - continue; - if (++with_this_prio <= 2) - continue; - pr_warn("Bearer <%s>: already 2 bearers with priority %u\n", - name, prio); - if (prio == TIPC_MIN_LINK_PRI) { - errstr = "cannot adjust to lower"; - goto rejected; + + if (b->priority == prio && + (++with_this_prio > 2)) { + pr_warn("Bearer <%s>: already 2 bearers with priority %u\n", + name, prio); + + if (prio == TIPC_MIN_LINK_PRI) { + errstr = "cannot adjust to lower"; + NL_SET_ERR_MSG(extack, "Cannot adjust to lower"); + goto rejected; + } + + pr_warn("Bearer <%s>: trying with adjusted priority\n", + name); + prio--; + bearer_id = MAX_BEARERS; + i = MAX_BEARERS; + with_this_prio = 1; } - pr_warn("Bearer <%s>: trying with adjusted priority\n", name); - prio--; - bearer_id = 0; - with_this_prio = 1; } if (bearer_id >= MAX_BEARERS) { errstr = "max 3 bearers permitted"; + NL_SET_ERR_MSG(extack, "Max 3 bearers permitted"); goto rejected; } @@ -315,6 +331,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, if (res) { kfree(b); errstr = "failed to enable media"; + NL_SET_ERR_MSG(extack, "Failed to enable media"); goto rejected; } @@ -331,6 +348,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, if (res) { bearer_disable(net, b); errstr = "failed to create discoverer"; + NL_SET_ERR_MSG(extack, "Failed to create discoverer"); goto rejected; } @@ -909,6 +927,7 @@ int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info) bearer = tipc_bearer_find(net, name); if (!bearer) { err = -EINVAL; + NL_SET_ERR_MSG(info->extack, "Bearer not found"); goto err_out; } @@ -948,8 +967,10 @@ int __tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); bearer = tipc_bearer_find(net, name); - if (!bearer) + if (!bearer) { + NL_SET_ERR_MSG(info->extack, "Bearer not found"); return -EINVAL; + } bearer_disable(net, bearer); @@ -1007,7 +1028,8 @@ int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); } - return tipc_enable_bearer(net, bearer, domain, prio, attrs); + return tipc_enable_bearer(net, bearer, domain, prio, attrs, + info->extack); } int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) @@ -1046,6 +1068,7 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) b = tipc_bearer_find(net, name); if (!b) { rtnl_unlock(); + NL_SET_ERR_MSG(info->extack, "Bearer not found"); return -EINVAL; } @@ -1086,8 +1109,10 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); b = tipc_bearer_find(net, name); - if (!b) + if (!b) { + NL_SET_ERR_MSG(info->extack, "Bearer not found"); return -EINVAL; + } if (attrs[TIPC_NLA_BEARER_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; @@ -1106,12 +1131,18 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_WIN]) b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { - if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) + if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) { + NL_SET_ERR_MSG(info->extack, + "MTU property is unsupported"); return -EINVAL; + } #ifdef CONFIG_TIPC_MEDIA_UDP if (tipc_udp_mtu_bad(nla_get_u32 - (props[TIPC_NLA_PROP_MTU]))) + (props[TIPC_NLA_PROP_MTU]))) { + NL_SET_ERR_MSG(info->extack, + "MTU value is out-of-range"); return -EINVAL; + } b->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]); tipc_node_apply_property(net, b, TIPC_NLA_PROP_MTU); #endif @@ -1239,6 +1270,7 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); media = tipc_media_find(name); if (!media) { + NL_SET_ERR_MSG(info->extack, "Media not found"); err = -EINVAL; goto err_out; } @@ -1275,9 +1307,10 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) name = nla_data(attrs[TIPC_NLA_MEDIA_NAME]); m = tipc_media_find(name); - if (!m) + if (!m) { + NL_SET_ERR_MSG(info->extack, "Media not found"); return -EINVAL; - + } if (attrs[TIPC_NLA_MEDIA_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; @@ -1293,12 +1326,18 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_WIN]) m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { - if (m->type_id != TIPC_MEDIA_TYPE_UDP) + if (m->type_id != TIPC_MEDIA_TYPE_UDP) { + NL_SET_ERR_MSG(info->extack, + "MTU property is unsupported"); return -EINVAL; + } #ifdef CONFIG_TIPC_MEDIA_UDP if (tipc_udp_mtu_bad(nla_get_u32 - (props[TIPC_NLA_PROP_MTU]))) + (props[TIPC_NLA_PROP_MTU]))) { + NL_SET_ERR_MSG(info->extack, + "MTU value is out-of-range"); return -EINVAL; + } m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]); #endif } diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 97710ce36047..e5c43d4d5a75 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -317,7 +317,7 @@ static int tipc_aead_key_generate(struct tipc_aead_key *skey); #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ do { \ - typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ + struct tipc_aead *__tmp = rcu_dereference_protected((rcu_ptr), \ lockdep_is_held(lock)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ tipc_aead_put(__tmp); \ @@ -798,7 +798,7 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) - salt ^= ehdr->addr; /* __be32 */ + salt ^= __be32_to_cpu(ehdr->addr); else if (__dnode) salt ^= tipc_node_get_addr(__dnode); memcpy(iv, &salt, 4); @@ -929,7 +929,7 @@ static int tipc_aead_decrypt(struct net *net, struct tipc_aead *aead, ehdr = (struct tipc_ehdr *)skb->data; salt = aead->salt; if (aead->mode == CLUSTER_KEY) - salt ^= ehdr->addr; /* __be32 */ + salt ^= __be32_to_cpu(ehdr->addr); else if (ehdr->destined) salt ^= tipc_own_addr(net); memcpy(iv, &salt, 4); @@ -1492,6 +1492,8 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, /* Allocate statistic structure */ c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); if (!c->stats) { + if (c->wq) + destroy_workqueue(c->wq); kfree_sensitive(c); return -ENOMEM; } @@ -1951,12 +1953,12 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, } if (unlikely(err)) { - tipc_aead_users_dec(aead, INT_MIN); + tipc_aead_users_dec((struct tipc_aead __force __rcu *)aead, INT_MIN); goto free_skb; } /* Set the RX key's user */ - tipc_aead_users_set(aead, 1); + tipc_aead_users_set((struct tipc_aead __force __rcu *)aead, 1); /* Mark this point, RX works */ rx->timer1 = jiffies; diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 48fac3b17e40..407619697292 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -104,6 +104,36 @@ static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); +static inline u16 mon_cpu_to_le16(u16 val) +{ + return (__force __u16)htons(val); +} + +static inline u32 mon_cpu_to_le32(u32 val) +{ + return (__force __u32)htonl(val); +} + +static inline u64 mon_cpu_to_le64(u64 val) +{ + return (__force __u64)cpu_to_be64(val); +} + +static inline u16 mon_le16_to_cpu(u16 val) +{ + return ntohs((__force __be16)val); +} + +static inline u32 mon_le32_to_cpu(u32 val) +{ + return ntohl((__force __be32)val); +} + +static inline u64 mon_le64_to_cpu(u64 val) +{ + return be64_to_cpu((__force __be64)val); +} + /* dom_rec_len(): actual length of domain record for transport */ static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) @@ -260,16 +290,16 @@ static void mon_update_local_domain(struct tipc_monitor *mon) diff |= dom->members[i] != peer->addr; dom->members[i] = peer->addr; map_set(&dom->up_map, i, peer->is_up); - cache->members[i] = htonl(peer->addr); + cache->members[i] = mon_cpu_to_le32(peer->addr); } diff |= dom->up_map != prev_up_map; if (!diff) return; dom->gen = ++mon->dom_gen; - cache->len = htons(dom->len); - cache->gen = htons(dom->gen); - cache->member_cnt = htons(member_cnt); - cache->up_map = cpu_to_be64(dom->up_map); + cache->len = mon_cpu_to_le16(dom->len); + cache->gen = mon_cpu_to_le16(dom->gen); + cache->member_cnt = mon_cpu_to_le16(member_cnt); + cache->up_map = mon_cpu_to_le64(dom->up_map); mon_apply_domain(mon, self); } @@ -455,10 +485,11 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, struct tipc_mon_domain dom_bef; struct tipc_mon_domain *dom; struct tipc_peer *peer; - u16 new_member_cnt = ntohs(arrv_dom->member_cnt); + u16 new_member_cnt = mon_le16_to_cpu(arrv_dom->member_cnt); int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); - u16 new_gen = ntohs(arrv_dom->gen); - u16 acked_gen = ntohs(arrv_dom->ack_gen); + u16 new_gen = mon_le16_to_cpu(arrv_dom->gen); + u16 acked_gen = mon_le16_to_cpu(arrv_dom->ack_gen); + u16 arrv_dlen = mon_le16_to_cpu(arrv_dom->len); bool probing = state->probing; int i, applied_bef; @@ -469,7 +500,7 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, return; if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) return; - if ((dlen < new_dlen) || ntohs(arrv_dom->len) != new_dlen) + if (dlen < new_dlen || arrv_dlen != new_dlen) return; /* Synch generation numbers with peer if link just came up */ @@ -517,9 +548,9 @@ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, dom->len = new_dlen; dom->gen = new_gen; dom->member_cnt = new_member_cnt; - dom->up_map = be64_to_cpu(arrv_dom->up_map); + dom->up_map = mon_le64_to_cpu(arrv_dom->up_map); for (i = 0; i < new_member_cnt; i++) - dom->members[i] = ntohl(arrv_dom->members[i]); + dom->members[i] = mon_le32_to_cpu(arrv_dom->members[i]); /* Update peers affected by this domain record */ applied_bef = peer->applied; @@ -548,19 +579,19 @@ void tipc_mon_prep(struct net *net, void *data, int *dlen, if (likely(state->acked_gen == gen)) { len = dom_rec_len(dom, 0); *dlen = len; - dom->len = htons(len); - dom->gen = htons(gen); - dom->ack_gen = htons(state->peer_gen); + dom->len = mon_cpu_to_le16(len); + dom->gen = mon_cpu_to_le16(gen); + dom->ack_gen = mon_cpu_to_le16(state->peer_gen); dom->member_cnt = 0; return; } /* Send the full record */ read_lock_bh(&mon->lock); - len = ntohs(mon->cache.len); + len = mon_le16_to_cpu(mon->cache.len); *dlen = len; memcpy(data, &mon->cache, len); read_unlock_bh(&mon->lock); - dom->ack_gen = htons(state->peer_gen); + dom->ack_gen = mon_cpu_to_le16(state->peer_gen); } void tipc_mon_get_state(struct net *net, u32 addr, diff --git a/net/tipc/msg.c b/net/tipc/msg.c index e9263280a2d4..3f0a25345a7c 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -707,8 +707,11 @@ bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy) bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) { struct tipc_msg *msg = buf_msg(skb); - u32 dport, dnode; - u32 onode = tipc_own_addr(net); + u32 scope = msg_lookup_scope(msg); + u32 self = tipc_own_addr(net); + u32 inst = msg_nameinst(msg); + struct tipc_socket_addr sk; + struct tipc_uaddr ua; if (!msg_isdata(msg)) return false; @@ -722,16 +725,16 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; - dnode = tipc_scope2node(net, msg_lookup_scope(msg)); - dport = tipc_nametbl_translate(net, msg_nametype(msg), - msg_nameinst(msg), &dnode); - if (!dport) + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, scope, + msg_nametype(msg), inst, inst); + sk.node = tipc_scope2node(net, scope); + if (!tipc_nametbl_lookup_anycast(net, &ua, &sk)) return false; msg_incr_reroute_cnt(msg); - if (dnode != onode) - msg_set_prevnode(msg, onode); - msg_set_destnode(msg, dnode); - msg_set_destport(msg, dport); + if (sk.node != self) + msg_set_prevnode(msg, self); + msg_set_destnode(msg, sk.node); + msg_set_destport(msg, sk.ref); *err = TIPC_OK; return true; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 6cf57c3bfa27..bda902caa814 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -1,8 +1,9 @@ /* * net/tipc/name_distr.c: TIPC name distribution code * - * Copyright (c) 2000-2006, 2014, Ericsson AB + * Copyright (c) 2000-2006, 2014-2019, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems + * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,10 +56,10 @@ struct distr_queue_item { */ static void publ_to_item(struct distr_item *i, struct publication *p) { - i->type = htonl(p->type); - i->lower = htonl(p->lower); - i->upper = htonl(p->upper); - i->port = htonl(p->port); + i->type = htonl(p->sr.type); + i->lower = htonl(p->sr.lower); + i->upper = htonl(p->sr.upper); + i->port = htonl(p->sk.ref); i->key = htonl(p->key); } @@ -90,20 +91,20 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, /** * tipc_named_publish - tell other nodes about a new publication by this node * @net: the associated network namespace - * @publ: the new publication + * @p: the new publication */ -struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) +struct sk_buff *tipc_named_publish(struct net *net, struct publication *p) { struct name_table *nt = tipc_name_table(net); struct distr_item *item; struct sk_buff *skb; - if (publ->scope == TIPC_NODE_SCOPE) { - list_add_tail_rcu(&publ->binding_node, &nt->node_scope); + if (p->scope == TIPC_NODE_SCOPE) { + list_add_tail_rcu(&p->binding_node, &nt->node_scope); return NULL; } write_lock_bh(&nt->cluster_scope_lock); - list_add_tail(&publ->binding_node, &nt->cluster_scope); + list_add_tail(&p->binding_node, &nt->cluster_scope); write_unlock_bh(&nt->cluster_scope_lock); skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); if (!skb) { @@ -113,25 +114,25 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); - publ_to_item(item, publ); + publ_to_item(item, p); return skb; } /** * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node * @net: the associated network namespace - * @publ: the withdrawn publication + * @p: the withdrawn publication */ -struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) +struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *p) { struct name_table *nt = tipc_name_table(net); struct distr_item *item; struct sk_buff *skb; write_lock_bh(&nt->cluster_scope_lock); - list_del(&publ->binding_node); + list_del(&p->binding_node); write_unlock_bh(&nt->cluster_scope_lock); - if (publ->scope == TIPC_NODE_SCOPE) + if (p->scope == TIPC_NODE_SCOPE) return NULL; skb = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); @@ -142,7 +143,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); - publ_to_item(item, publ); + publ_to_item(item, p); return skb; } @@ -233,33 +234,27 @@ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) /** * tipc_publ_purge - remove publication associated with a failed node * @net: the associated network namespace - * @publ: the publication to remove + * @p: the publication to remove * @addr: failed node's address * * Invoked for each publication issued by a newly failed node. * Removes publication structure from name table & deletes it. */ -static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) +static void tipc_publ_purge(struct net *net, struct publication *p, u32 addr) { struct tipc_net *tn = tipc_net(net); - struct publication *p; + struct publication *_p; + struct tipc_uaddr ua; + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, p->scope, p->sr.type, + p->sr.lower, p->sr.upper); spin_lock_bh(&tn->nametbl_lock); - p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->upper, - publ->node, publ->key); - if (p) - tipc_node_unsubscribe(net, &p->binding_node, addr); + _p = tipc_nametbl_remove_publ(net, &ua, &p->sk, p->key); + if (_p) + tipc_node_unsubscribe(net, &_p->binding_node, addr); spin_unlock_bh(&tn->nametbl_lock); - - if (p != publ) { - pr_err("Unable to remove publication from failed node\n" - " (type=%u, lower=%u, node=0x%x, port=%u, key=%u)\n", - publ->type, publ->lower, publ->node, publ->port, - publ->key); - } - - if (p) - kfree_rcu(p, rcu); + if (_p) + kfree_rcu(_p, rcu); } void tipc_publ_notify(struct net *net, struct list_head *nsub_list, @@ -293,30 +288,30 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) { struct publication *p = NULL; - u32 lower = ntohl(i->lower); - u32 upper = ntohl(i->upper); - u32 type = ntohl(i->type); - u32 port = ntohl(i->port); + struct tipc_socket_addr sk; + struct tipc_uaddr ua; u32 key = ntohl(i->key); + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, + ntohl(i->type), ntohl(i->lower), ntohl(i->upper)); + sk.ref = ntohl(i->port); + sk.node = node; + if (dtype == PUBLICATION) { - p = tipc_nametbl_insert_publ(net, type, lower, upper, - TIPC_CLUSTER_SCOPE, node, - port, key); + p = tipc_nametbl_insert_publ(net, &ua, &sk, key); if (p) { tipc_node_subscribe(net, &p->binding_node, node); return true; } } else if (dtype == WITHDRAWAL) { - p = tipc_nametbl_remove_publ(net, type, lower, - upper, node, key); + p = tipc_nametbl_remove_publ(net, &ua, &sk, key); if (p) { tipc_node_unsubscribe(net, &p->binding_node, node); kfree_rcu(p, rcu); return true; } - pr_warn_ratelimited("Failed to remove binding %u,%u from %x\n", - type, lower, node); + pr_warn_ratelimited("Failed to remove binding %u,%u from %u\n", + ua.sr.type, ua.sr.lower, node); } else { pr_warn("Unrecognized name table message received\n"); } @@ -410,15 +405,15 @@ void tipc_named_reinit(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); - struct publication *publ; + struct publication *p; u32 self = tipc_own_addr(net); spin_lock_bh(&tn->nametbl_lock); - list_for_each_entry_rcu(publ, &nt->node_scope, binding_node) - publ->node = self; - list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node) - publ->node = self; + list_for_each_entry_rcu(p, &nt->node_scope, binding_node) + p->sk.node = self; + list_for_each_entry_rcu(p, &nt->cluster_scope, binding_node) + p->sk.node = self; nt->rc_dests = 0; spin_unlock_bh(&tn->nametbl_lock); } diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index ee5ac40ea2b6..b4017945d3e5 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -3,7 +3,7 @@ * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems - * Copyright (c) 2020, Red Hat Inc + * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -222,59 +222,57 @@ static int hash(int x) /** * tipc_publ_create - create a publication structure - * @type: name sequence type - * @lower: name sequence lower bound - * @upper: name sequence upper bound - * @scope: publication scope - * @node: network address of publishing socket - * @port: publishing port + * @ua: the service range the user is binding to + * @sk: the address of the socket that is bound * @key: publication key */ -static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, - u32 scope, u32 node, u32 port, +static struct publication *tipc_publ_create(struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, u32 key) { - struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC); + struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC); - if (!publ) + if (!p) return NULL; - publ->type = type; - publ->lower = lower; - publ->upper = upper; - publ->scope = scope; - publ->node = node; - publ->port = port; - publ->key = key; - INIT_LIST_HEAD(&publ->binding_sock); - INIT_LIST_HEAD(&publ->binding_node); - INIT_LIST_HEAD(&publ->local_publ); - INIT_LIST_HEAD(&publ->all_publ); - INIT_LIST_HEAD(&publ->list); - return publ; + p->sr = ua->sr; + p->sk = *sk; + p->scope = ua->scope; + p->key = key; + INIT_LIST_HEAD(&p->binding_sock); + INIT_LIST_HEAD(&p->binding_node); + INIT_LIST_HEAD(&p->local_publ); + INIT_LIST_HEAD(&p->all_publ); + INIT_LIST_HEAD(&p->list); + return p; } /** * tipc_service_create - create a service structure for the specified 'type' - * @type: service type - * @hd: name_table services list + * @net: network namespace + * @ua: address representing the service to be bound * * Allocates a single range structure and sets it to all 0's. */ -static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd) +static struct tipc_service *tipc_service_create(struct net *net, + struct tipc_uaddr *ua) { - struct tipc_service *service = kzalloc(sizeof(*service), GFP_ATOMIC); + struct name_table *nt = tipc_name_table(net); + struct tipc_service *service; + struct hlist_head *hd; + service = kzalloc(sizeof(*service), GFP_ATOMIC); if (!service) { pr_warn("Service creation failed, no memory\n"); return NULL; } spin_lock_init(&service->lock); - service->type = type; + service->type = ua->sr.type; service->ranges = RB_ROOT; INIT_HLIST_NODE(&service->service_list); INIT_LIST_HEAD(&service->subscriptions); + hd = &nt->services[hash(ua->sr.type)]; hlist_add_head_rcu(&service->service_list, hd); return service; } @@ -282,13 +280,13 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd) /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, - u32 lower, u32 upper) + struct tipc_uaddr *ua) { struct service_range *sr; - service_range_foreach_match(sr, sc, lower, upper) { + service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { /* Look for exact match */ - if (sr->lower == lower && sr->upper == upper) + if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper) return sr; } @@ -296,10 +294,12 @@ static struct service_range *tipc_service_find_range(struct tipc_service *sc, } static struct service_range *tipc_service_create_range(struct tipc_service *sc, - u32 lower, u32 upper) + struct publication *p) { struct rb_node **n, *parent = NULL; struct service_range *sr; + u32 lower = p->sr.lower; + u32 upper = p->sr.upper; n = &sc->ranges.rb_node; while (*n) { @@ -327,64 +327,68 @@ static struct service_range *tipc_service_create_range(struct tipc_service *sc, return sr; } -static struct publication *tipc_service_insert_publ(struct net *net, - struct tipc_service *sc, - u32 type, u32 lower, - u32 upper, u32 scope, - u32 node, u32 port, - u32 key) +static bool tipc_service_insert_publ(struct net *net, + struct tipc_service *sc, + struct publication *p) { struct tipc_subscription *sub, *tmp; struct service_range *sr; - struct publication *p; + struct publication *_p; + u32 node = p->sk.node; bool first = false; + bool res = false; + u32 key = p->key; - sr = tipc_service_create_range(sc, lower, upper); + spin_lock_bh(&sc->lock); + sr = tipc_service_create_range(sc, p); if (!sr) - goto err; + goto exit; first = list_empty(&sr->all_publ); /* Return if the publication already exists */ - list_for_each_entry(p, &sr->all_publ, all_publ) { - if (p->key == key && (!p->node || p->node == node)) - return NULL; + list_for_each_entry(_p, &sr->all_publ, all_publ) { + if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) { + pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n", + p->sr.type, p->sr.lower, p->sr.upper, + node, p->sk.ref, key); + goto exit; + } } - /* Create and insert publication */ - p = tipc_publ_create(type, lower, upper, scope, node, port, key); - if (!p) - goto err; - /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */ - p->id = sc->publ_cnt++; - if (in_own_node(net, node)) + if (in_own_node(net, p->sk.node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); + p->id = sc->publ_cnt++; /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { - tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_PUBLISHED, - p->port, p->node, p->scope, first); + tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first); } - return p; -err: - pr_warn("Failed to bind to %u,%u,%u, no memory\n", type, lower, upper); - return NULL; + res = true; +exit: + if (!res) + pr_warn("Failed to bind to %u,%u,%u\n", + p->sr.type, p->sr.lower, p->sr.upper); + spin_unlock_bh(&sc->lock); + return res; } /** * tipc_service_remove_publ - remove a publication from a service - * @sr: service_range to remove publication from - * @node: target node + * @r: service_range to remove publication from + * @sk: address publishing socket * @key: target publication key */ -static struct publication *tipc_service_remove_publ(struct service_range *sr, - u32 node, u32 key) +static struct publication *tipc_service_remove_publ(struct service_range *r, + struct tipc_socket_addr *sk, + u32 key) { struct publication *p; + u32 node = sk->node; - list_for_each_entry(p, &sr->all_publ, all_publ) { - if (p->key != key || (node && node != p->node)) + list_for_each_entry(p, &r->all_publ, all_publ) { + if (p->key != key || (node && node != p->sk.node)) continue; list_del(&p->all_publ); list_del(&p->local_publ); @@ -417,17 +421,14 @@ static int tipc_publ_sort(void *priv, struct list_head *a, static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { - struct tipc_subscr *sb = &sub->evt.s; struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; - struct tipc_service_range r; - u32 filter; + u32 filter, lower, upper; - r.type = tipc_sub_read(sb, seq.type); - r.lower = tipc_sub_read(sb, seq.lower); - r.upper = tipc_sub_read(sb, seq.upper); - filter = tipc_sub_read(sb, filter); + filter = sub->s.filter; + lower = sub->s.seq.lower; + upper = sub->s.seq.upper; tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); @@ -436,7 +437,7 @@ static void tipc_service_subscribe(struct tipc_service *service, return; INIT_LIST_HEAD(&publ_list); - service_range_foreach_match(sr, service, r.lower, r.upper) { + service_range_foreach_match(sr, service, lower, upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) @@ -452,80 +453,74 @@ static void tipc_service_subscribe(struct tipc_service *service, /* Sort the publications before reporting */ list_sort(NULL, &publ_list, tipc_publ_sort); list_for_each_entry_safe(p, tmp, &publ_list, list) { - tipc_sub_report_overlap(sub, p->lower, p->upper, - TIPC_PUBLISHED, p->port, p->node, - p->scope, true); + tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true); list_del_init(&p->list); } } -static struct tipc_service *tipc_service_find(struct net *net, u32 type) +static struct tipc_service *tipc_service_find(struct net *net, + struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct hlist_head *service_head; struct tipc_service *service; - service_head = &nt->services[hash(type)]; + service_head = &nt->services[hash(ua->sr.type)]; hlist_for_each_entry_rcu(service, service_head, service_list) { - if (service->type == type) + if (service->type == ua->sr.type) return service; } return NULL; }; -struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, - u32 lower, u32 upper, - u32 scope, u32 node, - u32 port, u32 key) +struct publication *tipc_nametbl_insert_publ(struct net *net, + struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, + u32 key) { - struct name_table *nt = tipc_name_table(net); struct tipc_service *sc; struct publication *p; - if (scope > TIPC_NODE_SCOPE || lower > upper) { - pr_debug("Failed to bind illegal {%u,%u,%u} with scope %u\n", - type, lower, upper, scope); - return NULL; - } - sc = tipc_service_find(net, type); - if (!sc) - sc = tipc_service_create(type, &nt->services[hash(type)]); - if (!sc) + p = tipc_publ_create(ua, sk, key); + if (!p) return NULL; - spin_lock_bh(&sc->lock); - p = tipc_service_insert_publ(net, sc, type, lower, upper, - scope, node, port, key); - spin_unlock_bh(&sc->lock); - return p; + sc = tipc_service_find(net, ua); + if (!sc) + sc = tipc_service_create(net, ua); + if (sc && tipc_service_insert_publ(net, sc, p)) + return p; + kfree(p); + return NULL; } -struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, - u32 lower, u32 upper, - u32 node, u32 key) +struct publication *tipc_nametbl_remove_publ(struct net *net, + struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, + u32 key) { - struct tipc_service *sc = tipc_service_find(net, type); struct tipc_subscription *sub, *tmp; - struct service_range *sr = NULL; struct publication *p = NULL; + struct service_range *sr; + struct tipc_service *sc; bool last; + sc = tipc_service_find(net, ua); if (!sc) - return NULL; + goto exit; spin_lock_bh(&sc->lock); - sr = tipc_service_find_range(sc, lower, upper); + sr = tipc_service_find_range(sc, ua); if (!sr) - goto exit; - p = tipc_service_remove_publ(sr, node, key); + goto unlock; + p = tipc_service_remove_publ(sr, sk, key); if (!p) - goto exit; + goto unlock; /* Notify any waiting subscriptions */ last = list_empty(&sr->all_publ); list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { - tipc_sub_report_overlap(sub, lower, upper, TIPC_WITHDRAWN, - p->port, node, p->scope, last); + tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last); } /* Remove service range item if this was its last publication */ @@ -534,77 +529,85 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, kfree(sr); } - /* Delete service item if this no more publications and subscriptions */ + /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } -exit: +unlock: spin_unlock_bh(&sc->lock); +exit: + if (!p) { + pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n", + ua->sr.type, ua->sr.lower, ua->sr.upper, + sk->node, sk->ref, key); + } return p; } /** - * tipc_nametbl_translate - perform service instance to socket translation + * tipc_nametbl_lookup_anycast - perform service instance to socket translation * @net: network namespace - * @type: message type - * @instance: message instance - * @dnode: the search domain used during translation + * @ua: service address to look up + * @sk: address to socket we want to find + * + * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be + * performed, which may not be this one. * * On exit: - * - if translation is deferred to another node, leave 'dnode' unchanged and - * return 0 - * - if translation is attempted and succeeds, set 'dnode' to the publishing - * node and return the published (non-zero) port number - * - if translation is attempted and fails, set 'dnode' to 0 and return 0 + * + * - If lookup is deferred to another node, leave 'sk->node' unchanged and + * return 'true'. + * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which + * represent the bound socket and return 'true'. + * - If lookup fails, return 'false' * * Note that for legacy users (node configured with Z.C.N address format) the - * 'closest-first' lookup algorithm must be maintained, i.e., if dnode is 0 + * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0 * we must look in the local binding list first */ -u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode) +bool tipc_nametbl_lookup_anycast(struct net *net, + struct tipc_uaddr *ua, + struct tipc_socket_addr *sk) { struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); - struct service_range *sr; + u32 inst = ua->sa.instance; + struct service_range *r; struct tipc_service *sc; - struct list_head *list; struct publication *p; - u32 port = 0; - u32 node = 0; + struct list_head *l; + bool res = false; - if (!tipc_in_scope(legacy, *dnode, self)) - return 0; + if (!tipc_in_scope(legacy, sk->node, self)) + return true; rcu_read_lock(); - sc = tipc_service_find(net, type); + sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); - service_range_foreach_match(sr, sc, instance, instance) { + service_range_foreach_match(r, sc, inst, inst) { /* Select lookup algo: local, closest-first or round-robin */ - if (*dnode == self) { - list = &sr->local_publ; - if (list_empty(list)) + if (sk->node == self) { + l = &r->local_publ; + if (list_empty(l)) continue; - p = list_first_entry(list, struct publication, - local_publ); - list_move_tail(&p->local_publ, &sr->local_publ); - } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { - list = &sr->local_publ; - p = list_first_entry(list, struct publication, - local_publ); - list_move_tail(&p->local_publ, &sr->local_publ); + p = list_first_entry(l, struct publication, local_publ); + list_move_tail(&p->local_publ, &r->local_publ); + } else if (legacy && !sk->node && !list_empty(&r->local_publ)) { + l = &r->local_publ; + p = list_first_entry(l, struct publication, local_publ); + list_move_tail(&p->local_publ, &r->local_publ); } else { - list = &sr->all_publ; - p = list_first_entry(list, struct publication, - all_publ); - list_move_tail(&p->all_publ, &sr->all_publ); + l = &r->all_publ; + p = list_first_entry(l, struct publication, all_publ); + list_move_tail(&p->all_publ, &r->all_publ); } - port = p->port; - node = p->node; + *sk = p->sk; + res = true; /* Todo: as for legacy, pick the first matching range only, a * "true" round-robin will be performed as needed. */ @@ -614,40 +617,45 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode) exit: rcu_read_unlock(); - *dnode = node; - return port; + return res; } -bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, - struct list_head *dsts, int *dstcnt, u32 exclude, - bool all) +/* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group + * Returns a list of one (== group anycast) or more (== group multicast) + * destination socket/node pairs matching the given address. + * The requester may or may not want to exclude himself from the list. + */ +bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, + struct list_head *dsts, int *dstcnt, + u32 exclude, bool mcast) { u32 self = tipc_own_addr(net); + u32 inst = ua->sa.instance; struct service_range *sr; struct tipc_service *sc; struct publication *p; *dstcnt = 0; rcu_read_lock(); - sc = tipc_service_find(net, type); + sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); /* Todo: a full search i.e. service_range_foreach_match() instead? */ - sr = service_range_match_first(sc->ranges.rb_node, instance, instance); + sr = service_range_match_first(sc->ranges.rb_node, inst, inst); if (!sr) goto no_match; list_for_each_entry(p, &sr->all_publ, all_publ) { - if (p->scope != scope) + if (p->scope != ua->scope) continue; - if (p->port == exclude && p->node == self) + if (p->sk.ref == exclude && p->sk.node == self) continue; - tipc_dest_push(dsts, p->node, p->port); + tipc_dest_push(dsts, p->sk.node, p->sk.ref); (*dstcnt)++; - if (all) + if (mcast) continue; list_move_tail(&p->all_publ, &sr->all_publ); break; @@ -659,23 +667,29 @@ exit: return !list_empty(dsts); } -void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, - u32 scope, bool exact, struct list_head *dports) +/* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets + * matching the given address + * Used on nodes which have received a multicast/broadcast message + * Returns a list of local sockets + */ +void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, + bool exact, struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; + u32 scope = ua->scope; rcu_read_lock(); - sc = tipc_service_find(net, type); + sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); - service_range_foreach_match(sr, sc, lower, upper) { + service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (p->scope == scope || (!exact && p->scope < scope)) - tipc_dest_push(dports, 0, p->port); + tipc_dest_push(dports, 0, p->sk.ref); } } spin_unlock_bh(&sc->lock); @@ -683,26 +697,27 @@ exit: rcu_read_unlock(); } -/* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes - * - Creates list of nodes that overlap the given multicast address - * - Determines if any node local destinations overlap +/* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching + * the given address. Used in sending node. + * Used on nodes which are sending out a multicast/broadcast message + * Returns a list of nodes, including own node if applicable */ -void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, - u32 upper, struct tipc_nlist *nodes) +void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, + struct tipc_nlist *nodes) { struct service_range *sr; struct tipc_service *sc; struct publication *p; rcu_read_lock(); - sc = tipc_service_find(net, type); + sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); - service_range_foreach_match(sr, sc, lower, upper) { + service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { - tipc_nlist_add(nodes, p->node); + tipc_nlist_add(nodes, p->sk.node); } } spin_unlock_bh(&sc->lock); @@ -713,7 +728,7 @@ exit: /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, - u32 type, u32 scope) + struct tipc_uaddr *ua) { struct service_range *sr; struct tipc_service *sc; @@ -721,7 +736,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, struct rb_node *n; rcu_read_lock(); - sc = tipc_service_find(net, type); + sc = tipc_service_find(net, ua); if (!sc) goto exit; @@ -729,9 +744,10 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); list_for_each_entry(p, &sr->all_publ, all_publ) { - if (p->scope != scope) + if (p->scope != ua->scope) continue; - tipc_group_add_member(grp, p->node, p->port, p->lower); + tipc_group_add_member(grp, p->sk.node, p->sk.ref, + p->sr.lower); } } spin_unlock_bh(&sc->lock); @@ -741,9 +757,8 @@ exit: /* tipc_nametbl_publish - add service binding to name table */ -struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, - u32 upper, u32 scope, u32 port, - u32 key) +struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); @@ -758,8 +773,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, goto exit; } - p = tipc_nametbl_insert_publ(net, type, lower, upper, scope, - tipc_own_addr(net), port, key); + p = tipc_nametbl_insert_publ(net, ua, sk, key); if (p) { nt->local_publ_count++; skb = tipc_named_publish(net, p); @@ -777,41 +791,33 @@ exit: /** * tipc_nametbl_withdraw - withdraw a service binding * @net: network namespace - * @type: service type - * @lower: service range lower bound - * @upper: service range upper bound + * @ua: service address/range being unbound + * @sk: address of the socket being unbound from * @key: target publication key */ -int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, - u32 upper, u32 key) +void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); - u32 self = tipc_own_addr(net); struct sk_buff *skb = NULL; struct publication *p; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); - p = tipc_nametbl_remove_publ(net, type, lower, upper, self, key); + p = tipc_nametbl_remove_publ(net, ua, sk, key); if (p) { nt->local_publ_count--; skb = tipc_named_withdraw(net, p); list_del_init(&p->binding_sock); kfree_rcu(p, rcu); - } else { - pr_err("Failed to remove local publication {%u,%u,%u}/%u\n", - type, lower, upper, key); } rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); - if (skb) { + if (skb) tipc_node_broadcast(net, skb, rc_dests); - return 1; - } - return 0; } /** @@ -820,25 +826,25 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { - struct name_table *nt = tipc_name_table(sub->net); struct tipc_net *tn = tipc_net(sub->net); - struct tipc_subscr *s = &sub->evt.s; - u32 type = tipc_sub_read(s, seq.type); + u32 type = sub->s.seq.type; struct tipc_service *sc; + struct tipc_uaddr ua; bool res = true; + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type, + sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); - sc = tipc_service_find(sub->net, type); + sc = tipc_service_find(sub->net, &ua); if (!sc) - sc = tipc_service_create(type, &nt->services[hash(type)]); + sc = tipc_service_create(sub->net, &ua); if (sc) { spin_lock_bh(&sc->lock); tipc_service_subscribe(sc, sub); spin_unlock_bh(&sc->lock); } else { - pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, - tipc_sub_read(s, seq.lower), - tipc_sub_read(s, seq.upper)); + pr_warn("Failed to subscribe for {%u,%u,%u}\n", + type, sub->s.seq.lower, sub->s.seq.upper); res = false; } spin_unlock_bh(&tn->nametbl_lock); @@ -852,12 +858,13 @@ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); - struct tipc_subscr *s = &sub->evt.s; - u32 type = tipc_sub_read(s, seq.type); struct tipc_service *sc; + struct tipc_uaddr ua; + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, + sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); - sc = tipc_service_find(sub->net, type); + sc = tipc_service_find(sub->net, &ua); if (!sc) goto exit; @@ -909,7 +916,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc) spin_lock_bh(&sc->lock); rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { - tipc_service_remove_publ(sr, p->node, p->key); + tipc_service_remove_publ(sr, &p->sk, p->key); kfree_rcu(p, rcu); } rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); @@ -993,9 +1000,9 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->node)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node)) goto publ_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->port)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; @@ -1046,6 +1053,7 @@ static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, struct tipc_net *tn = tipc_net(net); struct tipc_service *service = NULL; struct hlist_head *head; + struct tipc_uaddr ua; int err; int i; @@ -1059,7 +1067,9 @@ static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, if (*last_type || (!i && *last_key && (*last_lower == *last_key))) { - service = tipc_service_find(net, *last_type); + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, + *last_type, *last_lower, *last_lower); + service = tipc_service_find(net, &ua); if (!service) return -EPIPE; } else { diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 5a82a01369d6..c7c9a3ddd420 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -3,6 +3,7 @@ * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems + * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -41,6 +42,7 @@ struct tipc_subscription; struct tipc_plist; struct tipc_nlist; struct tipc_group; +struct tipc_uaddr; /* * TIPC name types reserved for internal TIPC use (both current and planned) @@ -50,13 +52,10 @@ struct tipc_group; #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ /** - * struct publication - info about a published (name or) name sequence - * @type: name sequence type - * @lower: name sequence lower bound - * @upper: name sequence upper bound + * struct publication - info about a published service address or range + * @sr: service range represented by this publication + * @sk: address of socket bound to this publication * @scope: scope of publication, TIPC_NODE_SCOPE or TIPC_CLUSTER_SCOPE - * @node: network address of publishing socket's node - * @port: publishing port * @key: publication key, unique across the cluster * @id: publication id * @binding_node: all publications from the same node which bound this one @@ -74,12 +73,9 @@ struct tipc_group; * @rcu: RCU callback head used for deferred freeing */ struct publication { - u32 type; - u32 lower; - u32 upper; - u32 scope; - u32 node; - u32 port; + struct tipc_service_range sr; + struct tipc_socket_addr sk; + u16 scope; u32 key; u32 id; struct list_head binding_node; @@ -114,28 +110,29 @@ struct name_table { }; int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); - -u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); -void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, - u32 scope, bool exact, struct list_head *dports); +bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, + struct tipc_socket_addr *sk); +void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, + bool exact, struct list_head *dports); +void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, + struct tipc_nlist *nodes); +bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, + struct list_head *dsts, int *dstcnt, + u32 exclude, bool mcast); void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, - u32 type, u32 domain); -void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, - u32 upper, struct tipc_nlist *nodes); -bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, - struct list_head *dsts, int *dstcnt, u32 exclude, - bool all); -struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, - u32 upper, u32 scope, u32 port, - u32 key); -int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 upper, - u32 key); -struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, - u32 lower, u32 upper, u32 scope, - u32 node, u32 ref, u32 key); -struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, - u32 lower, u32 upper, - u32 node, u32 key); + struct tipc_uaddr *ua); +struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, u32 key); +void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, u32 key); +struct publication *tipc_nametbl_insert_publ(struct net *net, + struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, + u32 key); +struct publication *tipc_nametbl_remove_publ(struct net *net, + struct tipc_uaddr *ua, + struct tipc_socket_addr *sk, + u32 key); bool tipc_nametbl_subscribe(struct tipc_subscription *s); void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); diff --git a/net/tipc/net.c b/net/tipc/net.c index faf6bf554514..a130195af188 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -125,6 +125,11 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr) static void tipc_net_finalize(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); + struct tipc_socket_addr sk = {0, addr}; + struct tipc_uaddr ua; + + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, + TIPC_NODE_STATE, addr, addr); if (cmpxchg(&tn->node_addr, 0, addr)) return; @@ -132,8 +137,7 @@ static void tipc_net_finalize(struct net *net, u32 addr) tipc_named_reinit(net); tipc_sk_reinit(net); tipc_mon_reinit_self(net); - tipc_nametbl_publish(net, TIPC_NODE_STATE, addr, addr, - TIPC_CLUSTER_SCOPE, 0, addr); + tipc_nametbl_publish(net, &ua, &sk, addr); } void tipc_net_finalize_work(struct work_struct *work) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 5a1ce64039f7..0749df80454d 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -696,7 +696,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, if (err) return err; - link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); + link_info.dest = htonl(nla_get_flag(link[TIPC_NLA_LINK_DEST])); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); nla_strscpy(link_info.str, link[TIPC_NLA_LINK_NAME], TIPC_MAX_LINK_NAME); diff --git a/net/tipc/node.c b/net/tipc/node.c index e0ee83263a39..8217905348f4 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -372,42 +372,49 @@ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) } static void tipc_node_read_lock(struct tipc_node *n) + __acquires(n->lock) { read_lock_bh(&n->lock); } static void tipc_node_read_unlock(struct tipc_node *n) + __releases(n->lock) { read_unlock_bh(&n->lock); } static void tipc_node_write_lock(struct tipc_node *n) + __acquires(n->lock) { write_lock_bh(&n->lock); } static void tipc_node_write_unlock_fast(struct tipc_node *n) + __releases(n->lock) { write_unlock_bh(&n->lock); } static void tipc_node_write_unlock(struct tipc_node *n) + __releases(n->lock) { + struct tipc_socket_addr sk; struct net *net = n->net; - u32 addr = 0; u32 flags = n->action_flags; - u32 link_id = 0; - u32 bearer_id; struct list_head *publ_list; + struct tipc_uaddr ua; + u32 bearer_id; if (likely(!flags)) { write_unlock_bh(&n->lock); return; } - addr = n->addr; - link_id = n->link_id; - bearer_id = link_id & 0xffff; + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, + TIPC_LINK_STATE, n->addr, n->addr); + sk.ref = n->link_id; + sk.node = n->addr; + bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | @@ -416,20 +423,18 @@ static void tipc_node_write_unlock(struct tipc_node *n) write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) - tipc_publ_notify(net, publ_list, addr, n->capabilities); + tipc_publ_notify(net, publ_list, n->addr, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) - tipc_named_node_up(net, addr, n->capabilities); + tipc_named_node_up(net, n->addr, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { - tipc_mon_peer_up(net, addr, bearer_id); - tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr, - TIPC_NODE_SCOPE, link_id, link_id); + tipc_mon_peer_up(net, n->addr, bearer_id); + tipc_nametbl_publish(net, &ua, &sk, n->link_id); } if (flags & TIPC_NOTIFY_LINK_DOWN) { - tipc_mon_peer_down(net, addr, bearer_id); - tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, - addr, link_id); + tipc_mon_peer_down(net, n->addr, bearer_id); + tipc_nametbl_withdraw(net, &ua, &sk, n->link_id); } } @@ -2009,7 +2014,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, return true; } - /* No synching needed if only one link */ + /* No syncing needed if only one link */ if (!pl || !tipc_link_is_up(pl)) return true; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 022999e0202d..58935cd0d068 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3,7 +3,7 @@ * * Copyright (c) 2001-2007, 2012-2019, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems - * Copyright (c) 2020, Red Hat Inc + * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -111,7 +111,6 @@ struct tipc_sock { struct sock sk; u32 conn_type; u32 conn_instance; - int published; u32 max_pkt; u32 maxnagle; u32 portid; @@ -141,6 +140,7 @@ struct tipc_sock { bool expect_ack; bool nodelay; bool group_is_open; + bool published; }; static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); @@ -151,10 +151,8 @@ static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, bool kern); static void tipc_sk_timeout(struct timer_list *t); -static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, - struct tipc_service_range const *seq); -static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, - struct tipc_service_range const *seq); +static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua); +static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua); static int tipc_sk_leave(struct tipc_sock *tsk); static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); @@ -644,7 +642,7 @@ static int tipc_release(struct socket *sock) __tipc_shutdown(sock, TIPC_ERR_NO_PORT); sk->sk_shutdown = SHUTDOWN_MASK; tipc_sk_leave(tsk); - tipc_sk_withdraw(tsk, 0, NULL); + tipc_sk_withdraw(tsk, NULL); __skb_queue_purge(&tsk->mc_method.deferredq); sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); @@ -677,22 +675,31 @@ static int tipc_release(struct socket *sock) */ static int __tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) { - struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr; + struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr; struct tipc_sock *tsk = tipc_sk(sock->sk); + bool unbind = false; if (unlikely(!alen)) - return tipc_sk_withdraw(tsk, 0, NULL); + return tipc_sk_withdraw(tsk, NULL); - if (addr->addrtype == TIPC_SERVICE_ADDR) - addr->addr.nameseq.upper = addr->addr.nameseq.lower; + if (ua->addrtype == TIPC_SERVICE_ADDR) { + ua->addrtype = TIPC_SERVICE_RANGE; + ua->sr.upper = ua->sr.lower; + } + if (ua->scope < 0) { + unbind = true; + ua->scope = -ua->scope; + } + /* Users may still use deprecated TIPC_ZONE_SCOPE */ + if (ua->scope != TIPC_NODE_SCOPE) + ua->scope = TIPC_CLUSTER_SCOPE; if (tsk->group) return -EACCES; - if (addr->scope >= 0) - return tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq); - else - return tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); + if (unbind) + return tipc_sk_withdraw(tsk, ua); + return tipc_sk_publish(tsk, ua); } int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen) @@ -707,18 +714,17 @@ int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen) static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) { - struct sockaddr_tipc *addr = (struct sockaddr_tipc *)skaddr; + struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr; + u32 atype = ua->addrtype; if (alen) { - if (alen < sizeof(struct sockaddr_tipc)) + if (!tipc_uaddr_valid(ua, alen)) return -EINVAL; - if (addr->family != AF_TIPC) + if (atype == TIPC_SOCKET_ADDR) return -EAFNOSUPPORT; - if (addr->addrtype > TIPC_SERVICE_ADDR) - return -EAFNOSUPPORT; - if (addr->addr.nameseq.type < TIPC_RESERVED_TYPES) { + if (ua->sr.type < TIPC_RESERVED_TYPES) { pr_warn_once("Can't bind to reserved service type %u\n", - addr->addr.nameseq.type); + ua->sr.type); return -EACCES; } } @@ -826,7 +832,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, /** * tipc_sendmcast - send multicast message * @sock: socket structure - * @seq: destination address + * @ua: destination address struct * @msg: message to send * @dlen: length of data to send * @timeout: timeout to wait for wakeup @@ -834,7 +840,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock, * Called from function tipc_sendmsg(), which has done all sanity checks * Return: the number of bytes sent on success, or errno */ -static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq, +static int tipc_sendmcast(struct socket *sock, struct tipc_uaddr *ua, struct msghdr *msg, size_t dlen, long timeout) { struct sock *sk = sock->sk; @@ -842,7 +848,6 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq, struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); int mtu = tipc_bcast_get_mtu(net); - struct tipc_mc_method *method = &tsk->mc_method; struct sk_buff_head pkts; struct tipc_nlist dsts; int rc; @@ -857,8 +862,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq, /* Lookup destination nodes */ tipc_nlist_init(&dsts, tipc_own_addr(net)); - tipc_nametbl_lookup_dst_nodes(net, seq->type, seq->lower, - seq->upper, &dsts); + tipc_nametbl_lookup_mcast_nodes(net, ua, &dsts); if (!dsts.local && !dsts.remote) return -EHOSTUNREACH; @@ -868,9 +872,9 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq, msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE); msg_set_destport(hdr, 0); msg_set_destnode(hdr, 0); - msg_set_nametype(hdr, seq->type); - msg_set_namelower(hdr, seq->lower); - msg_set_nameupper(hdr, seq->upper); + msg_set_nametype(hdr, ua->sr.type); + msg_set_namelower(hdr, ua->sr.lower); + msg_set_nameupper(hdr, ua->sr.upper); /* Build message as chain of buffers */ __skb_queue_head_init(&pkts); @@ -880,7 +884,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_service_range *seq, if (unlikely(rc == dlen)) { trace_tipc_sk_sendmcast(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " "); - rc = tipc_mcast_xmit(net, &pkts, method, &dsts, + rc = tipc_mcast_xmit(net, &pkts, &tsk->mc_method, &dsts, &tsk->cong_link_cnt); } @@ -954,7 +958,7 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { struct sock *sk = sock->sk; - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name; int blks = tsk_blocks(GROUP_H_SIZE + dlen); struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); @@ -962,8 +966,8 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, u32 node, port; int rc; - node = dest->addr.id.node; - port = dest->addr.id.ref; + node = ua->sk.node; + port = ua->sk.ref; if (!port && !node) return -EHOSTUNREACH; @@ -997,7 +1001,7 @@ static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name; struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct list_head *cong_links = &tsk->cong_links; @@ -1008,16 +1012,13 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, struct net *net = sock_net(sk); u32 node, port, exclude; struct list_head dsts; - u32 type, inst, scope; int lookups = 0; int dstcnt, rc; bool cong; INIT_LIST_HEAD(&dsts); - - type = msg_nametype(hdr); - inst = dest->addr.name.name.instance; - scope = msg_lookup_scope(hdr); + ua->sa.type = msg_nametype(hdr); + ua->scope = msg_lookup_scope(hdr); while (++lookups < 4) { exclude = tipc_group_exclude(tsk->group); @@ -1026,8 +1027,8 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, /* Look for a non-congested destination member, if any */ while (1) { - if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, - &dstcnt, exclude, false)) + if (!tipc_nametbl_lookup_group(net, ua, &dsts, &dstcnt, + exclude, false)) return -EHOSTUNREACH; tipc_dest_pop(&dsts, &node, &port); cong = tipc_group_cong(tsk->group, node, port, blks, @@ -1082,7 +1083,7 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name; struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); @@ -1107,9 +1108,9 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, return -EHOSTUNREACH; /* Complete message header */ - if (dest) { + if (ua) { msg_set_type(hdr, TIPC_GRP_MCAST_MSG); - msg_set_nameinst(hdr, dest->addr.name.name.instance); + msg_set_nameinst(hdr, ua->sa.instance); } else { msg_set_type(hdr, TIPC_GRP_BCAST_MSG); msg_set_nameinst(hdr, 0); @@ -1156,29 +1157,25 @@ static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, int dlen, long timeout) { + struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name; struct sock *sk = sock->sk; - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - u32 type, inst, scope, exclude; struct list_head dsts; - u32 dstcnt; + u32 dstcnt, exclude; INIT_LIST_HEAD(&dsts); - - type = msg_nametype(hdr); - inst = dest->addr.name.name.instance; - scope = msg_lookup_scope(hdr); + ua->sa.type = msg_nametype(hdr); + ua->scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); - if (!tipc_nametbl_lookup(net, type, inst, scope, &dsts, - &dstcnt, exclude, true)) + if (!tipc_nametbl_lookup_group(net, ua, &dsts, &dstcnt, exclude, true)) return -EHOSTUNREACH; if (dstcnt == 1) { - tipc_dest_pop(&dsts, &dest->addr.id.node, &dest->addr.id.ref); + tipc_dest_pop(&dsts, &ua->sk.node, &ua->sk.ref); return tipc_send_group_unicast(sock, m, dlen, timeout); } @@ -1198,17 +1195,18 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { u32 self = tipc_own_addr(net); - u32 type, lower, upper, scope; struct sk_buff *skb, *_skb; u32 portid, onode; struct sk_buff_head tmpq; struct list_head dports; struct tipc_msg *hdr; + struct tipc_uaddr ua; int user, mtyp, hlen; bool exact; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); + ua.addrtype = TIPC_SERVICE_RANGE; skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { @@ -1217,7 +1215,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, mtyp = msg_type(hdr); hlen = skb_headroom(skb) + msg_hdr_sz(hdr); onode = msg_orignode(hdr); - type = msg_nametype(hdr); + ua.sr.type = msg_nametype(hdr); if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { spin_lock_bh(&inputq->lock); @@ -1232,24 +1230,23 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, /* Group messages require exact scope match */ if (msg_in_group(hdr)) { - lower = 0; - upper = ~0; - scope = msg_lookup_scope(hdr); + ua.sr.lower = 0; + ua.sr.upper = ~0; + ua.scope = msg_lookup_scope(hdr); exact = true; } else { /* TIPC_NODE_SCOPE means "any scope" in this context */ if (onode == self) - scope = TIPC_NODE_SCOPE; + ua.scope = TIPC_NODE_SCOPE; else - scope = TIPC_CLUSTER_SCOPE; + ua.scope = TIPC_CLUSTER_SCOPE; exact = false; - lower = msg_namelower(hdr); - upper = msg_nameupper(hdr); + ua.sr.lower = msg_namelower(hdr); + ua.sr.upper = msg_nameupper(hdr); } /* Create destination port list: */ - tipc_nametbl_mc_lookup(net, type, lower, upper, - scope, exact, &dports); + tipc_nametbl_lookup_mcast_sockets(net, &ua, exact, &dports); /* Clone message per destination */ while (tipc_dest_pop(&dports, NULL, &portid)) { @@ -1417,44 +1414,43 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct tipc_uaddr *ua = (struct tipc_uaddr *)m->msg_name; long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); struct list_head *clinks = &tsk->cong_links; bool syn = !tipc_sk_type_connectionless(sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; - struct tipc_service_range *seq; + struct tipc_socket_addr skaddr; struct sk_buff_head pkts; - u32 dport = 0, dnode = 0; - u32 type = 0, inst = 0; - int mtu, rc; + int atype, mtu, rc; if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; - if (likely(dest)) { - if (unlikely(m->msg_namelen < sizeof(*dest))) - return -EINVAL; - if (unlikely(dest->family != AF_TIPC)) + if (ua) { + if (!tipc_uaddr_valid(ua, m->msg_namelen)) return -EINVAL; + atype = ua->addrtype; } + /* If socket belongs to a communication group follow other paths */ if (grp) { - if (!dest) + if (!ua) return tipc_send_group_bcast(sock, m, dlen, timeout); - if (dest->addrtype == TIPC_SERVICE_ADDR) + if (atype == TIPC_SERVICE_ADDR) return tipc_send_group_anycast(sock, m, dlen, timeout); - if (dest->addrtype == TIPC_SOCKET_ADDR) + if (atype == TIPC_SOCKET_ADDR) return tipc_send_group_unicast(sock, m, dlen, timeout); - if (dest->addrtype == TIPC_ADDR_MCAST) + if (atype == TIPC_SERVICE_RANGE) return tipc_send_group_mcast(sock, m, dlen, timeout); return -EINVAL; } - if (unlikely(!dest)) { - dest = &tsk->peer; - if (!syn && dest->family != AF_TIPC) + if (!ua) { + ua = (struct tipc_uaddr *)&tsk->peer; + if (!syn && ua->family != AF_TIPC) return -EDESTADDRREQ; + atype = ua->addrtype; } if (unlikely(syn)) { @@ -1464,54 +1460,51 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -EISCONN; if (tsk->published) return -EOPNOTSUPP; - if (dest->addrtype == TIPC_SERVICE_ADDR) { - tsk->conn_type = dest->addr.name.name.type; - tsk->conn_instance = dest->addr.name.name.instance; + if (atype == TIPC_SERVICE_ADDR) { + tsk->conn_type = ua->sa.type; + tsk->conn_instance = ua->sa.instance; } msg_set_syn(hdr, 1); } - seq = &dest->addr.nameseq; - if (dest->addrtype == TIPC_ADDR_MCAST) - return tipc_sendmcast(sock, seq, m, dlen, timeout); - - if (dest->addrtype == TIPC_SERVICE_ADDR) { - type = dest->addr.name.name.type; - inst = dest->addr.name.name.instance; - dnode = dest->addr.name.domain; - dport = tipc_nametbl_translate(net, type, inst, &dnode); - if (unlikely(!dport && !dnode)) + /* Determine destination */ + if (atype == TIPC_SERVICE_RANGE) { + return tipc_sendmcast(sock, ua, m, dlen, timeout); + } else if (atype == TIPC_SERVICE_ADDR) { + skaddr.node = ua->lookup_node; + ua->scope = tipc_node2scope(skaddr.node); + if (!tipc_nametbl_lookup_anycast(net, ua, &skaddr)) return -EHOSTUNREACH; - } else if (dest->addrtype == TIPC_SOCKET_ADDR) { - dnode = dest->addr.id.node; + } else if (atype == TIPC_SOCKET_ADDR) { + skaddr = ua->sk; } else { return -EINVAL; } /* Block or return if destination link is congested */ rc = tipc_wait_for_cond(sock, &timeout, - !tipc_dest_find(clinks, dnode, 0)); + !tipc_dest_find(clinks, skaddr.node, 0)); if (unlikely(rc)) return rc; - if (dest->addrtype == TIPC_SERVICE_ADDR) { + /* Finally build message header */ + msg_set_destnode(hdr, skaddr.node); + msg_set_destport(hdr, skaddr.ref); + if (atype == TIPC_SERVICE_ADDR) { msg_set_type(hdr, TIPC_NAMED_MSG); msg_set_hdr_sz(hdr, NAMED_H_SIZE); - msg_set_nametype(hdr, type); - msg_set_nameinst(hdr, inst); - msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); - msg_set_destnode(hdr, dnode); - msg_set_destport(hdr, dport); + msg_set_nametype(hdr, ua->sa.type); + msg_set_nameinst(hdr, ua->sa.instance); + msg_set_lookup_scope(hdr, ua->scope); } else { /* TIPC_SOCKET_ADDR */ msg_set_type(hdr, TIPC_DIRECT_MSG); msg_set_lookup_scope(hdr, 0); - msg_set_destnode(hdr, dnode); - msg_set_destport(hdr, dest->addr.id.ref); msg_set_hdr_sz(hdr, BASIC_H_SIZE); } + /* Add message body */ __skb_queue_head_init(&pkts); - mtu = tipc_node_get_mtu(net, dnode, tsk->portid, true); + mtu = tipc_node_get_mtu(net, skaddr.node, tsk->portid, true); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); if (unlikely(rc != dlen)) return rc; @@ -1520,10 +1513,11 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) return -ENOMEM; } + /* Send message */ trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " "); - rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + rc = tipc_node_xmit(net, &pkts, skaddr.node, tsk->portid); if (unlikely(rc == -ELINKCONG)) { - tipc_dest_push(clinks, dnode, 0); + tipc_dest_push(clinks, skaddr.node, 0); tsk->cong_link_cnt++; rc = 0; } @@ -2891,66 +2885,62 @@ static void tipc_sk_timeout(struct timer_list *t) sock_put(sk); } -static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, - struct tipc_service_range const *seq) +static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); - struct publication *publ; + struct tipc_socket_addr skaddr; + struct publication *p; u32 key; - if (scope != TIPC_NODE_SCOPE) - scope = TIPC_CLUSTER_SCOPE; - if (tipc_sk_connected(sk)) return -EINVAL; key = tsk->portid + tsk->pub_count + 1; if (key == tsk->portid) return -EADDRINUSE; - - publ = tipc_nametbl_publish(net, seq->type, seq->lower, seq->upper, - scope, tsk->portid, key); - if (unlikely(!publ)) + skaddr.ref = tsk->portid; + skaddr.node = tipc_own_addr(net); + p = tipc_nametbl_publish(net, ua, &skaddr, key); + if (unlikely(!p)) return -EINVAL; - list_add(&publ->binding_sock, &tsk->publications); + list_add(&p->binding_sock, &tsk->publications); tsk->pub_count++; - tsk->published = 1; + tsk->published = true; return 0; } -static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, - struct tipc_service_range const *seq) +static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua) { struct net *net = sock_net(&tsk->sk); - struct publication *publ; - struct publication *safe; + struct publication *safe, *p; + struct tipc_uaddr _ua; int rc = -EINVAL; - if (scope != TIPC_NODE_SCOPE) - scope = TIPC_CLUSTER_SCOPE; - - list_for_each_entry_safe(publ, safe, &tsk->publications, binding_sock) { - if (seq) { - if (publ->scope != scope) - continue; - if (publ->type != seq->type) - continue; - if (publ->lower != seq->lower) - continue; - if (publ->upper != seq->upper) - break; - tipc_nametbl_withdraw(net, publ->type, publ->lower, - publ->upper, publ->key); - rc = 0; - break; + list_for_each_entry_safe(p, safe, &tsk->publications, binding_sock) { + if (!ua) { + tipc_uaddr(&_ua, TIPC_SERVICE_RANGE, p->scope, + p->sr.type, p->sr.lower, p->sr.upper); + tipc_nametbl_withdraw(net, &_ua, &p->sk, p->key); + continue; } - tipc_nametbl_withdraw(net, publ->type, publ->lower, - publ->upper, publ->key); + /* Unbind specific publication */ + if (p->scope != ua->scope) + continue; + if (p->sr.type != ua->sr.type) + continue; + if (p->sr.lower != ua->sr.lower) + continue; + if (p->sr.upper != ua->sr.upper) + break; + tipc_nametbl_withdraw(net, ua, &p->sk, p->key); rc = 0; + break; } - if (list_empty(&tsk->publications)) + if (list_empty(&tsk->publications)) { tsk->published = 0; + rc = 0; + } return rc; } @@ -3067,13 +3057,15 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) struct net *net = sock_net(&tsk->sk); struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; - struct tipc_service_range seq; + struct tipc_uaddr ua; int rc; if (mreq->type < TIPC_RESERVED_TYPES) return -EACCES; if (mreq->scope > TIPC_NODE_SCOPE) return -EINVAL; + if (mreq->scope != TIPC_NODE_SCOPE) + mreq->scope = TIPC_CLUSTER_SCOPE; if (grp) return -EACCES; grp = tipc_group_create(net, tsk->portid, mreq, &tsk->group_is_open); @@ -3083,11 +3075,10 @@ static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) msg_set_lookup_scope(hdr, mreq->scope); msg_set_nametype(hdr, mreq->type); msg_set_dest_droppable(hdr, true); - seq.type = mreq->type; - seq.lower = mreq->instance; - seq.upper = seq.lower; - tipc_nametbl_build_group(net, grp, mreq->type, mreq->scope); - rc = tipc_sk_publish(tsk, mreq->scope, &seq); + tipc_uaddr(&ua, TIPC_SERVICE_RANGE, mreq->scope, + mreq->type, mreq->instance, mreq->instance); + tipc_nametbl_build_group(net, grp, &ua); + rc = tipc_sk_publish(tsk, &ua); if (rc) { tipc_group_delete(net, grp); tsk->group = NULL; @@ -3104,15 +3095,17 @@ static int tipc_sk_leave(struct tipc_sock *tsk) { struct net *net = sock_net(&tsk->sk); struct tipc_group *grp = tsk->group; - struct tipc_service_range seq; + struct tipc_uaddr ua; int scope; if (!grp) return -EINVAL; - tipc_group_self(grp, &seq, &scope); + ua.addrtype = TIPC_SERVICE_RANGE; + tipc_group_self(grp, &ua.sr, &scope); + ua.scope = scope; tipc_group_delete(net, grp); tsk->group = NULL; - tipc_sk_withdraw(tsk, scope, &seq); + tipc_sk_withdraw(tsk, &ua); return 0; } @@ -3711,11 +3704,11 @@ static int __tipc_nl_add_sk_publ(struct sk_buff *skb, if (nla_put_u32(skb, TIPC_NLA_PUBL_KEY, publ->key)) goto attr_msg_cancel; - if (nla_put_u32(skb, TIPC_NLA_PUBL_TYPE, publ->type)) + if (nla_put_u32(skb, TIPC_NLA_PUBL_TYPE, publ->sr.type)) goto attr_msg_cancel; - if (nla_put_u32(skb, TIPC_NLA_PUBL_LOWER, publ->lower)) + if (nla_put_u32(skb, TIPC_NLA_PUBL_LOWER, publ->sr.lower)) goto attr_msg_cancel; - if (nla_put_u32(skb, TIPC_NLA_PUBL_UPPER, publ->upper)) + if (nla_put_u32(skb, TIPC_NLA_PUBL_UPPER, publ->sr.upper)) goto attr_msg_cancel; nla_nest_end(skb, attrs); @@ -3863,9 +3856,9 @@ bool tipc_sk_filtering(struct sock *sk) p = list_first_entry_or_null(&tsk->publications, struct publication, binding_sock); if (p) { - type = p->type; - lower = p->lower; - upper = p->upper; + type = p->sr.type; + lower = p->sr.lower; + upper = p->sr.upper; } } @@ -3964,9 +3957,9 @@ int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf) if (tsk->published) { p = list_first_entry_or_null(&tsk->publications, struct publication, binding_sock); - i += scnprintf(buf + i, sz - i, " %u", (p) ? p->type : 0); - i += scnprintf(buf + i, sz - i, " %u", (p) ? p->lower : 0); - i += scnprintf(buf + i, sz - i, " %u", (p) ? p->upper : 0); + i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.type : 0); + i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.lower : 0); + i += scnprintf(buf + i, sz - i, " %u", (p) ? p->sr.upper : 0); } i += scnprintf(buf + i, sz - i, " | %u", tsk->snd_win); i += scnprintf(buf + i, sz - i, " %u", tsk->rcv_win); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index f6ad0005218c..8e00d739f03a 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -3,7 +3,7 @@ * * Copyright (c) 2000-2017, Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems - * Copyright (c) 2020, Red Hat Inc + * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -40,77 +40,75 @@ #include "subscr.h" static void tipc_sub_send_event(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, - u32 event, u32 port, u32 node) + struct publication *p, + u32 event) { + struct tipc_subscr *s = &sub->evt.s; struct tipc_event *evt = &sub->evt; if (sub->inactive) return; tipc_evt_write(evt, event, event); - tipc_evt_write(evt, found_lower, found_lower); - tipc_evt_write(evt, found_upper, found_upper); - tipc_evt_write(evt, port.ref, port); - tipc_evt_write(evt, port.node, node); + if (p) { + tipc_evt_write(evt, found_lower, p->sr.lower); + tipc_evt_write(evt, found_upper, p->sr.upper); + tipc_evt_write(evt, port.ref, p->sk.ref); + tipc_evt_write(evt, port.node, p->sk.node); + } else { + tipc_evt_write(evt, found_lower, s->seq.lower); + tipc_evt_write(evt, found_upper, s->seq.upper); + tipc_evt_write(evt, port.ref, 0); + tipc_evt_write(evt, port.node, 0); + } tipc_topsrv_queue_evt(sub->net, sub->conid, event, evt); } /** * tipc_sub_check_overlap - test for subscription overlap with the given values - * @seq: tipc_name_seq to check - * @found_lower: lower value to test - * @found_upper: upper value to test + * @subscribed: the service range subscribed for + * @found: the service range we are checning for match * - * Return: 1 if there is overlap, otherwise 0. + * Returns true if there is overlap, otherwise false. */ -int tipc_sub_check_overlap(struct tipc_service_range *seq, u32 found_lower, - u32 found_upper) +static bool tipc_sub_check_overlap(struct tipc_service_range *subscribed, + struct tipc_service_range *found) { - if (found_lower < seq->lower) - found_lower = seq->lower; - if (found_upper > seq->upper) - found_upper = seq->upper; - if (found_lower > found_upper) - return 0; - return 1; + u32 found_lower = found->lower; + u32 found_upper = found->upper; + + if (found_lower < subscribed->lower) + found_lower = subscribed->lower; + if (found_upper > subscribed->upper) + found_upper = subscribed->upper; + return found_lower <= found_upper; } void tipc_sub_report_overlap(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, - u32 event, u32 port, u32 node, - u32 scope, int must) + struct publication *p, + u32 event, bool must) { - struct tipc_subscr *s = &sub->evt.s; - u32 filter = tipc_sub_read(s, filter); - struct tipc_service_range seq; + struct tipc_service_range *sr = &sub->s.seq; + u32 filter = sub->s.filter; - seq.type = tipc_sub_read(s, seq.type); - seq.lower = tipc_sub_read(s, seq.lower); - seq.upper = tipc_sub_read(s, seq.upper); - - if (!tipc_sub_check_overlap(&seq, found_lower, found_upper)) + if (!tipc_sub_check_overlap(sr, &p->sr)) return; - if (!must && !(filter & TIPC_SUB_PORTS)) return; - if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE) + if (filter & TIPC_SUB_CLUSTER_SCOPE && p->scope == TIPC_NODE_SCOPE) return; - if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE) + if (filter & TIPC_SUB_NODE_SCOPE && p->scope != TIPC_NODE_SCOPE) return; spin_lock(&sub->lock); - tipc_sub_send_event(sub, found_lower, found_upper, - event, port, node); + tipc_sub_send_event(sub, p, event); spin_unlock(&sub->lock); } static void tipc_sub_timeout(struct timer_list *t) { struct tipc_subscription *sub = from_timer(sub, t, timer); - struct tipc_subscr *s = &sub->evt.s; spin_lock(&sub->lock); - tipc_sub_send_event(sub, s->seq.lower, s->seq.upper, - TIPC_SUBSCR_TIMEOUT, 0, 0); + tipc_sub_send_event(sub, NULL, TIPC_SUBSCR_TIMEOUT); sub->inactive = true; spin_unlock(&sub->lock); } @@ -134,12 +132,14 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net, struct tipc_subscr *s, int conid) { + u32 lower = tipc_sub_read(s, seq.lower); + u32 upper = tipc_sub_read(s, seq.upper); u32 filter = tipc_sub_read(s, filter); struct tipc_subscription *sub; u32 timeout; if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) || - (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))) { + lower > upper) { pr_warn("Subscription rejected, illegal request\n"); return NULL; } @@ -154,6 +154,12 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net, sub->conid = conid; sub->inactive = false; memcpy(&sub->evt.s, s, sizeof(*s)); + sub->s.seq.type = tipc_sub_read(s, seq.type); + sub->s.seq.lower = lower; + sub->s.seq.upper = upper; + sub->s.filter = filter; + sub->s.timeout = tipc_sub_read(s, timeout); + memcpy(sub->s.usr_handle, s->usr_handle, 8); spin_lock_init(&sub->lock); kref_init(&sub->kref); if (!tipc_nametbl_subscribe(sub)) { diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 3ded27391d54..60b877531b66 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -3,7 +3,7 @@ * * Copyright (c) 2003-2017, Ericsson AB * Copyright (c) 2005-2007, 2012-2013, Wind River Systems - * Copyright (c) 2020, Red Hat Inc + * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -43,28 +43,31 @@ #define TIPC_MAX_SUBSCR 65535 #define TIPC_MAX_PUBL 65535 +struct publication; struct tipc_subscription; struct tipc_conn; /** * struct tipc_subscription - TIPC network topology subscription object + * @s: host-endian copy of the user subscription + * @evt: template for events generated by subscription * @kref: reference count for this subscription * @net: network namespace associated with subscription * @timer: timer governing subscription duration (optional) * @service_list: adjacent subscriptions in name sequence's subscription list * @sub_list: adjacent subscriptions in subscriber's subscription list - * @evt: template for events generated by subscription * @conid: connection identifier of topology server * @inactive: true if this subscription is inactive * @lock: serialize up/down and timer events */ struct tipc_subscription { + struct tipc_subscr s; + struct tipc_event evt; struct kref kref; struct net *net; struct timer_list timer; struct list_head service_list; struct list_head sub_list; - struct tipc_event evt; int conid; bool inactive; spinlock_t lock; @@ -74,13 +77,9 @@ struct tipc_subscription *tipc_sub_subscribe(struct net *net, struct tipc_subscr *s, int conid); void tipc_sub_unsubscribe(struct tipc_subscription *sub); - -int tipc_sub_check_overlap(struct tipc_service_range *seq, - u32 found_lower, u32 found_upper); void tipc_sub_report_overlap(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, - u32 event, u32 port, u32 node, - u32 scope, int must); + struct publication *p, + u32 event, bool must); int __net_init tipc_topsrv_init_net(struct net *net); void __net_exit tipc_topsrv_exit_net(struct net *net); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 21e75e28e86a..e556d2cdc064 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -414,8 +414,10 @@ static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote) err = ip_mc_join_group(sk, &mreqn); #if IS_ENABLED(CONFIG_IPV6) } else { + lock_sock(sk); err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex, &remote->ipv6); + release_sock(sk); #endif } return err; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index d9cd229aa111..790c6b7ecb26 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -601,7 +601,7 @@ struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, if (!info || before(seq, info->end_seq - info->len)) { /* if retransmit_hint is irrelevant start - * from the beggining of the list + * from the beginning of the list */ info = list_first_entry_or_null(&context->records_list, struct tls_record_info, list); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 01d933ae5f16..1dcb34dfd56b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1789,8 +1789,8 @@ int tls_sw_recvmsg(struct sock *sk, skb = tls_wait_data(sk, psock, flags, timeo, &err); if (!skb) { if (psock) { - int ret = __tcp_bpf_recvmsg(sk, psock, - msg, len, flags); + int ret = sk_msg_recvmsg(sk, psock, msg, len, + flags); if (ret > 0) { decrypted += ret; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index bc7fb9bf3351..92a72f0e0d94 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1855,7 +1855,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (!transport || sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a - * peer has not connected or a local shutdown occured with the + * peer has not connected or a local shutdown occurred with the * SOCK_DONE flag. */ if (sock_flag(sk, SOCK_DONE)) diff --git a/net/wireless/core.c b/net/wireless/core.c index a2785379df6e..adfbcb33fb8f 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1092,14 +1092,15 @@ void wiphy_free(struct wiphy *wiphy) } EXPORT_SYMBOL(wiphy_free); -void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked) +void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, + enum rfkill_hard_block_reasons reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - if (rfkill_set_hw_state(rdev->rfkill, blocked)) + if (rfkill_set_hw_state_reason(rdev->rfkill, blocked, reason)) schedule_work(&rdev->rfkill_block); } -EXPORT_SYMBOL(wiphy_rfkill_set_hw_state); +EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); void cfg80211_cqm_config_free(struct wireless_dev *wdev) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b1df42e4f1eb..7e811a3b0987 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -313,6 +313,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, + [NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG }, }; static const struct nla_policy @@ -411,9 +412,10 @@ static const struct nla_policy nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = { [NL80211_FILS_DISCOVERY_ATTR_INT_MIN] = NLA_POLICY_MAX(NLA_U32, 10000), [NL80211_FILS_DISCOVERY_ATTR_INT_MAX] = NLA_POLICY_MAX(NLA_U32, 10000), - NLA_POLICY_RANGE(NLA_BINARY, - NL80211_FILS_DISCOVERY_TMPL_MIN_LEN, - IEEE80211_MAX_DATA_LEN), + [NL80211_FILS_DISCOVERY_ATTR_TMPL] = + NLA_POLICY_RANGE(NLA_BINARY, + NL80211_FILS_DISCOVERY_TMPL_MIN_LEN, + IEEE80211_MAX_DATA_LEN), }; static const struct nla_policy diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index a95c79d18349..6bdd96408022 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation */ #ifndef __PMSR_H #define __PMSR_H @@ -158,6 +158,16 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } + out->ftm.lmr_feedback = + !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK]; + if (!out->ftm.trigger_based && !out->ftm.non_trigger_based && + out->ftm.lmr_feedback) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK], + "FTM: LMR feedback set for EDCA based ranging"); + return -EINVAL; + } + return 0; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 21536c48deec..0406ce7334fa 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -126,7 +126,7 @@ static int reg_num_devs_support_basehint; * is relevant for all registered devices. */ static bool reg_is_indoor; -static spinlock_t reg_indoor_lock; +static DEFINE_SPINLOCK(reg_indoor_lock); /* Used to track the userspace process controlling the indoor setting */ static u32 reg_is_indoor_portid; @@ -210,11 +210,11 @@ static struct regulatory_request *get_last_request(void) /* Used to queue up regulatory hints */ static LIST_HEAD(reg_requests_list); -static spinlock_t reg_requests_lock; +static DEFINE_SPINLOCK(reg_requests_lock); /* Used to queue up beacon hints for review */ static LIST_HEAD(reg_pending_beacons); -static spinlock_t reg_pending_beacons_lock; +static DEFINE_SPINLOCK(reg_pending_beacons_lock); /* Used to keep track of processed beacon hints */ static LIST_HEAD(reg_beacon_list); @@ -3404,7 +3404,7 @@ static void restore_custom_reg_settings(struct wiphy *wiphy) } /* - * Restoring regulatory settings involves ingoring any + * Restoring regulatory settings involves ignoring any * possibly stale country IE information and user regulatory * settings if so desired, this includes any beacon hints * learned as we could have traveled outside to another country @@ -4262,10 +4262,6 @@ int __init regulatory_init(void) if (IS_ERR(reg_pdev)) return PTR_ERR(reg_pdev); - spin_lock_init(®_requests_lock); - spin_lock_init(®_pending_beacons_lock); - spin_lock_init(®_indoor_lock); - rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); user_alpha2[0] = '9'; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 758eb7d2a706..4f06c1825029 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -589,7 +589,7 @@ static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, elem = cfg80211_find_elem(WLAN_EID_REDUCED_NEIGHBOR_REPORT, ies->data, ies->len); - if (!elem || elem->datalen > IEEE80211_MAX_SSID_LEN) + if (!elem) return 0; pos = elem->data; @@ -1751,6 +1751,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, if (rdev->bss_entries >= bss_entries_limit && !cfg80211_bss_expire_oldest(rdev)) { + if (!list_empty(&new->hidden_list)) + list_del(&new->hidden_list); kfree(new); goto drop; } diff --git a/net/wireless/util.c b/net/wireless/util.c index 1bf0200f562a..382c5262d997 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -24,7 +24,7 @@ #include "rdev-ops.h" -struct ieee80211_rate * +const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate) { diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ff687b97b2d9..44d6566dd23e 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1018,7 +1018,7 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, /* * current neighbour/link might impose additional limits - * on certain facilties + * on certain facilities */ x25_limit_facilities(&facilities, nb); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4faabd1ecfd1..cd62d4ba87a9 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -30,7 +30,7 @@ #include "xdp_umem.h" #include "xsk.h" -#define TX_BATCH_SIZE 16 +#define TX_BATCH_SIZE 32 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list); @@ -445,6 +445,97 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct xsk_buff_pool *pool = xs->pool; + u32 hr, len, ts, offset, copy, copied; + struct sk_buff *skb; + struct page *page; + void *buffer; + int err, i; + u64 addr; + + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); + + skb_reserve(skb, hr); + + addr = desc->addr; + len = desc->len; + ts = pool->unaligned ? len : pool->chunk_size; + + buffer = xsk_buff_raw_get_data(pool, addr); + offset = offset_in_page(buffer); + addr = buffer - pool->addrs; + + for (copied = 0, i = 0; copied < len; i++) { + page = pool->umem->pgs[addr >> PAGE_SHIFT]; + get_page(page); + + copy = min_t(u32, PAGE_SIZE - offset, len - copied); + skb_fill_page_desc(skb, i, page, offset, copy); + + copied += copy; + addr += copy; + offset = 0; + } + + skb->len += len; + skb->data_len += len; + skb->truesize += ts; + + refcount_add(ts, &xs->sk.sk_wmem_alloc); + + return skb; +} + +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, + struct xdp_desc *desc) +{ + struct net_device *dev = xs->dev; + struct sk_buff *skb; + + if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { + skb = xsk_build_skb_zerocopy(xs, desc); + if (IS_ERR(skb)) + return skb; + } else { + u32 hr, tr, len; + void *buffer; + int err; + + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + len = desc->len; + + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); + + skb_reserve(skb, hr); + skb_put(skb, len); + + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + return ERR_PTR(err); + } + } + + skb->dev = dev; + skb->priority = xs->sk.sk_priority; + skb->mark = xs->sk.sk_mark; + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; + skb->destructor = xsk_destruct_skb; + + return skb; +} + static int xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); @@ -461,43 +552,30 @@ static int xsk_generic_xmit(struct sock *sk) goto out; while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { - char *buffer; - u64 addr; - u32 len; - if (max_batch-- == 0) { err = -EAGAIN; goto out; } - len = desc.len; - skb = sock_alloc_send_skb(sk, len, 1, &err); - if (unlikely(!skb)) + skb = xsk_build_skb(xs, &desc); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); goto out; + } - skb_put(skb, len); - addr = desc.addr; - buffer = xsk_buff_raw_get_data(xs->pool, addr); - err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ spin_lock_irqsave(&xs->pool->cq_lock, flags); - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { + if (xskq_prod_reserve(xs->pool->cq)) { spin_unlock_irqrestore(&xs->pool->cq_lock, flags); kfree_skb(skb); goto out; } spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - skb->dev = xs->dev; - skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; - skb->destructor = xsk_destruct_skb; - err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 2823b7c3302d..2ac3802c2cd7 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -47,19 +47,18 @@ struct xsk_queue { u64 queue_empty_descs; }; -/* The structure of the shared state of the rings are the same as the - * ring buffer in kernel/events/ring_buffer.c. For the Rx and completion - * ring, the kernel is the producer and user space is the consumer. For - * the Tx and fill rings, the kernel is the consumer and user space is - * the producer. +/* The structure of the shared state of the rings are a simple + * circular buffer, as outlined in + * Documentation/core-api/circular-buffers.rst. For the Rx and + * completion ring, the kernel is the producer and user space is the + * consumer. For the Tx and fill rings, the kernel is the consumer and + * user space is the producer. * * producer consumer * - * if (LOAD ->consumer) { LOAD ->producer - * (A) smp_rmb() (C) + * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) * STORE $data LOAD $data - * smp_wmb() (B) smp_mb() (D) - * STORE ->producer STORE ->consumer + * STORE.rel ->producer (B) STORE.rel ->consumer (D) * } * * (A) pairs with (D), and (B) pairs with (C). @@ -78,7 +77,8 @@ struct xsk_queue { * * (A) is a control dependency that separates the load of ->consumer * from the stores of $data. In case ->consumer indicates there is no - * room in the buffer to store $data we do not. So no barrier is needed. + * room in the buffer to store $data we do not. The dependency will + * order both of the stores after the loads. So no barrier is needed. * * (D) protects the load of the data to be observed to happen after the * store of the consumer pointer. If we did not have this memory @@ -227,15 +227,13 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, static inline void __xskq_cons_release(struct xsk_queue *q) { - smp_mb(); /* D, matches A */ - WRITE_ONCE(q->ring->consumer, q->cached_cons); + smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ } static inline void __xskq_cons_peek(struct xsk_queue *q) { /* Refresh the local pointer */ - q->cached_prod = READ_ONCE(q->ring->producer); - smp_rmb(); /* C, matches B */ + q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ } static inline void xskq_cons_get_entries(struct xsk_queue *q) @@ -397,9 +395,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { - smp_wmb(); /* B, matches C */ - - WRITE_ONCE(q->ring->producer, idx); + smp_store_release(&q->ring->producer, idx); /* B, matches C */ } static inline void xskq_prod_submit(struct xsk_queue *q) diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 113fd9017203..67b4ce504852 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -87,7 +87,6 @@ static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); - bpf_clear_redirect_map(map); synchronize_net(); bpf_map_area_free(m); } @@ -125,6 +124,16 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + if (key >= map->max_entries) + return NULL; + + return READ_ONCE(m->xsk_map[key]); +} + static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -215,6 +224,11 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem); +} + void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry) { @@ -247,4 +261,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "xsk_map", .map_btf_id = &xsk_map_btf_id, + .map_redirect = xsk_map_redirect, }; diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 4d422447aadc..2e8afe078d61 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -41,19 +41,16 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) const int plen = skb->len; int dlen = IPCOMP_SCRATCH_SIZE; const u8 *start = skb->data; - const int cpu = get_cpu(); - u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu); - struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu); + u8 *scratch = *this_cpu_ptr(ipcomp_scratches); + struct crypto_comp *tfm = *this_cpu_ptr(ipcd->tfms); int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); int len; if (err) - goto out; + return err; - if (dlen < (plen + sizeof(struct ip_comp_hdr))) { - err = -EINVAL; - goto out; - } + if (dlen < (plen + sizeof(struct ip_comp_hdr))) + return -EINVAL; len = dlen - plen; if (len > skb_tailroom(skb)) @@ -68,16 +65,14 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) skb_frag_t *frag; struct page *page; - err = -EMSGSIZE; if (WARN_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) - goto out; + return -EMSGSIZE; frag = skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags; page = alloc_page(GFP_ATOMIC); - err = -ENOMEM; if (!page) - goto out; + return -ENOMEM; __skb_frag_set_page(frag, page); @@ -96,11 +91,7 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) skb_shinfo(skb)->nr_frags++; } - err = 0; - -out: - put_cpu(); - return err; + return 0; } int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b74f28cabe24..ce500f847b99 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -688,7 +688,7 @@ static void xfrm_hash_resize(struct work_struct *work) } /* Make sure *pol can be inserted into fastbin. - * Useful to check that later insert requests will be sucessful + * Useful to check that later insert requests will be successful * (provided xfrm_policy_lock is held throughout). */ static struct xfrm_pol_inexact_bin * @@ -3326,39 +3326,6 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) fl4->fl4_icmp_code = icmp[1]; } break; - case IPPROTO_ESP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be32 *ehdr; - - xprth = skb_network_header(skb) + ihl * 4; - ehdr = (__be32 *)xprth; - - fl4->fl4_ipsec_spi = ehdr[0]; - } - break; - case IPPROTO_AH: - if (xprth + 8 < skb->data || - pskb_may_pull(skb, xprth + 8 - skb->data)) { - __be32 *ah_hdr; - - xprth = skb_network_header(skb) + ihl * 4; - ah_hdr = (__be32 *)xprth; - - fl4->fl4_ipsec_spi = ah_hdr[1]; - } - break; - case IPPROTO_COMP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ipcomp_hdr; - - xprth = skb_network_header(skb) + ihl * 4; - ipcomp_hdr = (__be16 *)xprth; - - fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); - } - break; case IPPROTO_GRE: if (xprth + 12 < skb->data || pskb_may_pull(skb, xprth + 12 - skb->data)) { @@ -3377,7 +3344,6 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) } break; default: - fl4->fl4_ipsec_spi = 0; break; } } @@ -3470,12 +3436,7 @@ decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse) fl6->flowi6_proto = nexthdr; return; #endif - /* XXX Why are there these headers? */ - case IPPROTO_AH: - case IPPROTO_ESP: - case IPPROTO_COMP: default: - fl6->fl6_ipsec_spi = 0; fl6->flowi6_proto = nexthdr; return; } @@ -4173,9 +4134,6 @@ void __init xfrm_init(void) #ifdef CONFIG_XFRM_ESPINTCP espintcp_init(); #endif - - RCU_INIT_POINTER(xfrm_if_cb, NULL); - synchronize_rcu(); } #ifdef CONFIG_AUDITSYSCALL diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5a0ef4361e43..f0aecee4d539 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1761,7 +1761,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, /* shouldn't excl be based on nlh flags?? * Aha! this is anti-netlink really i.e more pfkey derived - * in netlink excl is a flag and you wouldnt need + * in netlink excl is a flag and you wouldn't need * a type XFRM_MSG_UPDPOLICY - JHS */ excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; err = xfrm_policy_insert(p->dir, xp, excl); @@ -3480,18 +3480,22 @@ static int __net_init xfrm_user_net_init(struct net *net) return 0; } +static void __net_exit xfrm_user_net_pre_exit(struct net *net) +{ + RCU_INIT_POINTER(net->xfrm.nlsk, NULL); +} + static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list) { struct net *net; - list_for_each_entry(net, net_exit_list, exit_list) - RCU_INIT_POINTER(net->xfrm.nlsk, NULL); - synchronize_net(); + list_for_each_entry(net, net_exit_list, exit_list) netlink_kernel_release(net->xfrm.nlsk_stash); } static struct pernet_operations xfrm_user_net_ops = { .init = xfrm_user_net_init, + .pre_exit = xfrm_user_net_pre_exit, .exit_batch = xfrm_user_net_exit, }; |