diff options
Diffstat (limited to 'net')
132 files changed, 2371 insertions, 1383 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 788076b002b3..e40aa3e3641c 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -319,8 +319,7 @@ static void vlan_transfer_features(struct net_device *dev, { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); - netif_set_gso_max_size(vlandev, dev->gso_max_size); - netif_set_gso_max_segs(vlandev, dev->gso_max_segs); + netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e5d23e75572a..839f2020b015 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -573,8 +573,7 @@ static int vlan_dev_init(struct net_device *dev) NETIF_F_ALL_FCOE; dev->features |= dev->hw_features | NETIF_F_LLTX; - netif_set_gso_max_size(dev, real_dev->gso_max_size); - netif_set_gso_max_segs(dev, real_dev->gso_max_segs); + netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); diff --git a/net/Kconfig.debug b/net/Kconfig.debug index 2f50611df858..a5781cf63b16 100644 --- a/net/Kconfig.debug +++ b/net/Kconfig.debug @@ -17,3 +17,10 @@ config NET_NS_REFCNT_TRACKER help Enable debugging feature to track netns references. This adds memory and cpu costs. + +config DEBUG_NET + bool "Add generic networking debug" + depends on DEBUG_KERNEL + help + Enable extra sanity checks in networking. + This is mostly used by fuzzers, but is safe to select. diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index d2a244e1c260..b80fccbac62a 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -115,23 +115,13 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; - spin_unlock_bh(&ax25_dev_lock); - ax25_dev_put(ax25_dev); - dev->ax25_ptr = NULL; - dev_put_track(dev, &ax25_dev->dev_tracker); - ax25_dev_put(ax25_dev); - return; + goto unlock_put; } while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; - spin_unlock_bh(&ax25_dev_lock); - ax25_dev_put(ax25_dev); - dev->ax25_ptr = NULL; - dev_put_track(dev, &ax25_dev->dev_tracker); - ax25_dev_put(ax25_dev); - return; + goto unlock_put; } s = s->next; @@ -139,6 +129,14 @@ void ax25_dev_device_down(struct net_device *dev) spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; ax25_dev_put(ax25_dev); + return; + +unlock_put: + spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); + dev->ax25_ptr = NULL; + dev_put_track(dev, &ax25_dev->dev_tracker); + ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 7f8a14d99cdb..37ce6cfb3520 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -65,7 +65,7 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv, */ static inline u32 batadv_choose_claim(const void *data, u32 size) { - struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; + const struct batadv_bla_claim *claim = data; u32 hash = 0; hash = jhash(&claim->addr, sizeof(claim->addr), hash); @@ -86,7 +86,7 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) const struct batadv_bla_backbone_gw *gw; u32 hash = 0; - gw = (struct batadv_bla_backbone_gw *)data; + gw = data; hash = jhash(&gw->orig, sizeof(gw->orig), hash); hash = jhash(&gw->vid, sizeof(gw->vid), hash); diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 0899a729a23f..c120c7c6d25f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -475,6 +475,17 @@ int batadv_frag_send_packet(struct sk_buff *skb, goto free_skb; } + /* GRO might have added fragments to the fragment list instead of + * frags[]. But this is not handled by skb_split and must be + * linearized to avoid incorrect length information after all + * batman-adv fragments were created and submitted to the + * hard-interface + */ + if (skb_has_frag_list(skb) && __skb_linearize(skb)) { + ret = -ENOMEM; + goto free_skb; + } + /* Create one header to be copied to all fragments */ frag_header.packet_type = BATADV_UNICAST_FRAG; frag_header.version = BATADV_COMPAT_VERSION; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index f3be82999f1f..23f3d53f4b51 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2022.1" +#define BATADV_SOURCE_VERSION "2022.2" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8478034d3abf..01d30c1e412c 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -103,10 +103,10 @@ static bool batadv_compare_tt(const struct hlist_node *node, const void *data2) */ static inline u32 batadv_choose_tt(const void *data, u32 size) { - struct batadv_tt_common_entry *tt; + const struct batadv_tt_common_entry *tt; u32 hash = 0; - tt = (struct batadv_tt_common_entry *)data; + tt = data; hash = jhash(&tt->addr, ETH_ALEN, hash); hash = jhash(&tt->vid, sizeof(tt->vid), hash); @@ -2766,7 +2766,7 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, u32 i; tt_tot = batadv_tt_entries(tt_len); - tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff; + tt_change = tvlv_buff; if (!valid_cb) return; @@ -3994,7 +3994,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, if (tvlv_value_len < sizeof(*tt_data)) return; - tt_data = (struct batadv_tvlv_tt_data *)tvlv_value; + tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); num_vlan = ntohs(tt_data->num_vlan); @@ -4037,7 +4037,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, if (tvlv_value_len < sizeof(*tt_data)) return NET_RX_SUCCESS; - tt_data = (struct batadv_tvlv_tt_data *)tvlv_value; + tt_data = tvlv_value; tvlv_value_len -= sizeof(*tt_data); tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data); @@ -4129,7 +4129,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, goto out; batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX); - roaming_adv = (struct batadv_tvlv_roam_adv *)tvlv_value; + roaming_adv = tvlv_value; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received ROAMING_ADV from %pM (client %pM)\n", diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b4782a6c1025..45c2dd2e1590 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2555,10 +2555,10 @@ int hci_register_dev(struct hci_dev *hdev) */ switch (hdev->dev_type) { case HCI_PRIMARY: - id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL); + id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL); break; case HCI_AMP: - id = ida_simple_get(&hci_index_ida, 1, 0, GFP_KERNEL); + id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL); break; default: return -EINVAL; @@ -2567,7 +2567,7 @@ int hci_register_dev(struct hci_dev *hdev) if (id < 0) return id; - sprintf(hdev->name, "hci%d", id); + snprintf(hdev->name, sizeof(hdev->name), "hci%d", id); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 8d54fef9a568..9b5a1f630bb0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1001,7 +1001,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) cb->pkt_len = skb->len; } else { if (__skb->wire_len < skb->len || - __skb->wire_len > GSO_MAX_SIZE) + __skb->wire_len > GSO_LEGACY_MAX_SIZE) return -EINVAL; cb->pkt_len = __skb->wire_len; } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 1a3d583fbc8e..e7f4fccb6adb 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1253,7 +1253,8 @@ static int __br_fdb_delete(struct net_bridge *br, /* Remove neighbor entry with RTM_DELNEIGH */ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 55f47cadb114..47fcbade7389 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -517,16 +517,16 @@ void br_mtu_auto_adjust(struct net_bridge *br) static void br_set_gso_limits(struct net_bridge *br) { - unsigned int gso_max_size = GSO_MAX_SIZE; - u16 gso_max_segs = GSO_MAX_SEGS; + unsigned int tso_max_size = TSO_MAX_SIZE; const struct net_bridge_port *p; + u16 tso_max_segs = TSO_MAX_SEGS; list_for_each_entry(p, &br->port_list, list) { - gso_max_size = min(gso_max_size, p->dev->gso_max_size); - gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs); + tso_max_size = min(tso_max_size, p->dev->tso_max_size); + tso_max_segs = min(tso_max_segs, p->dev->tso_max_segs); } - netif_set_gso_max_size(br->dev, gso_max_size); - netif_set_gso_max_segs(br->dev, gso_max_segs); + netif_set_tso_max_size(br->dev, tso_max_size); + netif_set_tso_max_segs(br->dev, tso_max_segs); } /* diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 196417859c4a..68b3e850bcb9 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -39,6 +39,13 @@ static int br_pass_frame_up(struct sk_buff *skb) dev_sw_netstats_rx_add(brdev, skb->len); vg = br_vlan_group_rcu(br); + + /* Reset the offload_fwd_mark because there could be a stacked + * bridge above, and it should not think this bridge it doing + * that bridge's work forwarding out its ports. + */ + br_switchdev_frame_unmark(skb); + /* Bridge is just like any other port. Make sure the * packet is allowed except in promisc mode when someone * may be running packet capture. diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 6ae882cfae1c..06e5f6faa431 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -793,7 +793,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags); int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, const unsigned char *addr, u16 vid); + struct net_device *dev, const unsigned char *addr, u16 vid, + struct netlink_ext_ack *extack); int br_fdb_delete_bulk(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, u16 vid, struct netlink_ext_ack *extack); diff --git a/net/can/isotp.c b/net/can/isotp.c index 11a65a9d3906..4a4007f10970 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -104,6 +104,7 @@ MODULE_ALIAS("can-proto-6"); #define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) +#define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST) /* Flow Status given in FC frame */ #define ISOTP_FC_CTS 0 /* clear to send */ @@ -159,6 +160,23 @@ static inline struct isotp_sock *isotp_sk(const struct sock *sk) return (struct isotp_sock *)sk; } +static u32 isotp_bc_flags(struct isotp_sock *so) +{ + return so->opt.flags & ISOTP_ALL_BC_FLAGS; +} + +static bool isotp_register_rxid(struct isotp_sock *so) +{ + /* no broadcast modes => register rx_id for FC frame reception */ + return (isotp_bc_flags(so) == 0); +} + +static bool isotp_register_txecho(struct isotp_sock *so) +{ + /* all modes but SF_BROADCAST register for tx echo skbs */ + return (isotp_bc_flags(so) != CAN_ISOTP_SF_BROADCAST); +} + static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, @@ -803,7 +821,6 @@ static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, cf->data[i] = so->tx.buf[so->tx.idx++]; so->tx.sn = 1; - so->tx.state = ISOTP_WAIT_FIRST_FC; } static void isotp_rcv_echo(struct sk_buff *skb, void *data) @@ -936,7 +953,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ - if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) && + if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; goto err_out_drop; @@ -1000,12 +1017,41 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) /* don't enable wait queue for a single frame transmission */ wait_tx_done = 0; } else { - /* send first frame and wait for FC */ + /* send first frame */ isotp_create_fframe(cf, so, ae); - /* start timeout for FC */ - hrtimer_sec = 1; + if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) { + /* set timer for FC-less operation (STmin = 0) */ + if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) + so->tx_gap = ktime_set(0, so->force_tx_stmin); + else + so->tx_gap = ktime_set(0, so->frame_txtime); + + /* disable wait for FCs due to activated block size */ + so->txfc.bs = 0; + + /* cfecho should have been zero'ed by init */ + if (so->cfecho) + pr_notice_once("can-isotp: no fc cfecho %08X\n", + so->cfecho); + + /* set consecutive frame echo tag */ + so->cfecho = *(u32 *)cf->data; + + /* switch directly to ISOTP_SENDING state */ + so->tx.state = ISOTP_SENDING; + + /* start timeout for unlikely lost echo skb */ + hrtimer_sec = 2; + } else { + /* standard flow control check */ + so->tx.state = ISOTP_WAIT_FIRST_FC; + + /* start timeout for FC */ + hrtimer_sec = 1; + } + hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), HRTIMER_MODE_REL_SOFT); } @@ -1025,6 +1071,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (hrtimer_sec) hrtimer_cancel(&so->txtimer); + /* reset consecutive frame echo tag */ + so->cfecho = 0; + goto err_out_drop; } @@ -1120,15 +1169,17 @@ static int isotp_release(struct socket *sock) lock_sock(sk); /* remove current filters & unregister */ - if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) { + if (so->bound && isotp_register_txecho(so)) { if (so->ifindex) { struct net_device *dev; dev = dev_get_by_index(net, so->ifindex); if (dev) { - can_rx_unregister(net, dev, so->rxid, - SINGLE_MASK(so->rxid), - isotp_rcv, sk); + if (isotp_register_rxid(so)) + can_rx_unregister(net, dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + can_rx_unregister(net, dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); @@ -1164,7 +1215,6 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) canid_t tx_id, rx_id; int err = 0; int notify_enetdown = 0; - int do_rx_reg = 1; if (len < ISOTP_MIN_NAMELEN) return -EINVAL; @@ -1182,24 +1232,26 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) else rx_id &= CAN_SFF_MASK; + /* give feedback on wrong CAN-ID values */ + if (tx_id != addr->can_addr.tp.tx_id || + rx_id != addr->can_addr.tp.rx_id) + return -EINVAL; + if (!addr->can_ifindex) return -ENODEV; lock_sock(sk); - /* do not register frame reception for functional addressing */ - if (so->opt.flags & CAN_ISOTP_SF_BROADCAST) - do_rx_reg = 0; - - /* do not validate rx address for functional addressing */ - if (do_rx_reg && rx_id == tx_id) { - err = -EADDRNOTAVAIL; + if (so->bound) { + err = -EINVAL; goto out; } - if (so->bound && addr->can_ifindex == so->ifindex && - rx_id == so->rxid && tx_id == so->txid) + /* ensure different CAN IDs when the rx_id is to be registered */ + if (isotp_register_rxid(so) && rx_id == tx_id) { + err = -EADDRNOTAVAIL; goto out; + } dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { @@ -1221,10 +1273,11 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) ifindex = dev->ifindex; - if (do_rx_reg) { + if (isotp_register_rxid(so)) can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); + if (isotp_register_txecho(so)) { /* no consecutive frame echo skb in flight */ so->cfecho = 0; @@ -1235,22 +1288,6 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) dev_put(dev); - if (so->bound && do_rx_reg) { - /* unregister old filter */ - if (so->ifindex) { - dev = dev_get_by_index(net, so->ifindex); - if (dev) { - can_rx_unregister(net, dev, so->rxid, - SINGLE_MASK(so->rxid), - isotp_rcv, sk); - can_rx_unregister(net, dev, so->txid, - SINGLE_MASK(so->txid), - isotp_rcv_echo, sk); - dev_put(dev); - } - } - } - /* switch to new settings */ so->ifindex = ifindex; so->rxid = rx_id; @@ -1309,6 +1346,15 @@ static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; + /* these broadcast flags are not allowed together */ + if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) { + /* CAN_ISOTP_SF_BROADCAST is prioritized */ + so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST; + + /* give user feedback on wrong config attempt */ + ret = -EINVAL; + } + /* check for frame_txtime changes (0 => no changes) */ if (so->opt.frame_txtime) { if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) @@ -1459,10 +1505,12 @@ static void isotp_notify(struct isotp_sock *so, unsigned long msg, case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ - if (so->bound && (!(so->opt.flags & CAN_ISOTP_SF_BROADCAST))) { - can_rx_unregister(dev_net(dev), dev, so->rxid, - SINGLE_MASK(so->rxid), - isotp_rcv, sk); + if (so->bound && isotp_register_txecho(so)) { + if (isotp_register_rxid(so)) + can_rx_unregister(dev_net(dev), dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + can_rx_unregister(dev_net(dev), dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); diff --git a/net/can/raw.c b/net/can/raw.c index b7dbb57557f3..d1bd9cc51ebe 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -772,6 +772,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); + struct sockcm_cookie sockc; struct sk_buff *skb; struct net_device *dev; int ifindex; @@ -817,11 +818,18 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err < 0) goto free_skb; - skb_setup_tx_timestamp(skb, sk->sk_tsflags); + sockcm_init(&sockc, sk); + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + goto free_skb; + } skb->dev = dev; - skb->sk = sk; skb->priority = sk->sk_priority; + skb->tstamp = sockc.transmit_time; + + skb_setup_tx_timestamp(skb, sockc.tsflags); err = can_send(skb, ro->loopback); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1c5815530e0d..83eb97c94e83 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2385,7 +2385,11 @@ again: if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) { err = -ENOSPC; } else { - pr_warn_ratelimited("FULL or reached pool quota\n"); + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) + pr_warn_ratelimited("cluster is full (osdmap FULL)\n"); + else + pr_warn_ratelimited("pool %lld is full or reached quota\n", + req->r_t.base_oloc.pool); req->r_t.paused = true; maybe_request_map(osdc); } diff --git a/net/core/dev.c b/net/core/dev.c index c2d73595a7c3..721ba9c26554 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -682,11 +682,11 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, const struct net_device *last_dev; struct net_device_path_ctx ctx = { .dev = dev, - .daddr = daddr, }; struct net_device_path *path; int ret = 0; + memcpy(ctx.daddr, daddr, sizeof(ctx.daddr)); stack->num_paths = 0; while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { last_dev = ctx.dev; @@ -2993,6 +2993,52 @@ undo_rx: EXPORT_SYMBOL(netif_set_real_num_queues); /** + * netif_set_tso_max_size() - set the max size of TSO frames supported + * @dev: netdev to update + * @size: max skb->len of a TSO frame + * + * Set the limit on the size of TSO super-frames the device can handle. + * Unless explicitly set the stack will assume the value of + * %GSO_LEGACY_MAX_SIZE. + */ +void netif_set_tso_max_size(struct net_device *dev, unsigned int size) +{ + dev->tso_max_size = min(GSO_MAX_SIZE, size); + if (size < READ_ONCE(dev->gso_max_size)) + netif_set_gso_max_size(dev, size); +} +EXPORT_SYMBOL(netif_set_tso_max_size); + +/** + * netif_set_tso_max_segs() - set the max number of segs supported for TSO + * @dev: netdev to update + * @segs: max number of TCP segments + * + * Set the limit on the number of TCP segments the device can generate from + * a single TSO super-frame. + * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS. + */ +void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs) +{ + dev->tso_max_segs = segs; + if (segs < READ_ONCE(dev->gso_max_segs)) + netif_set_gso_max_segs(dev, segs); +} +EXPORT_SYMBOL(netif_set_tso_max_segs); + +/** + * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper + * @to: netdev to update + * @from: netdev from which to copy the limits + */ +void netif_inherit_tso_max(struct net_device *to, const struct net_device *from) +{ + netif_set_tso_max_size(to, from->tso_max_size); + netif_set_tso_max_segs(to, from->tso_max_segs); +} +EXPORT_SYMBOL(netif_inherit_tso_max); + +/** * netif_get_num_default_rss_queues - default number of RSS queues * * Default value is the number of physical cores if there are only 1 or 2, or @@ -3220,12 +3266,18 @@ int skb_checksum_help(struct sk_buff *skb) } offset = skb_checksum_start_offset(skb); - BUG_ON(offset >= skb_headlen(skb)); + ret = -EINVAL; + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + goto out; + } csum = skb_checksum(skb, offset, skb->len - offset, 0); offset += skb->csum_offset; - BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - + if (WARN_ON_ONCE(offset + sizeof(__sum16) > skb_headlen(skb))) { + DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false); + goto out; + } ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); if (ret) goto out; @@ -4086,30 +4138,25 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, } /** - * __dev_queue_xmit - transmit a buffer - * @skb: buffer to transmit - * @sb_dev: suboordinate device used for L2 forwarding offload - * - * Queue a buffer for transmission to a network device. The caller must - * have set the device and priority and built the buffer before calling - * this function. The function can be called from an interrupt. + * __dev_queue_xmit() - transmit a buffer + * @skb: buffer to transmit + * @sb_dev: suboordinate device used for L2 forwarding offload * - * A negative errno code is returned on a failure. A success does not - * guarantee the frame will be transmitted as it may be dropped due - * to congestion or traffic shaping. + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. * - * ----------------------------------------------------------------------------------- - * I notice this method can also return errors from the queue disciplines, - * including NET_XMIT_DROP, which is a positive value. So, errors can also - * be positive. + * When calling this method, interrupts MUST be enabled. This is because + * the BH enable code must have IRQs enabled so that it will not deadlock. * - * Regardless of the return value, the skb is consumed, so it is currently - * difficult to retry a send to this method. (You can bump the ref count - * before sending to hold a reference for retry if you are careful.) + * Regardless of the return value, the skb is consumed, so it is currently + * difficult to retry a send to this method. (You can bump the ref count + * before sending to hold a reference for retry if you are careful.) * - * When calling this method, interrupts MUST be enabled. This is because - * the BH enable code must have IRQs enabled so that it will not deadlock. - * --BLG + * Return: + * * 0 - buffer successfully transmitted + * * positive qdisc return code - NET_XMIT_DROP etc. + * * negative errno - other errors */ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { @@ -4283,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; +unsigned int sysctl_skb_defer_max __read_mostly = 64; int netdev_budget __read_mostly = 300; /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; @@ -4535,9 +4583,12 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ -static void trigger_rx_softirq(void *data __always_unused) +static void trigger_rx_softirq(void *data) { + struct softnet_data *sd = data; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); } /* @@ -6583,7 +6634,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) while (skb != NULL) { next = skb->next; - __kfree_skb(skb); + napi_consume_skb(skb, 1); skb = next; } } @@ -6604,6 +6655,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) for (;;) { struct napi_struct *n; + skb_defer_free_flush(sd); + if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) goto end; @@ -6633,8 +6686,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); -end: - skb_defer_free_flush(sd); +end:; } struct netdev_adjacent { @@ -9881,22 +9933,14 @@ void netif_tx_stop_all_queues(struct net_device *dev) EXPORT_SYMBOL(netif_tx_stop_all_queues); /** - * register_netdevice - register a network device - * @dev: device to register + * register_netdevice() - register a network device + * @dev: device to register * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * Callers must hold the rtnl semaphore. You may want - * register_netdev() instead of this. - * - * BUGS: - * The locking appears insufficient to guarantee two parallel registers - * will not get the same name. + * Take a prepared network device structure and make it externally accessible. + * A %NETDEV_REGISTER message is sent to the netdev notifier chain. + * Callers must hold the rtnl lock - you may want register_netdev() + * instead of this. */ - int register_netdevice(struct net_device *dev) { int ret; @@ -10557,9 +10601,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_size = GSO_LEGACY_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; - dev->gro_max_size = GRO_MAX_SIZE; + dev->gro_max_size = GRO_LEGACY_MAX_SIZE; + dev->tso_max_size = TSO_LEGACY_MAX_SIZE; + dev->tso_max_segs = TSO_MAX_SEGS; dev->upper_level = 1; dev->lower_level = 1; #ifdef CONFIG_LOCKDEP @@ -11341,7 +11387,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif - INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL); + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); diff --git a/net/core/dev.h b/net/core/dev.h index 27923df00637..cbb8a925175a 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ extern int netdev_budget; extern unsigned int netdev_budget_usecs; - +extern unsigned int sysctl_skb_defer_max; extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; @@ -88,4 +88,25 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); +static inline void netif_set_gso_max_size(struct net_device *dev, + unsigned int size) +{ + /* dev->gso_max_size is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_size, size); +} + +static inline void netif_set_gso_max_segs(struct net_device *dev, + unsigned int segs) +{ + /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ + WRITE_ONCE(dev->gso_max_segs, segs); +} + +static inline void netif_set_gro_max_size(struct net_device *dev, + unsigned int size) +{ + /* This pairs with the READ_ONCE() in skb_gro_receive() */ + WRITE_ONCE(dev->gro_max_size, size); +} + #endif diff --git a/net/core/devlink.c b/net/core/devlink.c index 5f441a0e34f4..5cc88490f18f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -83,11 +83,10 @@ struct devlink_linecard { const struct devlink_linecard_ops *ops; void *priv; enum devlink_linecard_state state; - struct mutex state_lock; /* Protects state and device_list */ + struct mutex state_lock; /* Protects state */ const char *type; struct devlink_linecard_type *types; unsigned int types_count; - struct list_head device_list; }; /** @@ -2059,56 +2058,6 @@ struct devlink_linecard_type { const void *priv; }; -struct devlink_linecard_device { - struct list_head list; - unsigned int index; - void *priv; -}; - -static int -devlink_nl_linecard_device_fill(struct sk_buff *msg, - struct devlink_linecard_device *linecard_device) -{ - struct nlattr *attr; - - attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE); - if (!attr) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_DEVICE_INDEX, - linecard_device->index)) { - nla_nest_cancel(msg, attr); - return -EMSGSIZE; - } - nla_nest_end(msg, attr); - - return 0; -} - -static int devlink_nl_linecard_devices_fill(struct sk_buff *msg, - struct devlink_linecard *linecard) -{ - struct devlink_linecard_device *linecard_device; - struct nlattr *attr; - int err; - - if (list_empty(&linecard->device_list)) - return 0; - - attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE_LIST); - if (!attr) - return -EMSGSIZE; - list_for_each_entry(linecard_device, &linecard->device_list, list) { - err = devlink_nl_linecard_device_fill(msg, linecard_device); - if (err) { - nla_nest_cancel(msg, attr); - return err; - } - } - nla_nest_end(msg, attr); - - return 0; -} - static int devlink_nl_linecard_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_linecard *linecard, @@ -2119,7 +2068,6 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg, struct devlink_linecard_type *linecard_type; struct nlattr *attr; void *hdr; - int err; int i; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -2152,10 +2100,6 @@ static int devlink_nl_linecard_fill(struct sk_buff *msg, nla_nest_end(msg, attr); } - err = devlink_nl_linecard_devices_fill(msg, linecard); - if (err) - goto nla_put_failure; - genlmsg_end(msg, hdr); return 0; @@ -2425,191 +2369,6 @@ static int devlink_nl_cmd_linecard_set_doit(struct sk_buff *skb, return 0; } -struct devlink_info_req { - struct sk_buff *msg; -}; - -static int -devlink_nl_linecard_device_info_fill(struct sk_buff *msg, - struct devlink_linecard *linecard, - struct devlink_linecard_device *linecard_device, - struct netlink_ext_ack *extack) -{ - struct nlattr *attr; - - attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE); - if (!attr) - return -EMSGSIZE; - if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_DEVICE_INDEX, - linecard_device->index)) { - nla_nest_cancel(msg, attr); - return -EMSGSIZE; - } - if (linecard->ops->device_info_get) { - struct devlink_info_req req; - int err; - - req.msg = msg; - err = linecard->ops->device_info_get(linecard_device, - linecard_device->priv, - &req, extack); - if (err) { - nla_nest_cancel(msg, attr); - return err; - } - } - nla_nest_end(msg, attr); - - return 0; -} - -static int devlink_nl_linecard_devices_info_fill(struct sk_buff *msg, - struct devlink_linecard *linecard, - struct netlink_ext_ack *extack) -{ - struct devlink_linecard_device *linecard_device; - struct nlattr *attr; - int err; - - if (list_empty(&linecard->device_list)) - return 0; - - attr = nla_nest_start(msg, DEVLINK_ATTR_LINECARD_DEVICE_LIST); - if (!attr) - return -EMSGSIZE; - list_for_each_entry(linecard_device, &linecard->device_list, list) { - err = devlink_nl_linecard_device_info_fill(msg, linecard, - linecard_device, - extack); - if (err) { - nla_nest_cancel(msg, attr); - return err; - } - } - nla_nest_end(msg, attr); - - return 0; -} - -static int -devlink_nl_linecard_info_fill(struct sk_buff *msg, struct devlink *devlink, - struct devlink_linecard *linecard, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, struct netlink_ext_ack *extack) -{ - struct devlink_info_req req; - void *hdr; - int err; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - err = -EMSGSIZE; - if (devlink_nl_put_handle(msg, devlink)) - goto nla_put_failure; - if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index)) - goto nla_put_failure; - - req.msg = msg; - err = linecard->ops->info_get(linecard, linecard->priv, &req, extack); - if (err) - goto nla_put_failure; - - err = devlink_nl_linecard_devices_info_fill(msg, linecard, extack); - if (err) - goto nla_put_failure; - - genlmsg_end(msg, hdr); - return 0; - -nla_put_failure: - genlmsg_cancel(msg, hdr); - return err; -} - -static int devlink_nl_cmd_linecard_info_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink_linecard *linecard = info->user_ptr[1]; - struct devlink *devlink = linecard->devlink; - struct sk_buff *msg; - int err; - - if (!linecard->ops->info_get) - return -EOPNOTSUPP; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_info_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_INFO_GET, - info->snd_portid, info->snd_seq, 0, - info->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int devlink_nl_cmd_linecard_info_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_linecard *linecard; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - goto retry; - - mutex_lock(&devlink->linecards_lock); - list_for_each_entry(linecard, &devlink->linecard_list, list) { - if (idx < start || !linecard->ops->info_get) { - idx++; - continue; - } - mutex_lock(&linecard->state_lock); - err = devlink_nl_linecard_info_fill(msg, devlink, linecard, - DEVLINK_CMD_LINECARD_INFO_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); - mutex_unlock(&linecard->state_lock); - if (err) { - mutex_unlock(&devlink->linecards_lock); - devlink_put(devlink); - goto out; - } - idx++; - } - mutex_unlock(&devlink->linecards_lock); -retry: - devlink_put(devlink); - } -out: - mutex_unlock(&devlink_mutex); - - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; - return msg->len; -} - static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_sb *devlink_sb, enum devlink_command cmd, u32 portid, @@ -6602,6 +6361,10 @@ out_dev: return err; } +struct devlink_info_req { + struct sk_buff *msg; +}; + int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) { return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name); @@ -9322,13 +9085,6 @@ static const struct genl_small_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, }, { - .cmd = DEVLINK_CMD_LINECARD_INFO_GET, - .doit = devlink_nl_cmd_linecard_info_get_doit, - .dumpit = devlink_nl_cmd_linecard_info_get_dumpit, - .internal_flags = DEVLINK_NL_FLAG_NEED_LINECARD, - /* can be retrieved by unprivileged users */ - }, - { .cmd = DEVLINK_CMD_SB_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = devlink_nl_cmd_sb_get_doit, @@ -10508,7 +10264,6 @@ devlink_linecard_create(struct devlink *devlink, unsigned int linecard_index, linecard->priv = priv; linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; mutex_init(&linecard->state_lock); - INIT_LIST_HEAD(&linecard->device_list); err = devlink_linecard_types_init(linecard); if (err) { @@ -10536,7 +10291,6 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard) struct devlink *devlink = linecard->devlink; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL); - WARN_ON(!list_empty(&linecard->device_list)); mutex_lock(&devlink->linecards_lock); list_del(&linecard->list); devlink_linecard_types_fini(linecard); @@ -10546,52 +10300,6 @@ void devlink_linecard_destroy(struct devlink_linecard *linecard) EXPORT_SYMBOL_GPL(devlink_linecard_destroy); /** - * devlink_linecard_device_create - Create a device on linecard - * - * @linecard: devlink linecard - * @device_index: index of the linecard device - * @priv: user priv pointer - * - * Return: Line card device structure or an ERR_PTR() encoded error code. - */ -struct devlink_linecard_device * -devlink_linecard_device_create(struct devlink_linecard *linecard, - unsigned int device_index, void *priv) -{ - struct devlink_linecard_device *linecard_device; - - linecard_device = kzalloc(sizeof(*linecard_device), GFP_KERNEL); - if (!linecard_device) - return ERR_PTR(-ENOMEM); - linecard_device->index = device_index; - linecard_device->priv = priv; - mutex_lock(&linecard->state_lock); - list_add_tail(&linecard_device->list, &linecard->device_list); - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - return linecard_device; -} -EXPORT_SYMBOL_GPL(devlink_linecard_device_create); - -/** - * devlink_linecard_device_destroy - Destroy device on linecard - * - * @linecard: devlink linecard - * @linecard_device: devlink linecard device - */ -void -devlink_linecard_device_destroy(struct devlink_linecard *linecard, - struct devlink_linecard_device *linecard_device) -{ - mutex_lock(&linecard->state_lock); - list_del(&linecard_device->list); - devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - mutex_unlock(&linecard->state_lock); - kfree(linecard_device); -} -EXPORT_SYMBOL_GPL(devlink_linecard_device_destroy); - -/** * devlink_linecard_provision_set - Set provisioning on linecard * * @linecard: devlink linecard @@ -10623,7 +10331,6 @@ EXPORT_SYMBOL_GPL(devlink_linecard_provision_set); void devlink_linecard_provision_clear(struct devlink_linecard *linecard) { mutex_lock(&linecard->state_lock); - WARN_ON(!list_empty(&linecard->device_list)); linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED; linecard->type = NULL; devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index b89e3e95bffc..41cac0e4834e 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -517,7 +517,7 @@ static void net_dm_packet_trace_kfree_skb_hit(void *ignore, if (!nskb) return; - if ((unsigned int)reason >= SKB_DROP_REASON_MAX) + if (unlikely(reason >= SKB_DROP_REASON_MAX || reason <= 0)) reason = SKB_DROP_REASON_NOT_SPECIFIED; cb = NET_DM_SKB_CB(nskb); cb->reason = reason; diff --git a/net/core/gro.c b/net/core/gro.c index 78110edf5d4b..b4190eb08467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -167,6 +167,14 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; + if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { + if (p->protocol != htons(ETH_P_IPV6) || + skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || + ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + p->encapsulation) + return -E2BIG; + } + lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f64ebd050f6c..47b6c1f0fdbb 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3728,7 +3728,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ]; char *p_name; - t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL); + t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto err; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index bdbadfaee867..f18e6e771993 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -704,8 +704,10 @@ struct page *page_pool_alloc_frag(struct page_pool *pool, if (page && *offset + size > max_size) { page = page_pool_drain_frag(pool, page); - if (page) + if (page) { + alloc_stat_inc(pool, fast); goto frag_reset; + } } if (!page) { @@ -727,6 +729,7 @@ frag_reset: pool->frag_users++; pool->frag_offset = *offset + size; + alloc_stat_inc(pool, fast); return page; } EXPORT_SYMBOL(page_pool_alloc_frag); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eea5ed09e1bb..ac45328607f7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1064,6 +1064,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */ + + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ @@ -1769,6 +1771,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || nla_put_u32(skb, IFLA_GRO_MAX_SIZE, dev->gro_max_size) || + nla_put_u32(skb, IFLA_TSO_MAX_SIZE, dev->tso_max_size) || + nla_put_u32(skb, IFLA_TSO_MAX_SEGS, dev->tso_max_segs) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif @@ -1922,6 +1926,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, + [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2306,6 +2312,19 @@ invalid_attr: return -EINVAL; } +static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate, + int max_tx_rate) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_set_vf_rate) + return -EOPNOTSUPP; + if (max_tx_rate && max_tx_rate < min_tx_rate) + return -EINVAL; + + return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate); +} + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { @@ -2341,14 +2360,6 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], } } - if (tb[IFLA_GRO_MAX_SIZE]) { - u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]); - - if (gro_max_size > GRO_MAX_SIZE) { - NL_SET_ERR_MSG(extack, "too big gro_max_size"); - return -EINVAL; - } - } return 0; } @@ -2443,11 +2454,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (err < 0) return err; - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_rate) - err = ops->ndo_set_vf_rate(dev, ivt->vf, - ivf.min_tx_rate, - ivt->rate); + err = rtnl_set_vf_rate(dev, ivt->vf, + ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } @@ -2457,11 +2465,9 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) if (ivt->vf >= INT_MAX) return -EINVAL; - err = -EOPNOTSUPP; - if (ops->ndo_set_vf_rate) - err = ops->ndo_set_vf_rate(dev, ivt->vf, - ivt->min_tx_rate, - ivt->max_tx_rate); + + err = rtnl_set_vf_rate(dev, ivt->vf, + ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } @@ -2803,7 +2809,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); - if (max_size > GSO_MAX_SIZE) { + if (max_size > dev->tso_max_size) { err = -EINVAL; goto errout; } @@ -2817,7 +2823,7 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); - if (max_segs > GSO_MAX_SEGS) { + if (max_segs > GSO_MAX_SEGS || max_segs > dev->tso_max_segs) { err = -EINVAL; goto errout; } @@ -4258,7 +4264,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ops = br_dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); } else { if (ops->ndo_fdb_del_bulk) err = ops->ndo_fdb_del_bulk(ndm, tb, dev, vid, @@ -4276,7 +4282,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ops = dev->netdev_ops; if (!del_bulk) { if (ops->ndo_fdb_del) - err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); + err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid, extack); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); } else { diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 9b8443774449..5f85e01d4093 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -22,6 +22,8 @@ static siphash_aligned_key_t net_secret; static siphash_aligned_key_t ts_secret; +#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) + static __always_inline void net_secret_init(void) { net_get_random_once(&net_secret, sizeof(net_secret)); @@ -94,17 +96,19 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, } EXPORT_SYMBOL(secure_tcpv6_seq); -u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, +u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { const struct { struct in6_addr saddr; struct in6_addr daddr; + unsigned int timeseed; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *(struct in6_addr *)saddr, .daddr = *(struct in6_addr *)daddr, - .dport = dport + .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + .dport = dport, }; net_secret_init(); return siphash(&combined, offsetofend(typeof(combined), dport), @@ -142,11 +146,13 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr, } EXPORT_SYMBOL_GPL(secure_tcp_seq); -u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) +u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { net_secret_init(); - return siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u16)dport, &net_secret); + return siphash_4u32((__force u32)saddr, (__force u32)daddr, + (__force u16)dport, + jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, + &net_secret); } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 475183f37891..1d10bb4adec1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -80,6 +80,7 @@ #include <linux/user_namespace.h> #include <linux/indirect_call_wrapper.h> +#include "dev.h" #include "sock_destructor.h" struct kmem_cache *skbuff_head_cache __ro_after_init; @@ -771,6 +772,8 @@ void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) if (!skb_unref(skb)) return; + DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); + trace_kfree_skb(skb, __builtin_return_address(0), reason); __kfree_skb(skb); } @@ -1165,7 +1168,7 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { struct ubuf_info *uarg; struct sk_buff *skb; @@ -1196,7 +1199,6 @@ struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) return uarg; } -EXPORT_SYMBOL_GPL(msg_zerocopy_alloc); static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) { @@ -3890,7 +3892,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int delta_len = 0; struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; - int err; + int len_diff, err; skb_push(skb, -skb_network_offset(skb) + offset); @@ -3930,9 +3932,11 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); + len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb); __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); + nskb->transport_header += len_diff; skb_copy_from_linear_data_offset(skb, -tnl_hlen, nskb->data - tnl_hlen, offset + tnl_hlen); @@ -6493,16 +6497,21 @@ void skb_attempt_defer_free(struct sk_buff *skb) int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned long flags; + unsigned int defer_max; bool kick; if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || !cpu_online(cpu) || cpu == raw_smp_processor_id()) { - __kfree_skb(skb); +nodefer: __kfree_skb(skb); return; } sd = &per_cpu(softnet_data, cpu); + defer_max = READ_ONCE(sysctl_skb_defer_max); + if (READ_ONCE(sd->defer_count) >= defer_max) + goto nodefer; + /* We do not send an IPI or any signal. * Remote cpu will eventually call skb_defer_free_flush() */ @@ -6512,18 +6521,14 @@ void skb_attempt_defer_free(struct sk_buff *skb) WRITE_ONCE(sd->defer_list, skb); sd->defer_count++; - /* kick every time queue length reaches 128. - * This should avoid blocking in smp_call_function_single_async(). - * This condition should hardly be bit under normal conditions, - * unless cpu suddenly stopped to receive NIC interrupts. - */ - kick = sd->defer_count == 128; + /* Send an IPI every time queue reaches half capacity. */ + kick = sd->defer_count == (defer_max >> 1); spin_unlock_irqrestore(&sd->defer_lock, flags); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick)) + if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } diff --git a/net/core/sock.c b/net/core/sock.c index be20a1af20e5..2ff40dd0a7a6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -635,7 +635,9 @@ static int sock_bindtoindex_locked(struct sock *sk, int ifindex) if (ifindex < 0) goto out; - sk->sk_bound_dev_if = ifindex; + /* Paired with all READ_ONCE() done locklessly. */ + WRITE_ONCE(sk->sk_bound_dev_if, ifindex); + if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); sk_dst_reset(sk); @@ -713,10 +715,11 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES + int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); struct net *net = sock_net(sk); char devname[IFNAMSIZ]; - if (sk->sk_bound_dev_if == 0) { + if (bound_dev_if == 0) { len = 0; goto zero; } @@ -725,7 +728,7 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, if (len < IFNAMSIZ) goto out; - ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); + ret = netdev_get_name(net, devname, bound_dev_if); if (ret) goto out; @@ -1315,6 +1318,12 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } + sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; @@ -1855,7 +1864,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_BINDTOIFINDEX: - v.val = sk->sk_bound_dev_if; + v.val = READ_ONCE(sk->sk_bound_dev_if); break; case SO_NETNS_COOKIE: @@ -2287,6 +2296,19 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +static void sk_trim_gso_size(struct sock *sk) +{ + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) + return; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + sk_is_tcp(sk) && + !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return; +#endif + sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; +} + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; @@ -2306,6 +2328,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + sk_trim_gso_size(sk); sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 195ca5c28771..ca8d38325e1e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -578,6 +578,14 @@ static struct ctl_table net_core_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, + { + .procname = "skb_defer_max", + .data = &sysctl_skb_defer_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, { } }; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 82696ab86f74..da6e3b20cd75 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -628,7 +628,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); ireq->ir_mark = inet_request_mark(sk, skb); ireq->ireq_family = AF_INET; - ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); /* * Step 3: Process LISTEN state @@ -1029,9 +1029,15 @@ static void __net_exit dccp_v4_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v4_ctl_sk); } +static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&dccp_hashinfo, AF_INET); +} + static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, + .exit_batch = dccp_v4_exit_batch, .id = &dccp_v4_pernet_id, .size = sizeof(struct dccp_v4_pernet), }; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4d95b6400915..fd44638ec16b 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -374,10 +374,10 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) refcount_inc(&skb->users); ireq->pktopts = skb; } - ireq->ir_iif = sk->sk_bound_dev_if; + ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); /* So that link locals have meaning */ - if (!sk->sk_bound_dev_if && + if (!ireq->ir_iif && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = inet6_iif(skb); @@ -1115,9 +1115,15 @@ static void __net_exit dccp_v6_exit_net(struct net *net) inet_ctl_sock_destroy(pn->v6_ctl_sk); } +static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&dccp_hashinfo, AF_INET6); +} + static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, + .exit_batch = dccp_v6_exit_batch, .id = &dccp_v6_pernet_id, .size = sizeof(struct dccp_v6_pernet), }; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 58421f94427e..eb8e128e43e8 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1110,7 +1110,6 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); - inet_hashinfo_init(&dccp_hashinfo); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) goto out_fail; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 0ee7d4c0c955..a09ba642b5e7 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -854,7 +854,7 @@ static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa) memcpy(msg->neighbor, dn_hiord, ETH_ALEN); if (dn_db->router) { - struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; + struct dn_neigh *dn = container_of(dn_db->router, struct dn_neigh, n); dn_dn2eth(msg->neighbor, dn->addr); } @@ -902,7 +902,7 @@ static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa) { int n; struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr); - struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; + struct dn_neigh *dn = container_of(dn_db->router, struct dn_neigh, n); struct sk_buff *skb; size_t size; unsigned char *ptr; diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 94b306f6d551..fbd98ac853ea 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -426,7 +426,8 @@ int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb) if (!dn_db->router) { dn_db->router = neigh_clone(neigh); } else { - if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) + if (msg->priority > container_of(dn_db->router, + struct dn_neigh, n)->priority) neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); } } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 7e85f2a1ae25..552a53f1d5d0 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -159,7 +159,7 @@ static void dn_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how struct neighbour *n = rt->n; if (n && n->dev == dev) { - n->dev = dev_net(dev)->loopback_dev; + n->dev = blackhole_netdev; dev_hold(n->dev); dev_put(dev); } @@ -1120,7 +1120,7 @@ source_ok: /* Ok then, we assume its directly connected and move on */ select_source: if (neigh) - gateway = ((struct dn_neigh *)neigh)->addr; + gateway = container_of(neigh, struct dn_neigh, n)->addr; if (gateway == 0) gateway = fld.daddr; if (fld.saddr == 0) { @@ -1429,7 +1429,7 @@ static int dn_route_input_slow(struct sk_buff *skb) /* Use the default router if there is one */ neigh = neigh_clone(dn_db->router); if (neigh) { - gateway = ((struct dn_neigh *)neigh)->addr; + gateway = container_of(neigh, struct dn_neigh, n)->addr; goto make_route; } diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0c6ae32742ec..be7b320cda76 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -458,46 +458,6 @@ struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) } EXPORT_SYMBOL_GPL(dsa_port_from_netdev); -int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb) -{ - struct dsa_port *dp = dsa_to_port(ds, port); - struct dsa_mac_addr *a; - int err = 0; - - mutex_lock(&dp->addr_lists_lock); - - list_for_each_entry(a, &dp->fdbs, list) { - err = cb(ds, port, a->addr, a->vid, a->db); - if (err) - break; - } - - mutex_unlock(&dp->addr_lists_lock); - - return err; -} -EXPORT_SYMBOL_GPL(dsa_port_walk_fdbs); - -int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb) -{ - struct dsa_port *dp = dsa_to_port(ds, port); - struct dsa_mac_addr *a; - int err = 0; - - mutex_lock(&dp->addr_lists_lock); - - list_for_each_entry(a, &dp->mdbs, list) { - err = cb(ds, port, a->addr, a->vid, a->db); - if (err) - break; - } - - mutex_unlock(&dp->addr_lists_lock); - - return err; -} -EXPORT_SYMBOL_GPL(dsa_port_walk_mdbs); - bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b) { if (a->type != b->type) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index cf933225df32..d0a2452a1e24 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -809,22 +809,18 @@ static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) { const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; struct dsa_switch_tree *dst = ds->dst; - struct dsa_port *cpu_dp; int err; if (tag_ops->proto == dst->default_proto) goto connect; - dsa_switch_for_each_cpu_port(cpu_dp, ds) { - rtnl_lock(); - err = ds->ops->change_tag_protocol(ds, cpu_dp->index, - tag_ops->proto); - rtnl_unlock(); - if (err) { - dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", - tag_ops->name, ERR_PTR(err)); - return err; - } + rtnl_lock(); + err = ds->ops->change_tag_protocol(ds, tag_ops->proto); + rtnl_unlock(); + if (err) { + dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", + tag_ops->name, ERR_PTR(err)); + return err; } connect: diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 7c9abd5a0ab9..d9722e49864b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -291,6 +291,7 @@ int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast); void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast); +void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 48e5a309ca5c..3738f2d40a0b 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -503,6 +503,7 @@ out_rollback_unoffload: switchdev_bridge_port_unoffload(brport_dev, dp, &dsa_slave_switchdev_notifier, &dsa_slave_switchdev_blocking_notifier); + dsa_flush_workqueue(); out_rollback_unbridge: dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); out_rollback: @@ -919,6 +920,14 @@ int dsa_port_bridge_flags(struct dsa_port *dp, return 0; } +void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc) +{ + struct dsa_switch *ds = dp->ds; + + if (ds->ops->port_set_host_flood) + ds->ops->port_set_host_flood(ds, dp->index, uc, mc); +} + int dsa_port_vlan_msti(struct dsa_port *dp, const struct switchdev_vlan_msti *msti) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5ee0aced9410..801a5d445833 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -262,37 +262,13 @@ static int dsa_slave_close(struct net_device *dev) return 0; } -/* Keep flooding enabled towards this port's CPU port as long as it serves at - * least one port in the tree that requires it. - */ -static void dsa_port_manage_cpu_flood(struct dsa_port *dp) +static void dsa_slave_manage_host_flood(struct net_device *dev) { - struct switchdev_brport_flags flags = { - .mask = BR_FLOOD | BR_MCAST_FLOOD, - }; - struct dsa_switch_tree *dst = dp->ds->dst; - struct dsa_port *cpu_dp = dp->cpu_dp; - struct dsa_port *other_dp; - int err; - - list_for_each_entry(other_dp, &dst->ports, list) { - if (!dsa_port_is_user(other_dp)) - continue; - - if (other_dp->cpu_dp != cpu_dp) - continue; - - if (other_dp->slave->flags & IFF_ALLMULTI) - flags.val |= BR_MCAST_FLOOD; - if (other_dp->slave->flags & IFF_PROMISC) - flags.val |= BR_FLOOD | BR_MCAST_FLOOD; - } - - err = dsa_port_pre_bridge_flags(dp, flags, NULL); - if (err) - return; + bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI); + struct dsa_port *dp = dsa_slave_to_port(dev); + bool uc = dev->flags & IFF_PROMISC; - dsa_port_bridge_flags(cpu_dp, flags, NULL); + dsa_port_set_host_flood(dp, uc, mc); } static void dsa_slave_change_rx_flags(struct net_device *dev, int change) @@ -310,7 +286,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change) if (dsa_switch_supports_uc_filtering(ds) && dsa_switch_supports_mc_filtering(ds)) - dsa_port_manage_cpu_flood(dp); + dsa_slave_manage_host_flood(dev); } static void dsa_slave_set_rx_mode(struct net_device *dev) diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 704975e5c1c2..2b56218fc57c 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -809,14 +809,12 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds, ASSERT_RTNL(); - dsa_switch_for_each_cpu_port(cpu_dp, ds) { - err = ds->ops->change_tag_protocol(ds, cpu_dp->index, - tag_ops->proto); - if (err) - return err; + err = ds->ops->change_tag_protocol(ds, tag_ops->proto); + if (err) + return err; + dsa_switch_for_each_cpu_port(cpu_dp, ds) dsa_port_set_tag_protocol(cpu_dp, tag_ops); - } /* Now that changing the tag protocol can no longer fail, let's update * the remaining bits which are "duplicated for faster access", and the diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 53a6b14dc50a..89141ba5e9ee 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2573,7 +2573,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, struct devinet_sysctl_table *t; char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; - t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL); + t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto out; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index d747166bb291..b21238df3301 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -705,7 +705,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) static inline int esp_remove_trailer(struct sk_buff *skb) { struct xfrm_state *x = xfrm_input_state(skb); - struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; int alen, hlen, elen; int padlen, trimlen; @@ -717,11 +716,6 @@ static inline int esp_remove_trailer(struct sk_buff *skb) hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); elen = skb->len - hlen; - if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) { - ret = xo->proto; - goto out; - } - if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2)) BUG(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1ba8ebc439f3..b65d074d9620 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2403,9 +2403,10 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); - kfree_rcu(psl, rcu); } rcu_assign_pointer(pmc->sflist, newpsl); + if (psl) + kfree_rcu(psl, rcu); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ @@ -2507,11 +2508,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); - kfree_rcu(psl, rcu); - } else + } else { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); + } rcu_assign_pointer(pmc->sflist, newpsl); + if (psl) + kfree_rcu(psl, rcu); pmc->sfmode = msf->imsf_fmode; err = 0; done: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1e5b53c2bb26..53f5f956d948 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -155,10 +155,14 @@ static int inet_csk_bind_conflict(const struct sock *sk, */ sk_for_each_bound(sk2, &tb->owners) { - if (sk != sk2 && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + int bound_dev_if2; + + if (sk == sk2) + continue; + bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); + if ((!sk->sk_bound_dev_if || + !bound_dev_if2 || + sk->sk_bound_dev_if == bound_dev_if2)) { if (reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN) { if ((!relax || diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 581b5b2d72a5..b812eb36f0e3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1028,12 +1028,13 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) goto skip_listen_ht; - for (i = s_i; i < INET_LHTABLE_SIZE; i++) { + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; num = 0; - ilb = &hashinfo->listening_hash[i]; + ilb = &hashinfo->lhash2[i]; + spin_lock(&ilb->lock); sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 17440840a791..87354e20009a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -193,42 +193,6 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) return inet_lhash2_bucket(h, hash); } -static void inet_hash2(struct inet_hashinfo *h, struct sock *sk) -{ - struct inet_listen_hashbucket *ilb2; - - if (!h->lhash2) - return; - - ilb2 = inet_lhash2_bucket_sk(h, sk); - - spin_lock(&ilb2->lock); - if (sk->sk_reuseport && sk->sk_family == AF_INET6) - hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - else - hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - ilb2->count++; - spin_unlock(&ilb2->lock); -} - -static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk) -{ - struct inet_listen_hashbucket *ilb2; - - if (!h->lhash2 || - WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node))) - return; - - ilb2 = inet_lhash2_bucket_sk(h, sk); - - spin_lock(&ilb2->lock); - hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); - ilb2->count--; - spin_unlock(&ilb2->lock); -} - static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) @@ -282,12 +246,11 @@ static struct sock *inet_lhash2_lookup(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, @@ -410,13 +373,11 @@ begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; - if (likely(INET_MATCH(sk, net, acookie, - saddr, daddr, ports, dif, sdif))) { + if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) { if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; - if (unlikely(!INET_MATCH(sk, net, acookie, - saddr, daddr, ports, - dif, sdif))) { + if (unlikely(!inet_match(net, sk, acookie, + ports, dif, sdif))) { sock_gen_put(sk); goto begin; } @@ -465,8 +426,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, if (sk2->sk_hash != hash) continue; - if (likely(INET_MATCH(sk2, net, acookie, - saddr, daddr, ports, dif, sdif))) { + if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) @@ -504,7 +464,7 @@ not_unique: return -EADDRNOTAVAIL; } -static u32 inet_sk_port_offset(const struct sock *sk) +static u64 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); @@ -532,16 +492,14 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk, if (esk->sk_hash != sk->sk_hash) continue; if (sk->sk_family == AF_INET) { - if (unlikely(INET_MATCH(esk, net, acookie, - sk->sk_daddr, - sk->sk_rcv_saddr, + if (unlikely(inet_match(net, esk, acookie, ports, dif, sdif))) { return true; } } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { - if (unlikely(INET6_MATCH(esk, net, + if (unlikely(inet6_match(net, esk, &sk->sk_v6_daddr, &sk->sk_v6_rcv_saddr, ports, dif, sdif))) { @@ -633,7 +591,7 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct inet_listen_hashbucket *ilb; + struct inet_listen_hashbucket *ilb2; int err = 0; if (sk->sk_state != TCP_LISTEN) { @@ -643,25 +601,23 @@ int __inet_hash(struct sock *sk, struct sock *osk) return 0; } WARN_ON(!sk_unhashed(sk)); - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); - spin_lock(&ilb->lock); + spin_lock(&ilb2->lock); if (sk->sk_reuseport) { - err = inet_reuseport_add_sock(sk, ilb); + err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) - __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); + __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else - __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); - inet_hash2(hashinfo, sk); - ilb->count++; + __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: - spin_unlock(&ilb->lock); + spin_unlock(&ilb2->lock); return err; } @@ -678,23 +634,6 @@ int inet_hash(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_hash); -static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) -{ - if (sk_unhashed(sk)) - return; - - if (rcu_access_pointer(sk->sk_reuseport_cb)) - reuseport_stop_listen_sock(sk); - if (ilb) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - - inet_unhash2(hashinfo, sk); - ilb->count--; - } - __sk_nulls_del_node_init_rcu(sk); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -} - void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; @@ -703,20 +642,34 @@ void inet_unhash(struct sock *sk) return; if (sk->sk_state == TCP_LISTEN) { - struct inet_listen_hashbucket *ilb; + struct inet_listen_hashbucket *ilb2; - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ - spin_lock(&ilb->lock); - __inet_unhash(sk, ilb); - spin_unlock(&ilb->lock); + spin_lock(&ilb2->lock); + if (sk_unhashed(sk)) { + spin_unlock(&ilb2->lock); + return; + } + + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_stop_listen_sock(sk); + + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + spin_unlock(&ilb2->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); - __inet_unhash(sk, NULL); + if (sk_unhashed(sk)) { + spin_unlock_bh(lock); + return; + } + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); } } @@ -726,15 +679,17 @@ EXPORT_SYMBOL_GPL(inet_unhash); * Note that we use 32bit integers (vs RFC 'short integers') * because 2^16 is not a multiple of num_ephemeral and this * property might be used by clever attacker. - * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, - * we use 256 instead to really give more isolation and - * privacy, this only consumes 1 KB of kernel memory. + * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though + * attacks were since demonstrated, thus we use 65536 instead to really + * give more isolation and privacy, at the expense of 256kB of kernel + * memory. */ -#define INET_TABLE_PERTURB_SHIFT 8 -static u32 table_perturb[1 << INET_TABLE_PERTURB_SHIFT]; +#define INET_TABLE_PERTURB_SHIFT 16 +#define INET_TABLE_PERTURB_SIZE (1 << INET_TABLE_PERTURB_SHIFT) +static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, - struct sock *sk, u32 port_offset, + struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)) { @@ -774,10 +729,13 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (likely(remaining > 1)) remaining &= ~1U; - net_get_random_once(table_perturb, sizeof(table_perturb)); - index = hash_32(port_offset, INET_TABLE_PERTURB_SHIFT); + net_get_random_once(table_perturb, + INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); + index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); + + offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); + offset %= remaining; - offset = (READ_ONCE(table_perturb[index]) + port_offset) % remaining; /* In first pass we try ports of @low parity. * inet_csk_get_port() does the opposite choice. */ @@ -831,11 +789,12 @@ next_port: return -EADDRNOTAVAIL; ok: - /* If our first attempt found a candidate, skip next candidate - * in 1/16 of cases to add some noise. + /* Here we want to add a little bit of randomness to the next source + * port that will be chosen. We use a max() with a random here so that + * on low contention the randomness is maximal and on high contention + * it may be inexistent. */ - if (!i && !(prandom_u32() % 16)) - i = 2; + i = max_t(int, i, (prandom_u32() & 7) * 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ @@ -859,7 +818,7 @@ ok: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - u32 port_offset = 0; + u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); @@ -868,29 +827,14 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, } EXPORT_SYMBOL_GPL(inet_hash_connect); -void inet_hashinfo_init(struct inet_hashinfo *h) -{ - int i; - - for (i = 0; i < INET_LHTABLE_SIZE; i++) { - spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, - i + LISTENING_NULLS_BASE); - h->listening_hash[i].count = 0; - } - - h->lhash2 = NULL; -} -EXPORT_SYMBOL_GPL(inet_hashinfo_init); - static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); - INIT_HLIST_HEAD(&h->lhash2[i].head); - h->lhash2[i].count = 0; + INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, + i + LISTENING_NULLS_BASE); } } @@ -909,6 +853,12 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, low_limit, high_limit); init_hashinfo_lhash2(h); + + /* this one is used for source ports of outgoing connections */ + table_perturb = kmalloc_array(INET_TABLE_PERTURB_SIZE, + sizeof(*table_perturb), GFP_KERNEL); + if (!table_perturb) + panic("TCP: failed to alloc table_perturb"); } int inet_hashinfo2_init_mod(struct inet_hashinfo *h) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 9e0bbd026560..0ec501845cb3 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -52,7 +52,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) spin_unlock(lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[tw->tw_bslot]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); inet_twsk_bind_unhash(tw, hashinfo); @@ -111,12 +112,8 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - /* Cache inet_bhashfn(), because 'struct net' might be no longer - * available later in inet_twsk_kill(). - */ - tw->tw_bslot = inet_bhashfn(twsk_net(tw), inet->inet_num, - hashinfo->bhash_size); - bhead = &hashinfo->bhash[tw->tw_bslot]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); @@ -257,3 +254,50 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) } } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); + +void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) +{ + struct inet_timewait_sock *tw; + struct sock *sk; + struct hlist_nulls_node *node; + unsigned int slot; + + for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; +restart_rcu: + cond_resched(); + rcu_read_lock(); +restart: + sk_nulls_for_each_rcu(sk, node, &head->chain) { + if (sk->sk_state != TCP_TIME_WAIT) + continue; + tw = inet_twsk(sk); + if ((tw->tw_family != family) || + refcount_read(&twsk_net(tw)->ns.count)) + continue; + + if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt))) + continue; + + if (unlikely((tw->tw_family != family) || + refcount_read(&twsk_net(tw)->ns.count))) { + inet_twsk_put(tw); + goto restart; + } + + rcu_read_unlock(); + local_bh_disable(); + inet_twsk_deschedule_put(tw); + local_bh_enable(); + goto restart_rcu; + } + /* If the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto restart; + rcu_read_unlock(); + } +} +EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 4eed5afca392..918c61fda0f3 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -80,6 +80,7 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct iphdr *niph; struct icmphdr *icmph; unsigned int len; + int dataoff; __wsum csum; u8 proto; @@ -99,10 +100,11 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len))) return NULL; + dataoff = ip_hdrlen(oldskb); proto = ip_hdr(oldskb)->protocol; if (!skb_csum_unnecessary(oldskb) && - nf_reject_verify_csum(proto) && + nf_reject_verify_csum(oldskb, dataoff, proto) && nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto)) return NULL; @@ -311,6 +313,7 @@ EXPORT_SYMBOL_GPL(nf_send_reset); void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) { struct iphdr *iph = ip_hdr(skb_in); + int dataoff = ip_hdrlen(skb_in); u8 proto = iph->protocol; if (iph->frag_off & htons(IP_OFFSET)) @@ -320,12 +323,13 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) nf_reject_fill_skb_dst(skb_in) < 0) return; - if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) { + if (skb_csum_unnecessary(skb_in) || + !nf_reject_verify_csum(skb_in, dataoff, proto)) { icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); return; } - if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0) + if (nf_ip_checksum(skb_in, hook, dataoff, proto) == 0) icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); } EXPORT_SYMBOL_GPL(nf_send_unreach); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 5f8cad2978b3..1a43ca73f94d 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -305,6 +305,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + u32 tb_id = RT_TABLE_LOCAL; int chk_addr_ret; if (addr_len < sizeof(*addr)) @@ -318,7 +319,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; + chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk), addr->sin_addr.s_addr, @@ -355,6 +357,14 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, return -ENODEV; } } + + if (!dev && sk->sk_bound_dev_if) { + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (!dev) { + rcu_read_unlock(); + return -ENODEV; + } + } has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev, scoped); rcu_read_unlock(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ffbe2e4f8c89..356f535f3443 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1727,6 +1727,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; struct rtable *rth; + bool no_policy; u32 itag = 0; int err; @@ -1737,8 +1738,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (our) flags |= RTCF_LOCAL; + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, - IN_DEV_ORCONF(in_dev, NOPOLICY), false); + no_policy, false); if (!rth) return -ENOBUFS; @@ -1754,6 +1759,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, #endif RT_CACHE_STAT_INC(in_slow_mc); + skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return 0; } @@ -1796,7 +1802,7 @@ static int __mkroute_input(struct sk_buff *skb, struct rtable *rth; int err; struct in_device *out_dev; - bool do_cache; + bool do_cache, no_policy; u32 itag = 0; /* get a working reference to the output device */ @@ -1841,6 +1847,10 @@ static int __mkroute_input(struct sk_buff *skb, } } + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) @@ -1853,8 +1863,7 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(out_dev->dev, 0, res->type, - IN_DEV_ORCONF(in_dev, NOPOLICY), + rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; @@ -2229,6 +2238,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable *rth; struct flowi4 fl4; bool do_cache = true; + bool no_policy; /* IP on this device is disabled. */ @@ -2347,6 +2357,10 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: + no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY); + if (no_policy) + IPCB(skb)->flags |= IPSKB_NOPOLICY; + do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); @@ -2361,7 +2375,7 @@ local_input: rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, - IN_DEV_ORCONF(in_dev, NOPOLICY), false); + no_policy, false); if (!rth) goto e_nobufs; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b44fde435bd1..028513d3e2a2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4595,7 +4595,6 @@ void __init tcp_init(void) timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); - inet_hashinfo_init(&tcp_hashinfo); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index c7d30a3bbd81..075e744bfb48 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -310,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk) */ bytes = min_t(unsigned long, sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); + GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); return min(segs, 0x7FU); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index b0918839bee7..68178e7280ce 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -372,7 +372,7 @@ static void cubictcp_state(struct sock *sk, u8 new_state) * We apply another 100% factor because @rate is doubled at this point. * We cap the cushion to 1ms. */ -static u32 hystart_ack_delay(struct sock *sk) +static u32 hystart_ack_delay(const struct sock *sk) { unsigned long rate; @@ -380,7 +380,7 @@ static u32 hystart_ack_delay(struct sock *sk) if (!rate) return 0; return min_t(u64, USEC_PER_MSEC, - div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); + div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate)); } static void hystart_update(struct sock *sk, u32 delay) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 918816ec5dd4..dac2650f3863 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2101,6 +2101,7 @@ bad_packet: } discard_it: + SKB_DR_OR(drop_reason, NOT_SPECIFIED); /* Discard frame. */ kfree_skb_reason(skb, drop_reason); return 0; @@ -2283,16 +2284,15 @@ static void *listening_get_first(struct seq_file *seq) st->offset = 0; for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { struct inet_listen_hashbucket *ilb2; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; struct sock *sk; ilb2 = &tcp_hashinfo.lhash2[st->bucket]; - if (hlist_empty(&ilb2->head)) + if (hlist_nulls_empty(&ilb2->nulls_head)) continue; spin_lock(&ilb2->lock); - inet_lhash2_for_each_icsk(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each(sk, node, &ilb2->nulls_head) { if (seq_sk_match(seq, sk)) return sk; } @@ -2311,15 +2311,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur) { struct tcp_iter_state *st = seq->private; struct inet_listen_hashbucket *ilb2; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; struct sock *sk = cur; ++st->num; ++st->offset; - icsk = inet_csk(sk); - inet_lhash2_for_each_icsk_continue(icsk) { - sk = (struct sock *)icsk; + sk = sk_nulls_next(sk); + sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) return sk; } @@ -2728,16 +2727,15 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, { struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; unsigned int expected = 1; struct sock *sk; sock_hold(start_sk); iter->batch[iter->end_sk++] = start_sk; - icsk = inet_csk(start_sk); - inet_lhash2_for_each_icsk_continue(icsk) { - sk = (struct sock *)icsk; + sk = sk_nulls_next(start_sk); + sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); @@ -3171,6 +3169,8 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + list_for_each_entry(net, net_exit_list, exit_list) tcp_fastopen_ctx_destroy(net); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5f91a9536e00..b4b2284ed4a2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -445,12 +445,13 @@ struct tcp_out_options { struct mptcp_out_options mptcp; }; -static void mptcp_options_write(__be32 *ptr, const struct tcp_sock *tp, +static void mptcp_options_write(struct tcphdr *th, __be32 *ptr, + struct tcp_sock *tp, struct tcp_out_options *opts) { #if IS_ENABLED(CONFIG_MPTCP) if (unlikely(OPTION_MPTCP & opts->options)) - mptcp_write_options(ptr, tp, &opts->mptcp); + mptcp_write_options(th, ptr, tp, &opts->mptcp); #endif } @@ -606,9 +607,10 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, * At least SACK_PERM as the first option is known to lead to a disaster * (but it may well be that other scenarios fail similarly). */ -static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, +static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts) { + __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ if (unlikely(OPTION_MD5 & options)) { @@ -702,7 +704,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, smc_options_write(ptr, &options); - mptcp_options_write(ptr, tp, opts); + mptcp_options_write(th, ptr, tp, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -1355,7 +1357,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, th->window = htons(min(tp->rcv_wnd, 65535U)); } - tcp_options_write((__be32 *)(th + 1), tp, &opts); + tcp_options_write(th, tp, &opts); #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ @@ -1551,7 +1553,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, * SO_SNDBUF values. * Also allow first and last skb in retransmit queue to be split. */ - limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE); + limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE); if (unlikely((sk->sk_wmem_queued >> 1) > limit && tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && skb != tcp_rtx_queue_head(sk) && @@ -3591,7 +3593,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rsk_rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), NULL, &opts); + tcp_options_write(th, NULL, &opts); th->doff = (tcp_header_size >> 2); __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9d5071c79c95..aa9f2ec3dc46 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2563,8 +2563,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, struct sock *sk; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { - if (INET_MATCH(sk, net, acookie, rmt_addr, - loc_addr, ports, dif, sdif)) + if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 7dee662d7019..cde242dca530 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -335,7 +335,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev) { int i; - idev->stats.ipv6 = alloc_percpu(struct ipstats_mib); + idev->stats.ipv6 = alloc_percpu_gfp(struct ipstats_mib, GFP_KERNEL_ACCOUNT); if (!idev->stats.ipv6) goto err_ip; @@ -351,7 +351,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev) if (!idev->stats.icmpv6dev) goto err_icmp; idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!idev->stats.icmpv6msgdev) goto err_icmpmsg; @@ -375,7 +375,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev) return ERR_PTR(-EINVAL); - ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL); + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL_ACCOUNT); if (!ndev) return ERR_PTR(err); @@ -7060,7 +7060,7 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; - table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL); + table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL_ACCOUNT); if (!table) goto out; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 39b2327edc4e..df665d4e8f0f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -218,11 +218,11 @@ ipv4_connected: err = -EINVAL; goto out; } - sk->sk_bound_dev_if = usin->sin6_scope_id; + WRITE_ONCE(sk->sk_bound_dev_if, usin->sin6_scope_id); } if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) - sk->sk_bound_dev_if = np->mcast_oif; + WRITE_ONCE(sk->sk_bound_dev_if, np->mcast_oif); /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { @@ -798,7 +798,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (src_idx) { if (fl6->flowi6_oif && src_idx != fl6->flowi6_oif && - (sk->sk_bound_dev_if != fl6->flowi6_oif || + (READ_ONCE(sk->sk_bound_dev_if) != fl6->flowi6_oif || !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; fl6->flowi6_oif = src_idx; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index f2120e92caf1..36e1d0f8dd06 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -741,7 +741,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) static inline int esp_remove_trailer(struct sk_buff *skb) { struct xfrm_state *x = xfrm_input_state(skb); - struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; int alen, hlen, elen; int padlen, trimlen; @@ -753,11 +752,6 @@ static inline int esp_remove_trailer(struct sk_buff *skb) hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); elen = skb->len - hlen; - if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) { - ret = xo->proto; - goto out; - } - ret = skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2); BUG_ON(ret); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 4740afecf7c6..7d53d62783b1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -71,12 +71,12 @@ begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; - if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif)) + if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; - if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))) { + if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } @@ -138,12 +138,11 @@ static struct sock *inet6_lhash2_lookup(struct net *net, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { - struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, @@ -269,7 +268,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, if (sk2->sk_hash != hash) continue; - if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports, + if (likely(inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); @@ -308,7 +307,7 @@ not_unique: return -EADDRNOTAVAIL; } -static u32 inet6_sk_port_offset(const struct sock *sk) +static u64 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); @@ -320,7 +319,7 @@ static u32 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { - u32 port_offset = 0; + u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index c4fc03c1ac99..d12dba2dd535 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -77,7 +77,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; - int proto; + int proto, nexthdr; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; @@ -87,6 +87,28 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, bool gso_partial; skb_reset_network_header(skb); + nexthdr = ipv6_has_hopopt_jumbo(skb); + if (nexthdr) { + const int hophdr_len = sizeof(struct hop_jumbo_hdr); + int err; + + err = skb_cow_head(skb, 0); + if (err < 0) + return ERR_PTR(err); + + /* remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][TCP header] + */ + memmove(skb_mac_header(skb) + hophdr_len, + skb_mac_header(skb), + ETH_HLEN + sizeof(struct ipv6hdr)); + skb->data += hophdr_len; + skb->len -= hophdr_len; + skb->network_header += hophdr_len; + skb->mac_header += hophdr_len; + ipv6h = (struct ipv6hdr *)skb->data; + ipv6h->nexthdr = nexthdr; + } nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; @@ -320,15 +342,43 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head, INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; - struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); + struct ipv6hdr *iph; int err = -ENOSYS; + u32 payload_len; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); } - iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); + payload_len = skb->len - nhoff - sizeof(*iph); + if (unlikely(payload_len > IPV6_MAXPLEN)) { + struct hop_jumbo_hdr *hop_jumbo; + int hoplen = sizeof(*hop_jumbo); + + /* Move network header left */ + memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb), + skb->transport_header - skb->mac_header); + skb->data -= hoplen; + skb->len += hoplen; + skb->mac_header -= hoplen; + skb->network_header -= hoplen; + iph = (struct ipv6hdr *)(skb->data + nhoff); + hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1); + + /* Build hop-by-hop options */ + hop_jumbo->nexthdr = iph->nexthdr; + hop_jumbo->hdrlen = 0; + hop_jumbo->tlv_type = IPV6_TLV_JUMBO; + hop_jumbo->tlv_len = 4; + hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen); + + iph->nexthdr = NEXTHDR_HOP; + iph->payload_len = 0; + } else { + iph = (struct ipv6hdr *)(skb->data + nhoff); + iph->payload_len = htons(payload_len); + } nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index afa5bd4ad167..4081b12a01ff 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -182,7 +182,9 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff #endif mtu = ip6_skb_dst_mtu(skb); - if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)) + if (skb_is_gso(skb) && + !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && + !skb_gso_validate_network_len(skb, mtu)) return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); if ((skb->len > mtu && !skb_is_gso(skb)) || @@ -252,6 +254,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); + struct hop_jumbo_hdr *hop_jumbo; + int hoplen = sizeof(*hop_jumbo); unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; @@ -259,7 +263,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, int hlimit = -1; u32 mtu; - head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); + head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; @@ -282,6 +286,20 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, &fl6->saddr); } + if (unlikely(seg_len > IPV6_MAXPLEN)) { + hop_jumbo = skb_push(skb, hoplen); + + hop_jumbo->nexthdr = proto; + hop_jumbo->hdrlen = 0; + hop_jumbo->tlv_type = IPV6_TLV_JUMBO; + hop_jumbo->tlv_len = 4; + hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); + + proto = IPPROTO_HOPOPTS; + seg_len = 0; + IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 909f937befd7..7f695c39d9a8 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -460,10 +460,10 @@ int ip6_mc_source(int add, int omode, struct sock *sk, newpsl->sl_addr[i] = psl->sl_addr[i]; atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); - kfree_rcu(psl, rcu); } + rcu_assign_pointer(pmc->sflist, newpsl); + kfree_rcu(psl, rcu); psl = newpsl; - rcu_assign_pointer(pmc->sflist, psl); } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { @@ -565,12 +565,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, psl->sl_count, psl->sl_addr, 0); atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); - kfree_rcu(psl, rcu); } else { ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); } - mutex_unlock(&idev->mc_lock); rcu_assign_pointer(pmc->sflist, newpsl); + mutex_unlock(&idev->mc_lock); + kfree_rcu(psl, rcu); pmc->sfmode = gsf->gf_fmode; err = 0; done: diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index dffeaaaadcde..f61d4f18e1cf 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -31,7 +31,7 @@ static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; - if (!nf_reject_verify_csum(proto)) + if (!nf_reject_verify_csum(skb, thoff, proto)) return true; return nf_ip6_checksum(skb, hook, thoff, proto) == 0; @@ -388,7 +388,7 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook) if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; - if (!nf_reject_verify_csum(proto)) + if (!nf_reject_verify_csum(skb, thoff, proto)) return true; return nf_ip6_checksum(skb, hook, thoff, proto) == 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 60bdec257ba7..28e47ca1e26d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1509,6 +1509,7 @@ reset: discard: if (opt_skb) __kfree_skb(opt_skb); + SKB_DR_OR(reason, NOT_SPECIFIED); kfree_skb_reason(skb, reason); return 0; csum_err: @@ -2206,9 +2207,15 @@ static void __net_exit tcpv6_net_exit(struct net *net) inet_ctl_sock_destroy(net->ipv6.tcp_sk); } +static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, AF_INET6); +} + static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, + .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3fc97d4621ac..55afd7f39c04 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -105,7 +105,7 @@ static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *daddr, unsigned short hnum, int dif, int sdif) { - int score; + int bound_dev_if, score; struct inet_sock *inet; bool dev_match; @@ -132,10 +132,11 @@ static int compute_score(struct sock *sk, struct net *net, score++; } - dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + dev_match = udp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif); if (!dev_match) return -1; - if (sk->sk_bound_dev_if) + if (bound_dev_if) score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) @@ -789,7 +790,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport && inet->inet_dport != rmt_port) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || - !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif) || + !udp_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif) || (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) && !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))) return false; @@ -1043,7 +1044,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (sk->sk_state == TCP_ESTABLISHED && - INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif)) + inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; @@ -1433,7 +1434,7 @@ do_udp_sendmsg: } if (!fl6->flowi6_oif) - fl6->flowi6_oif = sk->sk_bound_dev_if; + fl6->flowi6_oif = READ_ONCE(sk->sk_bound_dev_if); if (!fl6->flowi6_oif) fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; diff --git a/net/key/af_key.c b/net/key/af_key.c index 175a162eec58..11e1a3a3e442 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2826,8 +2826,10 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb void *ext_hdrs[SADB_EXT_MAX]; int err; - pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, - BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); + err = pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); + if (err) + return err; memset(ext_hdrs, 0, sizeof(ext_hdrs)); err = parse_exthdrs(skb, hdr, ext_hdrs); @@ -2898,7 +2900,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t) break; if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg)) + if (aalg_tmpl_set(t, aalg) && aalg->available) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); @@ -2916,7 +2918,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!ealg->pfkey_supported) continue; - if (!(ealg_tmpl_set(t, ealg))) + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; for (k = 1; ; k++) { @@ -2927,7 +2929,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t) if (!aalg->pfkey_supported) continue; - if (aalg_tmpl_set(t, aalg)) + if (aalg_tmpl_set(t, aalg) && aalg->available) sz += sizeof(struct sadb_comb); } } diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 6af09e188e52..4db5a554bdbd 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -50,11 +50,13 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, sk_for_each_bound(sk, &l2tp_ip_bind_table) { const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); const struct inet_sock *inet = inet_sk(sk); + int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif) + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if && dif && bound_dev_if != dif) continue; if (inet->inet_rcv_saddr && laddr && diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 217c7192691e..c6ff8bf9b55f 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -62,11 +62,13 @@ static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); + int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif) + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if && dif && bound_dev_if != dif) continue; if (sk_laddr && !ipv6_addr_any(sk_laddr) && @@ -445,7 +447,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, lsa->l2tp_conn_id = lsk->conn_id; } if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) - lsa->l2tp_scope_id = sk->sk_bound_dev_if; + lsa->l2tp_scope_id = READ_ONCE(sk->sk_bound_dev_if); return sizeof(*lsa); } @@ -560,7 +562,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if (fl6.flowi6_oif == 0) - fl6.flowi6_oif = sk->sk_bound_dev_if; + fl6.flowi6_oif = READ_ONCE(sk->sk_bound_dev_if); if (msg->msg_controllen) { opt = &opt_space; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index ae5a1e34e7d2..58d48dcae030 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3649,6 +3649,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, cbss->transmitted_bss->bssid); bss_conf->bssid_indicator = cbss->max_bssid_indicator; bss_conf->bssid_index = cbss->bssid_index; + } else { + bss_conf->nontransmitted = false; + memset(bss_conf->transmitter_bssid, 0, + sizeof(bss_conf->transmitter_bssid)); + bss_conf->bssid_indicator = 0; + bss_conf->bssid_index = 0; } /* diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 959a36fd658b..3c08ae04ddbc 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1405,8 +1405,7 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, goto dont_reorder; /* not part of a BA session */ - if (ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_BLOCKACK && - ack_policy != IEEE80211_QOS_CTL_ACK_POLICY_NORMAL) + if (ack_policy == IEEE80211_QOS_CTL_ACK_POLICY_NOACK) goto dont_reorder; /* new, potentially un-ordered, ampdu frame - process it */ diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index e54daceac58b..cb7f53f6ab22 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o + mib.o pm_netlink.o sockopt.o pm_userspace.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index d93a8c9996fd..0dac2863c6e1 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -56,6 +56,10 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), + SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), + SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), + SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), + SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 529d07af9e14..2be3596374f4 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -49,6 +49,12 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ + MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ + MPTCP_MIB_RCVWNDSHARED, /* Subflow rcv wnd is overridden by msk's one */ + MPTCP_MIB_RCVWNDCONFLICTUPDATE, /* subflow rcv wnd is overridden by msk's one due to + * conflict with another subflow while updating msk rcv wnd + */ + MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index dbb6d876a203..7f9a71780437 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -83,13 +83,13 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba struct net *net = sock_net(skb->sk); int i; - for (i = diag_ctx->l_slot; i < INET_LHTABLE_SIZE; i++) { + for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk; int num = 0; - ilb = &tcp_hashinfo.listening_hash[i]; + ilb = &tcp_hashinfo.lhash2[i]; rcu_read_lock(); spin_lock(&ilb->lock); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index e05d9458a025..be3b918a6d15 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -107,7 +107,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { - mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } @@ -221,7 +221,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; - mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); + mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } @@ -1224,23 +1224,65 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) return true; } -static void mptcp_set_rwin(const struct tcp_sock *tp) +static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; - const struct mptcp_subflow_context *subflow; + struct mptcp_subflow_context *subflow; + u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; - u64 ack_seq; + u32 new_win; + u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); - ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd; + ack_seq = READ_ONCE(msk->ack_seq); + rcv_wnd_new = ack_seq + tp->rcv_wnd; + + rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); + if (after64(rcv_wnd_new, rcv_wnd_old)) { + u64 rcv_wnd; + + for (;;) { + rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); + + if (rcv_wnd == rcv_wnd_old) + break; + if (before64(rcv_wnd_new, rcv_wnd)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); + goto raise_win; + } + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); + rcv_wnd_old = rcv_wnd; + } + return; + } + + if (rcv_wnd_new != rcv_wnd_old) { +raise_win: + win = rcv_wnd_old - ack_seq; + tp->rcv_wnd = min_t(u64, win, U32_MAX); + new_win = tp->rcv_wnd; - if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent))) - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (unlikely(th->syn)) + new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; + if (!tp->rx_opt.rcv_wscale && + sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + /* RFC1323 scaling applied */ + new_win >>= tp->rx_opt.rcv_wscale; + th->window = htons(new_win); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); + } } -u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) +__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; @@ -1256,16 +1298,26 @@ u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) header.csum = 0; csum = csum_partial(&header, sizeof(header), sum); - return (__force u16)csum_fold(csum); + return csum_fold(csum); } -static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, ~csum_unfold(mpext->csum)); } -void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, +static void put_len_csum(u16 len, __sum16 csum, void *data) +{ + __sum16 *sumptr = data + 2; + __be16 *ptr = data; + + put_unaligned_be16(len, ptr); + + put_unaligned(csum, sumptr); +} + +void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; @@ -1343,9 +1395,9 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, /* data_len == 0 is reserved for the infinite mapping, * the checksum will also be set to 0. */ - put_unaligned_be32(mpext->data_len << 16 | - (mpext->data_len ? mptcp_make_csum(mpext) : 0), - ptr); + put_len_csum(mpext->data_len, + (mpext->data_len ? mptcp_make_csum(mpext) : 0), + ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); @@ -1396,11 +1448,12 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, goto mp_capable_done; if (opts->csum_reqd) { - put_unaligned_be32(opts->data_len << 16 | - __mptcp_make_csum(opts->data_seq, - opts->subflow_seq, - opts->data_len, - ~csum_unfold(opts->csum)), ptr); + put_len_csum(opts->data_len, + __mptcp_make_csum(opts->data_seq, + opts->subflow_seq, + opts->data_len, + ~csum_unfold(opts->csum)), + ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); @@ -1554,7 +1607,7 @@ mp_capable_done: } if (tp) - mptcp_set_rwin(tp); + mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 5d6832c4d9f2..59fdab2d0c27 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -181,15 +181,14 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk, struct mptcp_pm_data *pm = &msk->pm; bool update_subflows; - update_subflows = (ssk->sk_state == TCP_CLOSE) && - (subflow->request_join || subflow->mp_join) && + update_subflows = (subflow->request_join || subflow->mp_join) && mptcp_pm_is_kernel(msk); if (!READ_ONCE(pm->work_pending) && !update_subflows) return; spin_lock_bh(&pm->lock); if (update_subflows) - pm->subflows--; + __mptcp_pm_close_subflow(msk); /* Even if this subflow is not really established, tell the PM to try * to pick the next ones, if possible. @@ -469,6 +468,7 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) { spin_lock_init(&msk->pm.lock); INIT_LIST_HEAD(&msk->pm.anno_list); + INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list); mptcp_pm_data_reset(msk); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 98b205c2c101..e099f2a12504 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -22,14 +22,6 @@ static struct genl_family mptcp_genl_family; static int pm_nl_pernet_id; -struct mptcp_pm_addr_entry { - struct list_head list; - struct mptcp_addr_info addr; - u8 flags; - int ifindex; - struct socket *lsk; -}; - struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; @@ -66,8 +58,8 @@ pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) return pm_nl_get_pernet(sock_net((struct sock *)msk)); } -static bool addresses_equal(const struct mptcp_addr_info *a, - const struct mptcp_addr_info *b, bool use_port) +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port) { bool addr_equals = false; @@ -131,7 +123,7 @@ static bool lookup_subflow_by_saddr(const struct list_head *list, skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); local_address(skc, &cur); - if (addresses_equal(&cur, saddr, saddr->port)) + if (mptcp_addresses_equal(&cur, saddr, saddr->port)) return true; } @@ -149,7 +141,7 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); remote_address(skc, &cur); - if (addresses_equal(&cur, daddr, daddr->port)) + if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } @@ -269,7 +261,7 @@ mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, lockdep_assert_held(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (addresses_equal(&entry->addr, addr, true)) + if (mptcp_addresses_equal(&entry->addr, addr, true)) return entry; } @@ -286,7 +278,7 @@ bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (addresses_equal(&entry->addr, &saddr, true)) { + if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { ret = true; goto out; } @@ -360,8 +352,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, return entry; } -static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_pm_addr_entry *entry) +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_pm_addr_entry *entry) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; @@ -421,7 +413,7 @@ static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int i; for (i = 0; i < nr; i++) { - if (addresses_equal(&addrs[i], addr, addr->port)) + if (mptcp_addresses_equal(&addrs[i], addr, addr->port)) return true; } @@ -457,7 +449,7 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); - if (deny_id0 && addresses_equal(&addrs[i], &remote, false)) + if (deny_id0 && mptcp_addresses_equal(&addrs[i], &remote, false)) continue; if (!lookup_address_in_vec(addrs, i, &addrs[i]) && @@ -490,7 +482,7 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, &pernet->local_addr_list, list) { - if ((!lookup_by_id && addresses_equal(&entry->addr, info, true)) || + if ((!lookup_by_id && mptcp_addresses_equal(&entry->addr, info, true)) || (lookup_by_id && entry->addr.id == info->id)) return entry; } @@ -505,7 +497,7 @@ lookup_id_by_addr(const struct pm_nl_pernet *pernet, const struct mptcp_addr_inf rcu_read_lock(); list_for_each_entry(entry, &pernet->local_addr_list, list) { - if (addresses_equal(&entry->addr, addr, entry->addr.port)) { + if (mptcp_addresses_equal(&entry->addr, addr, entry->addr.port)) { ret = entry->addr.id; break; } @@ -739,7 +731,7 @@ static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info local; local_address((struct sock_common *)ssk, &local); - if (!addresses_equal(&local, addr, addr->port)) + if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; if (subflow->backup != bkup) @@ -909,9 +901,9 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, * singled addresses */ list_for_each_entry(cur, &pernet->local_addr_list, list) { - if (addresses_equal(&cur->addr, &entry->addr, - address_use_port(entry) && - address_use_port(cur))) { + if (mptcp_addresses_equal(&cur->addr, &entry->addr, + address_use_port(entry) && + address_use_port(cur))) { /* allow replacing the exiting endpoint only if such * endpoint is an implicit one and the user-space * did not provide an endpoint id @@ -1038,14 +1030,17 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) */ local_address((struct sock_common *)msk, &msk_local); local_address((struct sock_common *)skc, &skc_local); - if (addresses_equal(&msk_local, &skc_local, false)) + if (mptcp_addresses_equal(&msk_local, &skc_local, false)) return 0; + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_get_local_id(msk, &skc_local); + pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (addresses_equal(&entry->addr, &skc_local, entry->addr.port)) { + if (mptcp_addresses_equal(&entry->addr, &skc_local, entry->addr.port)) { ret = entry->addr.id; break; } @@ -1099,6 +1094,10 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { NLA_POLICY_NESTED(mptcp_pm_addr_policy), [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, }, + [MPTCP_PM_ATTR_ADDR_REMOTE] = + NLA_POLICY_NESTED(mptcp_pm_addr_policy), }; void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) @@ -1147,11 +1146,12 @@ static int mptcp_pm_family_to_addr(int family) return MPTCP_PM_ADDR_ATTR_ADDR4; } -static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, - bool require_family, - struct mptcp_pm_addr_entry *entry) +static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], + const struct nlattr *attr, + struct genl_info *info, + struct mptcp_addr_info *addr, + bool require_family) { - struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; int err, addr_addr; if (!attr) { @@ -1165,27 +1165,29 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, if (err) return err; - memset(entry, 0, sizeof(*entry)); + if (tb[MPTCP_PM_ADDR_ATTR_ID]) + addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); + if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) - goto skip_family; + return err; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); return -EINVAL; } - entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); - if (entry->addr.family != AF_INET + addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); + if (addr->family != AF_INET #if IS_ENABLED(CONFIG_MPTCP_IPV6) - && entry->addr.family != AF_INET6 + && addr->family != AF_INET6 #endif ) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "unknown address family"); return -EINVAL; } - addr_addr = mptcp_pm_family_to_addr(entry->addr.family); + addr_addr = mptcp_pm_family_to_addr(addr->family); if (!tb[addr_addr]) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing address data"); @@ -1193,22 +1195,47 @@ static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (entry->addr.family == AF_INET6) - entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]); + if (addr->family == AF_INET6) + addr->addr6 = nla_get_in6_addr(tb[addr_addr]); else #endif - entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]); + addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]); + + if (tb[MPTCP_PM_ADDR_ATTR_PORT]) + addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); + + return err; +} + +int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + struct mptcp_addr_info *addr) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + + memset(addr, 0, sizeof(*addr)); + + return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true); +} + +int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + int err; + + memset(entry, 0, sizeof(*entry)); + + err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family); + if (err) + return err; -skip_family: if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } - if (tb[MPTCP_PM_ADDR_ATTR_ID]) - entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); - if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); @@ -1256,7 +1283,7 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) struct mptcp_pm_addr_entry addr, *entry; int ret; - ret = mptcp_pm_parse_addr(attr, info, true, &addr); + ret = mptcp_pm_parse_entry(attr, info, true, &addr); if (ret < 0) return ret; @@ -1305,15 +1332,23 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) return 0; } -int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, +int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { struct mptcp_pm_addr_entry *entry; + struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); *flags = 0; *ifindex = 0; if (id) { + if (mptcp_pm_is_userspace(msk)) + return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, + id, + flags, + ifindex); + rcu_read_lock(); entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id); if (entry) { @@ -1416,7 +1451,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net, goto next; local_address((struct sock_common *)msk, &msk_local); - if (!addresses_equal(&msk_local, addr, addr->port)) + if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) goto next; lock_sock(sk); @@ -1442,7 +1477,7 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) unsigned int addr_max; int ret; - ret = mptcp_pm_parse_addr(attr, info, false, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1482,8 +1517,8 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) return ret; } -static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list) +void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; @@ -1616,7 +1651,7 @@ static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info) void *reply; int ret; - ret = mptcp_pm_parse_addr(attr, info, false, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1827,7 +1862,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) u8 bkup = 0, lookup_by_id = 0; int ret; - ret = mptcp_pm_parse_addr(attr, info, false, &addr); + ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -2177,6 +2212,26 @@ static const struct genl_small_ops mptcp_pm_ops[] = { .doit = mptcp_nl_cmd_set_flags, .flags = GENL_ADMIN_PERM, }, + { + .cmd = MPTCP_PM_CMD_ANNOUNCE, + .doit = mptcp_nl_cmd_announce, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_REMOVE, + .doit = mptcp_nl_cmd_remove, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE, + .doit = mptcp_nl_cmd_sf_create, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY, + .doit = mptcp_nl_cmd_sf_destroy, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family mptcp_genl_family __ro_after_init = { diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c new file mode 100644 index 000000000000..f56378e4f597 --- /dev/null +++ b/net/mptcp/pm_userspace.c @@ -0,0 +1,429 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2022, Intel Corporation. + */ + +#include "protocol.h" + +void mptcp_free_local_addr_list(struct mptcp_sock *msk) +{ + struct mptcp_pm_addr_entry *entry, *tmp; + struct sock *sk = (struct sock *)msk; + LIST_HEAD(free_list); + + if (!mptcp_pm_is_userspace(msk)) + return; + + spin_lock_bh(&msk->pm.lock); + list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); + spin_unlock_bh(&msk->pm.lock); + + list_for_each_entry_safe(entry, tmp, &free_list, list) { + sock_kfree_s(sk, entry, sizeof(*entry)); + } +} + +int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry) +{ + DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_pm_addr_entry *match = NULL; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *e; + bool addr_match = false; + bool id_match = false; + int ret = -EINVAL; + + bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { + addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); + if (addr_match && entry->addr.id == 0) + entry->addr.id = e->addr.id; + id_match = (e->addr.id == entry->addr.id); + if (addr_match && id_match) { + match = e; + break; + } else if (addr_match || id_match) { + break; + } + __set_bit(e->addr.id, id_bitmap); + } + + if (!match && !addr_match && !id_match) { + /* Memory for the entry is allocated from the + * sock option buffer. + */ + e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); + if (!e) { + spin_unlock_bh(&msk->pm.lock); + return -ENOMEM; + } + + *e = *entry; + if (!e->addr.id) + e->addr.id = find_next_zero_bit(id_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, + 1); + list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); + ret = e->addr.id; + } else if (match) { + ret = entry->addr.id; + } + + spin_unlock_bh(&msk->pm.lock); + return ret; +} + +int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, + u8 *flags, int *ifindex) +{ + struct mptcp_pm_addr_entry *entry, *match = NULL; + + *flags = 0; + *ifindex = 0; + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (id == entry->addr.id) { + match = entry; + break; + } + } + spin_unlock_bh(&msk->pm.lock); + if (match) { + *flags = match->flags; + *ifindex = match->ifindex; + } + + return 0; +} + +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, + struct mptcp_addr_info *skc) +{ + struct mptcp_pm_addr_entry new_entry; + __be16 msk_sport = ((struct inet_sock *) + inet_sk((struct sock *)msk))->inet_sport; + + memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); + new_entry.addr = *skc; + new_entry.addr.id = 0; + new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; + + if (new_entry.addr.port == msk_sport) + new_entry.addr.port = 0; + + return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry); +} + +int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_pm_addr_entry addr_val; + struct mptcp_sock *msk; + int err = -EINVAL; + u32 token_val; + + if (!addr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto announce_err; + } + + err = mptcp_pm_parse_entry(addr, info, true, &addr_val); + if (err < 0) { + GENL_SET_ERR_MSG(info, "error parsing local address"); + goto announce_err; + } + + if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + GENL_SET_ERR_MSG(info, "invalid addr id or flags"); + goto announce_err; + } + + err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val); + if (err < 0) { + GENL_SET_ERR_MSG(info, "did not match address and id"); + goto announce_err; + } + + lock_sock((struct sock *)msk); + spin_lock_bh(&msk->pm.lock); + + if (mptcp_pm_alloc_anno_list(msk, &addr_val)) { + mptcp_pm_announce_addr(msk, &addr_val.addr, false); + mptcp_pm_nl_addr_send_ack(msk); + } + + spin_unlock_bh(&msk->pm.lock); + release_sock((struct sock *)msk); + + err = 0; + announce_err: + sock_put((struct sock *)msk); + return err; +} + +int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; + struct mptcp_pm_addr_entry *match = NULL; + struct mptcp_pm_addr_entry *entry; + struct mptcp_sock *msk; + LIST_HEAD(free_list); + int err = -EINVAL; + u32 token_val; + u8 id_val; + + if (!id || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + id_val = nla_get_u8(id); + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto remove_err; + } + + lock_sock((struct sock *)msk); + + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (entry->addr.id == id_val) { + match = entry; + break; + } + } + + if (!match) { + GENL_SET_ERR_MSG(info, "address with specified id not found"); + release_sock((struct sock *)msk); + goto remove_err; + } + + list_move(&match->list, &free_list); + + mptcp_pm_remove_addrs_and_subflows(msk, &free_list); + + release_sock((struct sock *)msk); + + list_for_each_entry_safe(match, entry, &free_list, list) { + sock_kfree_s((struct sock *)msk, match, sizeof(*match)); + } + + err = 0; + remove_err: + sock_put((struct sock *)msk); + return err; +} + +int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_addr_info addr_r; + struct mptcp_addr_info addr_l; + struct mptcp_sock *msk; + int err = -EINVAL; + struct sock *sk; + u32 token_val; + + if (!laddr || !raddr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(genl_info_net(info), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto create_err; + } + + err = mptcp_pm_parse_addr(laddr, info, &addr_l); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + goto create_err; + } + + if (addr_l.id == 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id"); + goto create_err; + } + + err = mptcp_pm_parse_addr(raddr, info, &addr_r); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + goto create_err; + } + + sk = &msk->sk.icsk_inet.sk; + lock_sock(sk); + + err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); + + release_sock(sk); + + create_err: + sock_put((struct sock *)msk); + return err; +} + +static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, + const struct mptcp_addr_info *local, + const struct mptcp_addr_info *remote) +{ + struct sock *sk = &msk->sk.icsk_inet.sk; + struct mptcp_subflow_context *subflow; + struct sock *found = NULL; + + if (local->family != remote->family) + return NULL; + + lock_sock(sk); + + mptcp_for_each_subflow(msk, subflow) { + const struct inet_sock *issk; + struct sock *ssk; + + ssk = mptcp_subflow_tcp_sock(subflow); + + if (local->family != ssk->sk_family) + continue; + + issk = inet_sk(ssk); + + switch (ssk->sk_family) { + case AF_INET: + if (issk->inet_saddr != local->addr.s_addr || + issk->inet_daddr != remote->addr.s_addr) + continue; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: { + const struct ipv6_pinfo *pinfo = inet6_sk(ssk); + + if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || + !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) + continue; + break; + } +#endif + default: + continue; + } + + if (issk->inet_sport == local->port && + issk->inet_dport == remote->port) { + found = ssk; + goto found; + } + } + +found: + release_sock(sk); + + return found; +} + +int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_addr_info addr_l; + struct mptcp_addr_info addr_r; + struct mptcp_sock *msk; + struct sock *sk, *ssk; + int err = -EINVAL; + u32 token_val; + + if (!laddr || !raddr || !token) { + GENL_SET_ERR_MSG(info, "missing required inputs"); + return err; + } + + token_val = nla_get_u32(token); + + msk = mptcp_token_get_sock(genl_info_net(info), token_val); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return err; + } + + if (!mptcp_pm_is_userspace(msk)) { + GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + goto destroy_err; + } + + err = mptcp_pm_parse_addr(laddr, info, &addr_l); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + goto destroy_err; + } + + err = mptcp_pm_parse_addr(raddr, info, &addr_r); + if (err < 0) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + goto destroy_err; + } + + if (addr_l.family != addr_r.family) { + GENL_SET_ERR_MSG(info, "address families do not match"); + goto destroy_err; + } + + if (!addr_l.port || !addr_r.port) { + GENL_SET_ERR_MSG(info, "missing local or remote port"); + goto destroy_err; + } + + sk = &msk->sk.icsk_inet.sk; + ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); + if (ssk) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + + mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); + mptcp_close_ssk(sk, ssk, subflow); + err = 0; + } else { + err = -ESRCH; + } + + destroy_err: + sock_put((struct sock *)msk); + return err; +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5d529143ad77..921d67174e49 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -216,7 +216,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) seq = MPTCP_SKB_CB(skb)->map_seq; end_seq = MPTCP_SKB_CB(skb)->end_seq; - max_seq = READ_ONCE(msk->rcv_wnd_sent); + max_seq = atomic64_read(&msk->rcv_wnd_sent); pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); @@ -225,7 +225,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) mptcp_drop(sk, skb); pr_debug("oow by %lld, rcv_wnd_sent %llu\n", (unsigned long long)end_seq - (unsigned long)max_seq, - (unsigned long long)msk->rcv_wnd_sent); + (unsigned long long)atomic64_read(&msk->rcv_wnd_sent)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_NODSSWINDOW); return; } @@ -1141,18 +1141,21 @@ struct mptcp_sendmsg_info { bool data_lock_held; }; -static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, - int avail_size) +static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, + u64 data_seq, int avail_size) { u64 window_end = mptcp_wnd_end(msk); + u64 mptcp_snd_wnd; if (__mptcp_check_fallback(msk)) return avail_size; - if (!before64(data_seq + avail_size, window_end)) { - u64 allowed_size = window_end - data_seq; + mptcp_snd_wnd = window_end - data_seq; + avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); - return min_t(unsigned int, allowed_size, avail_size); + if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { + tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_SNDWNDSHARED); } return avail_size; @@ -1305,7 +1308,7 @@ alloc_skb: } /* Zero window and all data acked? Probe. */ - copy = mptcp_check_allowed_size(msk, data_seq, copy); + copy = mptcp_check_allowed_size(msk, ssk, data_seq, copy); if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); @@ -1498,11 +1501,16 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) * to check that subflow has a non empty cwin. */ ssk = send_info[SSK_MODE_ACTIVE].ssk; - if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd) + if (!ssk || !sk_stream_memory_free(ssk)) return NULL; - burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd); + burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); + if (!burst) { + msk->last_snd = NULL; + return ssk; + } + subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, @@ -1605,10 +1613,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) out: /* ensure the rtx timer is running */ - mptcp_data_lock(sk); if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); - mptcp_data_unlock(sk); if (copied) __mptcp_check_send_data_fin(sk); } @@ -2521,10 +2527,8 @@ static void __mptcp_retrans(struct sock *sk) reset_timer: mptcp_check_and_set_pending(sk); - mptcp_data_lock(sk); if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); - mptcp_data_unlock(sk); } static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) @@ -2703,10 +2707,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); tcp_send_ack(ssk); - mptcp_data_lock(sk); if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); - mptcp_data_unlock(sk); } break; } @@ -2807,10 +2809,8 @@ static void __mptcp_destroy_sock(struct sock *sk) /* join list will be eventually flushed (with rst) at sock lock release time*/ list_splice_init(&msk->conn_list, &conn_list); - mptcp_data_lock(sk); mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); - mptcp_data_unlock(sk); msk->pm.status = 0; /* clears msk->subflow, allowing the following loop to close @@ -2872,9 +2872,7 @@ cleanup: __mptcp_destroy_sock(sk); do_cancel_work = true; } else { - mptcp_data_lock(sk); sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); - mptcp_data_unlock(sk); } release_sock(sk); if (do_cancel_work) @@ -2919,10 +2917,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) __mptcp_close_ssk(sk, ssk, subflow, MPTCP_CF_FASTCLOSE); } - mptcp_data_lock(sk); mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); - mptcp_data_unlock(sk); if (mptcp_sk(sk)->token) mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); @@ -2996,7 +2992,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); ack_seq++; WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); } sock_reset_flag(nsk, SOCK_RCU_FREE); @@ -3097,6 +3093,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk) msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); + mptcp_free_local_addr_list(msk); } static void mptcp_destroy(struct sock *sk) @@ -3288,9 +3285,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq); + atomic64_set(&msk->rcv_wnd_sent, ack_seq); mptcp_pm_new_connection(msk, ssk, 0); @@ -3785,8 +3782,8 @@ void __init mptcp_proto_init(void) for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); - netif_tx_napi_add(&mptcp_napi_dev, &delegated->napi, mptcp_napi_poll, - NAPI_POLL_WEIGHT); + netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, + mptcp_napi_poll); napi_enable(&delegated->napi); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 187c932deef0..08b8015cb69f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -11,6 +11,7 @@ #include <net/tcp.h> #include <net/inet_connection_sock.h> #include <uapi/linux/mptcp.h> +#include <net/genetlink.h> #define MPTCP_SUPPORTED_VERSION 1 @@ -208,6 +209,7 @@ struct mptcp_pm_data { struct mptcp_addr_info local; struct mptcp_addr_info remote; struct list_head anno_list; + struct list_head userspace_pm_local_addr_list; spinlock_t lock; /*protects the whole PM data */ @@ -228,6 +230,14 @@ struct mptcp_pm_data { struct mptcp_rm_list rm_list_rx; }; +struct mptcp_pm_addr_entry { + struct list_head list; + struct mptcp_addr_info addr; + u8 flags; + int ifindex; + struct socket *lsk; +}; + struct mptcp_data_frag { struct list_head list; u64 data_seq; @@ -247,7 +257,7 @@ struct mptcp_sock { u64 write_seq; u64 snd_nxt; u64 ack_seq; - u64 rcv_wnd_sent; + atomic64_t rcv_wnd_sent; u64 rcv_data_fin_seq; int rmem_fwd_alloc; struct sock *last_snd; @@ -456,7 +466,8 @@ struct mptcp_subflow_context { can_ack : 1, /* only after processing the remote a key */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ - local_id_valid : 1; /* local_id is correctly initialized */ + local_id_valid : 1, /* local_id is correctly initialized */ + valid_csum_seen : 1; /* at least one csum validated */ enum mptcp_data_avail data_avail; bool mp_fail_response_expect; u32 remote_nonce; @@ -601,6 +612,9 @@ void mptcp_subflow_reset(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port); + /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *remote); @@ -738,11 +752,16 @@ void mptcp_token_destroy(struct mptcp_sock *msk); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); -u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); +__sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); +int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + struct mptcp_addr_info *addr); +int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); @@ -763,6 +782,8 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_pm_addr_entry *entry); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * @@ -771,14 +792,28 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr); -int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, +int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, u8 *flags, int *ifindex); +int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, + unsigned int id, + u8 *flags, int *ifindex); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); +void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list); + +int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry); +void mptcp_free_local_addr_list(struct mptcp_sock *msk); +int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info); +int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); @@ -847,6 +882,7 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_work(struct mptcp_sock *msk); @@ -858,6 +894,20 @@ unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); +/* called under PM lock */ +static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) +{ + if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + WRITE_ONCE(msk->pm.accept_subflow, true); +} + +static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) +{ + spin_lock_bh(&msk->pm.lock); + __mptcp_pm_close_subflow(msk); + spin_unlock_bh(&msk->pm.lock); +} + void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 826b0c1dae98..423d3826ca1e 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -756,6 +756,18 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optval, + unsigned int optlen) +{ + struct socket *listener; + + listener = __mptcp_nmpc_socket(msk); + if (!listener) + return 0; /* TCP_DEFER_ACCEPT does not fail */ + + return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen); +} + static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, sockptr_t optval, unsigned int optlen) { @@ -782,6 +794,8 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, return mptcp_setsockopt_sol_tcp_cork(msk, optval, optlen); case TCP_NODELAY: return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen); + case TCP_DEFER_ACCEPT: + return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen); } return -EOPNOTSUPP; @@ -1142,6 +1156,7 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_CONGESTION: case TCP_INFO: case TCP_CC_INFO: + case TCP_DEFER_ACCEPT: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); case TCP_INQ: diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a0e7af33fb26..aecd98ac5dfe 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -891,7 +891,7 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); u32 offset, seq, delta; - u16 csum; + __sum16 csum; int len; if (!csum_reqd) @@ -958,11 +958,14 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); - subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); + if (subflow->mp_join || subflow->valid_csum_seen) { + subflow->send_mp_fail = 1; + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); + } return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; } + subflow->valid_csum_seen = 1; return MAPPING_OK; } @@ -1153,6 +1156,18 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss } } +static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) +{ + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + if (subflow->mp_join) + return false; + else if (READ_ONCE(msk->csum_enabled)) + return !subflow->valid_csum_seen; + else + return !subflow->fully_established; +} + static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -1238,7 +1253,7 @@ fallback: return true; } - if ((subflow->mp_join || subflow->fully_established) && subflow->map_data_len) { + if (!subflow_can_fallback(subflow) && subflow->map_data_len) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ @@ -1444,20 +1459,20 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, struct sockaddr_storage addr; int remote_id = remote->id; int local_id = loc->id; + int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; int ifindex; u8 flags; - int err; if (!mptcp_is_fully_established(sk)) - return -ENOTCONN; + goto err_out; err = mptcp_subflow_create_socket(sk, &sf); if (err) - return err; + goto err_out; ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); @@ -1468,7 +1483,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, if (local_id) subflow_set_local_id(subflow, local_id); - mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id, + mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, &flags, &ifindex); subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; @@ -1515,6 +1530,12 @@ failed_unlink: failed: subflow->disposable = 1; sock_release(sf); + +err_out: + /* we account subflows before the creation, and this failures will not + * be caught by sk_state_change() + */ + mptcp_pm_close_subflow(msk); return err; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0164e5f522e8..082a2fd8d85b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -525,50 +525,6 @@ clean_from_lists(struct nf_conn *ct) nf_ct_remove_expectations(ct); } -/* must be called with local_bh_disable */ -static void nf_ct_add_to_dying_list(struct nf_conn *ct) -{ - struct ct_pcpu *pcpu; - - /* add this conntrack to the (per cpu) dying list */ - ct->cpu = smp_processor_id(); - pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); - - spin_lock(&pcpu->lock); - hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &pcpu->dying); - spin_unlock(&pcpu->lock); -} - -/* must be called with local_bh_disable */ -static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) -{ - struct ct_pcpu *pcpu; - - /* add this conntrack to the (per cpu) unconfirmed list */ - ct->cpu = smp_processor_id(); - pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); - - spin_lock(&pcpu->lock); - hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &pcpu->unconfirmed); - spin_unlock(&pcpu->lock); -} - -/* must be called with local_bh_disable */ -static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) -{ - struct ct_pcpu *pcpu; - - /* We overload first tuple to link into unconfirmed or dying list.*/ - pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); - - spin_lock(&pcpu->lock); - BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); - hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); - spin_unlock(&pcpu->lock); -} - #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) /* Released via nf_ct_destroy() */ @@ -640,7 +596,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) destroy_gre_conntrack(ct); - local_bh_disable(); /* Expectations will have been removed in clean_from_lists, * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, @@ -648,10 +603,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct) */ nf_ct_remove_expectations(ct); - nf_ct_del_from_dying_or_unconfirmed_list(ct); - - local_bh_enable(); - if (ct->master) nf_ct_put(ct->master); @@ -660,15 +611,12 @@ void nf_ct_destroy(struct nf_conntrack *nfct) } EXPORT_SYMBOL(nf_ct_destroy); -static void nf_ct_delete_from_lists(struct nf_conn *ct) +static void __nf_ct_delete_from_lists(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; unsigned int sequence; - nf_ct_helper_destroy(ct); - - local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, @@ -681,12 +629,30 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) clean_from_lists(ct); nf_conntrack_double_unlock(hash, reply_hash); +} + +static void nf_ct_delete_from_lists(struct nf_conn *ct) +{ + nf_ct_helper_destroy(ct); + local_bh_disable(); - nf_ct_add_to_dying_list(ct); + __nf_ct_delete_from_lists(ct); local_bh_enable(); } +static void nf_ct_add_to_ecache_list(struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); + + spin_lock(&cnet->ecache.dying_lock); + hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &cnet->ecache.dying_list); + spin_unlock(&cnet->ecache.dying_lock); +#endif +} + bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; @@ -709,7 +675,12 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) /* destroy event was not delivered. nf_ct_put will * be done by event cache worker on redelivery. */ - nf_ct_delete_from_lists(ct); + nf_ct_helper_destroy(ct); + local_bh_disable(); + __nf_ct_delete_from_lists(ct); + nf_ct_add_to_ecache_list(ct); + local_bh_enable(); + nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); return false; } @@ -870,6 +841,33 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, &nf_conntrack_hash[reply_hash]); } +static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) +{ + /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions + * may contain stale pointers to e.g. helper that has been removed. + * + * The helper can't clear this because the nf_conn object isn't in + * any hash and synchronize_rcu() isn't enough because associated skb + * might sit in a queue. + */ + return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); +} + +static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) +{ + if (!ext) + return true; + + if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) + return false; + + /* inserted into conntrack table, nf_ct_iterate_cleanup() + * will find it. Disable nf_ct_ext_find() id check. + */ + WRITE_ONCE(ext->gen_id, 0); + return true; +} + int nf_conntrack_hash_check_insert(struct nf_conn *ct) { @@ -885,6 +883,11 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) zone = nf_ct_zone(ct); + if (!nf_ct_ext_valid_pre(ct->ext)) { + NF_CT_STAT_INC(net, insert_failed); + return -ETIMEDOUT; + } + local_bh_disable(); do { sequence = read_seqcount_begin(&nf_conntrack_generation); @@ -925,6 +928,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); local_bh_enable(); + + if (!nf_ct_ext_valid_post(ct->ext)) { + nf_ct_kill(ct); + NF_CT_STAT_INC(net, drop); + return -ETIMEDOUT; + } + return 0; chaintoolong: NF_CT_STAT_INC(net, chaintoolong); @@ -972,7 +982,6 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) struct nf_conn_tstamp *tstamp; refcount_inc(&ct->ct_general.use); - ct->status |= IPS_CONFIRMED; /* set conntrack timestamp, if enabled. */ tstamp = nf_conn_tstamp_find(ct); @@ -1001,7 +1010,6 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, nf_conntrack_get(&ct->ct_general); nf_ct_acct_merge(ct, ctinfo, loser_ct); - nf_ct_add_to_dying_list(loser_ct); nf_ct_put(loser_ct); nf_ct_set(skb, ct, ctinfo); @@ -1134,7 +1142,6 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, return ret; drop: - nf_ct_add_to_dying_list(loser_ct); NF_CT_STAT_INC(net, drop); NF_CT_STAT_INC(net, insert_failed); return NF_DROP; @@ -1195,16 +1202,20 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_DROP; } + if (!nf_ct_ext_valid_pre(ct->ext)) { + NF_CT_STAT_INC(net, insert_failed); + goto dying; + } + pr_debug("Confirming conntrack %p\n", ct); /* We have to check the DYING flag after unlink to prevent * a race against nf_ct_get_next_corpse() possibly called from * user context, else we insert an already 'dead' hash, blocking * further use of that particular connection -JM. */ - nf_ct_del_from_dying_or_unconfirmed_list(ct); + ct->status |= IPS_CONFIRMED; if (unlikely(nf_ct_is_dying(ct))) { - nf_ct_add_to_dying_list(ct); NF_CT_STAT_INC(net, insert_failed); goto dying; } @@ -1228,7 +1239,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto out; if (chainlen++ > max_chainlen) { chaintoolong: - nf_ct_add_to_dying_list(ct); NF_CT_STAT_INC(net, chaintoolong); NF_CT_STAT_INC(net, insert_failed); ret = NF_DROP; @@ -1252,6 +1262,16 @@ chaintoolong: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); + /* ext area is still valid (rcu read lock is held, + * but will go out of scope soon, we need to remove + * this conntrack again. + */ + if (!nf_ct_ext_valid_post(ct->ext)) { + nf_ct_kill(ct); + NF_CT_STAT_INC(net, drop); + return NF_DROP; + } + help = nfct_help(ct); if (help && help->helper) nf_conntrack_event_cache(IPCT_HELPER, ct); @@ -1678,7 +1698,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conn *ct; struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; +#ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_ecache *ecache; +#endif struct nf_conntrack_expect *exp = NULL; const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; @@ -1711,15 +1733,21 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_labels_ext_add(ct); +#ifdef CONFIG_NF_CONNTRACK_EVENTS ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; - nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, - ecache ? ecache->expmask : 0, - GFP_ATOMIC); - local_bh_disable(); + if ((ecache || net->ct.sysctl_events) && + !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, + ecache ? ecache->expmask : 0, + GFP_ATOMIC)) { + nf_conntrack_free(ct); + return ERR_PTR(-ENOMEM); + } +#endif + cnet = nf_ct_pernet(net); if (cnet->expect_count) { - spin_lock(&nf_conntrack_expect_lock); + spin_lock_bh(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { pr_debug("expectation arrives ct=%p exp=%p\n", @@ -1742,16 +1770,13 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, #endif NF_CT_STAT_INC(net, expect_new); } - spin_unlock(&nf_conntrack_expect_lock); + spin_unlock_bh(&nf_conntrack_expect_lock); } if (!exp) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); - /* Now it is inserted into the unconfirmed list, set refcount to 1. */ + /* Now it is going to be associated with an sk_buff, set refcount to 1. */ refcount_set(&ct->ct_general.use, 1); - nf_ct_add_to_unconfirmed_list(ct); - - local_bh_enable(); if (exp) { if (exp->expectfn) @@ -2319,7 +2344,7 @@ static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, /* Bring out ya dead! */ static struct nf_conn * get_next_corpse(int (*iter)(struct nf_conn *i, void *data), - void *data, unsigned int *bucket) + const struct nf_ct_iter_data *iter_data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -2350,7 +2375,12 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), * tuple while iterating. */ ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) + + if (iter_data->net && + !net_eq(iter_data->net, nf_ct_net(ct))) + continue; + + if (iter(ct, iter_data->data)) goto found; } spin_unlock(lockp); @@ -2367,7 +2397,7 @@ found: } static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), - void *data, u32 portid, int report) + const struct nf_ct_iter_data *iter_data) { unsigned int bucket = 0; struct nf_conn *ct; @@ -2375,91 +2405,28 @@ static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), might_sleep(); mutex_lock(&nf_conntrack_mutex); - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { /* Time to push up daises... */ - nf_ct_delete(ct, portid, report); + nf_ct_delete(ct, iter_data->portid, iter_data->report); nf_ct_put(ct); cond_resched(); } mutex_unlock(&nf_conntrack_mutex); } -struct iter_data { - int (*iter)(struct nf_conn *i, void *data); - void *data; - struct net *net; -}; - -static int iter_net_only(struct nf_conn *i, void *data) -{ - struct iter_data *d = data; - - if (!net_eq(d->net, nf_ct_net(i))) - return 0; - - return d->iter(i, d->data); -} - -static void -__nf_ct_unconfirmed_destroy(struct net *net) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct nf_conntrack_tuple_hash *h; - struct hlist_nulls_node *n; - struct ct_pcpu *pcpu; - - pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_bh(&pcpu->lock); - hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { - struct nf_conn *ct; - - ct = nf_ct_tuplehash_to_ctrack(h); - - /* we cannot call iter() on unconfirmed list, the - * owning cpu can reallocate ct->ext at any time. - */ - set_bit(IPS_DYING_BIT, &ct->status); - } - spin_unlock_bh(&pcpu->lock); - cond_resched(); - } -} - -void nf_ct_unconfirmed_destroy(struct net *net) +void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), + const struct nf_ct_iter_data *iter_data) { + struct net *net = iter_data->net; struct nf_conntrack_net *cnet = nf_ct_pernet(net); might_sleep(); - if (atomic_read(&cnet->count) > 0) { - __nf_ct_unconfirmed_destroy(net); - nf_queue_nf_hook_drop(net); - synchronize_net(); - } -} -EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy); - -void nf_ct_iterate_cleanup_net(struct net *net, - int (*iter)(struct nf_conn *i, void *data), - void *data, u32 portid, int report) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - struct iter_data d; - - might_sleep(); - if (atomic_read(&cnet->count) == 0) return; - d.iter = iter; - d.data = data; - d.net = net; - - nf_ct_iterate_cleanup(iter_net_only, &d, portid, report); + nf_ct_iterate_cleanup(iter, iter_data); } EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); @@ -2477,6 +2444,7 @@ EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) { + struct nf_ct_iter_data iter_data = {}; struct net *net; down_read(&net_rwsem); @@ -2485,31 +2453,41 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) if (atomic_read(&cnet->count) == 0) continue; - __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); } up_read(&net_rwsem); /* Need to wait for netns cleanup worker to finish, if its * running -- it might have deleted a net namespace from - * the global list, so our __nf_ct_unconfirmed_destroy() might - * not have affected all namespaces. + * the global list, so hook drop above might not have + * affected all namespaces. */ net_ns_barrier(); - /* a conntrack could have been unlinked from unconfirmed list - * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy(). + /* a skb w. unconfirmed conntrack could have been reinjected just + * before we called nf_queue_nf_hook_drop(). + * * This makes sure its inserted into conntrack table. */ synchronize_net(); - nf_ct_iterate_cleanup(iter, data, 0, 0); + nf_ct_ext_bump_genid(); + iter_data.data = data; + nf_ct_iterate_cleanup(iter, &iter_data); + + /* Another cpu might be in a rcu read section with + * rcu protected pointer cleared in iter callback + * or hidden via nf_ct_ext_bump_genid() above. + * + * Wait until those are done. + */ + synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); static int kill_all(struct nf_conn *i, void *data) { - return net_eq(nf_ct_net(i), data); + return 1; } void nf_conntrack_cleanup_start(void) @@ -2544,8 +2522,9 @@ void nf_conntrack_cleanup_net(struct net *net) void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) { - int busy; + struct nf_ct_iter_data iter_data = {}; struct net *net; + int busy; /* * This makes sure all current packets have passed through @@ -2558,7 +2537,8 @@ i_see_dead_people: list_for_each_entry(net, net_exit_list, exit_list) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); - nf_ct_iterate_cleanup(kill_all, net, 0, 0); + iter_data.net = net; + nf_ct_iterate_cleanup_net(kill_all, &iter_data); if (atomic_read(&cnet->count) != 0) busy = 1; } @@ -2571,7 +2551,6 @@ i_see_dead_people: nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); - free_percpu(net->ct.pcpu_lists); } } @@ -2777,33 +2756,19 @@ void nf_conntrack_init_end(void) * We need to use special "null" values, not used in hash table */ #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) -#define DYING_NULLS_VAL ((1<<30)+1) int nf_conntrack_init_net(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); int ret = -ENOMEM; - int cpu; BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); atomic_set(&cnet->count, 0); - net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); - if (!net->ct.pcpu_lists) - goto err_stat; - - for_each_possible_cpu(cpu) { - struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - - spin_lock_init(&pcpu->lock); - INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); - } - net->ct.stat = alloc_percpu(struct ip_conntrack_stat); if (!net->ct.stat) - goto err_pcpu_lists; + return ret; ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) @@ -2819,8 +2784,5 @@ int nf_conntrack_init_net(struct net *net) err_expect: free_percpu(net->ct.stat); -err_pcpu_lists: - free_percpu(net->ct.pcpu_lists); -err_stat: return ret; } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 0cb2da0a759a..8698b3424646 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -16,7 +16,6 @@ #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> -#include <linux/percpu.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/slab.h> @@ -29,8 +28,9 @@ static DEFINE_MUTEX(nf_ct_ecache_mutex); -#define ECACHE_RETRY_WAIT (HZ/10) -#define ECACHE_STACK_ALLOC (256 / sizeof(void *)) +#define DYING_NULLS_VAL ((1 << 30) + 1) +#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10) +#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10) enum retry_state { STATE_CONGESTED, @@ -38,58 +38,67 @@ enum retry_state { STATE_DONE, }; -static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) +struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net) { - struct nf_conn *refs[ECACHE_STACK_ALLOC]; + struct nf_conntrack_net *cnet = nf_ct_pernet(net); + + return &cnet->ecache; +} +#if IS_MODULE(CONFIG_NF_CT_NETLINK) +EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache); +#endif + +static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet) +{ + unsigned long stop = jiffies + ECACHE_MAX_JIFFIES; + struct hlist_nulls_head evicted_list; enum retry_state ret = STATE_DONE; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - unsigned int evicted = 0; + unsigned int sent; - spin_lock(&pcpu->lock); + INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL); - hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) { +next: + sent = 0; + spin_lock_bh(&cnet->ecache.dying_lock); + + hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - struct nf_conntrack_ecache *e; - - if (!nf_ct_is_confirmed(ct)) - continue; - - /* This ecache access is safe because the ct is on the - * pcpu dying list and we hold the spinlock -- the entry - * cannot be free'd until after the lock is released. - * - * This is true even if ct has a refcount of 0: the - * cpu that is about to free the entry must remove it - * from the dying list and needs the lock to do so. - */ - e = nf_ct_ecache_find(ct); - if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL) - continue; - /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means - * the worker owns this entry: the ct will remain valid - * until the worker puts its ct reference. + /* The worker owns all entries, ct remains valid until nf_ct_put + * in the loop below. */ if (nf_conntrack_event(IPCT_DESTROY, ct)) { ret = STATE_CONGESTED; break; } - e->state = NFCT_ECACHE_DESTROY_SENT; - refs[evicted] = ct; + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list); - if (++evicted >= ARRAY_SIZE(refs)) { + if (time_after(stop, jiffies)) { ret = STATE_RESTART; break; } + + if (sent++ > 16) { + spin_unlock_bh(&cnet->ecache.dying_lock); + cond_resched(); + goto next; + } } - spin_unlock(&pcpu->lock); + spin_unlock_bh(&cnet->ecache.dying_lock); - /* can't _put while holding lock */ - while (evicted) - nf_ct_put(refs[--evicted]); + hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); + nf_ct_put(ct); + + cond_resched(); + } return ret; } @@ -97,35 +106,20 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) static void ecache_work(struct work_struct *work) { struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work); - struct netns_ct *ctnet = cnet->ecache.ct_net; - int cpu, delay = -1; - struct ct_pcpu *pcpu; - - local_bh_disable(); - - for_each_possible_cpu(cpu) { - enum retry_state ret; - - pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu); - - ret = ecache_work_evict_list(pcpu); - - switch (ret) { - case STATE_CONGESTED: - delay = ECACHE_RETRY_WAIT; - goto out; - case STATE_RESTART: - delay = 0; - break; - case STATE_DONE: - break; - } + int ret, delay = -1; + + ret = ecache_work_evict_list(cnet); + switch (ret) { + case STATE_CONGESTED: + delay = ECACHE_RETRY_JIFFIES; + break; + case STATE_RESTART: + delay = 0; + break; + case STATE_DONE: + break; } - out: - local_bh_enable(); - - ctnet->ecache_dwork_pending = delay > 0; if (delay >= 0) schedule_delayed_work(&cnet->ecache.dwork, delay); } @@ -199,7 +193,6 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, */ if (e->portid == 0 && portid != 0) e->portid = portid; - e->state = NFCT_ECACHE_DESTROY_FAIL; } return ret; @@ -297,12 +290,51 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) schedule_delayed_work(&cnet->ecache.dwork, HZ); net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { - net->ct.ecache_dwork_pending = false; - mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + if (!hlist_nulls_empty(&cnet->ecache.dying_list)) + mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + else + net->ct.ecache_dwork_pending = false; } } -#define NF_CT_EVENTS_DEFAULT 1 +bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) +{ + struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *e; + + switch (net->ct.sysctl_events) { + case 0: + /* assignment via template / ruleset? ignore sysctl. */ + if (ctmask || expmask) + break; + return true; + case 2: /* autodetect: no event listener, don't allocate extension. */ + if (!READ_ONCE(net->ct.ctnetlink_has_listener)) + return true; + fallthrough; + case 1: + /* always allocate an extension. */ + if (!ctmask && !expmask) { + ctmask = ~0; + expmask = ~0; + } + break; + default: + WARN_ON_ONCE(1); + return true; + } + + e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); + if (e) { + e->ctmask = ctmask; + e->expmask = expmask; + } + + return e != NULL; +} +EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add); + +#define NF_CT_EVENTS_DEFAULT 2 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; void nf_conntrack_ecache_pernet_init(struct net *net) @@ -311,8 +343,9 @@ void nf_conntrack_ecache_pernet_init(struct net *net) net->ct.sysctl_events = nf_ct_events; - cnet->ecache.ct_net = &net->ct; INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work); + INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL); + spin_lock_init(&cnet->ecache.dying_lock); BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ } diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 1296fda54ac6..0b513f7bf9f3 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -27,6 +27,8 @@ #define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */ +atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1); + static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = { [NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help), #if IS_ENABLED(CONFIG_NF_NAT) @@ -116,8 +118,10 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) if (!new) return NULL; - if (!ct->ext) + if (!ct->ext) { memset(new->offset, 0, sizeof(new->offset)); + new->gen_id = atomic_read(&nf_conntrack_ext_genid); + } new->offset[id] = newoff; new->len = newlen; @@ -127,3 +131,29 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) return (void *)new + newoff; } EXPORT_SYMBOL(nf_ct_ext_add); + +/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */ +void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id) +{ + unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid); + unsigned int this_id = READ_ONCE(ext->gen_id); + + if (!__nf_ct_ext_exist(ext, id)) + return NULL; + + if (this_id == 0 || ext->gen_id == gen_id) + return (void *)ext + ext->offset[id]; + + return NULL; +} +EXPORT_SYMBOL(__nf_ct_ext_find); + +void nf_ct_ext_bump_genid(void) +{ + unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid); + + if (value == UINT_MAX) + atomic_set(&nf_conntrack_ext_genid, 1); + + msleep(HZ); +} diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8dec42ec603e..c12a87ebc3ee 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -468,11 +468,6 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); - - /* Maybe someone has gotten the helper already when unhelp above. - * So need to wait it. - */ - synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 924d766e6c53..e768f59741a6 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1559,6 +1559,11 @@ static int ctnetlink_flush_conntrack(struct net *net, u32 portid, int report, u8 family) { struct ctnetlink_filter *filter = NULL; + struct nf_ct_iter_data iter = { + .net = net, + .portid = portid, + .report = report, + }; if (ctnetlink_needs_filter(family, cda)) { if (cda[CTA_FILTER]) @@ -1567,10 +1572,11 @@ static int ctnetlink_flush_conntrack(struct net *net, filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); + + iter.data = filter; } - nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter, - portid, report); + nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter); kfree(filter); return 0; @@ -1750,61 +1756,59 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, } static int -ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying) +ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) +{ + return 0; +} + +static int +ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - struct nf_conn *ct, *last; + struct nf_conn *last = ctx->last; +#ifdef CONFIG_NF_CONNTRACK_EVENTS + const struct net *net = sock_net(skb->sk); + struct nf_conntrack_net_ecache *ecache_net; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - struct hlist_nulls_head *list; - struct net *net = sock_net(skb->sk); - int res, cpu; +#endif if (ctx->done) return 0; - last = ctx->last; + ctx->last = NULL; - for (cpu = ctx->cpu; cpu < nr_cpu_ids; cpu++) { - struct ct_pcpu *pcpu; +#ifdef CONFIG_NF_CONNTRACK_EVENTS + ecache_net = nf_conn_pernet_ecache(net); + spin_lock_bh(&ecache_net->dying_lock); - if (!cpu_possible(cpu)) - continue; + hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) { + struct nf_conn *ct; + int res; - pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); - spin_lock_bh(&pcpu->lock); - list = dying ? &pcpu->dying : &pcpu->unconfirmed; -restart: - hlist_nulls_for_each_entry(h, n, list, hnnode) { - ct = nf_ct_tuplehash_to_ctrack(h); + ct = nf_ct_tuplehash_to_ctrack(h); + if (last && last != ct) + continue; - res = ctnetlink_dump_one_entry(skb, cb, ct, dying); - if (res < 0) { - ctx->cpu = cpu; - spin_unlock_bh(&pcpu->lock); - goto out; - } - } - if (ctx->last) { - ctx->last = NULL; - goto restart; + res = ctnetlink_dump_one_entry(skb, cb, ct, true); + if (res < 0) { + spin_unlock_bh(&ecache_net->dying_lock); + nf_ct_put(last); + return skb->len; } - spin_unlock_bh(&pcpu->lock); + + nf_ct_put(last); + last = NULL; } + + spin_unlock_bh(&ecache_net->dying_lock); +#endif ctx->done = true; -out: - if (last) - nf_ct_put(last); + nf_ct_put(last); return skb->len; } -static int -ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) -{ - return ctnetlink_dump_list(skb, cb, true); -} - static int ctnetlink_get_ct_dying(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) @@ -1820,12 +1824,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb, return -EOPNOTSUPP; } -static int -ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) -{ - return ctnetlink_dump_list(skb, cb, false); -} - static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index d1f2d3c8d2b1..895b09cbd7cf 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -538,9 +538,13 @@ retry: out_unlock: mutex_unlock(&nf_ct_proto_mutex); - if (fixup_needed) - nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup, - (void *)(unsigned long)nfproto, 0, 0); + if (fixup_needed) { + struct nf_ct_iter_data iter_data = { + .net = net, + .data = (void *)(unsigned long)nfproto, + }; + nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data); + } return err; } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 55aa55b252b2..6ad7bbc90d38 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -693,7 +693,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_TWO, }, #endif #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index cec166ecba77..0f828d05ea60 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -38,7 +38,12 @@ static int untimeout(struct nf_conn *ct, void *timeout) void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout) { - nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0); + struct nf_ct_iter_data iter_data = { + .net = net, + .data = timeout, + }; + + nf_ct_iterate_cleanup_net(untimeout, &iter_data); } EXPORT_SYMBOL_GPL(nf_ct_untimeout); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 3db256da919b..f2def06d1070 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -179,12 +179,11 @@ EXPORT_SYMBOL_GPL(flow_offload_route_init); static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) { - tcp->state = TCP_CONNTRACK_ESTABLISHED; tcp->seen[0].td_maxwin = 0; tcp->seen[1].td_maxwin = 0; } -static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); @@ -193,7 +192,9 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); - timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + flow_offload_fixup_tcp(&ct->proto.tcp); + + timeout = tn->timeouts[ct->proto.tcp.state]; timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); @@ -211,18 +212,6 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); } -static void flow_offload_fixup_ct_state(struct nf_conn *ct) -{ - if (nf_ct_protonum(ct) == IPPROTO_TCP) - flow_offload_fixup_tcp(&ct->proto.tcp); -} - -static void flow_offload_fixup_ct(struct nf_conn *ct) -{ - flow_offload_fixup_ct_state(ct); - flow_offload_fixup_ct_timeout(ct); -} - static void flow_offload_route_release(struct flow_offload *flow) { nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); @@ -335,8 +324,10 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, u32 timeout; timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); - if (READ_ONCE(flow->timeout) != timeout) + if (timeout - READ_ONCE(flow->timeout) > HZ) WRITE_ONCE(flow->timeout, timeout); + else + return; if (likely(!nf_flowtable_hw_offload(flow_table))) return; @@ -359,22 +350,14 @@ static void flow_offload_del(struct nf_flowtable *flow_table, rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, nf_flow_offload_rhash_params); - - clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); - - if (nf_flow_has_expired(flow)) - flow_offload_fixup_ct(flow->ct); - else - flow_offload_fixup_ct_timeout(flow->ct); - flow_offload_free(flow); } void flow_offload_teardown(struct flow_offload *flow) { + clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); set_bit(NF_FLOW_TEARDOWN, &flow->flags); - - flow_offload_fixup_ct_state(flow->ct); + flow_offload_fixup_ct(flow->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -438,33 +421,12 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } -static bool flow_offload_stale_dst(struct flow_offload_tuple *tuple) -{ - struct dst_entry *dst; - - if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || - tuple->xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { - dst = tuple->dst_cache; - if (!dst_check(dst, tuple->dst_cookie)) - return true; - } - - return false; -} - -static bool nf_flow_has_stale_dst(struct flow_offload *flow) -{ - return flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple) || - flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple); -} - static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { if (nf_flow_has_expired(flow) || - nf_ct_is_dying(flow->ct) || - nf_flow_has_stale_dst(flow)) - set_bit(NF_FLOW_TEARDOWN, &flow->flags); + nf_ct_is_dying(flow->ct)) + flow_offload_teardown(flow); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { if (test_bit(NF_FLOW_HW, &flow->flags)) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 32c0eb1b4821..b350fe9d00b0 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -248,6 +248,15 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } +static inline bool nf_flow_dst_check(struct flow_offload_tuple *tuple) +{ + if (tuple->xmit_type != FLOW_OFFLOAD_XMIT_NEIGH && + tuple->xmit_type != FLOW_OFFLOAD_XMIT_XFRM) + return true; + + return dst_check(tuple->dst_cache, tuple->dst_cookie); +} + static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, const struct nf_hook_state *state, struct dst_entry *dst) @@ -367,6 +376,11 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) return NF_ACCEPT; + if (!nf_flow_dst_check(&tuplehash->tuple)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } + if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; @@ -624,6 +638,11 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) return NF_ACCEPT; + if (!nf_flow_dst_check(&tuplehash->tuple)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } + if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index e32fac374608..1a506b0c6511 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -77,11 +77,14 @@ EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); static void iterate_cleanup_work(struct work_struct *work) { + struct nf_ct_iter_data iter_data = {}; struct masq_dev_work *w; w = container_of(work, struct masq_dev_work, work); - nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); + iter_data.net = w->net; + iter_data.data = (void *)w; + nf_ct_iterate_cleanup_net(w->iter, &iter_data); put_net_track(w->net, &w->ns_tracker); kfree(w); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f3ad02a399f8..12fc9cda4a2c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8342,16 +8342,7 @@ EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); static bool nft_expr_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { - if (!expr->ops->reduce) { - pr_warn_once("missing reduce for expression %s ", - expr->ops->type->name); - return false; - } - - if (nft_reduce_is_readonly(expr)) - return false; - - return expr->ops->reduce(track, expr); + return false; } static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 7e2c8dd01408..ad3bbe34ca88 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -45,6 +45,7 @@ MODULE_DESCRIPTION("Netfilter messages via netlink socket"); static unsigned int nfnetlink_pernet_id __read_mostly; struct nfnl_net { + unsigned int ctnetlink_listeners; struct sock *nfnl; }; @@ -654,7 +655,6 @@ static void nfnetlink_rcv(struct sk_buff *skb) netlink_rcv_skb(skb, nfnetlink_rcv_msg); } -#ifdef CONFIG_MODULES static int nfnetlink_bind(struct net *net, int group) { const struct nfnetlink_subsystem *ss; @@ -670,9 +670,44 @@ static int nfnetlink_bind(struct net *net, int group) rcu_read_unlock(); if (!ss) request_module_nowait("nfnetlink-subsys-%d", type); + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + if (type == NFNL_SUBSYS_CTNETLINK) { + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + nfnl_lock(NFNL_SUBSYS_CTNETLINK); + + if (WARN_ON_ONCE(nfnlnet->ctnetlink_listeners == UINT_MAX)) { + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); + return -EOVERFLOW; + } + + nfnlnet->ctnetlink_listeners++; + if (nfnlnet->ctnetlink_listeners == 1) + WRITE_ONCE(net->ct.ctnetlink_has_listener, true); + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); + } +#endif return 0; } + +static void nfnetlink_unbind(struct net *net, int group) +{ +#ifdef CONFIG_NF_CONNTRACK_EVENTS + int type = nfnl_group2type[group]; + + if (type == NFNL_SUBSYS_CTNETLINK) { + struct nfnl_net *nfnlnet = nfnl_pernet(net); + + nfnl_lock(NFNL_SUBSYS_CTNETLINK); + WARN_ON_ONCE(nfnlnet->ctnetlink_listeners == 0); + nfnlnet->ctnetlink_listeners--; + if (nfnlnet->ctnetlink_listeners == 0) + WRITE_ONCE(net->ct.ctnetlink_has_listener, false); + nfnl_unlock(NFNL_SUBSYS_CTNETLINK); + } #endif +} static int __net_init nfnetlink_net_init(struct net *net) { @@ -680,9 +715,8 @@ static int __net_init nfnetlink_net_init(struct net *net) struct netlink_kernel_cfg cfg = { .groups = NFNLGRP_MAX, .input = nfnetlink_rcv, -#ifdef CONFIG_MODULES .bind = nfnetlink_bind, -#endif + .unbind = nfnetlink_unbind, }; nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index eea486f32971..f069c24c6146 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -33,8 +33,19 @@ static unsigned int nfct_timeout_id __read_mostly; +struct ctnl_timeout { + struct list_head head; + struct rcu_head rcu_head; + refcount_t refcnt; + char name[CTNL_TIMEOUT_NAME_MAX]; + struct nf_ct_timeout timeout; + + struct list_head free_head; +}; + struct nfct_timeout_pernet { struct list_head nfct_timeout_list; + struct list_head nfct_timeout_freelist; }; MODULE_LICENSE("GPL"); @@ -574,20 +585,36 @@ static int __net_init cttimeout_net_init(struct net *net) struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); INIT_LIST_HEAD(&pernet->nfct_timeout_list); + INIT_LIST_HEAD(&pernet->nfct_timeout_freelist); return 0; } +static void __net_exit cttimeout_net_pre_exit(struct net *net) +{ + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + struct ctnl_timeout *cur, *tmp; + + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) { + list_del_rcu(&cur->head); + list_add(&cur->free_head, &pernet->nfct_timeout_freelist); + } + + /* core calls synchronize_rcu() after this */ +} + static void __net_exit cttimeout_net_exit(struct net *net) { struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); struct ctnl_timeout *cur, *tmp; - nf_ct_unconfirmed_destroy(net); + if (list_empty(&pernet->nfct_timeout_freelist)) + return; + nf_ct_untimeout(net, NULL); - list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) { - list_del_rcu(&cur->head); + list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, head) { + list_del(&cur->free_head); if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); @@ -596,6 +623,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, + .pre_exit = cttimeout_net_pre_exit, .exit = cttimeout_net_exit, .id = &nfct_timeout_id, .size = sizeof(struct nfct_timeout_pernet), @@ -628,13 +656,24 @@ err_out: return ret; } +static int untimeout(struct nf_conn *ct, void *timeout) +{ + struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); + + if (timeout_ext) + RCU_INIT_POINTER(timeout_ext->timeout, NULL); + + return 0; +} + static void __exit cttimeout_exit(void) { nfnetlink_subsys_unregister(&cttimeout_subsys); unregister_pernet_subsys(&cttimeout_ops); RCU_INIT_POINTER(nf_ct_timeout_hook, NULL); - synchronize_rcu(); + + nf_ct_iterate_destroy(untimeout, NULL); } module_init(cttimeout_init); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 900d48c810a1..a16cf47199b7 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -36,6 +36,15 @@ static void nft_default_forward_path(struct nf_flow_route *route, route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); } +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + static int nft_dev_fill_forward_path(const struct nf_flow_route *route, const struct dst_entry *dst_cache, const struct nf_conn *ct, @@ -47,6 +56,9 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route, struct neighbour *n; u8 nud_state; + if (!nft_is_valid_ether_device(dev)) + goto out; + n = dst_neigh_lookup(dst_cache, daddr); if (!n) return -1; @@ -60,6 +72,7 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route, if (!(nud_state & NUD_VALID)) return -1; +out: return dev_fill_forward_path(dev, ha, stack); } @@ -78,15 +91,6 @@ struct nft_forward_info { enum flow_offload_xmit_type xmit_type; }; -static bool nft_is_valid_ether_device(const struct net_device *dev) -{ - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) - return false; - - return true; -} - static void nft_dev_path_info(const struct net_device_path_stack *stack, struct nft_forward_info *info, unsigned char *ha, struct nf_flowtable *flowtable) @@ -119,7 +123,8 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, info->indev = NULL; break; } - info->outdev = path->dev; + if (!info->outdev) + info->outdev = path->dev; info->encap[info->num_encaps].id = path->encap.id; info->encap[info->num_encaps].proto = path->encap.proto; info->num_encaps++; @@ -227,11 +232,19 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, switch (nft_pf(pkt)) { case NFPROTO_IPV4: fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; + fl.u.ip4.saddr = ct->tuplehash[dir].tuple.dst.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; + fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos); + fl.u.ip4.flowi4_mark = pkt->skb->mark; break; case NFPROTO_IPV6: fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; + fl.u.ip6.saddr = ct->tuplehash[dir].tuple.dst.u3.in6; fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; break; } @@ -293,7 +306,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); - if (unlikely(!tcph || tcph->fin || tcph->rst)) + if (unlikely(!tcph || tcph->fin || tcph->rst || + !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 1b5a9c2e1c29..0cd91f813a3b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1974,7 +1974,6 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, copied = len; } - skb_reset_transport_header(data_skb); err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { diff --git a/net/nfc/core.c b/net/nfc/core.c index 67524982b89b..6ff3e10ff8e3 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -38,7 +38,7 @@ int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -94,7 +94,7 @@ int nfc_dev_up(struct nfc_dev *dev) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -142,7 +142,7 @@ int nfc_dev_down(struct nfc_dev *dev) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -207,7 +207,7 @@ int nfc_start_poll(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -246,7 +246,7 @@ int nfc_stop_poll(struct nfc_dev *dev) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -291,7 +291,7 @@ int nfc_dep_link_up(struct nfc_dev *dev, int target_index, u8 comm_mode) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -335,7 +335,7 @@ int nfc_dep_link_down(struct nfc_dev *dev) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -401,7 +401,7 @@ int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -448,7 +448,7 @@ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -495,7 +495,7 @@ int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb, device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; kfree_skb(skb); goto error; @@ -552,7 +552,7 @@ int nfc_enable_se(struct nfc_dev *dev, u32 se_idx) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -601,7 +601,7 @@ int nfc_disable_se(struct nfc_dev *dev, u32 se_idx) device_lock(&dev->dev); - if (!device_is_registered(&dev->dev)) { + if (dev->shutting_down) { rc = -ENODEV; goto error; } @@ -1134,6 +1134,7 @@ int nfc_register_device(struct nfc_dev *dev) dev->rfkill = NULL; } } + dev->shutting_down = false; device_unlock(&dev->dev); rc = nfc_genl_device_added(dev); @@ -1167,12 +1168,10 @@ void nfc_unregister_device(struct nfc_dev *dev) rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } + dev->shutting_down = true; device_unlock(&dev->dev); if (dev->ops->check_presence) { - device_lock(&dev->dev); - dev->shutting_down = true; - device_unlock(&dev->dev); del_timer_sync(&dev->check_pres_timer); cancel_work_sync(&dev->check_pres_work); } diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 6055dc9a82aa..aa5e712adf07 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -118,7 +118,7 @@ static int nci_queue_tx_data_frags(struct nci_dev *ndev, skb_frag = nci_skb_alloc(ndev, (NCI_DATA_HDR_SIZE + frag_len), - GFP_KERNEL); + GFP_ATOMIC); if (skb_frag == NULL) { rc = -ENOMEM; goto free_exit; diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 19703a649b5a..78c4b6addf15 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -153,7 +153,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, i = 0; skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + - NCI_DATA_HDR_SIZE, GFP_KERNEL); + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; @@ -184,7 +184,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, if (i < data_len) { skb = nci_skb_alloc(ndev, conn_info->max_pkt_payload_len + - NCI_DATA_HDR_SIZE, GFP_KERNEL); + NCI_DATA_HDR_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f184b0db79d4..7c62417ccfd7 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1244,7 +1244,7 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; @@ -1260,7 +1260,7 @@ int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, genlmsg_end(msg, hdr); - genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); + genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 5327d130c4b5..73ee2771093d 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -487,14 +487,27 @@ struct rds_tcp_net { /* All module specific customizations to the RDS-TCP socket should be done in * rds_tcp_tune() and applied after socket creation. */ -void rds_tcp_tune(struct socket *sock) +bool rds_tcp_tune(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); - struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct rds_tcp_net *rtn; tcp_sock_set_nodelay(sock->sk); lock_sock(sk); + /* TCP timer functions might access net namespace even after + * a process which created this net namespace terminated. + */ + if (!sk->sk_net_refcnt) { + if (!maybe_get_net(net)) { + release_sock(sk); + return false; + } + sk->sk_net_refcnt = 1; + netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); + } + rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; sk->sk_userlocks |= SOCK_SNDBUF_LOCK; @@ -504,6 +517,7 @@ void rds_tcp_tune(struct socket *sock) sk->sk_userlocks |= SOCK_RCVBUF_LOCK; } release_sock(sk); + return true; } static void rds_tcp_accept_worker(struct work_struct *work) diff --git a/net/rds/tcp.h b/net/rds/tcp.h index dc8d745d6857..f8b5930d7b34 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -49,7 +49,7 @@ struct rds_tcp_statistics { }; /* tcp.c */ -void rds_tcp_tune(struct socket *sock); +bool rds_tcp_tune(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 5461d77fff4f..f0c477c5d1db 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -124,7 +124,10 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) if (ret < 0) goto out; - rds_tcp_tune(sock); + if (!rds_tcp_tune(sock)) { + ret = -EINVAL; + goto out; + } if (isv6) { sin6.sin6_family = AF_INET6; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 09cadd556d1e..7edf2e69d3fe 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -133,7 +133,10 @@ int rds_tcp_accept_one(struct socket *sock) __module_get(new_sock->ops->owner); rds_tcp_keepalive(new_sock); - rds_tcp_tune(new_sock); + if (!rds_tcp_tune(new_sock)) { + ret = -EINVAL; + goto out; + } inet = inet_sk(new_sock->sk); diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index e2e6b6b78578..fee6409c2bb3 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -1128,22 +1128,15 @@ static int rose_node_show(struct seq_file *seq, void *v) seq_puts(seq, "address mask n neigh neigh neigh\n"); else { const struct rose_node *rose_node = v; - /* if (rose_node->loopback) { - seq_printf(seq, "%-10s %04d 1 loopback\n", - rose2asc(rsbuf, &rose_node->address), - rose_node->mask); - } else { */ - seq_printf(seq, "%-10s %04d %d", - rose2asc(rsbuf, &rose_node->address), - rose_node->mask, - rose_node->count); - - for (i = 0; i < rose_node->count; i++) - seq_printf(seq, " %05d", - rose_node->neighbour[i]->number); - - seq_puts(seq, "\n"); - /* } */ + seq_printf(seq, "%-10s %04d %d", + rose2asc(rsbuf, &rose_node->address), + rose_node->mask, + rose_node->count); + + for (i = 0; i < rose_node->count; i++) + seq_printf(seq, " %05d", rose_node->neighbour[i]->number); + + seq_puts(seq, "\n"); } return 0; } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index a4111408ffd0..6a1611b0e303 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -117,6 +117,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) local, srx->transport_type, srx->transport.family); udp_conf.family = srx->transport.family; + udp_conf.use_udp_checksums = true; if (udp_conf.family == AF_INET) { udp_conf.local_ip = srx->transport.sin.sin_addr; udp_conf.local_udp_port = srx->transport.sin.sin_port; @@ -124,6 +125,8 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } else { udp_conf.local_ip6 = srx->transport.sin6.sin6_addr; udp_conf.local_udp_port = srx->transport.sin6.sin6_port; + udp_conf.use_udp6_tx_checksums = true; + udp_conf.use_udp6_rx_checksums = true; #endif } ret = udp_sock_create(net, &udp_conf, &local->socket); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e01ef7f109f4..823ee643371c 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -149,7 +149,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *pattr; struct tcf_pedit *p; int ret = 0, err; - int ksize; + int i, ksize; u32 index; if (!nla) { @@ -228,6 +228,22 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, p->tcfp_nkeys = parm->nkeys; } memcpy(p->tcfp_keys, parm->keys, ksize); + p->tcfp_off_max_hint = 0; + for (i = 0; i < p->tcfp_nkeys; ++i) { + u32 cur = p->tcfp_keys[i].off; + + /* sanitize the shift value for any later use */ + p->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, + p->tcfp_keys[i].shift); + + /* The AT option can read a single byte, we can bound the actual + * value with uchar max. + */ + cur += (0xff & p->tcfp_keys[i].offmask) >> p->tcfp_keys[i].shift; + + /* Each key touches 4 bytes starting from the computed offset */ + p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4); + } p->tcfp_flags = parm->flags; goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -308,13 +324,18 @@ static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_pedit *p = to_pedit(a); + u32 max_offset; int i; - if (skb_unclone(skb, GFP_ATOMIC)) - return p->tcf_action; - spin_lock(&p->tcf_lock); + max_offset = (skb_transport_header_was_set(skb) ? + skb_transport_offset(skb) : + skb_network_offset(skb)) + + p->tcfp_off_max_hint; + if (skb_ensure_writable(skb, min(skb->len, max_offset))) + goto unlock; + tcf_lastuse_update(&p->tcf_tm); if (p->tcfp_nkeys > 0) { @@ -403,6 +424,7 @@ bad: p->tcf_qstats.overlimits++; done: bstats_update(&p->tcf_bstats, skb); +unlock: spin_unlock(&p->tcf_lock); return p->tcf_action; } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 0a04468b7314..49bae3d5006b 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -311,12 +311,15 @@ META_COLLECTOR(int_sk_bound_if) META_COLLECTOR(var_sk_bound_if) { + int bound_dev_if; + if (skip_nonlocal(skb)) { *err = -1; return; } - if (skb->sk->sk_bound_dev_if == 0) { + bound_dev_if = READ_ONCE(skb->sk->sk_bound_dev_if); + if (bound_dev_if == 0) { dst->value = (unsigned long) "any"; dst->len = 3; } else { @@ -324,7 +327,7 @@ META_COLLECTOR(var_sk_bound_if) rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(skb->sk), - skb->sk->sk_bound_dev_if); + bound_dev_if); *err = var_dev(dev, dst); rcu_read_unlock(); } diff --git a/net/sctp/input.c b/net/sctp/input.c index 90e12bafdd48..4f43afa8678f 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -92,6 +92,7 @@ int sctp_rcv(struct sk_buff *skb) struct sctp_chunk *chunk; union sctp_addr src; union sctp_addr dest; + int bound_dev_if; int family; struct sctp_af *af; struct net *net = dev_net(skb->dev); @@ -169,7 +170,8 @@ int sctp_rcv(struct sk_buff *skb) * If a frame arrives on an interface and the receiving socket is * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB */ - if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb))) { + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if && (bound_dev_if != af->skb_iif(skb))) { if (transport) { sctp_transport_put(transport); asoc = NULL; diff --git a/net/sctp/output.c b/net/sctp/output.c index 72fe6669c50d..a63df055ac57 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -134,7 +134,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } - packet->max_size = sk_can_gso(sk) ? READ_ONCE(tp->dst->dev->gso_max_size) + packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size), + GSO_LEGACY_MAX_SIZE) : asoc->pathmtu; rcu_read_unlock(); } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index fce16b9d6e1a..5f70642a8044 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1544,9 +1544,29 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out_err; lock_sock(sk); + switch (sock->state) { + default: + rc = -EINVAL; + goto out; + case SS_CONNECTED: + rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; + goto out; + case SS_CONNECTING: + if (sk->sk_state == SMC_ACTIVE) + goto connected; + break; + case SS_UNCONNECTED: + sock->state = SS_CONNECTING; + break; + } + switch (sk->sk_state) { default: goto out; + case SMC_CLOSED: + rc = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + goto out; case SMC_ACTIVE: rc = -EISCONN; goto out; @@ -1565,20 +1585,24 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out; sock_hold(&smc->sk); /* sock put in passive closing */ - if (smc->use_fallback) + if (smc->use_fallback) { + sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; + } if (flags & O_NONBLOCK) { if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; + goto out; } else { rc = __smc_connect(smc); if (rc < 0) goto out; - else - rc = 0; /* success cases including fallback */ } +connected: + rc = 0; + sock->state = SS_CONNECTED; out: release_sock(sk); out_err: @@ -1693,6 +1717,7 @@ struct sock *smc_accept_dequeue(struct sock *parent, } if (new_sock) { sock_graft(new_sk, new_sock); + new_sock->state = SS_CONNECTED; if (isk->use_fallback) { smc_sk(new_sk)->clcsock->file = new_sock->file; isk->clcsock->file->private_data = isk->clcsock; @@ -2424,7 +2449,7 @@ static int smc_listen(struct socket *sock, int backlog) rc = -EINVAL; if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || - smc->connect_nonblock) + smc->connect_nonblock || sock->state != SS_UNCONNECTED) goto out; rc = 0; @@ -2716,6 +2741,17 @@ static int smc_shutdown(struct socket *sock, int how) lock_sock(sk); + if (sock->state == SS_CONNECTING) { + if (sk->sk_state == SMC_ACTIVE) + sock->state = SS_CONNECTED; + else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || + sk->sk_state == SMC_PEERCLOSEWAIT2 || + sk->sk_state == SMC_APPCLOSEWAIT1 || + sk->sk_state == SMC_APPCLOSEWAIT2 || + sk->sk_state == SMC_APPFINCLOSEWAIT) + sock->state = SS_DISCONNECTING; + } + rc = -ENOTCONN; if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && @@ -2729,6 +2765,7 @@ static int smc_shutdown(struct socket *sock, int how) sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sk->sk_socket->state = SS_UNCONNECTED; sock_put(sk); } goto out; @@ -2754,6 +2791,10 @@ static int smc_shutdown(struct socket *sock, int how) /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; + if (sk->sk_state == SMC_CLOSED) + sock->state = SS_UNCONNECTED; + else + sock->state = SS_DISCONNECTING; out: release_sock(sk); return rc ? rc : rc1; @@ -3139,6 +3180,7 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, rc = -ENOBUFS; sock->ops = &smc_sock_ops; + sock->state = SS_UNCONNECTED; sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index a3e2d3b89568..dcda4165d107 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -671,6 +671,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = sges_per_buf, + .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 51e8eb2933ff..338b9ef806e8 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -355,12 +355,12 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } break; } + if (!timeo) + return -EAGAIN; if (signal_pending(current)) { read_done = sock_intr_errno(timeo); break; } - if (!timeo) - return -EAGAIN; } if (!smc_rx_data_available(conn)) { diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 98ca9229fe87..805a546e8c04 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -391,12 +391,20 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { - struct ib_sge *sge = - wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list; + struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; + struct ib_sge *sge = wr->wr.sg_list; + u64 base_addr = dma_addr; + + if (dst_len < link->qp_attr.cap.max_inline_data) { + base_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; + wr->wr.send_flags |= IB_SEND_INLINE; + } else { + wr->wr.send_flags &= ~IB_SEND_INLINE; + } num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = dma_addr + src_off; + sge[srcchunk].addr = base_addr + src_off; sge[srcchunk].length = src_len; num_sges++; @@ -410,8 +418,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, - &wr_rdma_buf->wr_tx_rdma[dstchunk]); + rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); if (rc) return rc; if (dst_len_sum == len) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 24be1d03fef9..26f8f240d9e8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; + bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; for (i = 0; i < lnk->wr_tx_cnt; i++) { - lnk->wr_tx_sges[i].addr = + lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) : lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; @@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; lnk->wr_tx_ibs[i].send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (send_inline) + lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = diff --git a/net/socket.c b/net/socket.c index f0c39c874665..6ee634c01eca 100644 --- a/net/socket.c +++ b/net/socket.c @@ -683,9 +683,18 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; - if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) + if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) { flags |= SKBTX_HW_TSTAMP; + /* PTP hardware clocks can provide a free running cycle counter + * as a time base for virtual clocks. Tell driver to use the + * free running cycle counter for timestamp if socket is bound + * to virtual clock. + */ + if (tsflags & SOF_TIMESTAMPING_BIND_PHC) + flags |= SKBTX_HW_TSTAMP_USE_CYCLES; + } + if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; @@ -796,7 +805,28 @@ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) return skb->tstamp && !false_tstamp && skb_is_err_queue(skb); } -static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) +static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) +{ + bool cycles = sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC; + struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + struct net_device *orig_dev; + ktime_t hwtstamp; + + rcu_read_lock(); + orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); + if (orig_dev) { + *if_index = orig_dev->ifindex; + hwtstamp = netdev_get_tstamp(orig_dev, shhwtstamps, cycles); + } else { + hwtstamp = shhwtstamps->hwtstamp; + } + rcu_read_unlock(); + + return hwtstamp; +} + +static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb, + int if_index) { struct scm_ts_pktinfo ts_pktinfo; struct net_device *orig_dev; @@ -806,11 +836,14 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb) memset(&ts_pktinfo, 0, sizeof(ts_pktinfo)); - rcu_read_lock(); - orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); - if (orig_dev) - ts_pktinfo.if_index = orig_dev->ifindex; - rcu_read_unlock(); + if (!if_index) { + rcu_read_lock(); + orig_dev = dev_get_by_napi_id(skb_napi_id(skb)); + if (orig_dev) + if_index = orig_dev->ifindex; + rcu_read_unlock(); + } + ts_pktinfo.if_index = if_index; ts_pktinfo.pkt_length = skb->len - skb_mac_offset(skb); put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_PKTINFO, @@ -830,6 +863,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + int if_index; ktime_t hwtstamp; /* Race occurred between timestamp enabling and packet @@ -878,18 +912,22 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && !skb_is_swtx_tstamp(skb, false_tstamp)) { - if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) - hwtstamp = ptp_convert_timestamp(shhwtstamps, - sk->sk_bind_phc); + if_index = 0; + if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) + hwtstamp = get_timestamp(sk, skb, &if_index); else hwtstamp = shhwtstamps->hwtstamp; + if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) + hwtstamp = ptp_convert_timestamp(&hwtstamp, + sk->sk_bind_phc); + if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { empty = 0; if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) - put_ts_pktinfo(msg, skb); + put_ts_pktinfo(msg, skb, if_index); } } if (!empty) { diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 61c276bddaf2..f549e4c05def 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -98,6 +98,7 @@ static int gssp_rpc_create(struct net *net, struct rpc_clnt **_clnt) * done without the correct namespace: */ .flags = RPC_CLNT_CREATE_NOPING | + RPC_CLNT_CREATE_CONNECTED | RPC_CLNT_CREATE_NO_IDLE_TIMEOUT }; struct rpc_clnt *clnt; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index af0174d7ce5a..e2c6eca0271b 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -76,6 +76,7 @@ static int rpc_encode_header(struct rpc_task *task, static int rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr); static int rpc_ping(struct rpc_clnt *clnt); +static int rpc_ping_noreply(struct rpc_clnt *clnt); static void rpc_check_timeout(struct rpc_task *task); static void rpc_register_client(struct rpc_clnt *clnt) @@ -483,6 +484,12 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, rpc_shutdown_client(clnt); return ERR_PTR(err); } + } else if (args->flags & RPC_CLNT_CREATE_CONNECTED) { + int err = rpc_ping_noreply(clnt); + if (err != 0) { + rpc_shutdown_client(clnt); + return ERR_PTR(err); + } } clnt->cl_softrtry = 1; @@ -1065,10 +1072,13 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt) static void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt) { - if (task->tk_xprt && - !(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) && - (task->tk_flags & RPC_TASK_MOVEABLE))) - return; + if (task->tk_xprt) { + if (!(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) && + (task->tk_flags & RPC_TASK_MOVEABLE))) + return; + xprt_release(task); + xprt_put(task->tk_xprt); + } if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) task->tk_xprt = rpc_task_get_first_xprt(clnt); else @@ -2706,6 +2716,10 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; +static const struct rpc_procinfo rpcproc_null_noreply = { + .p_encode = rpcproc_encode_null, +}; + static void rpc_null_call_prepare(struct rpc_task *task, void *data) { @@ -2759,6 +2773,28 @@ static int rpc_ping(struct rpc_clnt *clnt) return status; } +static int rpc_ping_noreply(struct rpc_clnt *clnt) +{ + struct rpc_message msg = { + .rpc_proc = &rpcproc_null_noreply, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = &rpc_null_ops, + .flags = RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, + }; + struct rpc_task *task; + int status; + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} + struct rpc_cb_add_xprt_calldata { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5c91c5457197..fcdd0fca408e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1419,6 +1419,26 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** + * xs_local_state_change - callback to handle AF_LOCAL socket state changes + * @sk: socket whose state has changed + * + */ +static void xs_local_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + struct sock_xprt *transport; + + if (!(xprt = xprt_from_sock(sk))) + return; + transport = container_of(xprt, struct sock_xprt, xprt); + if (sk->sk_shutdown & SHUTDOWN_MASK) { + clear_bit(XPRT_CONNECTED, &xprt->state); + /* Trigger the socket release */ + xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT); + } +} + +/** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed * @@ -1866,6 +1886,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_state_change = xs_local_state_change; sk->sk_error_report = xs_error_report; xprt_clear_connected(xprt); @@ -1950,6 +1971,9 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); int ret; + if (transport->file) + goto force_disconnect; + if (RPC_IS_ASYNC(task)) { /* * We want the AF_LOCAL connect to be resolved in the @@ -1962,11 +1986,17 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) */ task->tk_rpc_status = -ENOTCONN; rpc_exit(task, -ENOTCONN); - return; + goto out_wake; } ret = xs_local_setup_socket(transport); if (ret && !RPC_IS_SOFTCONN(task)) msleep_interruptible(15000); + return; +force_disconnect: + xprt_force_disconnect(xprt); +out_wake: + xprt_clear_connecting(xprt); + xprt_wake_pending_tasks(xprt, -ENOTCONN); } #if IS_ENABLED(CONFIG_SUNRPC_SWAP) @@ -2845,9 +2875,6 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) } xprt_set_bound(xprt); xs_format_peer_addresses(xprt, "local", RPCBIND_NETID_LOCAL); - ret = ERR_PTR(xs_local_setup_socket(transport)); - if (ret) - goto out_err; break; default: ret = ERR_PTR(-EAFNOSUPPORT); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index b12f81a2b44c..ec6f4b699a2b 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -411,10 +411,16 @@ static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i) return 0; } +union tls_iter_offset { + struct iov_iter *msg_iter; + int offset; +}; + static int tls_push_data(struct sock *sk, - struct iov_iter *msg_iter, + union tls_iter_offset iter_offset, size_t size, int flags, - unsigned char record_type) + unsigned char record_type, + struct page *zc_page) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; @@ -480,12 +486,21 @@ handle_error: } record = ctx->open_record; - copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); - copy = min_t(size_t, copy, (max_open_record_len - record->len)); - if (copy) { + copy = min_t(size_t, size, max_open_record_len - record->len); + if (copy && zc_page) { + struct page_frag zc_pfrag; + + zc_pfrag.page = zc_page; + zc_pfrag.offset = iter_offset.offset; + zc_pfrag.size = copy; + tls_append_frag(record, &zc_pfrag, copy); + } else if (copy) { + copy = min_t(size_t, copy, pfrag->size - pfrag->offset); + rc = tls_device_copy_data(page_address(pfrag->page) + - pfrag->offset, copy, msg_iter); + pfrag->offset, copy, + iter_offset.msg_iter); if (rc) goto handle_error; tls_append_frag(record, pfrag, copy); @@ -540,6 +555,7 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; struct tls_context *tls_ctx = tls_get_ctx(sk); + union tls_iter_offset iter; int rc; mutex_lock(&tls_ctx->tx_lock); @@ -551,8 +567,8 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto out; } - rc = tls_push_data(sk, &msg->msg_iter, size, - msg->msg_flags, record_type); + iter.msg_iter = &msg->msg_iter; + rc = tls_push_data(sk, iter, size, msg->msg_flags, record_type, NULL); out: release_sock(sk); @@ -564,7 +580,8 @@ int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct iov_iter msg_iter; + union tls_iter_offset iter_offset; + struct iov_iter msg_iter; char *kaddr; struct kvec iov; int rc; @@ -580,12 +597,20 @@ int tls_device_sendpage(struct sock *sk, struct page *page, goto out; } + if (tls_ctx->zerocopy_sendfile) { + iter_offset.offset = offset; + rc = tls_push_data(sk, iter_offset, size, + flags, TLS_RECORD_TYPE_DATA, page); + goto out; + } + kaddr = kmap(page); iov.iov_base = kaddr + offset; iov.iov_len = size; iov_iter_kvec(&msg_iter, WRITE, &iov, 1, size); - rc = tls_push_data(sk, &msg_iter, size, - flags, TLS_RECORD_TYPE_DATA); + iter_offset.msg_iter = &msg_iter; + rc = tls_push_data(sk, iter_offset, size, flags, TLS_RECORD_TYPE_DATA, + NULL); kunmap(page); out: @@ -656,10 +681,12 @@ EXPORT_SYMBOL(tls_get_record); static int tls_device_push_pending_record(struct sock *sk, int flags) { - struct iov_iter msg_iter; + union tls_iter_offset iter; + struct iov_iter msg_iter; iov_iter_kvec(&msg_iter, WRITE, NULL, 0, 0); - return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); + iter.msg_iter = &msg_iter; + return tls_push_data(sk, iter, 0, flags, TLS_RECORD_TYPE_DATA, NULL); } void tls_device_write_space(struct sock *sk, struct tls_context *ctx) @@ -1345,7 +1372,10 @@ static int tls_device_down(struct net_device *netdev) /* Device contexts for RX and TX will be freed in on sk_destruct * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW. + * Now release the ref taken above. */ + if (refcount_dec_and_test(&ctx->refcount)) + tls_device_free_ctx(ctx); } up_write(&device_offload_lock); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 7b2b0e7ffee4..b91ddc110786 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -513,6 +513,26 @@ out: return rc; } +static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval, + int __user *optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + unsigned int value; + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len != sizeof(value)) + return -EINVAL; + + value = ctx->zerocopy_sendfile; + if (copy_to_user(optval, &value, sizeof(value))) + return -EFAULT; + + return 0; +} + static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { @@ -524,6 +544,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, rc = do_tls_getsockopt_conf(sk, optval, optlen, optname == TLS_TX); break; + case TLS_TX_ZEROCOPY_SENDFILE: + rc = do_tls_getsockopt_tx_zc(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -675,6 +698,26 @@ err_crypto_info: return rc; } +static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval, + unsigned int optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + unsigned int value; + + if (sockptr_is_null(optval) || optlen != sizeof(value)) + return -EINVAL; + + if (copy_from_sockptr(&value, optval, sizeof(value))) + return -EFAULT; + + if (value > 1) + return -EINVAL; + + ctx->zerocopy_sendfile = value; + + return 0; +} + static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { @@ -688,6 +731,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, optname == TLS_TX); release_sock(sk); break; + case TLS_TX_ZEROCOPY_SENDFILE: + lock_sock(sk); + rc = do_tls_setsockopt_tx_zc(sk, optval, optlen); + release_sock(sk); + break; default: rc = -ENOPROTOOPT; break; @@ -921,6 +969,12 @@ static int tls_get_info(const struct sock *sk, struct sk_buff *skb) if (err) goto nla_failure; + if (ctx->tx_conf == TLS_HW && ctx->zerocopy_sendfile) { + err = nla_put_flag(skb, TLS_INFO_ZC_SENDFILE); + if (err) + goto nla_failure; + } + rcu_read_unlock(); nla_nest_end(skb, start); return 0; @@ -940,6 +994,7 @@ static size_t tls_get_info_size(const struct sock *sk) nla_total_size(sizeof(u16)) + /* TLS_INFO_CIPHER */ nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */ nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ + nla_total_size(0) + /* TLS_INFO_ZC_SENDFILE */ 0; return size; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 842f82fb3ced..740b29481bc6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3174,6 +3174,15 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); + if (chandef->chan->band == NL80211_BAND_S1GHZ) { + /* User input error for channel width doesn't match channel */ + if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { + NL_SET_ERR_MSG_ATTR(extack, + attrs[NL80211_ATTR_CHANNEL_WIDTH], + "bad channel width"); + return -EINVAL; + } + } if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); @@ -11665,18 +11674,23 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, struct cfg80211_bitrate_mask mask; struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; int err; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; + wdev_lock(wdev); err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, &mask, dev, true); if (err) - return err; + goto out; - return rdev_set_bitrate_mask(rdev, dev, NULL, &mask); + err = rdev_set_bitrate_mask(rdev, dev, NULL, &mask); +out: + wdev_unlock(wdev); + return err; } static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 4a6d86432910..6d82bd9eaf8c 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1829,7 +1829,7 @@ int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, if (tmp && tmp->datalen >= sizeof(struct ieee80211_s1g_oper_ie)) { struct ieee80211_s1g_oper_ie *s1gop = (void *)tmp->data; - return s1gop->primary_ch; + return s1gop->oper_ch; } } else { tmp = cfg80211_find_elem(WLAN_EID_DS_PARAMS, ie, ielen); diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c index 3bddcbdf2e40..0412814a2295 100644 --- a/net/x25/x25_proc.c +++ b/net/x25/x25_proc.c @@ -79,7 +79,6 @@ static int x25_seq_socket_show(struct seq_file *seq, void *v) { struct sock *s; struct x25_sock *x25; - struct net_device *dev; const char *devname; if (v == SEQ_START_TOKEN) { @@ -91,7 +90,7 @@ static int x25_seq_socket_show(struct seq_file *seq, void *v) s = sk_entry(v); x25 = x25_sk(s); - if (!x25->neighbour || (dev = x25->neighbour->dev) == NULL) + if (!x25->neighbour || !x25->neighbour->dev) devname = "???"; else devname = x25->neighbour->dev->name; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 36aa01d92b65..35c7e89b2e7d 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -117,7 +117,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; - if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) + if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN) return skb; /* This skb was already validated on the upper/virtual dev */ @@ -212,7 +212,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, int err; struct dst_entry *dst; struct net_device *dev; - struct xfrm_state_offload *xso = &x->xso; + struct xfrm_dev_offload *xso = &x->xso; xfrm_address_t *saddr; xfrm_address_t *daddr; @@ -264,15 +264,16 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->dev = dev; netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); xso->real_dev = dev; - xso->num_exthdrs = 1; - /* Don't forward bit that is not implemented */ - xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6; + + if (xuo->flags & XFRM_OFFLOAD_INBOUND) + xso->dir = XFRM_DEV_OFFLOAD_IN; + else + xso->dir = XFRM_DEV_OFFLOAD_OUT; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { - xso->num_exthdrs = 0; - xso->flags = 0; xso->dev = NULL; + xso->dir = 0; xso->real_dev = NULL; dev_put_track(dev, &xso->dev_tracker); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 00bd0ecff5a1..f1876ea61fdc 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3744,7 +3744,7 @@ static int stale_bundle(struct dst_entry *dst) void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { - dst->dev = dev_net(dev)->loopback_dev; + dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b749935152ba..08564e0eef20 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -751,7 +751,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; - struct xfrm_state_offload *xso; + struct xfrm_dev_offload *xso; hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { xso = &x->xso; @@ -835,7 +835,7 @@ int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_vali err = -ESRCH; for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; - struct xfrm_state_offload *xso; + struct xfrm_dev_offload *xso; restart: hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { xso = &x->xso; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 64fa8fdd6bbd..6a58fec6a1fb 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -840,7 +840,7 @@ static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb) return 0; } -static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb) +static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb) { struct xfrm_user_offload *xuo; struct nlattr *attr; @@ -852,7 +852,8 @@ static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb xuo = nla_data(attr); memset(xuo, 0, sizeof(*xuo)); xuo->ifindex = xso->dev->ifindex; - xuo->flags = xso->flags; + if (xso->dir == XFRM_DEV_OFFLOAD_IN) + xuo->flags = XFRM_OFFLOAD_INBOUND; return 0; } |