diff options
Diffstat (limited to 'net')
310 files changed, 6167 insertions, 2549 deletions
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c index 16be8f8b2f8c..c40b98f7743c 100644 --- a/net/6lowpan/ndisc.c +++ b/net/6lowpan/ndisc.c @@ -11,11 +11,6 @@ #include "6lowpan_i.h" -static int lowpan_ndisc_is_useropt(u8 nd_opt_type) -{ - return nd_opt_type == ND_OPT_6CO; -} - #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) #define NDISC_802154_SHORT_ADDR_LENGTH 1 static int lowpan_ndisc_parse_802154_options(const struct net_device *dev, @@ -222,7 +217,6 @@ static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net, #endif const struct ndisc_ops lowpan_ndisc_ops = { - .is_useropt = lowpan_ndisc_is_useropt, #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN) .parse_options = lowpan_ndisc_parse_options, .update = lowpan_ndisc_update, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 217be32426b5..458040e8a0e0 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -564,17 +564,20 @@ static int vlan_dev_init(struct net_device *dev) NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | - NETIF_F_ALL_FCOE; + NETIF_F_FCOE_CRC | NETIF_F_FSO; if (real_dev->vlan_features & NETIF_F_HW_MACSEC) dev->hw_features |= NETIF_F_HW_MACSEC; - dev->features |= dev->hw_features | NETIF_F_LLTX; + dev->features |= dev->hw_features; + dev->lltx = true; + dev->fcoe_mtu = true; netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); - dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; + dev->vlan_features = real_dev->vlan_features & + ~(NETIF_F_FCOE_CRC | NETIF_F_FSO); dev->hw_enc_features = vlan_tnl_features(real_dev); dev->mpls_features = real_dev->mpls_features; @@ -655,7 +658,6 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, lower_features |= NETIF_F_HW_CSUM; features = netdev_intersect_features(features, lower_features); features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE); - features |= NETIF_F_LLTX; return features; } diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 87b959da00cd..fa67374bda49 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -238,9 +238,9 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset) stats = dev_get_stats(vlandev, &temp); seq_printf(seq, - "%s VID: %d REORDER_HDR: %i dev->priv_flags: %llx\n", + "%s VID: %d REORDER_HDR: %i dev->priv_flags: %x\n", vlandev->name, vlan->vlan_id, - (int)(vlan->flags & 1), vlandev->priv_flags); + (int)(vlan->flags & 1), (u32)vlandev->priv_flags); seq_printf(seq, fmt64, "total frames received", stats->rx_packets); seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes); diff --git a/net/Kconfig b/net/Kconfig index d27d0deac0bf..a629f92dc86b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -66,6 +66,12 @@ config SKB_DECRYPTED config SKB_EXTENSIONS bool +config NET_DEVMEM + def_bool y + depends on DMA_SHARED_BUFFER + depends on GENERIC_ALLOCATOR + depends on PAGE_POOL + menu "Networking options" source "net/packet/Kconfig" diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 30ecbc2ef1fd..2758aba47a2f 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1020,9 +1020,10 @@ static void batadv_softif_init_early(struct net_device *dev) dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; - dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; - dev->features |= NETIF_F_LLTX; + dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; dev->priv_flags |= IFF_NO_QUEUE; + dev->lltx = true; + dev->netns_local = true; /* can't call min_mtu, because the needed variables * have not been initialized yet diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 00840d5784fe..04f6398b3a40 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -287,7 +287,7 @@ struct batadv_frag_table_entry { /** @lock: lock to protect the list of fragments */ spinlock_t lock; - /** @timestamp: time (jiffie) of last received fragment */ + /** @timestamp: time (jiffy) of last received fragment */ unsigned long timestamp; /** @seqno: sequence number of the fragments in the list */ diff --git a/net/bluetooth/cmtp/Kconfig b/net/bluetooth/cmtp/Kconfig index c8337786da6b..34e923466236 100644 --- a/net/bluetooth/cmtp/Kconfig +++ b/net/bluetooth/cmtp/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config BT_CMTP - tristate "CMTP protocol support" - depends on BT_BREDR && ISDN_CAPI + tristate "CMTP protocol support (DEPRECATED)" + depends on BT_BREDR && ISDN_CAPI && DEPRECATED help CMTP (CAPI Message Transport Protocol) is a transport layer for CAPI messages. CMTP is required for the Bluetooth Common diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index f3bedc3b613a..884703fda979 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -248,18 +248,10 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s break; case CAPI_FUNCTION_GET_MANUFACTURER: - if (skb->len < CAPI_MSG_BASELEN + 15) - break; - - if (!info && ctrl) { - int len = min_t(uint, CAPI_MANUFACTURER_LEN, - skb->data[CAPI_MSG_BASELEN + 14]); - - memset(ctrl->manu, 0, CAPI_MANUFACTURER_LEN); - strncpy(ctrl->manu, - skb->data + CAPI_MSG_BASELEN + 15, len); - } - + if (!info && ctrl && skb->len > CAPI_MSG_BASELEN + 14) + strscpy_pad(ctrl->manu, + skb->data + CAPI_MSG_BASELEN + 15, + skb->data[CAPI_MSG_BASELEN + 14]); break; case CAPI_FUNCTION_GET_VERSION: @@ -276,18 +268,10 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s break; case CAPI_FUNCTION_GET_SERIAL_NUMBER: - if (skb->len < CAPI_MSG_BASELEN + 17) - break; - - if (!info && ctrl) { - int len = min_t(uint, CAPI_SERIAL_LEN, - skb->data[CAPI_MSG_BASELEN + 16]); - - memset(ctrl->serial, 0, CAPI_SERIAL_LEN); - strncpy(ctrl->serial, - skb->data + CAPI_MSG_BASELEN + 17, len); - } - + if (!info && ctrl && skb->len > CAPI_MSG_BASELEN + 16) + strscpy_pad(ctrl->serial, + skb->data + CAPI_MSG_BASELEN + 17, + skb->data[CAPI_MSG_BASELEN + 16]); break; } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 8e48ccd2af30..d083117ee36c 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -106,8 +106,7 @@ void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) * where a timeout + cancel does indicate an actual failure. */ if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, status); + mgmt_connect_failed(hdev, conn, status); /* The connection attempt was doing scan for new RPA, and is * in scan phase. If params are not associated with any other @@ -778,7 +777,6 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c if (!d) return -ENOMEM; - memset(d, 0, sizeof(*d)); d->big = big; d->sync_handle = conn->sync_handle; @@ -1250,8 +1248,7 @@ void hci_conn_failed(struct hci_conn *conn, u8 status) hci_le_conn_failed(conn, status); break; case ACL_LINK: - mgmt_connect_failed(hdev, &conn->dst, conn->type, - conn->dst_type, status); + mgmt_connect_failed(hdev, conn, status); break; } @@ -2952,5 +2949,9 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) return 0; } - return hci_cmd_sync_queue_once(hdev, abort_conn_sync, conn, NULL); + /* Run immediately if on cmd_sync_work since this may be called + * as a result to MGMT_OP_DISCONNECT/MGMT_OP_UNPAIR which does + * already queue its callback on cmd_sync_work. + */ + return hci_cmd_sync_run_once(hdev, abort_conn_sync, conn, NULL); } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 06da8ac13dca..d6976db02c06 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2406,10 +2406,16 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, /* To avoid a potential race with hci_unregister_dev. */ hci_dev_hold(hdev); - if (action == PM_SUSPEND_PREPARE) + switch (action) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: ret = hci_suspend_dev(hdev); - else if (action == PM_POST_SUSPEND) + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: ret = hci_resume_dev(hdev); + break; + } if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", @@ -3664,19 +3670,19 @@ static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; struct sk_buff *skb; - int quote, cnt, tmp; + int quote, *cnt, tmp; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, LE_LINK)) return; - cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt; + cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; - __check_timeout(hdev, cnt, LE_LINK); + __check_timeout(hdev, *cnt, LE_LINK); - tmp = cnt; - while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, "e))) { + tmp = *cnt; + while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, "e))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, @@ -3691,7 +3697,7 @@ static void hci_sched_le(struct hci_dev *hdev) hci_send_frame(hdev, skb); hdev->le_last_tx = jiffies; - cnt--; + (*cnt)--; chan->sent++; chan->conn->sent++; @@ -3701,12 +3707,7 @@ static void hci_sched_le(struct hci_dev *hdev) } } - if (hdev->le_pkts) - hdev->le_cnt = cnt; - else - hdev->acl_cnt = cnt; - - if (cnt != tmp) + if (*cnt != tmp) hci_prio_recalculate(hdev, LE_LINK); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index d0c118c47f6c..1c82dcdf6e8f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5920,7 +5920,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, * while we have an existing one in peripheral role. */ if (hdev->conn_hash.le_num_peripheral > 0 && - (!test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) || + (test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) || !(hdev->le_states[3] & 0x10))) return NULL; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e79cd40bd079..40ccdef168d7 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -112,7 +112,7 @@ static void hci_cmd_sync_add(struct hci_request *req, u16 opcode, u32 plen, skb_queue_tail(&req->cmd_q, skb); } -static int hci_cmd_sync_run(struct hci_request *req) +static int hci_req_sync_run(struct hci_request *req) { struct hci_dev *hdev = req->hdev; struct sk_buff *skb; @@ -169,7 +169,7 @@ struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen, hdev->req_status = HCI_REQ_PEND; - err = hci_cmd_sync_run(&req); + err = hci_req_sync_run(&req); if (err < 0) return ERR_PTR(err); @@ -782,6 +782,44 @@ int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, } EXPORT_SYMBOL(hci_cmd_sync_queue_once); +/* Run HCI command: + * + * - hdev must be running + * - if on cmd_sync_work then run immediately otherwise queue + */ +int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + /* Only queue command if hdev is running which means it had been opened + * and is either on init phase or is already up. + */ + if (!test_bit(HCI_RUNNING, &hdev->flags)) + return -ENETDOWN; + + /* If on cmd_sync_work then run immediately otherwise queue */ + if (current_work() == &hdev->cmd_sync_work) + return func(hdev, data); + + return hci_cmd_sync_submit(hdev, func, data, destroy); +} +EXPORT_SYMBOL(hci_cmd_sync_run); + +/* Run HCI command entry once: + * + * - Lookup if an entry already exist and only if it doesn't creates a new entry + * and run it. + * - if on cmd_sync_work then run immediately otherwise queue + */ +int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func, + void *data, hci_cmd_sync_work_destroy_t destroy) +{ + if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy)) + return 0; + + return hci_cmd_sync_run(hdev, func, data, destroy); +} +EXPORT_SYMBOL(hci_cmd_sync_run_once); + /* Lookup HCI command entry: * * - Return first entry that matches by function callback or data or @@ -5342,7 +5380,10 @@ int hci_stop_discovery_sync(struct hci_dev *hdev) if (!e) return 0; - return hci_remote_name_cancel_sync(hdev, &e->data.bdaddr); + /* Ignore cancel errors since it should interfere with stopping + * of the discovery. + */ + hci_remote_name_cancel_sync(hdev, &e->data.bdaddr); } return 0; diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c index f46847632ffa..6e349704efe4 100644 --- a/net/bluetooth/leds.c +++ b/net/bluetooth/leds.c @@ -48,7 +48,7 @@ static int power_activate(struct led_classdev *led_cdev) htrig = to_hci_basic_led_trigger(led_cdev->trigger); powered = test_bit(HCI_UP, &htrig->hdev->flags); - led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF); + led_set_brightness(led_cdev, powered ? LED_FULL : LED_OFF); return 0; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 40d4887c7f79..e4f564d6f6fb 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -2830,16 +2830,6 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "debug_keys %u key_count %u", cp->debug_keys, key_count); - for (i = 0; i < key_count; i++) { - struct mgmt_link_key_info *key = &cp->keys[i]; - - /* Considering SMP over BREDR/LE, there is no need to check addr_type */ - if (key->type > 0x08) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_LOAD_LINK_KEYS, - MGMT_STATUS_INVALID_PARAMS); - } - hci_dev_lock(hdev); hci_link_keys_clear(hdev); @@ -2864,6 +2854,19 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, continue; } + if (key->addr.type != BDADDR_BREDR) { + bt_dev_warn(hdev, + "Invalid link address type %u for %pMR", + key->addr.type, &key->addr.bdaddr); + continue; + } + + if (key->type > 0x08) { + bt_dev_warn(hdev, "Invalid link key type %u for %pMR", + key->type, &key->addr.bdaddr); + continue; + } + /* Always ignore debug keys and require a new pairing if * the user wants to use them. */ @@ -2921,7 +2924,12 @@ static int unpair_device_sync(struct hci_dev *hdev, void *data) if (!conn) return 0; - return hci_abort_conn_sync(hdev, conn, HCI_ERROR_REMOTE_USER_TERM); + /* Disregard any possible error since the likes of hci_abort_conn_sync + * will clean up the connection no matter the error. + */ + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); + + return 0; } static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data, @@ -3053,13 +3061,44 @@ unlock: return err; } +static void disconnect_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_pending_cmd *cmd = data; + + cmd->cmd_complete(cmd, mgmt_status(err)); + mgmt_pending_free(cmd); +} + +static int disconnect_sync(struct hci_dev *hdev, void *data) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_disconnect *cp = cmd->param; + struct hci_conn *conn; + + if (cp->addr.type == BDADDR_BREDR) + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, + &cp->addr.bdaddr); + else + conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + + if (!conn) + return -ENOTCONN; + + /* Disregard any possible error since the likes of hci_abort_conn_sync + * will clean up the connection no matter the error. + */ + hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); + + return 0; +} + static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_cp_disconnect *cp = data; struct mgmt_rp_disconnect rp; struct mgmt_pending_cmd *cmd; - struct hci_conn *conn; int err; bt_dev_dbg(hdev, "sock %p", sk); @@ -3082,27 +3121,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, goto failed; } - if (pending_find(MGMT_OP_DISCONNECT, hdev)) { - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_BUSY, &rp, sizeof(rp)); - goto failed; - } - - if (cp->addr.type == BDADDR_BREDR) - conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, - &cp->addr.bdaddr); - else - conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr, - le_addr_type(cp->addr.type)); - - if (!conn || conn->state == BT_OPEN || conn->state == BT_CLOSED) { - err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT, - MGMT_STATUS_NOT_CONNECTED, &rp, - sizeof(rp)); - goto failed; - } - - cmd = mgmt_pending_add(sk, MGMT_OP_DISCONNECT, hdev, data, len); + cmd = mgmt_pending_new(sk, MGMT_OP_DISCONNECT, hdev, data, len); if (!cmd) { err = -ENOMEM; goto failed; @@ -3110,9 +3129,10 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, cmd->cmd_complete = generic_cmd_complete; - err = hci_disconnect(conn, HCI_ERROR_REMOTE_USER_TERM); + err = hci_cmd_sync_queue(hdev, disconnect_sync, cmd, + disconnect_complete); if (err < 0) - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); failed: hci_dev_unlock(hdev); @@ -3456,6 +3476,10 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, * will be kept and this function does nothing. */ p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); + if (!p) { + err = -EIO; + goto unlock; + } if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT) p->auto_connect = HCI_AUTO_CONN_DISABLED; @@ -7068,7 +7092,6 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *irk = &cp->irks[i]; - u8 addr_type = le_addr_type(irk->addr.type); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, @@ -7078,12 +7101,8 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, continue; } - /* When using SMP over BR/EDR, the addr type should be set to BREDR */ - if (irk->addr.type == BDADDR_BREDR) - addr_type = BDADDR_BREDR; - hci_add_irk(hdev, &irk->addr.bdaddr, - addr_type, irk->val, + le_addr_type(irk->addr.type), irk->val, BDADDR_ANY); } @@ -7148,15 +7167,6 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, bt_dev_dbg(hdev, "key_count %u", key_count); - for (i = 0; i < key_count; i++) { - struct mgmt_ltk_info *key = &cp->keys[i]; - - if (!ltk_is_valid(key)) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_LOAD_LONG_TERM_KEYS, - MGMT_STATUS_INVALID_PARAMS); - } - hci_dev_lock(hdev); hci_smp_ltks_clear(hdev); @@ -7164,7 +7174,6 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, for (i = 0; i < key_count; i++) { struct mgmt_ltk_info *key = &cp->keys[i]; u8 type, authenticated; - u8 addr_type = le_addr_type(key->addr.type); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, @@ -7174,6 +7183,12 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, continue; } + if (!ltk_is_valid(key)) { + bt_dev_warn(hdev, "Invalid LTK for %pMR", + &key->addr.bdaddr); + continue; + } + switch (key->type) { case MGMT_LTK_UNAUTHENTICATED: authenticated = 0x00; @@ -7199,12 +7214,8 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, continue; } - /* When using SMP over BR/EDR, the addr type should be set to BREDR */ - if (key->addr.type == BDADDR_BREDR) - addr_type = BDADDR_BREDR; - hci_add_ltk(hdev, &key->addr.bdaddr, - addr_type, type, authenticated, + le_addr_type(key->addr.type), type, authenticated, key->val, key->enc_size, key->ediv, key->rand); } @@ -9498,7 +9509,7 @@ void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key, ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &key->bdaddr); - ev.key.addr.type = link_to_bdaddr(key->link_type, key->bdaddr_type); + ev.key.addr.type = BDADDR_BREDR; ev.key.type = key->type; memcpy(ev.key.val, key->val, HCI_LINK_KEY_SIZE); ev.key.pin_len = key->pin_len; @@ -9549,7 +9560,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent) ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &key->bdaddr); - ev.key.addr.type = link_to_bdaddr(key->link_type, key->bdaddr_type); + ev.key.addr.type = link_to_bdaddr(LE_LINK, key->bdaddr_type); ev.key.type = mgmt_ltk_type(key); ev.key.enc_size = key->enc_size; ev.key.ediv = key->ediv; @@ -9578,7 +9589,7 @@ void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent) bacpy(&ev.rpa, &irk->rpa); bacpy(&ev.irk.addr.bdaddr, &irk->bdaddr); - ev.irk.addr.type = link_to_bdaddr(irk->link_type, irk->addr_type); + ev.irk.addr.type = link_to_bdaddr(LE_LINK, irk->addr_type); memcpy(ev.irk.val, irk->val, sizeof(irk->val)); mgmt_event(MGMT_EV_NEW_IRK, hdev, &ev, sizeof(ev), NULL); @@ -9607,7 +9618,7 @@ void mgmt_new_csrk(struct hci_dev *hdev, struct smp_csrk *csrk, ev.store_hint = persistent; bacpy(&ev.key.addr.bdaddr, &csrk->bdaddr); - ev.key.addr.type = link_to_bdaddr(csrk->link_type, csrk->bdaddr_type); + ev.key.addr.type = link_to_bdaddr(LE_LINK, csrk->bdaddr_type); ev.key.type = csrk->type; memcpy(ev.key.val, csrk->val, sizeof(csrk->val)); @@ -9685,18 +9696,6 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, mgmt_event_skb(skb, NULL); } -static void disconnect_rsp(struct mgmt_pending_cmd *cmd, void *data) -{ - struct sock **sk = data; - - cmd->cmd_complete(cmd, 0); - - *sk = cmd->sk; - sock_hold(*sk); - - mgmt_pending_remove(cmd); -} - static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data) { struct hci_dev *hdev = data; @@ -9740,8 +9739,6 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, if (link_type != ACL_LINK && link_type != LE_LINK) return; - mgmt_pending_foreach(MGMT_OP_DISCONNECT, hdev, disconnect_rsp, &sk); - bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); ev.reason = reason; @@ -9754,9 +9751,6 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, if (sk) sock_put(sk); - - mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp, - hdev); } void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, @@ -9785,13 +9779,18 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, mgmt_pending_remove(cmd); } -void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, - u8 addr_type, u8 status) +void mgmt_connect_failed(struct hci_dev *hdev, struct hci_conn *conn, u8 status) { struct mgmt_ev_connect_failed ev; - bacpy(&ev.addr.bdaddr, bdaddr); - ev.addr.type = link_to_bdaddr(link_type, addr_type); + if (test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) { + mgmt_device_disconnected(hdev, &conn->dst, conn->type, + conn->dst_type, status, true); + return; + } + + bacpy(&ev.addr.bdaddr, &conn->dst); + ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type); ev.status = mgmt_status(status); mgmt_event(MGMT_EV_CONNECT_FAILED, hdev, &ev, sizeof(ev), NULL); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 1e7ea3a4b7ef..8b9724fd752a 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -914,7 +914,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, * Confirms and the responder Enters the passkey. */ if (smp->method == OVERLAP) { - if (hcon->role == HCI_ROLE_MASTER) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp->method = CFM_PASSKEY; else smp->method = REQ_PASSKEY; @@ -964,7 +964,7 @@ static u8 smp_confirm(struct smp_chan *smp) smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp); - if (conn->hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); else SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); @@ -980,7 +980,8 @@ static u8 smp_random(struct smp_chan *smp) int ret; bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn, - conn->hcon->out ? "initiator" : "responder"); + test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : + "responder"); ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp, hcon->init_addr_type, &hcon->init_addr, @@ -994,7 +995,7 @@ static u8 smp_random(struct smp_chan *smp) return SMP_CONFIRM_FAILED; } - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 stk[16]; __le64 rand = 0; __le16 ediv = 0; @@ -1059,7 +1060,6 @@ static void smp_notify_keys(struct l2cap_conn *conn) } if (smp->remote_irk) { - smp->remote_irk->link_type = hcon->type; mgmt_new_irk(hdev, smp->remote_irk, persistent); /* Now that user space can be considered to know the @@ -1079,28 +1079,24 @@ static void smp_notify_keys(struct l2cap_conn *conn) } if (smp->csrk) { - smp->csrk->link_type = hcon->type; smp->csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->csrk, persistent); } if (smp->responder_csrk) { - smp->responder_csrk->link_type = hcon->type; smp->responder_csrk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_csrk->bdaddr, &hcon->dst); mgmt_new_csrk(hdev, smp->responder_csrk, persistent); } if (smp->ltk) { - smp->ltk->link_type = hcon->type; smp->ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->ltk, persistent); } if (smp->responder_ltk) { - smp->responder_ltk->link_type = hcon->type; smp->responder_ltk->bdaddr_type = hcon->dst_type; bacpy(&smp->responder_ltk->bdaddr, &hcon->dst); mgmt_new_ltk(hdev, smp->responder_ltk, persistent); @@ -1120,8 +1116,6 @@ static void smp_notify_keys(struct l2cap_conn *conn) key = hci_add_link_key(hdev, smp->conn->hcon, &hcon->dst, smp->link_key, type, 0, &persistent); if (key) { - key->link_type = hcon->type; - key->bdaddr_type = hcon->dst_type; mgmt_new_link_key(hdev, key, persistent); /* Don't keep debug keys around if the relevant @@ -1256,14 +1250,15 @@ static void smp_distribute_keys(struct smp_chan *smp) rsp = (void *) &smp->prsp[1]; /* The responder sends its keys first */ - if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags) && + (smp->remote_key_dist & KEY_DIST_MASK)) { smp_allow_key_dist(smp); return; } req = (void *) &smp->preq[1]; - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { keydist = &rsp->init_key_dist; *keydist &= req->init_key_dist; } else { @@ -1432,7 +1427,7 @@ static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16]) struct hci_conn *hcon = smp->conn->hcon; u8 *na, *nb, a[7], b[7]; - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { na = smp->prnd; nb = smp->rrnd; } else { @@ -1460,7 +1455,7 @@ static void sc_dhkey_check(struct smp_chan *smp) a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->preq[1], 3); @@ -1539,7 +1534,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) /* The round is only complete when the initiator * receives pairing random. */ - if (!hcon->out) { + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); if (smp->passkey_round == 20) @@ -1567,7 +1562,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); return 0; @@ -1578,7 +1573,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op) case SMP_CMD_PUBLIC_KEY: default: /* Initiating device starts the round */ - if (!hcon->out) + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; bt_dev_dbg(hdev, "Starting passkey round %u", @@ -1623,7 +1618,7 @@ static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey) } /* Initiator sends DHKey check first */ - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) { @@ -1746,7 +1741,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_cmd_pairing rsp, *req = (void *) skb->data; struct l2cap_chan *chan = conn->smp; struct hci_dev *hdev = conn->hcon->hdev; - struct smp_chan *smp; + struct smp_chan *smp = chan->data; u8 key_size, auth, sec_level; int ret; @@ -1755,16 +1750,14 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*req)) return SMP_INVALID_PARAMS; - if (conn->hcon->role != HCI_ROLE_SLAVE) + if (smp && test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; - if (!chan->data) + if (!smp) { smp = smp_chan_create(conn); - else - smp = chan->data; - - if (!smp) - return SMP_UNSPECIFIED; + if (!smp) + return SMP_UNSPECIFIED; + } /* We didn't start the pairing, so match remote */ auth = req->auth_req & AUTH_REQ_MASK(hdev); @@ -1946,7 +1939,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb) if (skb->len < sizeof(*rsp)) return SMP_INVALID_PARAMS; - if (conn->hcon->role != HCI_ROLE_MASTER) + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_CMD_NOTSUPP; skb_pull(skb, sizeof(*rsp)); @@ -2041,7 +2034,7 @@ static u8 sc_check_confirm(struct smp_chan *smp) if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM); - if (conn->hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); @@ -2063,7 +2056,7 @@ static int fixup_sc_false_positive(struct smp_chan *smp) u8 auth; /* The issue is only observed when we're in responder role */ - if (hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return SMP_UNSPECIFIED; if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) { @@ -2099,7 +2092,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) struct hci_dev *hdev = hcon->hdev; bt_dev_dbg(hdev, "conn %p %s", conn, - hcon->out ? "initiator" : "responder"); + test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" : + "responder"); if (skb->len < sizeof(smp->pcnf)) return SMP_INVALID_PARAMS; @@ -2121,7 +2115,7 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb) return ret; } - if (conn->hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM); @@ -2156,7 +2150,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) if (!test_bit(SMP_FLAG_SC, &smp->flags)) return smp_random(smp); - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { pkax = smp->local_pk; pkbx = smp->remote_pk; na = smp->prnd; @@ -2169,7 +2163,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) } if (smp->method == REQ_OOB) { - if (!hcon->out) + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); @@ -2180,7 +2174,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY) return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM); - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { u8 cfm[16]; err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk, @@ -2221,7 +2215,7 @@ mackey_and_ltk: return SMP_UNSPECIFIED; if (smp->method == REQ_OOB) { - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); } @@ -2295,10 +2289,27 @@ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level, return false; } +static void smp_send_pairing_req(struct smp_chan *smp, __u8 auth) +{ + struct smp_cmd_pairing cp; + + if (smp->conn->hcon->type == ACL_LINK) + build_bredr_pairing_cmd(smp, &cp, NULL); + else + build_pairing_cmd(smp->conn, &cp, NULL, auth); + + smp->preq[0] = SMP_CMD_PAIRING_REQ; + memcpy(&smp->preq[1], &cp, sizeof(cp)); + + smp_send_cmd(smp->conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); + + set_bit(SMP_FLAG_INITIATOR, &smp->flags); +} + static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) { struct smp_cmd_security_req *rp = (void *) skb->data; - struct smp_cmd_pairing cp; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; struct smp_chan *smp; @@ -2347,16 +2358,20 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) skb_pull(skb, sizeof(*rp)); - memset(&cp, 0, sizeof(cp)); - build_pairing_cmd(conn, &cp, NULL, auth); + smp_send_pairing_req(smp, auth); - smp->preq[0] = SMP_CMD_PAIRING_REQ; - memcpy(&smp->preq[1], &cp, sizeof(cp)); + return 0; +} - smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); +static void smp_send_security_req(struct smp_chan *smp, __u8 auth) +{ + struct smp_cmd_security_req cp; - return 0; + cp.auth_req = auth; + smp_send_cmd(smp->conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); + SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); + + clear_bit(SMP_FLAG_INITIATOR, &smp->flags); } int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) @@ -2427,23 +2442,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) authreq |= SMP_AUTH_MITM; } - if (hcon->role == HCI_ROLE_MASTER) { - struct smp_cmd_pairing cp; - - build_pairing_cmd(conn, &cp, NULL, authreq); - smp->preq[0] = SMP_CMD_PAIRING_REQ; - memcpy(&smp->preq[1], &cp, sizeof(cp)); - - smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp); - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); - } else { - struct smp_cmd_security_req cp; - cp.auth_req = authreq; - smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp); - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ); - } + if (hcon->role == HCI_ROLE_MASTER) + smp_send_pairing_req(smp, authreq); + else + smp_send_security_req(smp, authreq); - set_bit(SMP_FLAG_INITIATOR, &smp->flags); ret = 0; unlock: @@ -2694,8 +2697,6 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb) static u8 sc_select_method(struct smp_chan *smp) { - struct l2cap_conn *conn = smp->conn; - struct hci_conn *hcon = conn->hcon; struct smp_cmd_pairing *local, *remote; u8 local_mitm, remote_mitm, local_io, remote_io, method; @@ -2708,7 +2709,7 @@ static u8 sc_select_method(struct smp_chan *smp) * the "struct smp_cmd_pairing" from them we need to skip the * first byte which contains the opcode. */ - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local = (void *) &smp->preq[1]; remote = (void *) &smp->prsp[1]; } else { @@ -2777,7 +2778,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) /* Non-initiating device sends its public key after receiving * the key from the initiating device. */ - if (!hcon->out) { + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { err = sc_send_public_key(smp); if (err) return err; @@ -2839,7 +2840,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) } if (smp->method == REQ_OOB) { - if (hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); @@ -2848,7 +2849,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } - if (hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM); if (smp->method == REQ_PASSKEY) { @@ -2863,7 +2864,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) /* The Initiating device waits for the non-initiating device to * send the confirm value. */ - if (conn->hcon->out) + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) return 0; err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd, @@ -2897,7 +2898,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) a[6] = hcon->init_addr_type; b[6] = hcon->resp_addr_type; - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { local_addr = a; remote_addr = b; memcpy(io_cap, &smp->prsp[1], 3); @@ -2922,7 +2923,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) if (crypto_memneq(check->e, e, 16)) return SMP_DHKEY_CHECK_FAILED; - if (!hcon->out) { + if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) { set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags); return 0; @@ -2934,7 +2935,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb) sc_add_ltk(smp); - if (hcon->out) { + if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) { hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size); hcon->enc_key_size = smp->enc_key_size; } @@ -3083,7 +3084,6 @@ static void bredr_pairing(struct l2cap_chan *chan) struct l2cap_conn *conn = chan->conn; struct hci_conn *hcon = conn->hcon; struct hci_dev *hdev = hcon->hdev; - struct smp_cmd_pairing req; struct smp_chan *smp; bt_dev_dbg(hdev, "chan %p", chan); @@ -3135,14 +3135,7 @@ static void bredr_pairing(struct l2cap_chan *chan) bt_dev_dbg(hdev, "starting SMP over BR/EDR"); - /* Prepare and send the BR/EDR SMP Pairing Request */ - build_bredr_pairing_cmd(smp, &req, NULL); - - smp->preq[0] = SMP_CMD_PAIRING_REQ; - memcpy(&smp->preq[1], &req, sizeof(req)); - - smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(req), &req); - SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP); + smp_send_pairing_req(smp, 0x00); } static void smp_resume_cb(struct l2cap_chan *chan) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index fb1115857e49..26b79feb385d 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -487,9 +487,11 @@ void br_dev_setup(struct net_device *dev) dev->ethtool_ops = &br_ethtool_ops; SET_NETDEV_DEVTYPE(dev, &br_type); dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; + dev->lltx = true; + dev->netns_local = true; - dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | - NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; + dev->features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->vlan_features = COMMON_FEATURES; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index c77591e63841..ad7a42b505ef 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1469,12 +1469,10 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, modified = true; } - if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { + if (test_and_set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) { /* Refresh entry */ fdb->used = jiffies; - } else if (!test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags)) { - /* Take over SW learned entry */ - set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); + } else { modified = true; } diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 8f9c19d992ac..0e8bc0ea6175 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -36,6 +36,7 @@ #include <net/route.h> #include <net/netfilter/br_netfilter.h> #include <net/netns/generic.h> +#include <net/inet_dscp.h> #include <linux/uaccess.h> #include "br_private.h" @@ -402,7 +403,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ goto free_skb; rt = ip_route_output(net, iph->daddr, 0, - RT_TOS(iph->tos), 0, + iph->tos & INET_DSCP_MASK, 0, RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { /* - Bridged-and-DNAT'ed traffic doesn't diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index cbd0e3586c3f..3e67d4aff419 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1256,7 +1256,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, goto free_unlock; } - ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; if (newinfo->nentries) diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index bd4d1b4d745f..d12a221366d6 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -142,7 +142,7 @@ static int nft_meta_bridge_set_init(const struct nft_ctx *ctx, } priv->len = len; - err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; @@ -168,8 +168,7 @@ static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track, } static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -179,7 +178,7 @@ static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, hooks = 1 << NF_BR_PRE_ROUTING; break; default: - return nft_meta_set_validate(ctx, expr, data); + return nft_meta_set_validate(ctx, expr); } return nft_chain_validate_hooks(ctx->chain, hooks); diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 71b54fed7263..1cb5c16e97b7 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -170,8 +170,7 @@ out: } static int nft_reject_bridge_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_IN)); diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index 2ae8cfa3df88..96236d21b18e 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -298,10 +298,8 @@ struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) { return dstpkt; } - if (expectlen > addlen) - neededtailspace = expectlen; - else - neededtailspace = addlen; + + neededtailspace = max(expectlen, addlen); if (dst->tail + neededtailspace > dst->end) { /* Create a dumplicate of 'dst' with more tail space */ diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 47901bd4def1..94ad09e36df2 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -47,7 +47,6 @@ struct chnl_net { struct caif_connect_request conn_req; struct list_head list_field; struct net_device *netdev; - char name[256]; wait_queue_head_t netmgmt_wq; /* Flow status to remember and control the transmission. */ bool flowenabled; @@ -347,7 +346,6 @@ static int chnl_net_init(struct net_device *dev) struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); - strncpy(priv->name, dev->name, sizeof(priv->name)); INIT_LIST_HEAD(&priv->list_field); return 0; } diff --git a/net/can/bcm.c b/net/can/bcm.c index 27d5fcf0eac9..217049fa496e 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1470,6 +1470,12 @@ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, /* remove device reference, if this is our bound device */ if (bo->bound && bo->ifindex == dev->ifindex) { +#if IS_ENABLED(CONFIG_PROC_FS) + if (sock_net(sk)->can.bcmproc_dir && bo->bcm_proc_read) { + remove_proc_entry(bo->procname, sock_net(sk)->can.bcmproc_dir); + bo->bcm_proc_read = NULL; + } +#endif bo->bound = 0; bo->ifindex = 0; notify_enodev = 1; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 4be73de5033c..319f47df3330 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1179,10 +1179,10 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) break; case -ENETDOWN: /* In this case we should get a netdev_event(), all active - * sessions will be cleared by - * j1939_cancel_all_active_sessions(). So handle this as an - * error, but let j1939_cancel_all_active_sessions() do the - * cleanup including propagation of the error to user space. + * sessions will be cleared by j1939_cancel_active_session(). + * So handle this as an error, but let + * j1939_cancel_active_session() do the cleanup including + * propagation of the error to user space. */ break; case -EOVERFLOW: diff --git a/net/core/Makefile b/net/core/Makefile index 62be9aef2528..c3ebbaf9c81e 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o +obj-y += netdev_rx_queue.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o @@ -43,3 +44,4 @@ obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o +obj-$(CONFIG_NET_DEVMEM) += devmem.o diff --git a/net/core/datagram.c b/net/core/datagram.c index a40f733b37d7..f0693707aece 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -407,6 +407,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, return 0; } + if (!skb_frags_readable(skb)) + goto short_copy; + /* Copy paged appendix. Hmm... why does this look so complicated? */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -623,6 +626,9 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, { int frag = skb_shinfo(skb)->nr_frags; + if (!skb_frags_readable(skb)) + return -EFAULT; + while (length && iov_iter_count(from)) { struct page *head, *last_head = NULL; struct page *pages[MAX_SKB_FRAGS]; diff --git a/net/core/dev.c b/net/core/dev.c index f66e61407883..cd479f5f22f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -158,8 +158,10 @@ #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/rps.h> +#include <linux/phy_link_topology.h> #include "dev.h" +#include "devmem.h" #include "net-sysfs.h" static DEFINE_SPINLOCK(ptype_lock); @@ -3310,6 +3312,10 @@ int skb_checksum_help(struct sk_buff *skb) return -EINVAL; } + if (!skb_frags_readable(skb)) { + return -EFAULT; + } + /* Before computing a checksum, we should make sure no frag could * be modified by an external entity : checksum could be wrong. */ @@ -3386,6 +3392,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb) out: return ret; } +EXPORT_SYMBOL(skb_crc32c_csum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { @@ -3431,8 +3438,9 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = skb_frag_page(frag); - if (PageHighMem(skb_frag_page(frag))) + if (page && PageHighMem(page)) return 1; } } @@ -3705,7 +3713,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d next = skb->next; skb_mark_not_on_list(skb); - /* in case skb wont be segmented, point to itself */ + /* in case skb won't be segmented, point to itself */ skb->prev = skb; skb = validate_xmit_skb(skb, dev, again); @@ -4245,13 +4253,6 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_zero); -u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev) -{ - return (u16)raw_smp_processor_id() % dev->real_num_tx_queues; -} -EXPORT_SYMBOL(dev_pick_tx_cpu_id); - u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@ -5248,7 +5249,7 @@ int netif_rx(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx); -static __latent_entropy void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -5725,10 +5726,9 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo struct packet_type *pt_curr = NULL; /* Current (common) orig_dev of sublist */ struct net_device *od_curr = NULL; - struct list_head sublist; struct sk_buff *skb, *next; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *orig_dev = skb->dev; struct packet_type *pt_prev = NULL; @@ -5866,9 +5866,8 @@ static int netif_receive_skb_internal(struct sk_buff *skb) void netif_receive_skb_list_internal(struct list_head *head) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb); @@ -6921,7 +6920,7 @@ static int napi_threaded_poll(void *data) return 0; } -static __latent_entropy void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + @@ -9272,7 +9271,7 @@ EXPORT_SYMBOL(netdev_port_same_parent_id); */ int dev_change_proto_down(struct net_device *dev, bool proto_down) { - if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) + if (!dev->change_proto_down) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -9369,6 +9368,20 @@ u8 dev_xdp_prog_count(struct net_device *dev) } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); +int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + if (!dev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); + return -EBUSY; + } + + return dev->netdev_ops->ndo_bpf(dev, bpf); +} +EXPORT_SYMBOL_GPL(dev_xdp_propagate); + u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { struct bpf_prog *prog = dev_xdp_prog(dev, mode); @@ -9397,6 +9410,11 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; + if (dev_get_min_mp_channel_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); + return -EBUSY; + } + memset(&xdp, 0, sizeof(xdp)); xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG; xdp.extack = extack; @@ -9821,6 +9839,20 @@ err_out: return err; } +u32 dev_get_min_mp_channel_count(const struct net_device *dev) +{ + int i; + + ASSERT_RTNL(); + + for (i = dev->real_num_rx_queues - 1; i >= 0; i--) + if (dev->_rx[i].mp_params.mp_priv) + /* The channel count is the idx plus 1. */ + return i + 1; + + return 0; +} + /** * dev_index_reserve() - allocate an ifindex in a namespace * @net: the applicable net namespace @@ -10321,6 +10353,17 @@ static void netdev_do_free_pcpu_stats(struct net_device *dev) } } +static void netdev_free_phy_link_topology(struct net_device *dev) +{ + struct phy_link_topology *topo = dev->link_topo; + + if (IS_ENABLED(CONFIG_PHYLIB) && topo) { + xa_destroy(&topo->phys); + kfree(topo); + dev->link_topo = NULL; + } +} + /** * register_netdevice() - register a network device * @dev: device to register @@ -10868,7 +10911,7 @@ noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset) return; } - field = (__force unsigned long __percpu *)((__force void *)p + offset); + field = (unsigned long __percpu *)((void __percpu *)p + offset); this_cpu_inc(*field); } EXPORT_SYMBOL_GPL(netdev_core_stats_inc); @@ -11099,6 +11142,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, #ifdef CONFIG_NET_SCHED hash_init(dev->qdisc_hash); #endif + dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); @@ -11120,7 +11164,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool) goto free_all; - strcpy(dev->name, name); + strscpy(dev->name, name); dev->name_assign_type = name_assign_type; dev->group = INIT_NETDEV_GROUP; if (!dev->ethtool_ops) @@ -11191,6 +11235,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; + netdev_free_phy_link_topology(dev); + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED || dev->reg_state == NETREG_DUMMY) { @@ -11343,6 +11389,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); + dev_dmabuf_uninstall(dev); netdev_offload_xstats_disable_all(dev); @@ -11407,7 +11454,7 @@ void unregister_netdevice_many_notify(struct list_head *head, * @head: list of devices * * Note: As most callers use a stack allocated list_head, - * we force a list_del() to make sure stack wont be corrupted later. + * we force a list_del() to make sure stack won't be corrupted later. */ void unregister_netdevice_many(struct list_head *head) { @@ -11462,10 +11509,10 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_local) goto out; - /* Ensure the device has been registrered */ + /* Ensure the device has been registered */ if (dev->reg_state != NETREG_REGISTERED) goto out; @@ -11844,7 +11891,7 @@ static void __net_exit default_device_exit_net(struct net *net) char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->features & NETIF_F_NETNS_LOCAL) + if (dev->netns_local) continue; /* Leave virtual devices for the generic cleanup */ @@ -11905,7 +11952,7 @@ static struct pernet_operations __net_initdata default_device_ops = { static void __init net_dev_struct_check(void) { /* TX read-mostly hotpath */ - CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags); + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx); diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index baa63dee2829..166e404f7c03 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -262,7 +262,7 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, } /* This function only works where there is a strict 1-1 relationship - * between source and destionation of they synch. If you ever need to + * between source and destination of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use * __hw_addr_sync_multiple(). */ @@ -299,8 +299,8 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, EXPORT_SYMBOL(__hw_addr_unsync); /** - * __hw_addr_sync_dev - Synchonize device's multicast list - * @list: address list to syncronize + * __hw_addr_sync_dev - Synchronize device's multicast list + * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 8592c052c0f4..473c437b6b53 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -317,8 +317,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) * should take precedence in front of hardware timestamping provided by the * netdev. If the netdev driver needs to perform specific actions even for PHY * timestamping to work properly (a switch port must trap the timestamped - * frames and not forward them), it must set IFF_SEE_ALL_HWTSTAMP_REQUESTS in - * dev->priv_flags. + * frames and not forward them), it must set dev->see_all_hwtstamp_requests. */ int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, @@ -332,13 +331,13 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; - if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (phy_ts && dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; } - if (!phy_ts || (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) { + if (!phy_ts || dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) @@ -347,7 +346,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, } } - if (phy_ts && (dev->priv_flags & IFF_SEE_ALL_HWTSTAMP_REQUESTS)) + if (phy_ts && dev->see_all_hwtstamp_requests) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { diff --git a/net/core/devmem.c b/net/core/devmem.c new file mode 100644 index 000000000000..11b91c12ee11 --- /dev/null +++ b/net/core/devmem.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Devmem TCP + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemdebruijn.kernel@gmail.com> + * Kaiyuan Zhang <kaiyuanz@google.com + */ + +#include <linux/dma-buf.h> +#include <linux/genalloc.h> +#include <linux/mm.h> +#include <linux/netdevice.h> +#include <linux/types.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> +#include <net/page_pool/helpers.h> +#include <trace/events/page_pool.h> + +#include "devmem.h" +#include "mp_dmabuf_devmem.h" +#include "page_pool_priv.h" + +/* Device memory support */ + +/* Protected by rtnl_lock() */ +static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); + +static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, + struct gen_pool_chunk *chunk, + void *not_used) +{ + struct dmabuf_genpool_chunk_owner *owner = chunk->owner; + + kvfree(owner->niovs); + kfree(owner); +} + +static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) +{ + struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + + return owner->base_dma_addr + + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); +} + +void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +{ + size_t size, avail; + + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + + size = gen_pool_size(binding->chunk_pool); + avail = gen_pool_avail(binding->chunk_pool); + + if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu", + size, avail)) + gen_pool_destroy(binding->chunk_pool); + + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + DMA_FROM_DEVICE); + dma_buf_detach(binding->dmabuf, binding->attachment); + dma_buf_put(binding->dmabuf); + xa_destroy(&binding->bound_rxqs); + kfree(binding); +} + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct dmabuf_genpool_chunk_owner *owner; + unsigned long dma_addr; + struct net_iov *niov; + ssize_t offset; + ssize_t index; + + dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE, + (void **)&owner); + if (!dma_addr) + return NULL; + + offset = dma_addr - owner->base_dma_addr; + index = offset / PAGE_SIZE; + niov = &owner->niovs[index]; + + niov->pp_magic = 0; + niov->pp = NULL; + atomic_long_set(&niov->pp_ref_count, 0); + + return niov; +} + +void net_devmem_free_dmabuf(struct net_iov *niov) +{ + struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); + unsigned long dma_addr = net_devmem_get_dma_addr(niov); + + if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, + PAGE_SIZE))) + return; + + gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE); +} + +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + struct netdev_rx_queue *rxq; + unsigned long xa_idx; + unsigned int rxq_idx; + + if (binding->list.next) + list_del(&binding->list); + + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { + WARN_ON(rxq->mp_params.mp_priv != binding); + + rxq->mp_params.mp_priv = NULL; + + rxq_idx = get_netdev_rx_queue_index(rxq); + + WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx)); + } + + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + net_devmem_dmabuf_binding_put(binding); +} + +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) +{ + struct netdev_rx_queue *rxq; + u32 xa_idx; + int err; + + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + + rxq = __netif_get_rx_queue(dev, rxq_idx); + if (rxq->mp_params.mp_priv) { + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); + return -EEXIST; + } + +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) { + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); + return -EBUSY; + } +#endif + + err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, + GFP_KERNEL); + if (err) + return err; + + rxq->mp_params.mp_priv = binding; + + err = netdev_rx_queue_restart(dev, rxq_idx); + if (err) + goto err_xa_erase; + + return 0; + +err_xa_erase: + rxq->mp_params.mp_priv = NULL; + xa_erase(&binding->bound_rxqs, xa_idx); + + return err; +} + +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + struct netlink_ext_ack *extack) +{ + struct net_devmem_dmabuf_binding *binding; + static u32 id_alloc_next; + struct scatterlist *sg; + struct dma_buf *dmabuf; + unsigned int sg_idx, i; + unsigned long virtual; + int err; + + dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(dmabuf)) + return ERR_CAST(dmabuf); + + binding = kzalloc_node(sizeof(*binding), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!binding) { + err = -ENOMEM; + goto err_put_dmabuf; + } + + binding->dev = dev; + + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_binding; + + xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); + + refcount_set(&binding->ref, 1); + + binding->dmabuf = dmabuf; + + binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); + if (IS_ERR(binding->attachment)) { + err = PTR_ERR(binding->attachment); + NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); + goto err_free_id; + } + + binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, + DMA_FROM_DEVICE); + if (IS_ERR(binding->sgt)) { + err = PTR_ERR(binding->sgt); + NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); + goto err_detach; + } + + /* For simplicity we expect to make PAGE_SIZE allocations, but the + * binding can be much more flexible than that. We may be able to + * allocate MTU sized chunks here. Leave that for future work... + */ + binding->chunk_pool = + gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + if (!binding->chunk_pool) { + err = -ENOMEM; + goto err_unmap; + } + + virtual = 0; + for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) { + dma_addr_t dma_addr = sg_dma_address(sg); + struct dmabuf_genpool_chunk_owner *owner; + size_t len = sg_dma_len(sg); + struct net_iov *niov; + + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, + dev_to_node(&dev->dev)); + if (!owner) { + err = -ENOMEM; + goto err_free_chunks; + } + + owner->base_virtual = virtual; + owner->base_dma_addr = dma_addr; + owner->num_niovs = len / PAGE_SIZE; + owner->binding = binding; + + err = gen_pool_add_owner(binding->chunk_pool, dma_addr, + dma_addr, len, dev_to_node(&dev->dev), + owner); + if (err) { + kfree(owner); + err = -EINVAL; + goto err_free_chunks; + } + + owner->niovs = kvmalloc_array(owner->num_niovs, + sizeof(*owner->niovs), + GFP_KERNEL); + if (!owner->niovs) { + err = -ENOMEM; + goto err_free_chunks; + } + + for (i = 0; i < owner->num_niovs; i++) { + niov = &owner->niovs[i]; + niov->owner = owner; + page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), + net_devmem_get_dma_addr(niov)); + } + + virtual += len; + } + + return binding; + +err_free_chunks: + gen_pool_for_each_chunk(binding->chunk_pool, + net_devmem_dmabuf_free_chunk_owner, NULL); + gen_pool_destroy(binding->chunk_pool); +err_unmap: + dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, + DMA_FROM_DEVICE); +err_detach: + dma_buf_detach(dmabuf, binding->attachment); +err_free_id: + xa_erase(&net_devmem_dmabuf_bindings, binding->id); +err_free_binding: + kfree(binding); +err_put_dmabuf: + dma_buf_put(dmabuf); + return ERR_PTR(err); +} + +void dev_dmabuf_uninstall(struct net_device *dev) +{ + struct net_devmem_dmabuf_binding *binding; + struct netdev_rx_queue *rxq; + unsigned long xa_idx; + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + binding = dev->_rx[i].mp_params.mp_priv; + if (!binding) + continue; + + xa_for_each(&binding->bound_rxqs, xa_idx, rxq) + if (rxq == &dev->_rx[i]) { + xa_erase(&binding->bound_rxqs, xa_idx); + break; + } + } +} + +/*** "Dmabuf devmem memory provider" ***/ + +int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + if (!binding) + return -EINVAL; + + if (!pool->dma_map) + return -EOPNOTSUPP; + + if (pool->dma_sync) + return -EOPNOTSUPP; + + if (pool->p.order != 0) + return -E2BIG; + + net_devmem_dmabuf_binding_get(binding); + return 0; +} + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + struct net_iov *niov; + netmem_ref netmem; + + niov = net_devmem_alloc_dmabuf(binding); + if (!niov) + return 0; + + netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); + return netmem; +} + +void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; + + net_devmem_dmabuf_binding_put(binding); +} + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem)); + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + if (WARN_ON_ONCE(refcount != 1)) + return false; + + page_pool_clear_pp_info(netmem); + + net_devmem_free_dmabuf(netmem_to_net_iov(netmem)); + + /* We don't want the page pool put_page()ing our net_iovs. */ + return false; +} diff --git a/net/core/devmem.h b/net/core/devmem.h new file mode 100644 index 000000000000..76099ef9c482 --- /dev/null +++ b/net/core/devmem.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Device memory TCP support + * + * Authors: Mina Almasry <almasrymina@google.com> + * Willem de Bruijn <willemb@google.com> + * Kaiyuan Zhang <kaiyuanz@google.com> + * + */ +#ifndef _NET_DEVMEM_H +#define _NET_DEVMEM_H + +struct netlink_ext_ack; + +struct net_devmem_dmabuf_binding { + struct dma_buf *dmabuf; + struct dma_buf_attachment *attachment; + struct sg_table *sgt; + struct net_device *dev; + struct gen_pool *chunk_pool; + + /* The user holds a ref (via the netlink API) for as long as they want + * the binding to remain alive. Each page pool using this binding holds + * a ref to keep the binding alive. Each allocated net_iov holds a + * ref. + * + * The binding undos itself and unmaps the underlying dmabuf once all + * those refs are dropped and the binding is no longer desired or in + * use. + */ + refcount_t ref; + + /* The list of bindings currently active. Used for netlink to notify us + * of the user dropping the bind. + */ + struct list_head list; + + /* rxq's this binding is active on. */ + struct xarray bound_rxqs; + + /* ID of this binding. Globally unique to all bindings currently + * active. + */ + u32 id; +}; + +#if defined(CONFIG_NET_DEVMEM) +/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist + * entry from the dmabuf is inserted into the genpool as a chunk, and needs + * this owner struct to keep track of some metadata necessary to create + * allocations from this chunk. + */ +struct dmabuf_genpool_chunk_owner { + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; + + /* dma_addr of the start of the chunk. */ + dma_addr_t base_dma_addr; + + /* Array of net_iovs for this chunk. */ + struct net_iov *niovs; + size_t num_niovs; + + struct net_devmem_dmabuf_binding *binding; +}; + +void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + struct netlink_ext_ack *extack); +void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); +int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack); +void dev_dmabuf_uninstall(struct net_device *dev); + +static inline struct dmabuf_genpool_chunk_owner * +net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) +{ + return niov - net_iov_owner(niov)->niovs; +} + +static inline struct net_devmem_dmabuf_binding * +net_iov_binding(const struct net_iov *niov) +{ + return net_iov_owner(niov)->binding; +} + +static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) +{ + struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + + return owner->base_virtual + + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); +} + +static inline u32 net_iov_binding_id(const struct net_iov *niov) +{ + return net_iov_owner(niov)->binding->id; +} + +static inline void +net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) +{ + refcount_inc(&binding->ref); +} + +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ + if (!refcount_dec_and_test(&binding->ref)) + return; + + __net_devmem_dmabuf_binding_free(binding); +} + +struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); +void net_devmem_free_dmabuf(struct net_iov *ppiov); + +#else +struct net_devmem_dmabuf_binding; + +static inline void +__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + struct netlink_ext_ack *extack) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void +net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline int +net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, + struct net_devmem_dmabuf_binding *binding, + struct netlink_ext_ack *extack) + +{ + return -EOPNOTSUPP; +} + +static inline void dev_dmabuf_uninstall(struct net_device *dev) +{ +} + +static inline struct net_iov * +net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) +{ + return NULL; +} + +static inline void net_devmem_free_dmabuf(struct net_iov *ppiov) +{ +} + +static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) +{ + return 0; +} + +static inline u32 net_iov_binding_id(const struct net_iov *niov) +{ + return 0; +} +#endif + +#endif /* _NET_DEVMEM_H */ diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 6ebffbc63236..154a2681f55c 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -11,6 +11,7 @@ #include <linux/list.h> #include <linux/module.h> #include <net/net_namespace.h> +#include <net/inet_dscp.h> #include <net/sock.h> #include <net/fib_rules.h> #include <net/ip_tunnels.h> @@ -72,7 +73,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->suppress_prefixlen = -1; r->suppress_ifgroup = -1; - /* The lock is not required here, the list in unreacheable + /* The lock is not required here, the list in unreachable * at the moment this function is called */ list_add_tail(&r->list, &ops->rules_list); return 0; @@ -766,7 +767,8 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_PROTOCOL] = { .type = NLA_U8 }, [FRA_IP_PROTO] = { .type = NLA_U8 }, [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, - [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } + [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, + [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), }; int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -1205,8 +1207,7 @@ static void notify_rule_change(int event, struct fib_rule *rule, rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, ops->nlgroup, err); + rtnl_set_sk_err(net, ops->nlgroup, err); } static void attach_rules(struct list_head *rules, struct net_device *dev) diff --git a/net/core/filter.c b/net/core/filter.c index e4a4454df5f9..cd3524cb326b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -84,6 +84,7 @@ #include <net/netkit.h> #include <linux/un.h> #include <net/xdp_sock_drv.h> +#include <net/inet_dscp.h> #include "dev.h" @@ -2371,7 +2372,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_tos = ip4h->tos & INET_DSCP_MASK, .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -3189,6 +3190,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); + skb_reset_mac_len(skb); bpf_compute_data_pointers(skb); return ret; @@ -5278,6 +5280,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, return -EINVAL; inet_csk(sk)->icsk_rto_min = timeout; break; + case TCP_BPF_SOCK_OPS_CB_FLAGS: + if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS)) + return -EINVAL; + tp->bpf_sock_ops_cb_flags = val; + break; default: return -EINVAL; } @@ -5366,6 +5373,17 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, if (*optlen < 1) return -EINVAL; break; + case TCP_BPF_SOCK_OPS_CB_FLAGS: + if (*optlen != sizeof(int)) + return -EINVAL; + if (getopt) { + struct tcp_sock *tp = tcp_sk(sk); + int cb_flags = tp->bpf_sock_ops_cb_flags; + + memcpy(optval, &cb_flags, *optlen); + return 0; + } + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); default: if (getopt) return -EINVAL; @@ -5899,7 +5917,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } - fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_tos = params->tos & INET_DSCP_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; @@ -12060,7 +12078,7 @@ int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, } BTF_KFUNCS_START(bpf_kfunc_check_set_skb) -BTF_ID_FLAGS(func, bpf_dynptr_from_skb) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) @@ -12109,6 +12127,7 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); diff --git a/net/core/gro.c b/net/core/gro.c index b3b43de1a650..802b4a062400 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -374,7 +374,7 @@ static void gro_list_prepare(const struct list_head *head, skb_mac_header(skb), maclen); - /* in most common scenarions 'slow_gro' is 0 + /* in most common scenarios 'slow_gro' is 0 * otherwise we are already on some slower paths * either skip all the infrequent tests altogether or * avoid trying too hard to skip each of them individually @@ -408,7 +408,8 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) pinfo = skb_shinfo(skb); frag0 = &pinfo->frags[0]; - if (pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && + if (pinfo->nr_frags && skb_frag_page(frag0) && + !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index afb05f58b64c..1a14f915b7a4 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -12,6 +12,7 @@ #include <net/gre.h> #include <net/ip6_route.h> #include <net/ipv6_stubs.h> +#include <net/inet_dscp.h> struct bpf_lwt_prog { struct bpf_prog *prog; @@ -205,7 +206,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; diff --git a/net/core/mp_dmabuf_devmem.h b/net/core/mp_dmabuf_devmem.h new file mode 100644 index 000000000000..67cd0dd7319c --- /dev/null +++ b/net/core/mp_dmabuf_devmem.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Dmabuf device memory provider. + * + * Authors: Mina Almasry <almasrymina@google.com> + * + */ +#ifndef _NET_MP_DMABUF_DEVMEM_H +#define _NET_MP_DMABUF_DEVMEM_H + +#include <net/netmem.h> + +#if defined(CONFIG_NET_DEVMEM) +int mp_dmabuf_devmem_init(struct page_pool *pool); + +netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp); + +void mp_dmabuf_devmem_destroy(struct page_pool *pool); + +bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem); +#else +static inline int mp_dmabuf_devmem_init(struct page_pool *pool) +{ + return -EOPNOTSUPP; +} + +static inline netmem_ref +mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp) +{ + return 0; +} + +static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool) +{ +} + +static inline bool +mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) +{ + return false; +} +#endif + +#endif /* _NET_MP_DMABUF_DEVMEM_H */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a6fe88eca939..77b819cd995b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3530,8 +3530,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } void neigh_app_ns(struct neighbour *n) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0e2084ce7b75..05cf5347f25e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -32,6 +32,7 @@ #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; static const char fmt_dec[] = "%d\n"; +static const char fmt_uint[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; @@ -235,7 +236,7 @@ static ssize_t speed_show(struct device *dev, if (!rtnl_trylock()) return restart_syscall(); - if (netif_running(netdev) && netif_device_present(netdev)) { + if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) @@ -425,6 +426,9 @@ NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { + if (val > S32_MAX) + return -ERANGE; + WRITE_ONCE(dev->napi_defer_hard_irqs, val); return 0; } @@ -438,7 +442,7 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev, return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); } -NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint); static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -1056,7 +1060,7 @@ static const void *rx_queue_namespace(const struct kobject *kobj) struct device *dev = &queue->dev->dev; const void *ns = NULL; - if (dev->class && dev->class->ns_type) + if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; @@ -1524,7 +1528,7 @@ static const struct attribute_group dql_group = { }; #else /* Fake declaration, all the code using it should be dead */ -extern const struct attribute_group dql_group; +static const struct attribute_group dql_group = {}; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS @@ -1740,7 +1744,7 @@ static const void *netdev_queue_namespace(const struct kobject *kobj) struct device *dev = &queue->dev->dev; const void *ns = NULL; - if (dev->class && dev->class->ns_type) + if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; @@ -1764,8 +1768,7 @@ static const struct kobj_type netdev_queue_ktype = { static bool netdev_uses_bql(const struct net_device *dev) { - if (dev->features & NETIF_F_LLTX || - dev->priv_flags & IFF_NO_QUEUE) + if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE)) return false; return IS_ENABLED(CONFIG_BQL); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6a823ba906c6..11e4dd4f09ed 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -125,7 +125,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) int err = -ENOMEM; void *data = NULL; - if (ops->id && ops->size) { + if (ops->id) { data = kzalloc(ops->size, GFP_KERNEL); if (!data) goto out; @@ -140,7 +140,7 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; - if (ops->id && ops->size) { + if (ops->id) { ng = rcu_dereference_protected(net->gen, lockdep_is_held(&pernet_ops_rwsem)); ng->ptr[*ops->id] = NULL; @@ -182,7 +182,8 @@ static void ops_free_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { struct net *net; - if (ops->size && ops->id) { + + if (ops->id) { list_for_each_entry(net, net_exit_list, exit_list) kfree(net_generic(net, *ops->id)); } @@ -308,16 +309,38 @@ struct net *get_net_ns_by_id(const struct net *net, int id) } EXPORT_SYMBOL_GPL(get_net_ns_by_id); +static __net_init void preinit_net_sysctl(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + /* Limits per socket sk_omem_alloc usage. + * TCP zerocopy regular usage needs 128 KB. + */ + net->core.sysctl_optmem_max = 128 * 1024; + net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; +} + /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net) +static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) { + refcount_set(&net->passive, 1); + refcount_set(&net->ns.count, 1); + ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt"); + + get_random_bytes(&net->hash_mix, sizeof(u32)); + net->dev_base_seq = 1; + net->user_ns = user_ns; + + idr_init(&net->netns_ids); + spin_lock_init(&net->nsid_lock); + mutex_init(&net->ipv4.ra_mutex); + preinit_net_sysctl(net); } /* * setup_net runs the initializers for the network namespace object. */ -static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) +static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; @@ -325,19 +348,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) LIST_HEAD(dev_kill_list); int error = 0; - refcount_set(&net->ns.count, 1); - ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); - - refcount_set(&net->passive, 1); - get_random_bytes(&net->hash_mix, sizeof(u32)); preempt_disable(); net->net_cookie = gen_cookie_next(&net_cookie); preempt_enable(); - net->dev_base_seq = 1; - net->user_ns = user_ns; - idr_init(&net->netns_ids); - spin_lock_init(&net->nsid_lock); - mutex_init(&net->ipv4.ra_mutex); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -382,32 +395,6 @@ out_undo: goto out; } -static int __net_init net_defaults_init_net(struct net *net) -{ - net->core.sysctl_somaxconn = SOMAXCONN; - /* Limits per socket sk_omem_alloc usage. - * TCP zerocopy regular usage needs 128 KB. - */ - net->core.sysctl_optmem_max = 128 * 1024; - net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; - - return 0; -} - -static struct pernet_operations net_defaults_ops = { - .init = net_defaults_init_net, -}; - -static __init int net_defaults_init(void) -{ - if (register_pernet_subsys(&net_defaults_ops)) - panic("Cannot initialize net default settings"); - - return 0; -} - -core_initcall(net_defaults_init); - #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) { @@ -496,8 +483,7 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net); - refcount_set(&net->passive, 1); + preinit_net(net, user_ns); net->ucounts = ucounts; get_user_ns(user_ns); @@ -505,7 +491,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) goto put_userns; - rv = setup_net(net, user_ns); + rv = setup_net(net); up_read(&pernet_ops_rwsem); @@ -1199,9 +1185,10 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif + preinit_net(&init_net, &init_user_ns); + down_write(&pernet_ops_rwsem); - preinit_net(&init_net); - if (setup_net(&init_net, &init_user_ns)) + if (setup_net(&init_net)) panic("Could not setup the initial network namespace"); init_net_initialized = true; @@ -1244,7 +1231,7 @@ static int __register_pernet_operations(struct list_head *list, LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); - if (ops->init || (ops->id && ops->size)) { + if (ops->init || ops->id) { /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ @@ -1310,6 +1297,9 @@ static int register_pernet_operations(struct list_head *list, { int error; + if (WARN_ON(!!ops->id ^ !!ops->size)) + return -EINVAL; + if (ops->id) { error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID, GFP_KERNEL); diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 8350a0afa9ec..b28424ae06d5 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -9,6 +9,7 @@ #include "netdev-genl-gen.h" #include <uapi/linux/netdev.h> +#include <linux/list.h> /* Integer value ranges */ static const struct netlink_range_validation netdev_a_page_pool_id_range = { @@ -27,6 +28,11 @@ const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFIND [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), }; +const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = { + [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, }, + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), +}; + /* NETDEV_CMD_DEV_GET - do */ static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = { [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), @@ -74,6 +80,13 @@ static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1), }; +/* NETDEV_CMD_BIND_RX - do */ +static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, + [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -151,6 +164,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_QSTATS_SCOPE, .flags = GENL_CMD_CAP_DUMP, }, + { + .cmd = NETDEV_CMD_BIND_RX, + .doit = netdev_nl_bind_rx_doit, + .policy = netdev_bind_rx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { @@ -168,4 +188,7 @@ struct genl_family netdev_nl_family __ro_after_init = { .n_split_ops = ARRAY_SIZE(netdev_nl_ops), .mcgrps = netdev_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), + .sock_priv_size = sizeof(struct list_head), + .sock_priv_init = (void *)netdev_nl_sock_priv_init, + .sock_priv_destroy = (void *)netdev_nl_sock_priv_destroy, }; diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 4db40fd5b4a9..8cda334fd042 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -10,9 +10,11 @@ #include <net/genetlink.h> #include <uapi/linux/netdev.h> +#include <linux/list.h> /* Common nested types */ extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; +extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); @@ -30,6 +32,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, @@ -38,4 +41,7 @@ enum { extern struct genl_family netdev_nl_family; +void netdev_nl_sock_priv_init(struct list_head *priv); +void netdev_nl_sock_priv_destroy(struct list_head *priv); + #endif /* _LINUX_NETDEV_GEN_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 05f9515d2c05..1cb954f2d39e 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -3,16 +3,17 @@ #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> +#include <net/busy_poll.h> #include <net/net_namespace.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> -#include <net/netdev_rx_queue.h> -#include <net/netdev_queues.h> -#include <net/busy_poll.h> -#include "netdev-genl-gen.h" #include "dev.h" +#include "devmem.h" +#include "netdev-genl-gen.h" struct netdev_nl_dump_ctx { unsigned long ifindex; @@ -216,10 +217,12 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); napi = napi_by_id(napi_id); - if (napi) + if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); - else - err = -EINVAL; + } else { + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); + err = -ENOENT; + } rtnl_unlock(); @@ -292,6 +295,7 @@ static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { + struct net_devmem_dmabuf_binding *binding; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -311,6 +315,12 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, rxq->napi->napi_id)) goto nla_put_failure; + + binding = rxq->mp_params.mp_priv; + if (binding && + nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) + goto nla_put_failure; + break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); @@ -721,6 +731,129 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, return err; } +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct net_devmem_dmabuf_binding *binding; + struct list_head *sock_binding_list; + u32 ifindex, dmabuf_fd, rxq_idx; + struct net_device *netdev; + struct sk_buff *rsp; + struct nlattr *attr; + int rem, err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + sock_binding_list = genl_sk_priv_get(&netdev_nl_family, + NETLINK_CB(skb).sk); + if (IS_ERR(sock_binding_list)) + return PTR_ERR(sock_binding_list); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + rtnl_lock(); + + netdev = __dev_get_by_index(genl_info_net(info), ifindex); + if (!netdev || !netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock; + } + + if (dev_xdp_prog_count(netdev)) { + NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached"); + err = -EEXIST; + goto err_unlock; + } + + binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock; + } + + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + err = nla_parse_nested( + tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + goto err_unbind; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { + err = -EINVAL; + goto err_unbind; + } + + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); + err = -EINVAL; + goto err_unbind; + } + + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + + err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, + info->extack); + if (err) + goto err_unbind; + } + + list_add(&binding->list, sock_binding_list); + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + err = genlmsg_reply(rsp, info); + if (err) + goto err_unbind; + + rtnl_unlock(); + + return 0; + +err_unbind: + net_devmem_unbind_dmabuf(binding); +err_unlock: + rtnl_unlock(); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + +void netdev_nl_sock_priv_init(struct list_head *priv) +{ + INIT_LIST_HEAD(priv); +} + +void netdev_nl_sock_priv_destroy(struct list_head *priv) +{ + struct net_devmem_dmabuf_binding *binding; + struct net_devmem_dmabuf_binding *temp; + + list_for_each_entry_safe(binding, temp, priv, list) { + rtnl_lock(); + net_devmem_unbind_dmabuf(binding); + rtnl_unlock(); + } +} + static int netdev_genl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c new file mode 100644 index 000000000000..e217a5838c87 --- /dev/null +++ b/net/core/netdev_rx_queue.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/netdevice.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> + +#include "page_pool_priv.h" + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); + void *new_mem, *old_mem; + int err; + + if (!dev->queue_mgmt_ops || !dev->queue_mgmt_ops->ndo_queue_stop || + !dev->queue_mgmt_ops->ndo_queue_mem_free || + !dev->queue_mgmt_ops->ndo_queue_mem_alloc || + !dev->queue_mgmt_ops->ndo_queue_start) + return -EOPNOTSUPP; + + ASSERT_RTNL(); + + new_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); + if (!new_mem) + return -ENOMEM; + + old_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); + if (!old_mem) { + err = -ENOMEM; + goto err_free_new_mem; + } + + err = dev->queue_mgmt_ops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + if (err) + goto err_free_old_mem; + + err = page_pool_check_memory_provider(dev, rxq); + if (err) + goto err_free_new_queue_mem; + + err = dev->queue_mgmt_ops->ndo_queue_stop(dev, old_mem, rxq_idx); + if (err) + goto err_free_new_queue_mem; + + err = dev->queue_mgmt_ops->ndo_queue_start(dev, new_mem, rxq_idx); + if (err) + goto err_start_queue; + + dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); + + kvfree(old_mem); + kvfree(new_mem); + + return 0; + +err_start_queue: + /* Restarting the queue with old_mem should be successful as we haven't + * changed any of the queue configuration, and there is not much we can + * do to recover from a failure here. + * + * WARN if we fail to recover the old rx queue, and at least free + * old_mem so we don't also leak that. + */ + if (dev->queue_mgmt_ops->ndo_queue_start(dev, old_mem, rxq_idx)) { + WARN(1, + "Failed to restart old queue in error path. RX queue %d may be unhealthy.", + rxq_idx); + dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); + } + +err_free_new_queue_mem: + dev->queue_mgmt_ops->ndo_queue_mem_free(dev, new_mem); + +err_free_old_mem: + kvfree(old_mem); + +err_free_new_mem: + kvfree(new_mem); + + return err; +} diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h new file mode 100644 index 000000000000..7eadb8393e00 --- /dev/null +++ b/net/core/netmem_priv.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NETMEM_PRIV_H +#define __NETMEM_PRIV_H + +static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) +{ + return __netmem_clear_lsb(netmem)->pp_magic; +} + +static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) +{ + __netmem_clear_lsb(netmem)->pp_magic |= pp_magic; +} + +static inline void netmem_clear_pp_magic(netmem_ref netmem) +{ + __netmem_clear_lsb(netmem)->pp_magic = 0; +} + +static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) +{ + __netmem_clear_lsb(netmem)->pp = pool; +} + +static inline void netmem_set_dma_addr(netmem_ref netmem, + unsigned long dma_addr) +{ + __netmem_clear_lsb(netmem)->dma_addr = dma_addr; +} +#endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 55bcacf67df3..ca52cbe0f63c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -48,8 +48,6 @@ static struct sk_buff_head skb_pool; -DEFINE_STATIC_SRCU(netpoll_srcu); - #define USEC_PER_POLL 50 #define MAX_SKB_SIZE \ @@ -162,7 +160,7 @@ static void poll_one_napi(struct napi_struct *napi) if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) return; - /* We explicilty pass the polling call a budget of 0 to + /* We explicitly pass the polling call a budget of 0 to * indicate that we are clearing the Tx path only. */ work = napi->poll(napi, 0); @@ -220,26 +218,21 @@ EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { struct netpoll_info *ni; - int idx; + might_sleep(); - idx = srcu_read_lock(&netpoll_srcu); - ni = srcu_dereference(dev->npinfo, &netpoll_srcu); + ni = rtnl_dereference(dev->npinfo); if (ni) down(&ni->dev_lock); - srcu_read_unlock(&netpoll_srcu, idx); } -EXPORT_SYMBOL(netpoll_poll_disable); void netpoll_poll_enable(struct net_device *dev) { struct netpoll_info *ni; - rcu_read_lock(); - ni = rcu_dereference(dev->npinfo); + + ni = rtnl_dereference(dev->npinfo); if (ni) up(&ni->dev_lock); - rcu_read_unlock(); } -EXPORT_SYMBOL(netpoll_poll_enable); static void refill_skbs(void) { @@ -626,12 +619,9 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) const struct net_device_ops *ops; int err; - np->dev = ndev; - strscpy(np->dev_name, ndev->name, IFNAMSIZ); - if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", - np->dev_name); + ndev->name); err = -ENOTSUPP; goto out; } @@ -649,7 +639,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) refcount_set(&npinfo->refcnt, 1); - ops = np->dev->netdev_ops; + ops = ndev->netdev_ops; if (ops->ndo_netpoll_setup) { err = ops->ndo_netpoll_setup(ndev, npinfo); if (err) @@ -660,6 +650,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) refcount_inc(&npinfo->refcnt); } + np->dev = ndev; + strscpy(np->dev_name, ndev->name, IFNAMSIZ); npinfo->netpoll = np; /* last thing to do is link it to the net device structure */ @@ -677,6 +669,7 @@ EXPORT_SYMBOL_GPL(__netpoll_setup); int netpoll_setup(struct netpoll *np) { struct net_device *ndev = NULL; + bool ip_overwritten = false; struct in_device *in_dev; int err; @@ -741,6 +734,7 @@ put_noaddr: } np->local_ip.ip = ifa->ifa_local; + ip_overwritten = true; np_info(np, "local IP %pI4\n", &np->local_ip.ip); } else { #if IS_ENABLED(CONFIG_IPV6) @@ -757,6 +751,7 @@ put_noaddr: !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL)) continue; np->local_ip.in6 = ifp->addr; + ip_overwritten = true; err = 0; break; } @@ -787,6 +782,9 @@ put_noaddr: return 0; put: + DEBUG_NET_WARN_ON_ONCE(np->dev); + if (ip_overwritten) + memset(&np->local_ip, 0, sizeof(np->local_ip)); netdev_put(ndev, &np->dev_tracker); unlock: rtnl_unlock(); @@ -826,8 +824,6 @@ void __netpoll_cleanup(struct netpoll *np) if (!npinfo) return; - synchronize_srcu(&netpoll_srcu); - if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; @@ -853,14 +849,20 @@ void __netpoll_free(struct netpoll *np) } EXPORT_SYMBOL_GPL(__netpoll_free); +void do_netpoll_cleanup(struct netpoll *np) +{ + __netpoll_cleanup(np); + netdev_put(np->dev, &np->dev_tracker); + np->dev = NULL; +} +EXPORT_SYMBOL(do_netpoll_cleanup); + void netpoll_cleanup(struct netpoll *np) { rtnl_lock(); if (!np->dev) goto out; - __netpoll_cleanup(np); - netdev_put(np->dev, &np->dev_tracker); - np->dev = NULL; + do_netpoll_cleanup(np); out: rtnl_unlock(); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 2abe6e919224..a813d30d2135 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/device.h> +#include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/xdp.h> @@ -24,8 +25,12 @@ #include <trace/events/page_pool.h> +#include "mp_dmabuf_devmem.h" +#include "netmem_priv.h" #include "page_pool_priv.h" +DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); + #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) @@ -187,6 +192,8 @@ static int page_pool_init(struct page_pool *pool, int cpuid) { unsigned int ring_qsize = 1024; /* Default */ + struct netdev_rx_queue *rxq; + int err; page_pool_struct_check(); @@ -268,7 +275,37 @@ static int page_pool_init(struct page_pool *pool, if (pool->dma_map) get_device(pool->p.dev); + if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { + /* We rely on rtnl_lock()ing to make sure netdev_rx_queue + * configuration doesn't change while we're initializing + * the page_pool. + */ + ASSERT_RTNL(); + rxq = __netif_get_rx_queue(pool->slow.netdev, + pool->slow.queue_idx); + pool->mp_priv = rxq->mp_params.mp_priv; + } + + if (pool->mp_priv) { + err = mp_dmabuf_devmem_init(pool); + if (err) { + pr_warn("%s() mem-provider init failed %d\n", __func__, + err); + goto free_ptr_ring; + } + + static_branch_inc(&page_pool_mem_providers); + } + return 0; + +free_ptr_ring: + ptr_ring_cleanup(&pool->ring, NULL); +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif + return err; } static void page_pool_uninit(struct page_pool *pool) @@ -358,7 +395,7 @@ static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) if (unlikely(!netmem)) break; - if (likely(page_to_nid(netmem_to_page(netmem)) == pref_nid)) { + if (likely(netmem_is_pref_nid(netmem, pref_nid))) { pool->alloc.cache[pool->alloc.count++] = netmem; } else { /* NUMA mismatch; @@ -452,32 +489,6 @@ unmap_failed: return false; } -static void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) -{ - struct page *page = netmem_to_page(netmem); - - page->pp = pool; - page->pp_magic |= PP_SIGNATURE; - - /* Ensuring all pages have been split into one fragment initially: - * page_pool_set_pp_info() is only called once for every page when it - * is allocated from the page allocator and page_pool_fragment_page() - * is dirtying the same cache line as the page->pp_magic above, so - * the overhead is negligible. - */ - page_pool_fragment_netmem(netmem, 1); - if (pool->has_init_callback) - pool->slow.init_callback(netmem, pool->slow.init_arg); -} - -static void page_pool_clear_pp_info(netmem_ref netmem) -{ - struct page *page = netmem_to_page(netmem); - - page->pp_magic = 0; - page->pp = NULL; -} - static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -573,7 +584,10 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) return netmem; /* Slow-path: cache empty, do real allocation */ - netmem = __page_pool_alloc_pages_slow(pool, gfp); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) + netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); + else + netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; } EXPORT_SYMBOL(page_pool_alloc_netmem); @@ -609,6 +623,28 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) return inflight; } +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) +{ + netmem_set_pp(netmem, pool); + netmem_or_pp_magic(netmem, PP_SIGNATURE); + + /* Ensuring all pages have been split into one fragment initially: + * page_pool_set_pp_info() is only called once for every page when it + * is allocated from the page allocator and page_pool_fragment_page() + * is dirtying the same cache line as the page->pp_magic above, so + * the overhead is negligible. + */ + page_pool_fragment_netmem(netmem, 1); + if (pool->has_init_callback) + pool->slow.init_callback(netmem, pool->slow.init_arg); +} + +void page_pool_clear_pp_info(netmem_ref netmem) +{ + netmem_clear_pp_magic(netmem); + netmem_set_pp(netmem, NULL); +} + static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { @@ -637,8 +673,13 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) { int count; + bool put; - __page_pool_release_page_dma(pool, netmem); + put = true; + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) + put = mp_dmabuf_devmem_release_page(pool, netmem); + else + __page_pool_release_page_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. @@ -646,8 +687,10 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, netmem, count); - page_pool_clear_pp_info(netmem); - put_page(netmem_to_page(netmem)); + if (put) { + page_pool_clear_pp_info(netmem); + put_page(netmem_to_page(netmem)); + } /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). @@ -692,8 +735,9 @@ static bool page_pool_recycle_in_cache(netmem_ref netmem, static bool __page_pool_page_can_be_recycled(netmem_ref netmem) { - return page_ref_count(netmem_to_page(netmem)) == 1 && - !page_is_pfmemalloc(netmem_to_page(netmem)); + return netmem_is_net_iov(netmem) || + (page_ref_count(netmem_to_page(netmem)) == 1 && + !page_is_pfmemalloc(netmem_to_page(netmem))); } /* If the page refcnt == 1, this will try to recycle the page. @@ -728,6 +772,7 @@ __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, /* Page found as candidate for recycling */ return netmem; } + /* Fallback/non-XDP mode: API user have elevated refcnt. * * Many drivers split up the page into fragments, and some @@ -949,7 +994,7 @@ static void page_pool_empty_ring(struct page_pool *pool) /* Empty recycle ring */ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ - if (!(page_ref_count(netmem_to_page(netmem)) == 1)) + if (!(netmem_ref_count(netmem) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", __func__, netmem_ref_count(netmem)); @@ -964,6 +1009,12 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); + + if (pool->mp_priv) { + mp_dmabuf_devmem_destroy(pool); + static_branch_dec(&page_pool_mem_providers); + } + kfree(pool); } diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h index 90665d40f1eb..57439787b9c2 100644 --- a/net/core/page_pool_priv.h +++ b/net/core/page_pool_priv.h @@ -3,10 +3,56 @@ #ifndef __PAGE_POOL_PRIV_H #define __PAGE_POOL_PRIV_H +#include <net/page_pool/helpers.h> + +#include "netmem_priv.h" + s32 page_pool_inflight(const struct page_pool *pool, bool strict); int page_pool_list(struct page_pool *pool); void page_pool_detached(struct page_pool *pool); void page_pool_unlist(struct page_pool *pool); +static inline bool +page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr) +{ + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { + netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT); + + /* We assume page alignment to shave off bottom bits, + * if this "compression" doesn't work we need to drop. + */ + return addr != (dma_addr_t)netmem_get_dma_addr(netmem) + << PAGE_SHIFT; + } + + netmem_set_dma_addr(netmem, addr); + return false; +} + +static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr); +} + +#if defined(CONFIG_PAGE_POOL) +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem); +void page_pool_clear_pp_info(netmem_ref netmem); +int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq); +#else +static inline void page_pool_set_pp_info(struct page_pool *pool, + netmem_ref netmem) +{ +} +static inline void page_pool_clear_pp_info(netmem_ref netmem) +{ +} +static inline int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq) +{ + return 0; +} +#endif + #endif diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 3a3277ba167b..48335766c1bf 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -4,10 +4,12 @@ #include <linux/netdevice.h> #include <linux/xarray.h> #include <net/net_debug.h> -#include <net/page_pool/types.h> +#include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/types.h> #include <net/sock.h> +#include "devmem.h" #include "page_pool_priv.h" #include "netdev-genl-gen.h" @@ -212,6 +214,7 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { + struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; void *hdr; @@ -241,6 +244,9 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; + if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + goto err_cancel; + genlmsg_end(rsp, hdr); return 0; @@ -344,6 +350,30 @@ void page_pool_unlist(struct page_pool *pool) mutex_unlock(&page_pools_lock); } +int page_pool_check_memory_provider(struct net_device *dev, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; + struct page_pool *pool; + struct hlist_node *n; + + if (!binding) + return 0; + + mutex_lock(&page_pools_lock); + hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) { + if (pool->mp_priv != binding) + continue; + + if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) { + mutex_unlock(&page_pools_lock); + return 0; + } + } + mutex_unlock(&page_pools_lock); + return -ENODATA; +} + static void page_pool_unreg_netdev_wipe(struct net_device *netdev) { struct page_pool *pool; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index ea55a758a475..34f68ef74b8f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -69,7 +69,7 @@ * * By design there should only be *one* "controlling" process. In practice * multiple write accesses gives unpredictable result. Understood by "write" - * to /proc gives result code thats should be read be the "writer". + * to /proc gives result code that should be read be the "writer". * For practical use this should be no problem. * * Note when adding devices to a specific CPU there good idea to also assign @@ -2371,11 +2371,11 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) if (pkt_dev->spi) { /* We need as quick as possible to find the right SA - * Searching with minimum criteria to archieve this. + * Searching with minimum criteria to achieve, this. */ x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET); } else { - /* slow path: we dont already have xfrm_state */ + /* slow path: we don't already have xfrm_state */ x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0, (xfrm_address_t *)&pkt_dev->cur_daddr, (xfrm_address_t *)&pkt_dev->cur_saddr, @@ -3654,7 +3654,7 @@ static int pktgen_thread_worker(void *arg) struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; - WARN_ON(smp_processor_id() != cpu); + WARN_ON_ONCE(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); complete(&t->start_done); @@ -3838,8 +3838,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) pkt_dev->ipsmode = XFRM_MODE_TRANSPORT; pkt_dev->ipsproto = IPPROTO_ESP; - /* xfrm tunnel mode needs additional dst to extract outter - * ip header protocol/ttl/id field, here creat a phony one. + /* xfrm tunnel mode needs additional dst to extract outer + * ip header protocol/ttl/id field, here create a phony one. * instead of looking for a valid rt, which definitely hurting * performance under such circumstance. */ @@ -3989,6 +3989,7 @@ static int __net_init pg_net_init(struct net *net) goto remove; } + cpus_read_lock(); for_each_online_cpu(cpu) { int err; @@ -3997,6 +3998,7 @@ static int __net_init pg_net_init(struct net *net) pr_warn("Cannot create thread for cpu %d (%d)\n", cpu, err); } + cpus_read_unlock(); if (list_empty(&pn->pktgen_threads)) { pr_err("Initialization failed for all threads\n"); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 73fd7f543fd0..f0a520987085 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2724,7 +2724,7 @@ static int do_set_proto_down(struct net_device *dev, bool proto_down; int err; - if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN)) { + if (!dev->change_proto_down) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } @@ -4087,8 +4087,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, } return skb; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_LINK, err); + rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 83f8cd8aa2d1..74149dc4ee31 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -88,6 +88,7 @@ #include <linux/textsearch.h> #include "dev.h" +#include "netmem_priv.h" #include "sock_destructor.h" #ifdef CONFIG_SKB_EXTENSIONS @@ -314,8 +315,8 @@ void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) fragsz = SKB_DATA_ALIGN(fragsz); local_lock_nested_bh(&napi_alloc_cache.bh_lock); - data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, - align_mask); + data = __page_frag_alloc_align(&nc->page, fragsz, + GFP_ATOMIC | __GFP_NOWARN, align_mask); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return data; @@ -330,7 +331,8 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); fragsz = SKB_DATA_ALIGN(fragsz); - data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, + data = __page_frag_alloc_align(nc, fragsz, + GFP_ATOMIC | __GFP_NOWARN, align_mask); } else { local_bh_disable(); @@ -349,7 +351,7 @@ static struct sk_buff *napi_skb_cache_get(void) local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - GFP_ATOMIC, + GFP_ATOMIC | __GFP_NOWARN, NAPI_SKB_CACHE_BULK, nc->skb_cache); if (unlikely(!nc->skb_count)) { @@ -418,7 +420,8 @@ struct sk_buff *slab_build_skb(void *data) struct sk_buff *skb; unsigned int size; - skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; @@ -469,7 +472,8 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC); + skb = kmem_cache_alloc(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN); if (unlikely(!skb)) return NULL; @@ -917,9 +921,9 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool is_pp_page(struct page *page) +static bool is_pp_netmem(netmem_ref netmem) { - return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; + return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; } int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, @@ -1017,9 +1021,7 @@ EXPORT_SYMBOL(skb_cow_data_for_xdp); #if IS_ENABLED(CONFIG_PAGE_POOL) bool napi_pp_put_page(netmem_ref netmem) { - struct page *page = netmem_to_page(netmem); - - page = compound_head(page); + netmem = netmem_compound_head(netmem); /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation * in order to preserve any existing bits, such as bit 0 for the @@ -1028,10 +1030,10 @@ bool napi_pp_put_page(netmem_ref netmem) * and page_is_pfmemalloc() is checked in __page_pool_put_page() * to avoid recycling the pfmemalloc page. */ - if (unlikely(!is_pp_page(page))) + if (unlikely(!is_pp_netmem(netmem))) return false; - page_pool_put_full_netmem(page->pp, page_to_netmem(page), false); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); return true; } @@ -1058,7 +1060,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data) static int skb_pp_frag_ref(struct sk_buff *skb) { struct skb_shared_info *shinfo; - struct page *head_page; + netmem_ref head_netmem; int i; if (!skb->pp_recycle) @@ -1067,11 +1069,11 @@ static int skb_pp_frag_ref(struct sk_buff *skb) shinfo = skb_shinfo(skb); for (i = 0; i < shinfo->nr_frags; i++) { - head_page = compound_head(skb_frag_page(&shinfo->frags[i])); - if (likely(is_pp_page(head_page))) - page_pool_ref_page(head_page); + head_netmem = netmem_compound_head(shinfo->frags[i].netmem); + if (likely(is_pp_netmem(head_netmem))) + page_pool_ref_netmem(head_netmem); else - page_ref_inc(head_page); + page_ref_inc(netmem_to_page(head_netmem)); } return 0; } @@ -1369,6 +1371,14 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) struct page *p; u8 *vaddr; + if (skb_frag_is_net_iov(frag)) { + printk("%sskb frag %d: not readable\n", level, i); + len -= skb_frag_size(frag); + if (!len) + break; + continue; + } + skb_frag_foreach_page(frag, skb_frag_off(frag), skb_frag_size(frag), p, p_off, p_len, copied) { @@ -1962,6 +1972,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) return -EINVAL; + if (!skb_frags_readable(skb)) + return -EFAULT; + if (!num_frags) goto release; @@ -2135,6 +2148,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) unsigned int size; int headerlen; + if (!skb_frags_readable(skb)) + return NULL; + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) return NULL; @@ -2473,6 +2489,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, struct sk_buff *n; int oldheadroom; + if (!skb_frags_readable(skb)) + return NULL; + if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST)) return NULL; @@ -2817,6 +2836,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) */ int i, k, eat = (skb->tail + delta) - skb->end; + if (!skb_frags_readable(skb)) + return NULL; + if (eat > 0 || skb_cloned(skb)) { if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC)) @@ -2970,6 +2992,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) to += copy; } + if (!skb_frags_readable(skb)) + goto fault; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; @@ -3158,9 +3183,15 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, /* * then map the fragments */ + if (!skb_frags_readable(skb)) + return false; + for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; + if (WARN_ON_ONCE(!skb_frag_page(f))) + return false; + if (__splice_segment(skb_frag_page(f), skb_frag_off(f), skb_frag_size(f), offset, len, spd, false, sk, pipe)) @@ -3378,6 +3409,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) from += copy; } + if (!skb_frags_readable(skb)) + goto fault; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; int end; @@ -3457,6 +3491,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, pos = copy; } + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; @@ -3557,6 +3594,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, pos = copy; } + if (!skb_frags_readable(skb)) + return 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int end; @@ -4048,6 +4088,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb, skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb1->unreadable = skb->unreadable; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; skb1->len += skb1->data_len; @@ -4095,6 +4136,8 @@ static inline void skb_split_no_header(struct sk_buff *skb, pos += size; } skb_shinfo(skb1)->nr_frags = k; + + skb1->unreadable = skb->unreadable; } /** @@ -4169,8 +4212,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) /* Actual merge is delayed until the point when we know we can * commit all, so that we don't have to undo partial changes */ - if (!to || - !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), + if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom), skb_frag_off(fragfrom))) { merge = -1; } else { @@ -4333,6 +4375,9 @@ next_skb: return block_limit - abs_offset; } + if (!skb_frags_readable(st->cur_skb)) + return 0; + if (st->frag_idx == 0 && !st->frag_data) st->stepped_offset += skb_headlen(st->cur_skb); @@ -4409,6 +4454,41 @@ void skb_abort_seq_read(struct skb_seq_state *st) } EXPORT_SYMBOL(skb_abort_seq_read); +/** + * skb_copy_seq_read() - copy from a skb_seq_state to a buffer + * @st: source skb_seq_state + * @offset: offset in source + * @to: destination buffer + * @len: number of bytes to copy + * + * Copy @len bytes from @offset bytes into the source @st to the destination + * buffer @to. `offset` should increase (or be unchanged) with each subsequent + * call to this function. If offset needs to decrease from the previous use `st` + * should be reset first. + * + * Return: 0 on success or -EINVAL if the copy ended early + */ +int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len) +{ + const u8 *data; + u32 sqlen; + + for (;;) { + sqlen = skb_seq_read(offset, &data, st); + if (sqlen == 0) + return -EINVAL; + if (sqlen >= len) { + memcpy(to, data, len); + return 0; + } + memcpy(to, data, sqlen); + to += sqlen; + offset += sqlen; + len -= sqlen; + } +} +EXPORT_SYMBOL(skb_copy_seq_read); + #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, @@ -5161,7 +5241,7 @@ EXPORT_SYMBOL_GPL(skb_to_sgvec); * 3. sg_unmark_end * 4. skb_to_sgvec(payload2) * - * When mapping mutilple payload conditionally, skb_to_sgvec_nomark + * When mapping multiple payload conditionally, skb_to_sgvec_nomark * is more preferable. */ int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, @@ -5945,7 +6025,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (to->pp_recycle != from->pp_recycle) return false; - if (len <= skb_tailroom(to)) { + if (skb_frags_readable(from) != skb_frags_readable(to)) + return false; + + if (len <= skb_tailroom(to) && skb_frags_readable(from)) { if (len) BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); *delta_truesize = 0; @@ -6019,7 +6102,7 @@ EXPORT_SYMBOL(skb_try_coalesce); * @skb: buffer to clean * @xnet: packet is crossing netns * - * skb_scrub_packet can be used after encapsulating or decapsulting a packet + * skb_scrub_packet can be used after encapsulating or decapsulating a packet * into/from a tunnel. Some information have to be cleared during these * operations. * skb_scrub_packet can also be used to clean a skb before injecting it in @@ -6122,6 +6205,9 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) if (!pskb_may_pull(skb, write_len)) return -ENOMEM; + if (!skb_frags_readable(skb)) + return -EFAULT; + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) return 0; @@ -6241,7 +6327,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) return err; skb->protocol = skb->vlan_proto; - skb->mac_len += VLAN_HLEN; + skb->network_header -= VLAN_HLEN; skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); } @@ -6801,7 +6887,7 @@ void skb_condense(struct sk_buff *skb) { if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || - skb_cloned(skb)) + skb_cloned(skb) || !skb_frags_readable(skb)) return; /* Nice, we can free page frag(s) right now */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index bbf40b999713..b1dcbd3be89e 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -293,7 +293,7 @@ out: /* If we trim data a full sg elem before curr pointer update * copybreak and current so that any future copy operations * start at new copy location. - * However trimed data that has not yet been used in a copy op + * However trimmed data that has not yet been used in a copy op * does not require an update. */ if (!msg->sg.size) { diff --git a/net/core/sock.c b/net/core/sock.c index 9abc4fe25953..fe87f9bd8f16 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -124,6 +124,7 @@ #include <linux/netdevice.h> #include <net/protocol.h> #include <linux/skbuff.h> +#include <linux/skbuff_ref.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/sock.h> @@ -1049,6 +1050,69 @@ static int sock_reserve_memory(struct sock *sk, int bytes) return 0; } +#ifdef CONFIG_PAGE_POOL + +/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in + * 1 syscall. The limit exists to limit the amount of memory the kernel + * allocates to copy these tokens. + */ +#define MAX_DONTNEED_TOKENS 128 + +static noinline_for_stack int +sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) +{ + unsigned int num_tokens, i, j, k, netmem_num = 0; + struct dmabuf_token *tokens; + netmem_ref netmems[16]; + int ret = 0; + + if (!sk_is_tcp(sk)) + return -EBADF; + + if (optlen % sizeof(struct dmabuf_token) || + optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) + return -EINVAL; + + tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); + if (!tokens) + return -ENOMEM; + + num_tokens = optlen / sizeof(struct dmabuf_token); + if (copy_from_sockptr(tokens, optval, optlen)) { + kvfree(tokens); + return -EFAULT; + } + + xa_lock_bh(&sk->sk_user_frags); + for (i = 0; i < num_tokens; i++) { + for (j = 0; j < tokens[i].token_count; j++) { + netmem_ref netmem = (__force netmem_ref)__xa_erase( + &sk->sk_user_frags, tokens[i].token_start + j); + + if (netmem && + !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) { + netmems[netmem_num++] = netmem; + if (netmem_num == ARRAY_SIZE(netmems)) { + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + netmem_num = 0; + xa_lock_bh(&sk->sk_user_frags); + } + ret++; + } + } + } + + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + + kvfree(tokens); + return ret; +} +#endif + void sockopt_lock_sock(struct sock *sk) { /* When current->bpf_ctx is set, the setsockopt is called from @@ -1211,6 +1275,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, ret = -EOPNOTSUPP; return ret; } +#ifdef CONFIG_PAGE_POOL + case SO_DEVMEM_DONTNEED: + return sock_devmem_dontneed(sk, optval, optlen); +#endif } sockopt_lock_sock(sk); @@ -2048,7 +2116,7 @@ static inline void sock_lock_init(struct sock *sk) /* * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, - * even temporarly, because of RCU lookups. sk_node should also be left as is. + * even temporarily, because of RCU lookups. sk_node should also be left as is. * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end */ static void sock_copy(struct sock *nsk, const struct sock *osk) @@ -2538,7 +2606,7 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) skb_set_hash_from_sk(skb, sk); /* * We used to take a refcount on sk, but following operation - * is enough to guarantee sk_free() wont free this sock until + * is enough to guarantee sk_free() won't free this sock until * all in-flight packets are completed */ refcount_add(skb->truesize, &sk->sk_wmem_alloc); @@ -3429,7 +3497,7 @@ static void sock_def_destruct(struct sock *sk) void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) - if (send_sigurg(&sk->sk_socket->file->f_owner)) + if (send_sigurg(sk->sk_socket->file)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); @@ -3697,7 +3765,7 @@ EXPORT_SYMBOL(sock_recv_errqueue); * * FIX: POSIX 1003.1g is very ambiguous here. It states that * asynchronous errors should be reported by getsockopt. We assume - * this means if you specify SO_ERROR (otherwise whats the point of it). + * this means if you specify SO_ERROR (otherwise what is the point of it). */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d3dbb92153f2..724b6856fcc3 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1183,6 +1183,7 @@ static void sock_hash_free(struct bpf_map *map) sock_put(elem->sk); sock_hash_free_elem(htab, elem); } + cond_resched(); } /* wait for psock readers accessing its map link */ diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5a165286e4d8..4211710393a8 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -173,10 +173,9 @@ static bool __reuseport_detach_closed_sock(struct sock *sk, static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { - unsigned int size = sizeof(struct sock_reuseport) + - sizeof(struct sock *) * max_socks; - struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + struct sock_reuseport *reuse; + reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC); if (!reuse) return NULL; diff --git a/net/core/utils.c b/net/core/utils.c index c994e95172ac..27f4cffaae05 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Generic address resultion entity + * Generic address resolution entity * * Authors: * net_random Alan Cox diff --git a/net/dsa/tag.c b/net/dsa/tag.c index 6e402d49afd3..79ad105902d9 100644 --- a/net/dsa/tag.c +++ b/net/dsa/tag.c @@ -105,8 +105,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, p = netdev_priv(skb->dev); - if (unlikely(cpu_dp->ds->untag_bridge_pvid)) { - nskb = dsa_untag_bridge_pvid(skb); + if (unlikely(cpu_dp->ds->untag_bridge_pvid || + cpu_dp->ds->untag_vlan_aware_bridge_pvid)) { + nskb = dsa_software_vlan_untag(skb); if (!nskb) { kfree_skb(skb); return 0; diff --git a/net/dsa/tag.h b/net/dsa/tag.h index f6b9c73718df..d5707870906b 100644 --- a/net/dsa/tag.h +++ b/net/dsa/tag.h @@ -44,46 +44,81 @@ static inline struct net_device *dsa_conduit_find_user(struct net_device *dev, return NULL; } -/* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged - * frames as untagged, since the bridge will not untag them. +/** + * dsa_software_untag_vlan_aware_bridge: Software untagging for VLAN-aware bridge + * @skb: Pointer to received socket buffer (packet) + * @br: Pointer to bridge upper interface of ingress port + * @vid: Parsed VID from packet + * + * The bridge can process tagged packets. Software like STP/PTP may not. The + * bridge can also process untagged packets, to the same effect as if they were + * tagged with the PVID of the ingress port. So packets tagged with the PVID of + * the bridge port must be software-untagged, to support both use cases. */ -static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) +static inline void dsa_software_untag_vlan_aware_bridge(struct sk_buff *skb, + struct net_device *br, + u16 vid) { - struct dsa_port *dp = dsa_user_to_port(skb->dev); - struct net_device *br = dsa_port_bridge_dev_get(dp); - struct net_device *dev = skb->dev; - struct net_device *upper_dev; - u16 vid, pvid, proto; + u16 pvid, proto; int err; - if (!br || br_vlan_enabled(br)) - return skb; - err = br_vlan_get_proto(br, &proto); if (err) - return skb; + return; - /* Move VLAN tag from data to hwaccel */ - if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { - skb = skb_vlan_untag(skb); - if (!skb) - return NULL; - } + err = br_vlan_get_pvid_rcu(skb->dev, &pvid); + if (err) + return; - if (!skb_vlan_tag_present(skb)) - return skb; + if (vid == pvid && skb->vlan_proto == htons(proto)) + __vlan_hwaccel_clear_tag(skb); +} - vid = skb_vlan_tag_get_id(skb); +/** + * dsa_software_untag_vlan_unaware_bridge: Software untagging for VLAN-unaware bridge + * @skb: Pointer to received socket buffer (packet) + * @br: Pointer to bridge upper interface of ingress port + * @vid: Parsed VID from packet + * + * The bridge ignores all VLAN tags. Software like STP/PTP may not (it may run + * on the plain port, or on a VLAN upper interface). Maybe packets are coming + * to software as tagged with a driver-defined VID which is NOT equal to the + * PVID of the bridge port (since the bridge is VLAN-unaware, its configuration + * should NOT be committed to hardware). DSA needs a method for this private + * VID to be communicated by software to it, and if packets are tagged with it, + * software-untag them. Note: the private VID may be different per bridge, to + * support the FDB isolation use case. + * + * FIXME: this is currently implemented based on the broken assumption that + * the "private VID" used by the driver in VLAN-unaware mode is equal to the + * bridge PVID. It should not be, except for a coincidence; the bridge PVID is + * irrelevant to the data path in the VLAN-unaware mode. Thus, the VID that + * this function removes is wrong. + * + * All users of ds->untag_bridge_pvid should fix their drivers, if necessary, + * to make the two independent. Only then, if there still remains a need to + * strip the private VID from packets, then a new ds->ops->get_private_vid() + * API shall be introduced to communicate to DSA what this VID is, which needs + * to be stripped here. + */ +static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb, + struct net_device *br, + u16 vid) +{ + struct net_device *upper_dev; + u16 pvid, proto; + int err; - /* We already run under an RCU read-side critical section since - * we are called from netif_receive_skb_list_internal(). - */ - err = br_vlan_get_pvid_rcu(dev, &pvid); + err = br_vlan_get_proto(br, &proto); if (err) - return skb; + return; - if (vid != pvid) - return skb; + err = br_vlan_get_pvid_rcu(skb->dev, &pvid); + if (err) + return; + + if (vid != pvid || skb->vlan_proto != htons(proto)) + return; /* The sad part about attempting to untag from DSA is that we * don't know, unless we check, if the skb will end up in @@ -95,10 +130,50 @@ static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) * definitely keep the tag, to make sure it keeps working. */ upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid); - if (upper_dev) + if (!upper_dev) + __vlan_hwaccel_clear_tag(skb); +} + +/** + * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path + * @skb: Pointer to socket buffer (packet) + * + * Receive path method for switches which cannot avoid tagging all packets + * towards the CPU port. Called when ds->untag_bridge_pvid (legacy) or + * ds->untag_vlan_aware_bridge_pvid is set to true. + * + * As a side effect of this method, any VLAN tag from the skb head is moved + * to hwaccel. + */ +static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb) +{ + struct dsa_port *dp = dsa_user_to_port(skb->dev); + struct net_device *br = dsa_port_bridge_dev_get(dp); + u16 vid; + + /* software untagging for standalone ports not yet necessary */ + if (!br) return skb; - __vlan_hwaccel_clear_tag(skb); + /* Move VLAN tag from data to hwaccel */ + if (!skb_vlan_tag_present(skb)) { + skb = skb_vlan_untag(skb); + if (!skb) + return NULL; + } + + if (!skb_vlan_tag_present(skb)) + return skb; + + vid = skb_vlan_tag_get_id(skb); + + if (br_vlan_enabled(br)) { + if (dp->ds->untag_vlan_aware_bridge_pvid) + dsa_software_untag_vlan_aware_bridge(skb, br, vid); + } else { + if (dp->ds->untag_bridge_pvid) + dsa_software_untag_vlan_unaware_bridge(skb, br, vid); + } return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index ee7b272ab715..281bbac5539d 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -111,9 +111,10 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes) * --------------------------------------------------------------------------- * tag0 : zero-based value represents port - * (eg, 0x00=port1, 0x02=port3, 0x06=port7) + * (eg, 0x0=port1, 0x2=port3, 0x3=port4) */ +#define KSZ8795_TAIL_TAG_EG_PORT_M GENMASK(1, 0) #define KSZ8795_TAIL_TAG_OVERRIDE BIT(6) #define KSZ8795_TAIL_TAG_LOOKUP BIT(7) @@ -141,7 +142,8 @@ static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev) { u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; - return ksz_common_rcv(skb, dev, tag[0] & 7, KSZ_EGRESS_TAG_LEN); + return ksz_common_rcv(skb, dev, tag[0] & KSZ8795_TAIL_TAG_EG_PORT_M, + KSZ_EGRESS_TAG_LEN); } static const struct dsa_device_ops ksz8795_netdev_ops = { @@ -176,8 +178,9 @@ MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME); #define KSZ9477_INGRESS_TAG_LEN 2 #define KSZ9477_PTP_TAG_LEN 4 -#define KSZ9477_PTP_TAG_INDICATION 0x80 +#define KSZ9477_PTP_TAG_INDICATION BIT(7) +#define KSZ9477_TAIL_TAG_EG_PORT_M GENMASK(2, 0) #define KSZ9477_TAIL_TAG_PRIO GENMASK(8, 7) #define KSZ9477_TAIL_TAG_OVERRIDE BIT(9) #define KSZ9477_TAIL_TAG_LOOKUP BIT(10) @@ -310,7 +313,7 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) { /* Tag decoding */ u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; - unsigned int port = tag[0] & 7; + unsigned int port = tag[0] & KSZ9477_TAIL_TAG_EG_PORT_M; unsigned int len = KSZ_EGRESS_TAG_LEN; /* Extra 4-bytes PTP timestamp */ diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index e0e4300bfbd3..bf6608fc6be7 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -8,40 +8,6 @@ #define OCELOT_NAME "ocelot" #define SEVILLE_NAME "seville" -/* If the port is under a VLAN-aware bridge, remove the VLAN header from the - * payload and move it into the DSA tag, which will make the switch classify - * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero, - * which is the pvid of standalone and VLAN-unaware bridge ports. - */ -static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp, - u64 *vlan_tci, u64 *tag_type) -{ - struct net_device *br = dsa_port_bridge_dev_get(dp); - struct vlan_ethhdr *hdr; - u16 proto, tci; - - if (!br || !br_vlan_enabled(br)) { - *vlan_tci = 0; - *tag_type = IFH_TAG_TYPE_C; - return; - } - - hdr = skb_vlan_eth_hdr(skb); - br_vlan_get_proto(br, &proto); - - if (ntohs(hdr->h_vlan_proto) == proto) { - vlan_remove_tag(skb, &tci); - *vlan_tci = tci; - } else { - rcu_read_lock(); - br_vlan_get_pvid_rcu(br, &tci); - rcu_read_unlock(); - *vlan_tci = tci; - } - - *tag_type = (proto != ETH_P_8021Q) ? IFH_TAG_TYPE_S : IFH_TAG_TYPE_C; -} - static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, __be32 ifh_prefix, void **ifh) { @@ -53,7 +19,8 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, u32 rew_op = 0; u64 qos_class; - ocelot_xmit_get_vlan_info(skb, dp, &vlan_tci, &tag_type); + ocelot_xmit_get_vlan_info(skb, dsa_port_bridge_dev_get(dp), &vlan_tci, + &tag_type); qos_class = netdev_get_num_tc(netdev) ? netdev_get_prio_tc_map(netdev, skb->priority) : skb->priority; diff --git a/net/dsa/user.c b/net/dsa/user.c index f5adfa1d978a..74eda9b30608 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -2642,11 +2642,12 @@ void dsa_user_setup_tagger(struct net_device *user) user->features = conduit->vlan_features | NETIF_F_HW_TC; user->hw_features |= NETIF_F_HW_TC; - user->features |= NETIF_F_LLTX; if (user->needed_tailroom) user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); if (ds->needs_standalone_vlan_filtering) user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; + + user->lltx = true; } int dsa_user_suspend(struct net_device *user_dev) diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 9a190635fe95..9b540644ba31 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -8,4 +8,5 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ - module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o + module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \ + phy.o diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index f6f136ec7ddf..f22051f33868 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -13,7 +13,7 @@ const struct nla_policy ethnl_cable_test_act_policy[] = { [ETHTOOL_A_CABLE_TEST_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd) @@ -58,6 +58,7 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) struct ethnl_req_info req_info = {}; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; + struct phy_device *phydev; struct net_device *dev; int ret; @@ -69,12 +70,16 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) return ret; dev = req_info.dev; - if (!dev->phydev) { + + rtnl_lock(); + phydev = ethnl_req_get_phydev(&req_info, + tb[ETHTOOL_A_CABLE_TEST_HEADER], + info->extack); + if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; - goto out_dev_put; + goto out_rtnl; } - rtnl_lock(); ops = ethtool_phy_ops; if (!ops || !ops->start_cable_test) { ret = -EOPNOTSUPP; @@ -85,17 +90,15 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) if (ret < 0) goto out_rtnl; - ret = ops->start_cable_test(dev->phydev, info->extack); + ret = ops->start_cable_test(phydev, info->extack); ethnl_ops_complete(dev); if (!ret) - ethnl_cable_test_started(dev->phydev, - ETHTOOL_MSG_CABLE_TEST_NTF); + ethnl_cable_test_started(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); out_rtnl: rtnl_unlock(); -out_dev_put: ethnl_parse_header_dev_put(&req_info); return ret; } @@ -160,7 +163,8 @@ void ethnl_cable_test_finished(struct phy_device *phydev) } EXPORT_SYMBOL_GPL(ethnl_cable_test_finished); -int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result) +int ethnl_cable_test_result_with_src(struct phy_device *phydev, u8 pair, + u8 result, u32 src) { struct nlattr *nest; int ret = -EMSGSIZE; @@ -173,6 +177,10 @@ int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result) goto err; if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_CODE, result)) goto err; + if (src != ETHTOOL_A_CABLE_INF_SRC_UNSPEC) { + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_RESULT_SRC, src)) + goto err; + } nla_nest_end(phydev->skb, nest); return 0; @@ -181,9 +189,10 @@ err: nla_nest_cancel(phydev->skb, nest); return ret; } -EXPORT_SYMBOL_GPL(ethnl_cable_test_result); +EXPORT_SYMBOL_GPL(ethnl_cable_test_result_with_src); -int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm) +int ethnl_cable_test_fault_length_with_src(struct phy_device *phydev, u8 pair, + u32 cm, u32 src) { struct nlattr *nest; int ret = -EMSGSIZE; @@ -197,6 +206,11 @@ int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm) goto err; if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_CM, cm)) goto err; + if (src != ETHTOOL_A_CABLE_INF_SRC_UNSPEC) { + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_SRC, + src)) + goto err; + } nla_nest_end(phydev->skb, nest); return 0; @@ -205,7 +219,7 @@ err: nla_nest_cancel(phydev->skb, nest); return ret; } -EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); +EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length_with_src); static const struct nla_policy cable_test_tdr_act_cfg_policy[] = { [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 }, @@ -216,7 +230,7 @@ static const struct nla_policy cable_test_tdr_act_cfg_policy[] = { const struct nla_policy ethnl_cable_test_tdr_act_policy[] = { [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED }, }; @@ -305,6 +319,7 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) struct ethnl_req_info req_info = {}; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; + struct phy_device *phydev; struct phy_tdr_config cfg; struct net_device *dev; int ret; @@ -317,10 +332,6 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) return ret; dev = req_info.dev; - if (!dev->phydev) { - ret = -EOPNOTSUPP; - goto out_dev_put; - } ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG], info, &cfg); @@ -328,6 +339,14 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; rtnl_lock(); + phydev = ethnl_req_get_phydev(&req_info, + tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER], + info->extack); + if (IS_ERR_OR_NULL(phydev)) { + ret = -EOPNOTSUPP; + goto out_rtnl; + } + ops = ethtool_phy_ops; if (!ops || !ops->start_cable_test_tdr) { ret = -EOPNOTSUPP; @@ -338,12 +357,12 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) if (ret < 0) goto out_rtnl; - ret = ops->start_cable_test_tdr(dev->phydev, info->extack, &cfg); + ret = ops->start_cable_test_tdr(phydev, info->extack, &cfg); ethnl_ops_complete(dev); if (!ret) - ethnl_cable_test_started(dev->phydev, + ethnl_cable_test_started(phydev, ETHTOOL_MSG_CABLE_TEST_TDR_NTF); out_rtnl: diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index cee188da54f8..ca4f80282448 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -114,8 +114,7 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) struct net_device *dev = req_info->dev; struct ethtool_channels channels = {}; struct nlattr **tb = info->attrs; - u32 err_attr, max_rxfh_in_use; - u64 max_rxnfc_in_use; + u32 err_attr; int ret; dev->ethtool_ops->get_channels(dev, &channels); @@ -166,20 +165,9 @@ ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) return -EINVAL; } - /* ensure the new Rx count fits within the configured Rx flow - * indirection table/rxnfc settings - */ - if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use)) - max_rxnfc_in_use = 0; - max_rxfh_in_use = ethtool_get_max_rxfh_channel(dev); - if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) { - GENL_SET_ERR_MSG_FMT(info, "requested channel counts are too low for existing indirection table (%d)", max_rxfh_in_use); - return -EINVAL; - } - if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) { - GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings"); - return -EINVAL; - } + ret = ethtool_check_max_channel(dev, channels, info); + if (ret) + return ret; /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h index e71cc3e1b7eb..3e7c293af78c 100644 --- a/net/ethtool/cmis.h +++ b/net/ethtool/cmis.h @@ -108,7 +108,6 @@ void ethtool_cmis_cdb_check_completion_flag(u8 cmis_rev, u8 *flags); void ethtool_cmis_page_init(struct ethtool_module_eeprom *page_data, u8 page, u32 offset, u32 length); -void ethtool_cmis_page_fini(struct ethtool_module_eeprom *page_data); struct ethtool_cmis_cdb * ethtool_cmis_cdb_init(struct net_device *dev, diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c index 1bb08783b60d..4d5581147952 100644 --- a/net/ethtool/cmis_cdb.c +++ b/net/ethtool/cmis_cdb.c @@ -100,7 +100,8 @@ static u8 cmis_cdb_advert_rpl_inst_supported(struct cmis_cdb_advert_rpl *rpl) } static int cmis_cdb_advertisement_get(struct ethtool_cmis_cdb *cdb, - struct net_device *dev) + struct net_device *dev, + struct ethnl_module_fw_flash_ntf_params *ntf_params) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_module_eeprom page_data = {}; @@ -119,8 +120,12 @@ static int cmis_cdb_advertisement_get(struct ethtool_cmis_cdb *cdb, return err; } - if (!cmis_cdb_advert_rpl_inst_supported(&rpl)) + if (!cmis_cdb_advert_rpl_inst_supported(&rpl)) { + ethnl_module_fw_flash_ntf_err(dev, ntf_params, + "CDB functionality is not supported", + NULL); return -EOPNOTSUPP; + } cdb->read_write_len_ext = rpl.read_write_len_ext; @@ -282,7 +287,7 @@ ethtool_cmis_cdb_init(struct net_device *dev, goto err; } - err = cmis_cdb_advertisement_get(cdb, dev); + err = cmis_cdb_advertisement_get(cdb, dev, ntf_params); if (err < 0) goto err; @@ -444,6 +449,9 @@ static void cmis_cdb_status_fail_msg_get(u8 status, char **err_msg) case 0b01000101: *err_msg = "CDB status failed: CdbChkCode error"; break; + case 0b01000110: + *err_msg = "CDB status failed: Password error"; + break; default: *err_msg = "Unknown failure reason"; } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 07032babd1b6..dd345efa114b 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -6,6 +6,7 @@ #include <linux/rtnetlink.h> #include <linux/ptp_clock_kernel.h> +#include "netlink.h" #include "common.h" const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { @@ -24,8 +25,6 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", [NETIF_F_GSO_BIT] = "tx-generic-segmentation", - [NETIF_F_LLTX_BIT] = "tx-lockless", - [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", [NETIF_F_GRO_BIT] = "rx-gro", [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", [NETIF_F_LRO_BIT] = "rx-lro", @@ -51,7 +50,6 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", - [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", [NETIF_F_RXHASH_BIT] = "rx-hashing", [NETIF_F_RXCSUM_BIT] = "rx-checksum", @@ -429,6 +427,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = { [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw", [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)] = "bind-phc", [const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)] = "option-id-tcp", + [const_ilog2(SOF_TIMESTAMPING_OPT_RX_FILTER)] = "option-rx-filter", }; static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT); @@ -539,7 +538,7 @@ static int ethtool_get_rxnfc_rule_count(struct net_device *dev) return info.rule_cnt; } -int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max) +static int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc *info; @@ -609,7 +608,7 @@ static u32 ethtool_get_max_rss_ctx_channel(struct net_device *dev) return max_ring; } -u32 ethtool_get_max_rxfh_channel(struct net_device *dev) +static u32 ethtool_get_max_rxfh_channel(struct net_device *dev) { struct ethtool_rxfh_param rxfh = {}; u32 dev_size, current_max; @@ -650,10 +649,47 @@ out_free: return current_max; } +int ethtool_check_max_channel(struct net_device *dev, + struct ethtool_channels channels, + struct genl_info *info) +{ + u64 max_rxnfc_in_use; + u32 max_rxfh_in_use; + int max_mp_in_use; + + /* ensure the new Rx count fits within the configured Rx flow + * indirection table/rxnfc settings + */ + if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use)) + max_rxnfc_in_use = 0; + max_rxfh_in_use = ethtool_get_max_rxfh_channel(dev); + if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) { + if (info) + GENL_SET_ERR_MSG_FMT(info, "requested channel counts are too low for existing indirection table (%d)", max_rxfh_in_use); + return -EINVAL; + } + if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) { + if (info) + GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings"); + return -EINVAL; + } + + max_mp_in_use = dev_get_min_mp_channel_count(dev); + if (channels.combined_count + channels.rx_count <= max_mp_in_use) { + if (info) + GENL_SET_ERR_MSG_FMT(info, "requested channel counts are too low for existing memory provider setting (%d)", max_mp_in_use); + return -EINVAL; + } + + return 0; +} + int ethtool_check_ops(const struct ethtool_ops *ops) { if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params)) return -EINVAL; + if (WARN_ON(ops->rxfh_max_num_contexts == 1)) + return -EINVAL; /* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime, * the fact that ops are checked at registration time does not * mean the ops attached to a netdev later on are sane. @@ -665,20 +701,21 @@ int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; + int err = 0; memset(info, 0, sizeof(*info)); info->cmd = ETHTOOL_GET_TS_INFO; + info->phc_index = -1; if (phy_is_default_hwtstamp(phydev) && phy_has_tsinfo(phydev)) - return phy_ts_info(phydev, info); - if (ops->get_ts_info) - return ops->get_ts_info(dev, info); + err = phy_ts_info(phydev, info); + else if (ops->get_ts_info) + err = ops->get_ts_info(dev, info); - info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | - SOF_TIMESTAMPING_SOFTWARE; - info->phc_index = -1; + info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; - return 0; + return err; } int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index) diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 863806fcf01a..d55d5201b085 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -20,6 +20,8 @@ struct link_mode_info { u8 duplex; }; +struct genl_info; + extern const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; extern const char @@ -42,8 +44,9 @@ int __ethtool_get_link(struct net_device *dev); bool convert_legacy_settings_to_link_ksettings( struct ethtool_link_ksettings *link_ksettings, const struct ethtool_cmd *legacy_settings); -u32 ethtool_get_max_rxfh_channel(struct net_device *dev); -int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max); +int ethtool_check_max_channel(struct net_device *dev, + struct ethtool_channels channels, + struct genl_info *info); int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); extern const struct ethtool_phy_ops *ethtool_phy_ops; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index e18823bf2330..65cfe76dafbe 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -442,6 +442,9 @@ int __ethtool_get_link_ksettings(struct net_device *dev, if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + memset(link_ksettings, 0, sizeof(*link_ksettings)); return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); } @@ -1227,7 +1230,8 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ - if (rxfh.rss_context && !ops->cap_rss_ctx_supported) + if (rxfh.rss_context && !(ops->cap_rss_ctx_supported || + ops->create_rxfh_context)) return -EOPNOTSUPP; rxfh.indir_size = rxfh_dev.indir_size; @@ -1260,10 +1264,15 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (rxfh_dev.indir) memcpy(rxfh_dev.indir, ethtool_rxfh_context_indir(ctx), indir_bytes); - if (rxfh_dev.key) - memcpy(rxfh_dev.key, ethtool_rxfh_context_key(ctx), - user_key_size); - rxfh_dev.hfunc = ctx->hfunc; + if (!ops->rxfh_per_ctx_key) { + rxfh_dev.key_size = 0; + } else { + if (rxfh_dev.key) + memcpy(rxfh_dev.key, + ethtool_rxfh_context_key(ctx), + user_key_size); + rxfh_dev.hfunc = ctx->hfunc; + } rxfh_dev.input_xfrm = ctx->input_xfrm; ret = 0; } else { @@ -1281,6 +1290,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, sizeof(rxfh.input_xfrm))) { ret = -EFAULT; } else if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, key_size), + &rxfh_dev.key_size, + sizeof(rxfh.key_size))) { + ret = -EFAULT; + } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_config[0]), rss_config, total_size)) { ret = -EFAULT; @@ -1357,7 +1371,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ - if (rxfh.rss_context && !ops->cap_rss_ctx_supported) + if (rxfh.rss_context && !(ops->cap_rss_ctx_supported || + ops->create_rxfh_context)) return -EOPNOTSUPP; /* Check input data transformation capabilities */ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && @@ -1387,6 +1402,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); + /* Check settings which may be global rather than per RSS-context */ + if (rxfh.rss_context && !ops->rxfh_per_ctx_key) + if (rxfh.key_size || + (rxfh.hfunc && rxfh.hfunc != ETH_RSS_HASH_NO_CHANGE) || + (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE)) + return -EOPNOTSUPP; + rss_config = kzalloc(indir_bytes + dev_key_size, GFP_USER); if (!rss_config) return -ENOMEM; @@ -2069,8 +2091,6 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, { struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; u16 from_channel, to_channel; - u64 max_rxnfc_in_use; - u32 max_rxfh_in_use; unsigned int i; int ret; @@ -2100,14 +2120,9 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, (!channels.rx_count || !channels.tx_count)) return -EINVAL; - /* ensure the new Rx count fits within the configured Rx flow - * indirection table/rxnfc settings */ - if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use)) - max_rxnfc_in_use = 0; - max_rxfh_in_use = ethtool_get_max_rxfh_channel(dev); - if (channels.combined_count + channels.rx_count <= - max_t(u64, max_rxnfc_in_use, max_rxfh_in_use)) - return -EINVAL; + ret = ethtool_check_max_channel(dev, channels, NULL); + if (ret) + return ret; /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c index 5c317d23787b..30b8ce275159 100644 --- a/net/ethtool/linkinfo.c +++ b/net/ethtool/linkinfo.c @@ -35,7 +35,7 @@ static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); - if (ret < 0 && info) + if (ret < 0) GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); ethnl_ops_complete(dev); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index b2591db49f7d..259cd9ef1f2a 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -40,7 +40,7 @@ static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); - if (ret < 0 && info) { + if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); goto out; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index cb1eea00e349..e3f0ef6b851b 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -2,6 +2,7 @@ #include <net/sock.h> #include <linux/ethtool_netlink.h> +#include <linux/phy_link_topology.h> #include <linux/pm_runtime.h> #include "netlink.h" #include "module_fw.h" @@ -31,6 +32,24 @@ const struct nla_policy ethnl_header_policy_stats[] = { ETHTOOL_FLAGS_STATS), }; +const struct nla_policy ethnl_header_policy_phy[] = { + [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = ALTIFNAMSIZ - 1 }, + [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, + ETHTOOL_FLAGS_BASIC), + [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +const struct nla_policy ethnl_header_policy_phy_stats[] = { + [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = ALTIFNAMSIZ - 1 }, + [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, + ETHTOOL_FLAGS_STATS), + [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), +}; + int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, enum ethnl_sock_type type) { @@ -119,7 +138,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *header, struct net *net, struct netlink_ext_ack *extack, bool require_dev) { - struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy)]; + struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)]; const struct nlattr *devname_attr; struct net_device *dev = NULL; u32 flags = 0; @@ -134,7 +153,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, /* No validation here, command policy should have a nested policy set * for the header, therefore validation should have already been done. */ - ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy) - 1, header, + ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header, NULL, extack); if (ret < 0) return ret; @@ -175,11 +194,45 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, return -EINVAL; } + if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { + if (dev) { + req_info->phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]); + } else { + NL_SET_ERR_MSG_ATTR(extack, header, + "phy_index set without a netdev"); + return -EINVAL; + } + } + req_info->dev = dev; req_info->flags = flags; return 0; } +struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, + const struct nlattr *header, + struct netlink_ext_ack *extack) +{ + struct phy_device *phydev; + + ASSERT_RTNL(); + + if (!req_info->dev) + return NULL; + + if (!req_info->phy_index) + return req_info->dev->phydev; + + phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index); + if (!phydev) { + NL_SET_ERR_MSG_ATTR(extack, header, + "no phy matching phyindex"); + return ERR_PTR(-ENODEV); + } + + return phydev; +} + /** * ethnl_fill_reply_header() - Put common header into a reply message * @skb: skb with the message @@ -1128,6 +1181,8 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_RSS_GET, .doit = ethnl_default_doit, + .start = ethnl_rss_dump_start, + .dumpit = ethnl_rss_dumpit, .policy = ethnl_rss_get_policy, .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, }, @@ -1179,6 +1234,15 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_module_fw_flash_act_policy, .maxattr = ARRAY_SIZE(ethnl_module_fw_flash_act_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_PHY_GET, + .doit = ethnl_phy_doit, + .start = ethnl_phy_start, + .dumpit = ethnl_phy_dumpit, + .done = ethnl_phy_done, + .policy = ethnl_phy_get_policy, + .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 46ec273a87c5..203b08eb6c6f 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -251,6 +251,9 @@ static inline unsigned int ethnl_reply_header_size(void) * @dev: network device the request is for (may be null) * @dev_tracker: refcount tracker for @dev reference * @flags: request flags common for all request types + * @phy_index: phy_device index connected to @dev this request is for. Can be + * 0 if the request doesn't target a phy, or if the @dev's attached + * phy is targeted. * * This is a common base for request specific structures holding data from * parsed userspace request. These always embed struct ethnl_req_info at @@ -260,6 +263,7 @@ struct ethnl_req_info { struct net_device *dev; netdevice_tracker dev_tracker; u32 flags; + u32 phy_index; }; static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) @@ -268,6 +272,27 @@ static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) } /** + * ethnl_req_get_phydev() - Gets the phy_device targeted by this request, + * if any. Must be called under rntl_lock(). + * @req_info: The ethnl request to get the phy from. + * @header: The netlink header, used for error reporting. + * @extack: The netlink extended ACK, for error reporting. + * + * The caller must hold RTNL, until it's done interacting with the returned + * phy_device. + * + * Return: A phy_device pointer corresponding either to the passed phy_index + * if one is provided. If not, the phy_device attached to the + * net_device targeted by this request is returned. If there's no + * targeted net_device, or no phy_device is attached, NULL is + * returned. If the provided phy_index is invalid, an error pointer + * is returned. + */ +struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, + const struct nlattr *header, + struct netlink_ext_ack *extack); + +/** * struct ethnl_reply_data - base type of reply data for GET requests * @dev: device for current reply message; in single shot requests it is * equal to ðnl_req_info.dev; in dumps it's different for each @@ -409,9 +434,12 @@ extern const struct ethnl_request_ops ethnl_rss_request_ops; extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; +extern const struct ethnl_request_ops ethnl_phy_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; +extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1]; +extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1]; extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1]; extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; @@ -449,13 +477,14 @@ extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; -extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_CONTEXT + 1]; +extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1]; extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1]; extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1]; extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1]; +extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); @@ -464,6 +493,12 @@ int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info); +int ethnl_rss_dump_start(struct netlink_callback *cb); +int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_phy_start(struct netlink_callback *cb); +int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info); +int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_phy_done(struct netlink_callback *cb); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c new file mode 100644 index 000000000000..ed8f690f6bac --- /dev/null +++ b/net/ethtool/phy.c @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Bootlin + * + */ +#include "common.h" +#include "netlink.h" + +#include <linux/phy.h> +#include <linux/phy_link_topology.h> +#include <linux/sfp.h> + +struct phy_req_info { + struct ethnl_req_info base; + struct phy_device_node *pdn; +}; + +#define PHY_REQINFO(__req_base) \ + container_of(__req_base, struct phy_req_info, base) + +const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = { + [ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +/* Caller holds rtnl */ +static ssize_t +ethnl_phy_reply_size(const struct ethnl_req_info *req_base, + struct netlink_ext_ack *extack) +{ + struct phy_req_info *req_info = PHY_REQINFO(req_base); + struct phy_device_node *pdn = req_info->pdn; + struct phy_device *phydev = pdn->phy; + size_t size = 0; + + ASSERT_RTNL(); + + /* ETHTOOL_A_PHY_INDEX */ + size += nla_total_size(sizeof(u32)); + + /* ETHTOOL_A_DRVNAME */ + if (phydev->drv) + size += nla_total_size(strlen(phydev->drv->name) + 1); + + /* ETHTOOL_A_NAME */ + size += nla_total_size(strlen(dev_name(&phydev->mdio.dev)) + 1); + + /* ETHTOOL_A_PHY_UPSTREAM_TYPE */ + size += nla_total_size(sizeof(u32)); + + if (phy_on_sfp(phydev)) { + const char *upstream_sfp_name = sfp_get_name(pdn->parent_sfp_bus); + + /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */ + if (upstream_sfp_name) + size += nla_total_size(strlen(upstream_sfp_name) + 1); + + /* ETHTOOL_A_PHY_UPSTREAM_INDEX */ + size += nla_total_size(sizeof(u32)); + } + + /* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */ + if (phydev->sfp_bus) { + const char *sfp_name = sfp_get_name(phydev->sfp_bus); + + if (sfp_name) + size += nla_total_size(strlen(sfp_name) + 1); + } + + return size; +} + +static int +ethnl_phy_fill_reply(const struct ethnl_req_info *req_base, struct sk_buff *skb) +{ + struct phy_req_info *req_info = PHY_REQINFO(req_base); + struct phy_device_node *pdn = req_info->pdn; + struct phy_device *phydev = pdn->phy; + enum phy_upstream ptype; + + ptype = pdn->upstream_type; + + if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, phydev->phyindex) || + nla_put_string(skb, ETHTOOL_A_PHY_NAME, dev_name(&phydev->mdio.dev)) || + nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, ptype)) + return -EMSGSIZE; + + if (phydev->drv && + nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, phydev->drv->name)) + return -EMSGSIZE; + + if (ptype == PHY_UPSTREAM_PHY) { + struct phy_device *upstream = pdn->upstream.phydev; + const char *sfp_upstream_name; + + /* Parent index */ + if (nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, upstream->phyindex)) + return -EMSGSIZE; + + if (pdn->parent_sfp_bus) { + sfp_upstream_name = sfp_get_name(pdn->parent_sfp_bus); + if (sfp_upstream_name && + nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, + sfp_upstream_name)) + return -EMSGSIZE; + } + } + + if (phydev->sfp_bus) { + const char *sfp_name = sfp_get_name(phydev->sfp_bus); + + if (sfp_name && + nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, + sfp_name)) + return -EMSGSIZE; + } + + return 0; +} + +static int ethnl_phy_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct phy_link_topology *topo = req_base->dev->link_topo; + struct phy_req_info *req_info = PHY_REQINFO(req_base); + struct phy_device *phydev; + + phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PHY_HEADER], + extack); + if (!phydev) + return 0; + + if (IS_ERR(phydev)) + return PTR_ERR(phydev); + + if (!topo) + return 0; + + req_info->pdn = xa_load(&topo->phys, phydev->phyindex); + + return 0; +} + +int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct phy_req_info req_info = {}; + struct nlattr **tb = info->attrs; + struct sk_buff *rskb; + void *reply_payload; + int reply_len; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info.base, + tb[ETHTOOL_A_PHY_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + rtnl_lock(); + + ret = ethnl_phy_parse_request(&req_info.base, tb, info->extack); + if (ret < 0) + goto err_unlock_rtnl; + + /* No PHY, return early */ + if (!req_info.pdn) + goto err_unlock_rtnl; + + ret = ethnl_phy_reply_size(&req_info.base, info->extack); + if (ret < 0) + goto err_unlock_rtnl; + reply_len = ret + ethnl_reply_header_size(); + + rskb = ethnl_reply_init(reply_len, req_info.base.dev, + ETHTOOL_MSG_PHY_GET_REPLY, + ETHTOOL_A_PHY_HEADER, + info, &reply_payload); + if (!rskb) { + ret = -ENOMEM; + goto err_unlock_rtnl; + } + + ret = ethnl_phy_fill_reply(&req_info.base, rskb); + if (ret) + goto err_free_msg; + + rtnl_unlock(); + ethnl_parse_header_dev_put(&req_info.base); + genlmsg_end(rskb, reply_payload); + + return genlmsg_reply(rskb, info); + +err_free_msg: + nlmsg_free(rskb); +err_unlock_rtnl: + rtnl_unlock(); + ethnl_parse_header_dev_put(&req_info.base); + return ret; +} + +struct ethnl_phy_dump_ctx { + struct phy_req_info *phy_req_info; + unsigned long ifindex; + unsigned long phy_index; +}; + +int ethnl_phy_start(struct netlink_callback *cb) +{ + const struct genl_info *info = genl_info_dump(cb); + struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + ctx->phy_req_info = kzalloc(sizeof(*ctx->phy_req_info), GFP_KERNEL); + if (!ctx->phy_req_info) + return -ENOMEM; + + ret = ethnl_parse_header_dev_get(&ctx->phy_req_info->base, + info->attrs[ETHTOOL_A_PHY_HEADER], + sock_net(cb->skb->sk), cb->extack, + false); + ctx->ifindex = 0; + ctx->phy_index = 0; + + if (ret) + kfree(ctx->phy_req_info); + + return ret; +} + +int ethnl_phy_done(struct netlink_callback *cb) +{ + struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; + + if (ctx->phy_req_info->base.dev) + ethnl_parse_header_dev_put(&ctx->phy_req_info->base); + + kfree(ctx->phy_req_info); + + return 0; +} + +static int ethnl_phy_dump_one_dev(struct sk_buff *skb, struct net_device *dev, + struct netlink_callback *cb) +{ + struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; + struct phy_req_info *pri = ctx->phy_req_info; + struct phy_device_node *pdn; + int ret = 0; + void *ehdr; + + if (!dev->link_topo) + return 0; + + xa_for_each_start(&dev->link_topo->phys, ctx->phy_index, pdn, ctx->phy_index) { + ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_PHY_GET_REPLY); + if (!ehdr) { + ret = -EMSGSIZE; + break; + } + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_PHY_HEADER); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + break; + } + + pri->pdn = pdn; + ret = ethnl_phy_fill_reply(&pri->base, skb); + if (ret < 0) { + genlmsg_cancel(skb, ehdr); + break; + } + + genlmsg_end(skb, ehdr); + } + + return ret; +} + +int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + + if (ctx->phy_req_info->base.dev) { + ret = ethnl_phy_dump_one_dev(skb, ctx->phy_req_info->base.dev, cb); + } else { + for_each_netdev_dump(net, dev, ctx->ifindex) { + ret = ethnl_phy_dump_one_dev(skb, dev, cb); + if (ret) + break; + + ctx->phy_index = 0; + } + } + rtnl_unlock(); + + return ret; +} diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c index b1e2e3b5027f..d95d92f173a6 100644 --- a/net/ethtool/plca.c +++ b/net/ethtool/plca.c @@ -25,7 +25,7 @@ struct plca_reply_data { const struct nla_policy ethnl_plca_get_cfg_policy[] = { [ETHTOOL_A_PLCA_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static void plca_update_sint(int *dst, struct nlattr **tb, u32 attrid, @@ -58,10 +58,14 @@ static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const struct ethtool_phy_ops *ops; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; int ret; + phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PLCA_HEADER], + info->extack); // check that the PHY device is available and connected - if (!dev->phydev) { + if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; goto out; } @@ -80,7 +84,7 @@ static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, memset(&data->plca_cfg, 0xff, sizeof_field(struct plca_reply_data, plca_cfg)); - ret = ops->get_plca_cfg(dev->phydev, &data->plca_cfg); + ret = ops->get_plca_cfg(phydev, &data->plca_cfg); ethnl_ops_complete(dev); out: @@ -129,7 +133,7 @@ static int plca_get_cfg_fill_reply(struct sk_buff *skb, const struct nla_policy ethnl_plca_set_cfg_policy[] = { [ETHTOOL_A_PLCA_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_PLCA_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_PLCA_NODE_ID] = NLA_POLICY_MAX(NLA_U32, 255), [ETHTOOL_A_PLCA_NODE_CNT] = NLA_POLICY_RANGE(NLA_U32, 1, 255), @@ -141,15 +145,17 @@ const struct nla_policy ethnl_plca_set_cfg_policy[] = { static int ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) { - struct net_device *dev = req_info->dev; const struct ethtool_phy_ops *ops; struct nlattr **tb = info->attrs; struct phy_plca_cfg plca_cfg; + struct phy_device *phydev; bool mod = false; int ret; + phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PLCA_HEADER], + info->extack); // check that the PHY device is available and connected - if (!dev->phydev) + if (IS_ERR_OR_NULL(phydev)) return -EOPNOTSUPP; ops = ethtool_phy_ops; @@ -168,7 +174,7 @@ ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) if (!mod) return 0; - ret = ops->set_plca_cfg(dev->phydev, &plca_cfg, info->extack); + ret = ops->set_plca_cfg(phydev, &plca_cfg, info->extack); return ret < 0 ? ret : 1; } @@ -191,7 +197,7 @@ const struct ethnl_request_ops ethnl_plca_cfg_request_ops = { const struct nla_policy ethnl_plca_get_status_policy[] = { [ETHTOOL_A_PLCA_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_phy), }; static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, @@ -201,10 +207,14 @@ static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, struct plca_reply_data *data = PLCA_REPDATA(reply_base); struct net_device *dev = reply_base->dev; const struct ethtool_phy_ops *ops; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; int ret; + phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PLCA_HEADER], + info->extack); // check that the PHY device is available and connected - if (!dev->phydev) { + if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; goto out; } @@ -223,7 +233,7 @@ static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, memset(&data->plca_st, 0xff, sizeof_field(struct plca_reply_data, plca_st)); - ret = ops->get_plca_status(dev->phydev, &data->plca_st); + ret = ops->get_plca_status(phydev, &data->plca_st); ethnl_ops_complete(dev); out: return ret; diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index ff81aa749784..a0705edca22a 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -28,17 +28,15 @@ struct pse_reply_data { /* PSE_GET */ const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = { - [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), }; -static int pse_get_pse_attributes(struct net_device *dev, +static int pse_get_pse_attributes(struct phy_device *phydev, struct netlink_ext_ack *extack, struct pse_reply_data *data) { - struct phy_device *phydev = dev->phydev; - if (!phydev) { - NL_SET_ERR_MSG(extack, "No PHY is attached"); + NL_SET_ERR_MSG(extack, "No PHY found"); return -EOPNOTSUPP; } @@ -58,13 +56,20 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base, { struct pse_reply_data *data = PSE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; - ret = pse_get_pse_attributes(dev, info->extack, data); + phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PSE_HEADER], + info->extack); + if (IS_ERR(phydev)) + return -ENODEV; + + ret = pse_get_pse_attributes(phydev, info->extack, data); ethnl_ops_complete(dev); @@ -206,7 +211,7 @@ static void pse_cleanup_data(struct ethnl_reply_data *reply_base) /* PSE_SET */ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { - [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] = NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED), @@ -217,14 +222,11 @@ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { }; static int -ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) +ethnl_set_pse_validate(struct phy_device *phydev, struct genl_info *info) { - struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct phy_device *phydev; - phydev = dev->phydev; - if (!phydev) { + if (IS_ERR_OR_NULL(phydev)) { NL_SET_ERR_MSG(info->extack, "No PHY is attached"); return -EOPNOTSUPP; } @@ -249,18 +251,21 @@ ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) return -EOPNOTSUPP; } - return 1; + return 0; } static int ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) { - struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; struct phy_device *phydev; - int ret = 0; + int ret; - phydev = dev->phydev; + phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PSE_HEADER], + info->extack); + ret = ethnl_set_pse_validate(phydev, info); + if (ret) + return ret; if (tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]) { unsigned int pw_limit; @@ -307,7 +312,6 @@ const struct ethnl_request_ops ethnl_pse_request_ops = { .fill_reply = pse_fill_reply, .cleanup_data = pse_cleanup_data, - .set_validate = ethnl_set_pse_validate, .set = ethnl_set_pse, /* PSE has no notification */ }; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 5c4c4505ab9a..e07386275e14 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -10,6 +10,7 @@ struct rss_req_info { struct rss_reply_data { struct ethnl_reply_data base; + bool no_key_fields; u32 indir_size; u32 hkey_size; u32 hfunc; @@ -27,6 +28,7 @@ struct rss_reply_data { const struct nla_policy ethnl_rss_get_policy[] = { [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32 }, + [ETHTOOL_A_RSS_START_CONTEXT] = { .type = NLA_U32 }, }; static int @@ -37,18 +39,18 @@ rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, if (tb[ETHTOOL_A_RSS_CONTEXT]) request->rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]); + if (tb[ETHTOOL_A_RSS_START_CONTEXT]) { + NL_SET_BAD_ATTR(extack, tb[ETHTOOL_A_RSS_START_CONTEXT]); + return -EINVAL; + } return 0; } static int -rss_prepare_data(const struct ethnl_req_info *req_base, - struct ethnl_reply_data *reply_base, - const struct genl_info *info) +rss_prepare_get(const struct rss_req_info *request, struct net_device *dev, + struct rss_reply_data *data, const struct genl_info *info) { - struct rss_reply_data *data = RSS_REPDATA(reply_base); - struct rss_req_info *request = RSS_REQINFO(req_base); - struct net_device *dev = reply_base->dev; struct ethtool_rxfh_param rxfh = {}; const struct ethtool_ops *ops; u32 total_size, indir_bytes; @@ -56,12 +58,6 @@ rss_prepare_data(const struct ethnl_req_info *req_base, int ret; ops = dev->ethtool_ops; - if (!ops->get_rxfh) - return -EOPNOTSUPP; - - /* Some drivers don't handle rss_context */ - if (request->rss_context && !ops->cap_rss_ctx_supported) - return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) @@ -91,7 +87,6 @@ rss_prepare_data(const struct ethnl_req_info *req_base, rxfh.indir = data->indir_table; rxfh.key_size = data->hkey_size; rxfh.key = data->hkey; - rxfh.rss_context = request->rss_context; ret = ops->get_rxfh(dev, &rxfh); if (ret) @@ -105,6 +100,67 @@ out_ops: } static int +rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev, + struct rss_reply_data *data, const struct genl_info *info) +{ + struct ethtool_rxfh_context *ctx; + u32 total_size, indir_bytes; + u8 *rss_config; + + ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); + if (!ctx) + return -ENOENT; + + data->indir_size = ctx->indir_size; + data->hkey_size = ctx->key_size; + data->hfunc = ctx->hfunc; + data->input_xfrm = ctx->input_xfrm; + + indir_bytes = data->indir_size * sizeof(u32); + total_size = indir_bytes + data->hkey_size; + rss_config = kzalloc(total_size, GFP_KERNEL); + if (!rss_config) + return -ENOMEM; + + data->indir_table = (u32 *)rss_config; + memcpy(data->indir_table, ethtool_rxfh_context_indir(ctx), indir_bytes); + + if (data->hkey_size) { + data->hkey = rss_config + indir_bytes; + memcpy(data->hkey, ethtool_rxfh_context_key(ctx), + data->hkey_size); + } + + return 0; +} + +static int +rss_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + const struct genl_info *info) +{ + struct rss_reply_data *data = RSS_REPDATA(reply_base); + struct rss_req_info *request = RSS_REQINFO(req_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_ops *ops; + + ops = dev->ethtool_ops; + if (!ops->get_rxfh) + return -EOPNOTSUPP; + + /* Some drivers don't handle rss_context */ + if (request->rss_context) { + if (!ops->cap_rss_ctx_supported && !ops->create_rxfh_context) + return -EOPNOTSUPP; + + data->no_key_fields = !ops->rxfh_per_ctx_key; + return rss_prepare_ctx(request, dev, data, info); + } + + return rss_prepare_get(request, dev, data, info); +} + +static int rss_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { @@ -131,13 +187,18 @@ rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, nla_put_u32(skb, ETHTOOL_A_RSS_CONTEXT, request->rss_context)) return -EMSGSIZE; + if ((data->indir_size && + nla_put(skb, ETHTOOL_A_RSS_INDIR, + sizeof(u32) * data->indir_size, data->indir_table))) + return -EMSGSIZE; + + if (data->no_key_fields) + return 0; + if ((data->hfunc && nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) || (data->input_xfrm && nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) || - (data->indir_size && - nla_put(skb, ETHTOOL_A_RSS_INDIR, - sizeof(u32) * data->indir_size, data->indir_table)) || (data->hkey_size && nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))) return -EMSGSIZE; @@ -152,6 +213,146 @@ static void rss_cleanup_data(struct ethnl_reply_data *reply_base) kfree(data->indir_table); } +struct rss_nl_dump_ctx { + unsigned long ifindex; + unsigned long ctx_idx; + + /* User wants to only dump contexts from given ifindex */ + unsigned int match_ifindex; + unsigned int start_ctx; +}; + +static struct rss_nl_dump_ctx *rss_dump_ctx(struct netlink_callback *cb) +{ + NL_ASSERT_DUMP_CTX_FITS(struct rss_nl_dump_ctx); + + return (struct rss_nl_dump_ctx *)cb->ctx; +} + +int ethnl_rss_dump_start(struct netlink_callback *cb) +{ + const struct genl_info *info = genl_info_dump(cb); + struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb); + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + int ret; + + /* Filtering by context not supported */ + if (tb[ETHTOOL_A_RSS_CONTEXT]) { + NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]); + return -EINVAL; + } + if (tb[ETHTOOL_A_RSS_START_CONTEXT]) { + ctx->start_ctx = nla_get_u32(tb[ETHTOOL_A_RSS_START_CONTEXT]); + ctx->ctx_idx = ctx->start_ctx; + } + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_RSS_HEADER], + sock_net(cb->skb->sk), cb->extack, + false); + if (req_info.dev) { + ctx->match_ifindex = req_info.dev->ifindex; + ctx->ifindex = ctx->match_ifindex; + ethnl_parse_header_dev_put(&req_info); + req_info.dev = NULL; + } + + return ret; +} + +static int +rss_dump_one_ctx(struct sk_buff *skb, struct netlink_callback *cb, + struct net_device *dev, u32 rss_context) +{ + const struct genl_info *info = genl_info_dump(cb); + struct rss_reply_data data = {}; + struct rss_req_info req = {}; + void *ehdr; + int ret; + + req.rss_context = rss_context; + + ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_RSS_GET_REPLY); + if (!ehdr) + return -EMSGSIZE; + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_RSS_HEADER); + if (ret < 0) + goto err_cancel; + + /* Context 0 is not currently storred or cached in the XArray */ + if (!rss_context) + ret = rss_prepare_get(&req, dev, &data, info); + else + ret = rss_prepare_ctx(&req, dev, &data, info); + if (ret) + goto err_cancel; + + ret = rss_fill_reply(skb, &req.base, &data.base); + if (ret) + goto err_cleanup; + genlmsg_end(skb, ehdr); + + rss_cleanup_data(&data.base); + return 0; + +err_cleanup: + rss_cleanup_data(&data.base); +err_cancel: + genlmsg_cancel(skb, ehdr); + return ret; +} + +static int +rss_dump_one_dev(struct sk_buff *skb, struct netlink_callback *cb, + struct net_device *dev) +{ + struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb); + int ret; + + if (!dev->ethtool_ops->get_rxfh) + return 0; + + if (!ctx->ctx_idx) { + ret = rss_dump_one_ctx(skb, cb, dev, 0); + if (ret) + return ret; + ctx->ctx_idx++; + } + + for (; xa_find(&dev->ethtool->rss_ctx, &ctx->ctx_idx, + ULONG_MAX, XA_PRESENT); ctx->ctx_idx++) { + ret = rss_dump_one_ctx(skb, cb, dev, ctx->ctx_idx); + if (ret) + return ret; + } + ctx->ctx_idx = ctx->start_ctx; + + return 0; +} + +int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb); + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + for_each_netdev_dump(net, dev, ctx->ifindex) { + if (ctx->match_ifindex && ctx->match_ifindex != ctx->ifindex) + break; + + ret = rss_dump_one_dev(skb, cb, dev); + if (ret) + break; + } + rtnl_unlock(); + + return ret; +} + const struct ethnl_request_ops ethnl_rss_request_ops = { .request_cmd = ETHTOOL_MSG_RSS_GET, .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY, diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index c678b484a079..b3382b3cf325 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -126,7 +126,7 @@ struct strset_reply_data { const struct nla_policy ethnl_strset_get_policy[] = { [ETHTOOL_A_STRSET_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_phy), [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED }, [ETHTOOL_A_STRSET_COUNTS_ONLY] = { .type = NLA_FLAG }, }; @@ -233,17 +233,18 @@ static void strset_cleanup_data(struct ethnl_reply_data *reply_base) } static int strset_prepare_set(struct strset_info *info, struct net_device *dev, - unsigned int id, bool counts_only) + struct phy_device *phydev, unsigned int id, + bool counts_only) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; void *strings; int count, ret; - if (id == ETH_SS_PHY_STATS && dev->phydev && + if (id == ETH_SS_PHY_STATS && phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_sset_count) - ret = phy_ops->get_sset_count(dev->phydev); + ret = phy_ops->get_sset_count(phydev); else if (ops->get_sset_count && ops->get_strings) ret = ops->get_sset_count(dev, id); else @@ -258,10 +259,10 @@ static int strset_prepare_set(struct strset_info *info, struct net_device *dev, strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL); if (!strings) return -ENOMEM; - if (id == ETH_SS_PHY_STATS && dev->phydev && + if (id == ETH_SS_PHY_STATS && phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_strings) - phy_ops->get_strings(dev->phydev, strings); + phy_ops->get_strings(phydev, strings); else ops->get_strings(dev, id, strings); info->strings = strings; @@ -279,6 +280,8 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, const struct strset_req_info *req_info = STRSET_REQINFO(req_base); struct strset_reply_data *data = STRSET_REPDATA(reply_base); struct net_device *dev = reply_base->dev; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; unsigned int i; int ret; @@ -289,14 +292,20 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, for (i = 0; i < ETH_SS_COUNT; i++) { if ((req_info->req_ids & (1U << i)) && data->sets[i].per_dev) { - if (info) - GENL_SET_ERR_MSG(info, "requested per device strings without dev"); + GENL_SET_ERR_MSG(info, "requested per device strings without dev"); return -EINVAL; } } return 0; } + phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_HEADER_FLAGS], + info->extack); + + /* phydev can be NULL, check for errors only */ + if (IS_ERR(phydev)) + return PTR_ERR(phydev); + ret = ethnl_ops_begin(dev); if (ret < 0) goto err_strset; @@ -305,7 +314,7 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, !data->sets[i].per_dev) continue; - ret = strset_prepare_set(&data->sets[i], dev, i, + ret = strset_prepare_set(&data->sets[i], dev, phydev, i, req_info->counts_only); if (ret < 0) goto err_ops; diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 89637e732866..7e46d130dce2 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -153,7 +153,7 @@ int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info) if (!req) { err = -EBUSY; trace_handshake_cmd_done_err(net, req, sock->sk, err); - fput(sock->file); + sockfd_put(sock); return err; } @@ -164,7 +164,7 @@ int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info) status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]); handshake_complete(req, status, info); - fput(sock->file); + sockfd_put(sock); return 0; } diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index e4cc6b78dcfc..ebdfd5b64e17 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -427,6 +427,9 @@ static void hsr_proxy_announce(struct timer_list *t) * of SAN nodes stored in ProxyNodeTable. */ interlink = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); + if (!interlink) + goto done; + list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) { if (hsr_addr_is_redbox(hsr, node->macaddress_A)) continue; @@ -441,6 +444,7 @@ static void hsr_proxy_announce(struct timer_list *t) mod_timer(&hsr->announce_proxy_timer, jiffies + interval); } +done: rcu_read_unlock(); } @@ -554,6 +558,12 @@ void hsr_dev_setup(struct net_device *dev) dev->netdev_ops = &hsr_device_ops; SET_NETDEV_DEVTYPE(dev, &hsr_type); dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; + /* Prevent recursive tx locking */ + dev->lltx = true; + /* Not sure about this. Taken from bridge code. netdevice.h says + * it means "Does not change network namespaces". + */ + dev->netns_local = true; dev->needs_free_netdev = true; @@ -563,16 +573,10 @@ void hsr_dev_setup(struct net_device *dev) dev->features = dev->hw_features; - /* Prevent recursive tx locking */ - dev->features |= NETIF_F_LLTX; /* VLAN on top of HSR needs testing and probably some work on * hsr_header_create() etc. */ dev->features |= NETIF_F_VLAN_CHALLENGED; - /* Not sure about this. Taken from bridge code. netdev_features.h says - * it means "Does not change network namespaces". - */ - dev->features |= NETIF_F_NETNS_LOCAL; } /* Return true if dev is a HSR master; return false otherwise. @@ -625,7 +629,6 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], /* Overflow soon to find bugs easier: */ hsr->sequence_nr = HSR_SEQNR_START; hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; - hsr->interlink_sequence_nr = HSR_SEQNR_START; timer_setup(&hsr->announce_timer, hsr_announce, 0); timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0); diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index ab1f8d35d9dc..fcfeb79bb040 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -203,7 +203,6 @@ struct hsr_priv { struct timer_list prune_proxy_timer; int announce_count; u16 sequence_nr; - u16 interlink_sequence_nr; /* Interlink port seq_nr */ u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */ spinlock_t seqnr_lock; /* locking for sequence_nr */ diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index af6cf64a00e0..464f683e016d 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -67,7 +67,16 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); skb_reset_mac_len(skb); - hsr_forward_skb(skb, port); + /* Only the frames received over the interlink port will assign a + * sequence number and require synchronisation vs other sender. + */ + if (port->type == HSR_PT_INTERLINK) { + spin_lock_bh(&hsr->seqnr_lock); + hsr_forward_skb(skb, port); + spin_unlock_bh(&hsr->seqnr_lock); + } else { + hsr_forward_skb(skb, port); + } finish_consume: return RX_HANDLER_CONSUMED; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 77b4e92027c5..175efd860f7b 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -116,7 +116,7 @@ static void lowpan_setup(struct net_device *ldev) ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; - ldev->features |= NETIF_F_NETNS_LOCAL; + ldev->netns_local = true; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 60e8fff1347e..88adb04e4072 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -226,11 +226,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; - wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + wpan_dev->netdev->netns_local = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; - wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + wpan_dev->netdev->netns_local = true; } if (err) { @@ -242,11 +242,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, list) { if (!wpan_dev->netdev) continue; - wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + wpan_dev->netdev->netns_local = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); WARN_ON(err); - wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; + wpan_dev->netdev->netns_local = true; } return err; @@ -291,7 +291,7 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb, switch (state) { /* TODO NETDEV_DEVTYPE */ case NETDEV_REGISTER: - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 8e94ed7c56a0..6d2c97f8e9ef 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -661,7 +661,8 @@ config TCP_CONG_CDG For further details see: D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using - delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg + delay gradients." In Networking 2011. Preprint: + http://caia.swin.edu.au/cv/dahayes/content/networking2011-cdg-preprint.pdf config TCP_CONG_BBR tristate "BBR TCP" diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d96f3e452fef..ab76744383cf 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -216,17 +216,27 @@ static void devinet_sysctl_unregister(struct in_device *idev) /* Locks all the inet devices. */ -static struct in_ifaddr *inet_alloc_ifa(void) +static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { - return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT); + struct in_ifaddr *ifa; + + ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); + if (!ifa) + return NULL; + + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + + INIT_HLIST_NODE(&ifa->hash); + + return ifa; } static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); - if (ifa->ifa_dev) - in_dev_put(ifa->ifa_dev); + in_dev_put(ifa->ifa_dev); kfree(ifa); } @@ -574,17 +584,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) ASSERT_RTNL(); - if (!in_dev) { - inet_free_ifa(ifa); - return -ENOBUFS; - } ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); - if (ifa->ifa_dev != in_dev) { - WARN_ON(ifa->ifa_dev); - in_dev_hold(in_dev); - ifa->ifa_dev = in_dev; - } + if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); @@ -701,8 +703,6 @@ errout: return err; } -#define INFINITY_LIFE_TIME 0xFFFFFFFF - static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; @@ -875,7 +875,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, if (!in_dev) goto errout; - ifa = inet_alloc_ifa(); + ifa = inet_alloc_ifa(in_dev); if (!ifa) /* * A potential indev allocation can be left alive, it stays @@ -885,19 +885,15 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); - in_dev_hold(in_dev); if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; - INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags; ifa->ifa_scope = ifm->ifa_scope; - ifa->ifa_dev = in_dev; - ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); @@ -1184,10 +1180,12 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) if (!ifa) { ret = -ENOBUFS; - ifa = inet_alloc_ifa(); + if (!in_dev) + break; + ifa = inet_alloc_ifa(in_dev); if (!ifa) break; - INIT_HLIST_NODE(&ifa->hash); + if (colon) memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else @@ -1586,16 +1584,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { - struct in_ifaddr *ifa = inet_alloc_ifa(); + struct in_ifaddr *ifa = inet_alloc_ifa(in_dev); if (ifa) { - INIT_HLIST_NODE(&ifa->hash); ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; ifa->ifa_mask = inet_make_mask(8); - in_dev_hold(in_dev); - ifa->ifa_dev = in_dev; ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, @@ -1948,8 +1943,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } static size_t inet_get_link_af_size(const struct net_device *dev, @@ -2145,8 +2139,7 @@ void inet_netconf_notify_devconf(struct net *net, int event, int type, rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); + rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); } static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 47378ca41904..f3281312eb5e 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -115,7 +115,8 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(sg_page(sg), skb->pp_recycle); + skb_page_unref(page_to_netmem(sg_page(sg)), + skb->pp_recycle); } #ifdef CONFIG_INET_ESPINTCP diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7ad2cafb9276..793e6781399a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -293,7 +293,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, + .flowi4_tos = ip_hdr(skb)->tos & INET_DSCP_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -1343,7 +1343,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, - .flowi4_tos = frn->fl_tos, + .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 5bdd1c016009..b07292d50ee7 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -37,6 +37,7 @@ struct fib4_rule { u8 dst_len; u8 src_len; dscp_t dscp; + u8 dscp_full:1; /* DSCP or TOS selector */ __be32 src; __be32 srcmask; __be32 dst; @@ -186,7 +187,15 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, ((daddr ^ r->dst) & r->dstmask)) return 0; - if (r->dscp && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) + /* When DSCP selector is used we need to match on the entire DSCP field + * in the flow information structure. When TOS selector is used we need + * to mask the upper three DSCP bits prior to matching to maintain + * legacy behavior. + */ + if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) + return 0; + else if (!r->dscp_full && r->dscp && + !fib_dscp_masked_match(r->dscp, fl4)) return 0; if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) @@ -217,6 +226,20 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } +static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4, + struct netlink_ext_ack *extack) +{ + if (rule4->dscp) { + NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); + return -EINVAL; + } + + rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule4->dscp_full = true; + + return 0; +} + static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, @@ -238,6 +261,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } rule4->dscp = inet_dsfield_to_dscp(frh->tos); + if (tb[FRA_DSCP] && + fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0) + goto errout; + /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) @@ -320,9 +347,19 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->dst_len && (rule4->dst_len != frh->dst_len)) return 0; - if (frh->tos && inet_dscp_to_dsfield(rule4->dscp) != frh->tos) + if (frh->tos && + (rule4->dscp_full || + inet_dscp_to_dsfield(rule4->dscp) != frh->tos)) return 0; + if (tb[FRA_DSCP]) { + dscp_t dscp; + + dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); + if (!rule4->dscp_full || rule4->dscp != dscp) + return 0; + } + #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; @@ -344,7 +381,15 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->dst_len = rule4->dst_len; frh->src_len = rule4->src_len; - frh->tos = inet_dscp_to_dsfield(rule4->dscp); + + if (rule4->dscp_full) { + frh->tos = 0; + if (nla_put_u8(skb, FRA_DSCP, + inet_dscp_to_dsfield(rule4->dscp) >> 2)) + goto nla_put_failure; + } else { + frh->tos = inet_dscp_to_dsfield(rule4->dscp); + } if ((rule4->dst_len && nla_put_in_addr(skb, FRA_DST, rule4->dst)) || @@ -366,7 +411,8 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(4) /* dst */ + nla_total_size(4) /* src */ - + nla_total_size(4); /* flow */ + + nla_total_size(4) /* flow */ + + nla_total_size(1); /* dscp */ } static void fib4_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2b57cd2b96e2..ba2df3d2ac15 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -543,8 +543,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, info->nlh, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); + rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } static int fib_detect_death(struct fib_info *fi, int order, @@ -2066,8 +2065,7 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) if (fa->fa_slen != slen) continue; - if (fa->fa_dscp && - fa->fa_dscp != inet_dsfield_to_dscp(flp->flowi4_tos)) + if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; if (fa->tb_id != tb->tb_id) continue; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 8f30e3f00b7f..09e31757e96c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1580,8 +1580,7 @@ found: if (index >= (1ul << fa->fa_slen)) continue; } - if (fa->fa_dscp && - inet_dscp_to_dsfield(fa->fa_dscp) != flp->flowi4_tos) + if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; /* Paired with WRITE_ONCE() in fib_release_info() */ if (READ_ONCE(fi->fib_dead)) diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 0abbc413e0fe..3e30745e2c09 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -50,7 +50,7 @@ struct fou_net { static inline struct fou *fou_from_sock(struct sock *sk) { - return sk->sk_user_data; + return rcu_dereference_sk_user_data(sk); } static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len) @@ -233,9 +233,15 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, struct sk_buff *skb) { const struct net_offload __rcu **offloads; - u8 proto = fou_from_sock(sk)->protocol; + struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; + u8 proto; + + if (!fou) + goto out; + + proto = fou->protocol; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel @@ -263,14 +269,24 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { const struct net_offload __rcu **offloads; - u8 proto = fou_from_sock(sk)->protocol; + struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; - int err = -ENOSYS; + u8 proto; + int err; + + if (!fou) { + err = -ENOENT; + goto out; + } + + proto = fou->protocol; offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); - if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { + err = -ENOSYS; goto out; + } err = ops->callbacks.gro_complete(skb, nhoff); @@ -322,6 +338,9 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, skb_gro_remcsum_init(&grc); + if (!fou) + goto out; + off = skb_gro_offset(skb); len = off + sizeof(*guehdr); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index ab6d0d98dbc3..e1384e7331d8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -93,6 +93,7 @@ #include <net/ip_fib.h> #include <net/l3mdev.h> #include <net/addrconf.h> +#include <net/inet_dscp.h> #define CREATE_TRACE_POINTS #include <trace/events/icmp.h> @@ -220,61 +221,56 @@ static inline void icmp_xmit_unlock(struct sock *sk) spin_unlock(&sk->sk_lock.slock); } -int sysctl_icmp_msgs_per_sec __read_mostly = 1000; -int sysctl_icmp_msgs_burst __read_mostly = 50; - -static struct { - spinlock_t lock; - u32 credit; - u32 stamp; -} icmp_global = { - .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), -}; - /** * icmp_global_allow - Are we allowed to send one more ICMP message ? + * @net: network namespace * * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. - * Note: called with BH disabled + * Works in tandem with icmp_global_consume(). */ -bool icmp_global_allow(void) +bool icmp_global_allow(struct net *net) { - u32 credit, delta, incr = 0, now = (u32)jiffies; - bool rc = false; + u32 delta, now, oldstamp; + int incr, new, old; - /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. The READ_ONCE() are paired - * with the following WRITE_ONCE() in this same function. + /* Note: many cpus could find this condition true. + * Then later icmp_global_consume() could consume more credits, + * this is an acceptable race. */ - if (!READ_ONCE(icmp_global.credit)) { - delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); - if (delta < HZ / 50) - return false; - } + if (atomic_read(&net->ipv4.icmp_global_credit) > 0) + return true; - spin_lock(&icmp_global.lock); - delta = min_t(u32, now - icmp_global.stamp, HZ); - if (delta >= HZ / 50) { - incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ; - if (incr) - WRITE_ONCE(icmp_global.stamp, now); - } - credit = min_t(u32, icmp_global.credit + incr, - READ_ONCE(sysctl_icmp_msgs_burst)); - if (credit) { - /* We want to use a credit of one in average, but need to randomize - * it for security reasons. - */ - credit = max_t(int, credit - get_random_u32_below(3), 0); - rc = true; + now = jiffies; + oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp); + delta = min_t(u32, now - oldstamp, HZ); + if (delta < HZ / 50) + return false; + + incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ; + if (!incr) + return false; + + if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) { + old = atomic_read(&net->ipv4.icmp_global_credit); + do { + new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst)); + } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new)); } - WRITE_ONCE(icmp_global.credit, credit); - spin_unlock(&icmp_global.lock); - return rc; + return true; } EXPORT_SYMBOL(icmp_global_allow); +void icmp_global_consume(struct net *net) +{ + int credits = get_random_u32_below(3); + + /* Note: this might make icmp_global.credit negative. */ + if (credits) + atomic_sub(credits, &net->ipv4.icmp_global_credit); +} +EXPORT_SYMBOL(icmp_global_consume); + static bool icmpv4_mask_allow(struct net *net, int type, int code) { if (type > NR_ICMP_TYPES) @@ -291,14 +287,16 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code) return false; } -static bool icmpv4_global_allow(struct net *net, int type, int code) +static bool icmpv4_global_allow(struct net *net, int type, int code, + bool *apply_ratelimit) { if (icmpv4_mask_allow(net, type, code)) return true; - if (icmp_global_allow()) + if (icmp_global_allow(net)) { + *apply_ratelimit = true; return true; - + } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -308,15 +306,16 @@ static bool icmpv4_global_allow(struct net *net, int type, int code) */ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) + struct flowi4 *fl4, int type, int code, + bool apply_ratelimit) { struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; int vif; - if (icmpv4_mask_allow(net, type, code)) - goto out; + if (!apply_ratelimit) + return true; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) @@ -331,6 +330,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); + else + icmp_global_consume(net); return rc; } @@ -402,6 +403,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + bool apply_ratelimit = false; struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; @@ -413,11 +415,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) return; - /* Needed by both icmp_global_allow and icmp_xmit_lock */ + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); - /* global icmp_msgs_per_sec */ - if (!icmpv4_global_allow(net, type, code)) + /* is global icmp_msgs_per_sec exhausted ? */ + if (!icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); @@ -443,14 +445,14 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); - fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_tos = ip_hdr(skb)->tos & INET_DSCP_MASK; fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; - if (icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: @@ -496,7 +498,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_tos = tos & INET_DSCP_MASK; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; @@ -545,7 +547,7 @@ static struct rtable *icmp_route_lookup(struct net *net, orefdst = skb_in->_skb_refdst; /* save old refdst */ skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - RT_TOS(tos), rt2->dst.dev); + tos, rt2->dst.dev); dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); @@ -596,6 +598,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, int room; struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); + bool apply_ratelimit = false; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; @@ -677,7 +680,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, } } - /* Needed by both icmp_global_allow and icmp_xmit_lock */ + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless @@ -685,7 +688,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow) */ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) && - !icmpv4_global_allow(net, type, code)) + !icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); @@ -744,7 +747,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, goto out_unlock; /* peer icmp_ratelimit */ - if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ @@ -1487,6 +1490,8 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + net->ipv4.sysctl_icmp_msgs_per_sec = 1000; + net->ipv4.sysctl_icmp_msgs_burst = 50; return 0; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 64d07b842e73..2c5632d4fddb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -236,7 +236,7 @@ static bool inet_bhash2_conflict(const struct sock *sk, #define sk_for_each_bound_bhash(__sk, __tb2, __tb) \ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \ - sk_for_each_bound(sk2, &(__tb2)->owners) + sk_for_each_bound((__sk), &(__tb2)->owners) /* This should be called only when the tb and tb2 hashbuckets' locks are held */ static int inet_csk_bind_conflict(const struct sock *sk, @@ -714,6 +714,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; int amt = 0; /* atomically get the memory usage, set and charge the @@ -731,8 +732,8 @@ out: } if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, - GFP_KERNEL | __GFP_NOFAIL); + mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); + kmem_cache_charge(newsk, gfp); release_sock(newsk); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 9712cdb8087c..67639309163d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -442,7 +442,7 @@ static int inet_twsk_diag_fill(struct sock *sk, inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; - r->idiag_state = tw->tw_substate; + r->idiag_state = READ_ONCE(tw->tw_substate); r->idiag_timer = 3; tmo = tw->tw_timer.expires - jiffies; r->idiag_expires = jiffies_delta_to_msecs(tmo); @@ -1209,7 +1209,7 @@ next_chunk: if (num < s_num) goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_substate : sk->sk_state; + READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 48d0d494185b..9bfcfd016e18 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -310,7 +310,7 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk) return inet_lhash2_bucket(h, hash); } -static inline int compute_score(struct sock *sk, struct net *net, +static inline int compute_score(struct sock *sk, const struct net *net, const unsigned short hnum, const __be32 daddr, const int dif, const int sdif) { @@ -348,7 +348,7 @@ static inline int compute_score(struct sock *sk, struct net *net, * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ -struct sock *inet_lookup_reuseport(struct net *net, struct sock *sk, +struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, @@ -374,7 +374,7 @@ EXPORT_SYMBOL_GPL(inet_lookup_reuseport); */ /* called with rcu_read_lock() : No refcount taken on the socket */ -static struct sock *inet_lhash2_lookup(struct net *net, +static struct sock *inet_lhash2_lookup(const struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, @@ -401,7 +401,7 @@ static struct sock *inet_lhash2_lookup(struct net *net, return result; } -struct sock *inet_lookup_run_sk_lookup(struct net *net, +struct sock *inet_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, @@ -423,7 +423,7 @@ struct sock *inet_lookup_run_sk_lookup(struct net *net, return sk; } -struct sock *__inet_lookup_listener(struct net *net, +struct sock *__inet_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, @@ -488,7 +488,7 @@ void sock_edemux(struct sk_buff *skb) } EXPORT_SYMBOL(sock_edemux); -struct sock *__inet_lookup_established(struct net *net, +struct sock *__inet_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ba205473522e..5f6fd382af38 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -44,6 +44,7 @@ #include <net/gre.h> #include <net/dst_metadata.h> #include <net/erspan.h> +#include <net/inet_dscp.h> /* Problems & solutions @@ -930,7 +931,7 @@ static int ipgre_open(struct net_device *dev) t->parms.iph.daddr, t->parms.iph.saddr, t->parms.o_key, - RT_TOS(t->parms.iph.tos), + t->parms.iph.tos & INET_DSCP_MASK, t->parms.link); if (IS_ERR(rt)) return -EADDRNOTAVAIL; @@ -996,7 +997,7 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); - dev->features |= GRE_FEATURES | NETIF_F_LLTX; + dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; /* TCP offload with GRE SEQ is not supported, nor can we support 2 @@ -1010,6 +1011,8 @@ static void __gre_tunnel_init(struct net_device *dev) dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + + dev->lltx = true; } static int ipgre_tunnel_init(struct net_device *dev) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d6fbcbd2358a..b6e7d4921309 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -596,9 +596,8 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; @@ -646,9 +645,8 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b90d0f78ac80..49811c9281d4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -77,6 +77,7 @@ #include <net/inetpeer.h> #include <net/inet_ecn.h> #include <net/lwtunnel.h> +#include <net/inet_dscp.h> #include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> @@ -493,7 +494,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_TOS(tos), + tos & INET_DSCP_MASK, sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; @@ -1621,7 +1622,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, - RT_TOS(arg->tos), + arg->tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5cffad42fe8c..d591c73e2c0e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -43,6 +43,7 @@ #include <net/rtnetlink.h> #include <net/udp.h> #include <net/dst_metadata.h> +#include <net/inet_dscp.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -293,7 +294,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), dev_net(dev), + iph->tos & INET_DSCP_MASK, dev_net(dev), tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); @@ -609,9 +610,9 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, - tunnel_id_to_key32(key->tun_id), RT_TOS(tos), - dev_net(dev), 0, skb->mark, skb_get_hash(skb), - key->flow_flags); + tunnel_id_to_key32(key->tun_id), + tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark, + skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) tunnel_hlen = ip_encap_hlen(&tun_info->encap); @@ -772,7 +773,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), + tunnel->parms.o_key, tos & INET_DSCP_MASK, dev_net(dev), READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); @@ -1161,7 +1162,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { - itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + itn->fb_tunnel_dev->netns_local = true; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; @@ -1326,7 +1327,7 @@ int ip_tunnel_init(struct net_device *dev) tunnel->dev = dev; tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 14536da9f5dc..f0b4419cef34 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -443,7 +443,7 @@ static int vti_tunnel_init(struct net_device *dev) dev->flags = IFF_NOARP; dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; netif_keep_dst(dev); return ip_tunnel_init(dev); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 923a2ef68c2f..dc0db5895e0e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -378,7 +378,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; netif_keep_dst(dev); dev->features |= IPIP_FEATURES; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 6c750bd13dd8..089864c6a35e 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -62,6 +62,7 @@ #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/rtnh.h> +#include <net/inet_dscp.h> #include <linux/nospec.h> @@ -536,7 +537,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) @@ -1868,7 +1869,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->remote, vif->local, 0, 0, IPPROTO_IPIP, - RT_TOS(iph->tos), vif->link); + iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) goto out_free; encap = sizeof(struct iphdr); @@ -1876,7 +1877,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0, 0, 0, IPPROTO_IPIP, - RT_TOS(iph->tos), vif->link); + iph->tos & INET_DSCP_MASK, vif->link); if (IS_ERR(rt)) goto out_free; } @@ -2080,7 +2081,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = RT_TOS(iph->tos), + .flowi4_tos = iph->tos & INET_DSCP_MASK, .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? @@ -2406,8 +2407,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, errout: kfree_skb(skb); - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err); } static size_t igmpmsg_netlink_msgsize(size_t payloadlen) diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 591a2737808e..e0aab66cd925 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -14,6 +14,7 @@ #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/inet_dscp.h> #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ @@ -43,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un */ fl4.daddr = iph->daddr; fl4.saddr = saddr; - fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 14365b20f1c5..1cdd9c28ab2d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -826,7 +826,7 @@ static int get_info(struct net *net, void __user *user, const int *len) sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; - strcpy(info.name, name); + strscpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; @@ -1547,7 +1547,7 @@ int arpt_register_table(struct net *net, goto out_free; } - ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index fe89a056eb06..3d101613f27f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -981,7 +981,7 @@ static int get_info(struct net *net, void __user *user, const int *len) sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; - strcpy(info.name, name); + strscpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; @@ -1767,7 +1767,7 @@ int ipt_register_table(struct net *net, const struct xt_table *table, goto out_free; } - ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index ded5bef02f77..1ce7a1655b97 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> +#include <net/inet_dscp.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ip_fib.h> @@ -75,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; + flow.flowi4_tos = iph->tos & INET_DSCP_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 6cc5743c553a..f4aed0789d69 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -15,6 +15,7 @@ #include <net/icmp.h> #include <net/ip.h> #include <net/route.h> +#include <net/inet_dscp.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> @@ -32,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, fl4.flowi4_oif = oif; fl4.daddr = gw->s_addr; - fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index a522c3a3be52..ef5dd88107dd 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -40,13 +40,13 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, sizeof(struct in_addr)); if (err < 0) return err; if (tb[NFTA_DUP_SREG_DEV]) - err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); return err; diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 9eee535c64dd..00da1332bbf1 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -10,6 +10,7 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> +#include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/route.h> @@ -22,8 +23,6 @@ static __be32 get_saddr(__be32 addr) return addr; } -#define DSCP_BITS 0xfc - void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { @@ -110,7 +109,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (priv->flags & NFTA_FIB_F_MARK) fl4.flowi4_mark = pkt->skb->mark; - fl4.flowi4_tos = iph->tos & DSCP_BITS; + fl4.flowi4_tos = iph->tos & INET_DSCP_MASK; if (priv->flags & NFTA_FIB_F_DADDR) { fl4.daddr = iph->daddr; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 6b9787ee8601..93aaea0006ba 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -865,15 +865,18 @@ out: } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, - u32 op_flags) + u32 op_flags, u32 *resp_op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; + u16 weight; int i; + *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0; + if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) @@ -888,9 +891,12 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { + weight = nhg->nh_entries[i].weight - 1; + *p++ = (struct nexthop_grp) { .id = nhg->nh_entries[i].nh->id, - .weight = nhg->nh_entries[i].weight - 1, + .weight = weight, + .weight_high = weight >> 8, }; } @@ -934,10 +940,12 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); + u32 resp_op_flags = 0; if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; - if (nla_put_nh_group(skb, nh, op_flags)) + if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) || + nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags)) goto nla_put_failure; goto out; } @@ -1050,7 +1058,9 @@ static size_t nh_nlmsg_size(struct nexthop *nh) sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) - sz += nh_nlmsg_size_grp(nh); + sz += nh_nlmsg_size_grp(nh) + + nla_total_size(4) + /* NHA_OP_FLAGS */ + 0; else sz += nh_nlmsg_size_single(nh); @@ -1080,8 +1090,7 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) info->nlh, gfp_any()); return; errout: - if (err < 0) - rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); + rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) @@ -1201,8 +1210,7 @@ static void nexthop_bucket_notify(struct nh_res_table *res_table, rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); + rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, @@ -1280,11 +1288,14 @@ static int nh_check_attr_group(struct net *net, nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { - if (nhg[i].resvd1 || nhg[i].resvd2) { - NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); + if (nhg[i].resvd2) { + NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0"); return -EINVAL; } - if (nhg[i].weight > 254) { + if (nexthop_grp_weight(&nhg[i]) == 0) { + /* 0xffff got passed in, representing weight of 0x10000, + * which is too heavy. + */ NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } @@ -1880,9 +1891,9 @@ static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { - int prev_upper_bound = 0; - int total = 0; - int w = 0; + u16 prev_upper_bound = 0; + u32 total = 0; + u32 w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); @@ -1892,11 +1903,12 @@ static void nh_res_group_rebalance(struct nh_group *nhg, for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - int upper_bound; + u16 upper_bound; + u64 btw; w += nhge->weight; - upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, - total); + btw = ((u64)res_table->num_nh_buckets) * w; + upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; @@ -1962,8 +1974,8 @@ static void replace_nexthop_grp_res(struct nh_group *oldg, static void nh_hthr_group_rebalance(struct nh_group *nhg) { - int total = 0; - int w = 0; + u32 total = 0; + u32 w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) @@ -1971,7 +1983,7 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg) for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - int upper_bound; + u32 upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; @@ -2713,7 +2725,8 @@ static struct nexthop *nexthop_create_group(struct net *net, goto out_no_nh; } nhg->nh_entries[i].nh = nhe; - nhg->nh_entries[i].weight = entry[i].weight + 1; + nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]); + list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 13c0f1d455f3..723ac9181558 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -512,7 +512,7 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, sk->sk_protocol; } - flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope, + flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } @@ -541,7 +541,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), - ip_sock_rt_tos(sk) & IPTOS_RT_MASK, + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, @@ -1263,7 +1263,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = iph->tos & IPTOS_RT_MASK, + .flowi4_tos = iph->tos & INET_DSCP_MASK, .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -2160,7 +2160,7 @@ int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - tos &= IPTOS_RT_MASK; + tos &= INET_DSCP_MASK; err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); if (err < 0) goto martian_source; @@ -2470,7 +2470,7 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct fib_result res; int err; - tos &= IPTOS_RT_MASK; + tos &= INET_DSCP_MASK; rcu_read_lock(); err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); rcu_read_unlock(); @@ -2618,7 +2618,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos &= IPTOS_RT_MASK; + fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -3261,7 +3261,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; + fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK; fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; @@ -3286,7 +3286,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, - rtm->rtm_tos & IPTOS_RT_MASK, dev, + rtm->rtm_tos & INET_DSCP_MASK, dev, &res); rt = skb_rtable(skb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4af0c234d8d7..a79b2a52ce01 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -601,22 +601,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_available_ulp, }, { - .procname = "icmp_msgs_per_sec", - .data = &sysctl_icmp_msgs_per_sec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "icmp_msgs_burst", - .data = &sysctl_icmp_msgs_burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), @@ -702,6 +686,22 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "icmp_msgs_per_sec", + .data = &init_net.ipv4.sysctl_icmp_msgs_per_sec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "icmp_msgs_burst", + .data = &init_net.ipv4.sysctl_icmp_msgs_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { .procname = "ping_group_range", .data = &init_net.ipv4.ping_group_range.range, .maxlen = sizeof(gid_t)*2, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e03a342c9162..4f77bd862e95 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -285,6 +285,8 @@ #include <trace/events/tcp.h> #include <net/rps.h> +#include "../core/devmem.h" + /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, @@ -471,6 +473,7 @@ void tcp_init_sock(struct sock *sk) set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); + xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); } EXPORT_SYMBOL(tcp_init_sock); @@ -2160,6 +2163,9 @@ static int tcp_zerocopy_receive(struct sock *sk, skb = tcp_recv_skb(sk, seq, &offset); } + if (!skb_frags_readable(skb)) + break; + if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; @@ -2177,6 +2183,9 @@ static int tcp_zerocopy_receive(struct sock *sk, break; } page = skb_frag_page(frags); + if (WARN_ON_ONCE(!page)) + break; + prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; @@ -2235,6 +2244,7 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); + u32 tsflags = READ_ONCE(sk->sk_tsflags); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { @@ -2274,14 +2284,18 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, } } - if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) + if (tsflags & SOF_TIMESTAMPING_SOFTWARE && + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { - if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) + if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && + (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; @@ -2317,6 +2331,220 @@ static int tcp_inq_hint(struct sock *sk) return inq; } +/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */ +struct tcp_xa_pool { + u8 max; /* max <= MAX_SKB_FRAGS */ + u8 idx; /* idx <= max */ + __u32 tokens[MAX_SKB_FRAGS]; + netmem_ref netmems[MAX_SKB_FRAGS]; +}; + +static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p) +{ + int i; + + /* Commit part that has been copied to user space. */ + for (i = 0; i < p->idx; i++) + __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY, + (__force void *)p->netmems[i], GFP_KERNEL); + /* Rollback what has been pre-allocated and is no longer needed. */ + for (; i < p->max; i++) + __xa_erase(&sk->sk_user_frags, p->tokens[i]); + + p->max = 0; + p->idx = 0; +} + +static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p) +{ + if (!p->max) + return; + + xa_lock_bh(&sk->sk_user_frags); + + tcp_xa_pool_commit_locked(sk, p); + + xa_unlock_bh(&sk->sk_user_frags); +} + +static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p, + unsigned int max_frags) +{ + int err, k; + + if (p->idx < p->max) + return 0; + + xa_lock_bh(&sk->sk_user_frags); + + tcp_xa_pool_commit_locked(sk, p); + + for (k = 0; k < max_frags; k++) { + err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k], + XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL); + if (err) + break; + } + + xa_unlock_bh(&sk->sk_user_frags); + + p->max = k; + p->idx = 0; + return k ? 0 : err; +} + +/* On error, returns the -errno. On success, returns number of bytes sent to the + * user. May not consume all of @remaining_len. + */ +static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, + unsigned int offset, struct msghdr *msg, + int remaining_len) +{ + struct dmabuf_cmsg dmabuf_cmsg = { 0 }; + struct tcp_xa_pool tcp_xa_pool; + unsigned int start; + int i, copy, n; + int sent = 0; + int err = 0; + + tcp_xa_pool.max = 0; + tcp_xa_pool.idx = 0; + do { + start = skb_headlen(skb); + + if (skb_frags_readable(skb)) { + err = -ENODEV; + goto out; + } + + /* Copy header. */ + copy = start - offset; + if (copy > 0) { + copy = min(copy, remaining_len); + + n = copy_to_iter(skb->data + offset, copy, + &msg->msg_iter); + if (n != copy) { + err = -EFAULT; + goto out; + } + + offset += copy; + remaining_len -= copy; + + /* First a dmabuf_cmsg for # bytes copied to user + * buffer. + */ + memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); + dmabuf_cmsg.frag_size = copy; + err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), &dmabuf_cmsg); + if (err || msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~MSG_CTRUNC; + if (!err) + err = -ETOOSMALL; + goto out; + } + + sent += copy; + + if (remaining_len == 0) + goto out; + } + + /* after that, send information of dmabuf pages through a + * sequence of cmsg + */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct net_iov *niov; + u64 frag_offset; + int end; + + /* !skb_frags_readable() should indicate that ALL the + * frags in this skb are dmabuf net_iovs. We're checking + * for that flag above, but also check individual frags + * here. If the tcp stack is not setting + * skb_frags_readable() correctly, we still don't want + * to crash here. + */ + if (!skb_frag_net_iov(frag)) { + net_err_ratelimited("Found non-dmabuf skb with net_iov"); + err = -ENODEV; + goto out; + } + + niov = skb_frag_net_iov(frag); + end = start + skb_frag_size(frag); + copy = end - offset; + + if (copy > 0) { + copy = min(copy, remaining_len); + + frag_offset = net_iov_virtual_addr(niov) + + skb_frag_off(frag) + offset - + start; + dmabuf_cmsg.frag_offset = frag_offset; + dmabuf_cmsg.frag_size = copy; + err = tcp_xa_pool_refill(sk, &tcp_xa_pool, + skb_shinfo(skb)->nr_frags - i); + if (err) + goto out; + + /* Will perform the exchange later */ + dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; + dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); + + offset += copy; + remaining_len -= copy; + + err = put_cmsg(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err || msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~MSG_CTRUNC; + if (!err) + err = -ETOOSMALL; + goto out; + } + + atomic_long_inc(&niov->pp_ref_count); + tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); + + sent += copy; + + if (remaining_len == 0) + goto out; + } + start = end; + } + + tcp_xa_pool_commit(sk, &tcp_xa_pool); + if (!remaining_len) + goto out; + + /* if remaining_len is not satisfied yet, we need to go to the + * next frag in the frag_list to satisfy remaining_len. + */ + skb = skb_shinfo(skb)->frag_list ?: skb->next; + + offset = offset - start; + } while (skb); + + if (remaining_len) { + err = -EFAULT; + goto out; + } + +out: + tcp_xa_pool_commit(sk, &tcp_xa_pool); + if (!sent) + sent = err; + + return sent; +} + /* * This routine copies from a sock struct into the user buffer. * @@ -2330,6 +2558,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); + int last_copied_dmabuf = -1; /* uninitialized */ int copied = 0; u32 peek_seq; u32 *seq; @@ -2509,15 +2738,44 @@ found_ok_skb: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_msg(skb, offset, msg, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; + if (last_copied_dmabuf != -1 && + last_copied_dmabuf != !skb_frags_readable(skb)) break; + + if (skb_frags_readable(skb)) { + err = skb_copy_datagram_msg(skb, offset, msg, + used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } else { + if (!(flags & MSG_SOCK_DEVMEM)) { + /* dmabuf skbs can only be received + * with the MSG_SOCK_DEVMEM flag. + */ + if (!copied) + copied = -EFAULT; + + break; + } + + err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, + used); + if (err <= 0) { + if (!copied) + copied = -EFAULT; + + break; + } + used = err; } } + last_copied_dmabuf = !skb_frags_readable(skb); + WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; @@ -2833,7 +3091,7 @@ void __tcp_close(struct sock *sk, long timeout) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_ABORT_ON_CLOSE); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); @@ -2908,7 +3166,7 @@ adjudge_to_death: if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_ABORT_ON_LINGER); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { @@ -2927,7 +3185,7 @@ adjudge_to_death: if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_ABORT_ON_MEMORY); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { @@ -3025,13 +3283,16 @@ int tcp_disconnect(struct sock *sk, int flags) inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { WRITE_ONCE(sk->sk_err, ECONNABORTED); - } else if (tcp_need_reset(old_state) || - (tp->snd_nxt != tp->write_seq && - (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { + } else if (tcp_need_reset(old_state)) { + tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE); + WRITE_ONCE(sk->sk_err, ECONNRESET); + } else if (tp->snd_nxt != tp->write_seq && + (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ - tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_NOT_SPECIFIED); + tcp_send_active_reset(sk, gfp_any(), + SK_RST_REASON_TCP_DISCONNECT_WITH_DATA); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); @@ -4637,6 +4898,13 @@ int tcp_abort(struct sock *sk, int err) /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); + /* Avoid closing the same socket twice. */ + if (sk->sk_state == TCP_CLOSE) { + if (!has_current_bpf_ctx()) + release_sock(sk); + return -ENOENT; + } + if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); @@ -4646,16 +4914,13 @@ int tcp_abort(struct sock *sk, int err) local_bh_disable(); bh_lock_sock(sk); - if (!sock_flag(sk, SOCK_DEAD)) { - if (tcp_need_reset(sk->sk_state)) - tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); - tcp_done_with_error(sk, err); - } + if (tcp_need_reset(sk->sk_state)) + tcp_send_active_reset(sk, GFP_ATOMIC, + SK_RST_REASON_TCP_STATE); + tcp_done_with_error(sk, err); bh_unlock_sock(sk); local_bh_enable(); - tcp_write_queue_purge(sk); if (!has_current_bpf_ctx()) release_sock(sk); return 0; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 53b0d62fd2c2..e7658c5d6b79 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -30,7 +30,7 @@ void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) } static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, - struct sk_msg *msg, u32 apply_bytes, int flags) + struct sk_msg *msg, u32 apply_bytes) { bool apply = apply_bytes; struct scatterlist *sge; @@ -167,7 +167,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, if (unlikely(!psock)) return -EPIPE; - ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); return ret; @@ -577,7 +577,7 @@ out_err: err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); sk_psock_put(sk, psock); - return copied ? copied : err; + return copied > 0 ? copied : err; } enum { diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 52b1f2665dfa..81b96331b2bb 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -185,7 +185,7 @@ static inline void htcp_alpha_update(struct htcp *ca) u32 scale = (HZ << 3) / (10 * minRTT); /* clamping ratio to interval [0.5,10]<<3 */ - scale = min(max(scale, 1U << 2), 10U << 3); + scale = clamp(scale, 1U << 2, 10U << 3); factor = (factor << 3) / scale; if (!factor) factor = 1; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e37488d3453f..9f314dfa1490 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5391,6 +5391,9 @@ restart: for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { n = tcp_skb_next(skb, list); + if (!skb_frags_readable(skb)) + goto skip_this; + /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { skb = tcp_collapse_one(sk, skb, list, root); @@ -5411,17 +5414,20 @@ restart: break; } - if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) && + if (n && n != tail && skb_frags_readable(n) && + tcp_skb_can_collapse_rx(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; } +skip_this: /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; } if (end_of_skbs || - (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || + !skb_frags_readable(skb)) return; __skb_queue_head_init(&tmp); @@ -5463,7 +5469,8 @@ restart: if (!skb || skb == tail || !tcp_skb_can_collapse_rx(nskb, skb) || - (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) || + !skb_frags_readable(skb)) goto end; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd17f25ff288..5afe5e57c89b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -79,6 +79,7 @@ #include <linux/seq_file.h> #include <linux/inetdevice.h> #include <linux/btf_ids.h> +#include <linux/skbuff_ref.h> #include <crypto/hash.h> #include <linux/scatterlist.h> @@ -97,6 +98,8 @@ static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; +static DEFINE_MUTEX(tcp_exit_batch_mutex); + static u32 tcp_v4_init_seq(const struct sk_buff *skb) { return secure_tcp_seq(ip_hdr(skb)->daddr, @@ -118,6 +121,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) struct tcp_sock *tp = tcp_sk(sk); int ts_recent_stamp; + if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) + reuse = 0; + if (reuse == 2) { /* Still does not detect *everything* that goes through * lo, since we require a loopback src or dst address @@ -1068,7 +1074,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) } tcp_v4_send_ack(sk, skb, - tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), @@ -2507,10 +2513,25 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head) } #endif +static void tcp_release_user_frags(struct sock *sk) +{ +#ifdef CONFIG_PAGE_POOL + unsigned long index; + void *netmem; + + xa_for_each(&sk->sk_user_frags, index, netmem) + WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem)); +#endif +} + void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + tcp_release_user_frags(sk); + + xa_destroy(&sk->sk_user_frags); + trace_tcp_destroy_sock(sk); tcp_clear_xmit_timers(sk); @@ -2943,7 +2964,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", - i, src, srcp, dest, destp, tw->tw_substate, 0, 0, + i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } @@ -3514,6 +3535,16 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; + /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work + * and failed setup_net error unwinding path are serialized. + * + * tcp_twsk_purge() handles twsk in any dead netns, not just those in + * net_exit_list, the thread that dismantles a particular twsk must + * do so without other thread progressing to refcount_dec_and_test() of + * tcp_death_row.tw_refcount. + */ + mutex_lock(&tcp_exit_batch_mutex); + tcp_twsk_purge(net_exit_list); list_for_each_entry(net, net_exit_list, exit_list) { @@ -3521,6 +3552,8 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); tcp_fastopen_ctx_destroy(net); } + + mutex_unlock(&tcp_exit_batch_mutex); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b01eb6d94413..95669935494e 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -617,9 +617,13 @@ static struct genl_family tcp_metrics_nl_family; static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, - [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr), }, + [TCP_METRICS_ATTR_ADDR_IPV6] = + NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), + [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, }, + [TCP_METRICS_ATTR_SADDR_IPV6] = + NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), + /* Following attributes are not received for GET/DEL, * we keep them for reference */ @@ -811,8 +815,6 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, if (a) { struct in6_addr in6; - if (nla_len(a) != sizeof(struct in6_addr)) - return -EINVAL; in6 = nla_get_in6_addr(a); inetpeer_set_addr_v6(addr, &in6); if (hash) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a19a9dbd3409..bb1fe1ba867a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -52,16 +52,17 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, return TCP_TW_SUCCESS; } -static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq) +static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, + u32 rcv_nxt) { #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao; ao = rcu_dereference(tcptw->ao_info); - if (unlikely(ao && seq < tcptw->tw_rcv_nxt)) + if (unlikely(ao && seq < rcv_nxt)) WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1); #endif - tcptw->tw_rcv_nxt = seq; + WRITE_ONCE(tcptw->tw_rcv_nxt, seq); } /* @@ -98,8 +99,9 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, u32 *tw_isn) { - struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); + struct tcp_options_received tmp_opt; bool paws_reject = false; int ts_recent_stamp; @@ -117,26 +119,26 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, } } - if (tw->tw_substate == TCP_FIN_WAIT2) { + if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcptw->tw_rcv_nxt, - tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) + rcv_nxt, + rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) + if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || - !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || + !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; @@ -146,12 +148,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, * reset. */ if (!th->fin || - TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) + TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ - tw->tw_substate = TCP_TIME_WAIT; - twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq); + WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT); + twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq, + rcv_nxt); if (tmp_opt.saw_tstamp) { WRITE_ONCE(tcptw->tw_ts_recent_stamp, @@ -182,7 +185,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, */ if (!paws_reject && - (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ @@ -229,7 +232,7 @@ kill: */ if (th->syn && !th->rst && !th->ack && !paws_reject && - (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || + (after(TCP_SKB_CB(skb)->seq, rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; @@ -625,6 +628,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); + xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1); + return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 16c48df8df4c..4fd746bd4d54 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2344,7 +2344,8 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb) || - !skb_pure_zcopy_same(skb, next)) + !skb_pure_zcopy_same(skb, next) || + skb_frags_readable(skb) != skb_frags_readable(next)) return false; len -= skb->len; @@ -3264,6 +3265,8 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) return false; if (skb_cloned(skb)) return false; + if (!skb_frags_readable(skb)) + return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -3649,7 +3652,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, /* skb of trace_tcp_send_reset() keeps the skb that caused RST, * skb here is different to the troublesome skb, so use NULL */ - trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED); + trace_tcp_send_reset(sk, NULL, reason); } /* Send a crossed SYN-ACK during socket establishment. diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 4d40615dc8fc..79064580c8c0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -125,7 +125,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) do_reset = true; if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_ABORT_ON_MEMORY); tcp_done(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; @@ -282,6 +282,7 @@ static int tcp_write_timeout(struct sock *sk) expired = retransmits_timed_out(sk, retry_until, READ_ONCE(icsk->icsk_user_timeout)); tcp_fastopen_active_detect_blackhole(sk, expired); + mptcp_active_detect_blackhole(sk, expired); if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB, @@ -779,7 +780,7 @@ static void tcp_keepalive_timer (struct timer_list *t) goto out; } } - tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED); + tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_STATE); goto death; } @@ -807,7 +808,7 @@ static void tcp_keepalive_timer (struct timer_list *t) (user_timeout == 0 && icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC, - SK_RST_REASON_NOT_SPECIFIED); + SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT); tcp_write_err(sk); goto out; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 49c622e743e8..8accbf4cb295 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -115,6 +115,7 @@ #include <net/addrconf.h> #include <net/udp_tunnel.h> #include <net/gro.h> +#include <net/inet_dscp.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif @@ -365,7 +366,7 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, hash2_nulladdr); } -static int compute_score(struct sock *sk, struct net *net, +static int compute_score(struct sock *sk, const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, int dif, int sdif) @@ -420,7 +421,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, } /* called with rcu_read_lock() */ -static struct sock *udp4_lib_lookup2(struct net *net, +static struct sock *udp4_lib_lookup2(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, @@ -480,7 +481,7 @@ rescore: /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ -struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, +struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { @@ -561,7 +562,7 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) -struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, +struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { struct sock *sk; @@ -2618,7 +2619,7 @@ int udp_v4_early_demux(struct sk_buff *skb) if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, iph->saddr, - iph->tos & IPTOS_RT_MASK, + iph->tos & INET_DSCP_MASK, skb->dev, in_dev, &itag); } return 0; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b254a5dadfcf..d842303587af 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -279,7 +279,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, return ERR_PTR(-EINVAL); if (unlikely(skb_checksum_start(gso_skb) != - skb_transport_header(gso_skb))) + skb_transport_header(gso_skb) && + !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST))) return ERR_PTR(-EINVAL); /* We don't know if egress device can segment and checksum the packet diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index e4e0fa869fa4..619a53eb672d 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -6,6 +6,7 @@ #include <net/dst_metadata.h> #include <net/udp.h> #include <net/udp_tunnel.h> +#include <net/inet_dscp.h> int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) @@ -232,7 +233,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; - fl4.flowi4_tos = RT_TOS(tos); + fl4.flowi4_tos = tos & INET_DSCP_MASK; fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f70d8757af1a..d680beb91b0a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -92,8 +92,6 @@ #include <linux/export.h> #include <linux/ioam6.h> -#define INFINITY_LIFE_TIME 0xFFFFFFFF - #define IPV6_MAX_STRLEN \ sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255") @@ -239,6 +237,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, .ndisc_evict_nocarrier = 1, .ra_honor_pio_life = 0, + .ra_honor_pio_pflag = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -302,6 +301,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, .ndisc_evict_nocarrier = 1, .ra_honor_pio_life = 0, + .ra_honor_pio_pflag = 0, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -2762,6 +2762,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) u32 addr_flags = 0; struct inet6_dev *in6_dev; struct net *net = dev_net(dev); + bool ignore_autoconf = false; pinfo = (struct prefix_info *) opt; @@ -2864,7 +2865,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) /* Try to figure out our local address for this prefix */ - if (pinfo->autoconf && in6_dev->cnf.autoconf) { + ignore_autoconf = READ_ONCE(in6_dev->cnf.ra_honor_pio_pflag) && pinfo->preferpd; + if (pinfo->autoconf && in6_dev->cnf.autoconf && !ignore_autoconf) { struct in6_addr addr; bool tokenized = false, dev_addr_generated = false; @@ -5617,8 +5619,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); } static void ipv6_store_devconf(const struct ipv6_devconf *cnf, @@ -6173,8 +6174,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err); } static inline size_t inet6_prefix_nlmsg_size(void) @@ -6241,8 +6241,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); } static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) @@ -6926,6 +6925,15 @@ static const struct ctl_table addrconf_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "ra_honor_pio_pflag", + .data = &ipv6_devconf.ra_honor_pio_pflag, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, #ifdef CONFIG_IPV6_ROUTER_PREF { .procname = "accept_ra_rtr_pref", diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 90d2c7e3f5e9..ba69b86f1c7d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -708,6 +708,7 @@ const struct proto_ops inet6_stream_ops = { .splice_eof = inet_splice_eof, .sendmsg_locked = tcp_sendmsg_locked, .splice_read = tcp_splice_read, + .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .peek_len = tcp_peek_len, diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 3920e8aa1031..b2400c226a32 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -132,7 +132,8 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - skb_page_unref(sg_page(sg), skb->pp_recycle); + skb_page_unref(page_to_netmem(sg_page(sg)), + skb->pp_recycle); } #ifdef CONFIG_INET6_ESPINTCP diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 9e254de7462f..04a9ed5e8310 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -27,6 +27,7 @@ struct fib6_rule { struct rt6key src; struct rt6key dst; dscp_t dscp; + u8 dscp_full:1; /* DSCP or TOS selector */ }; static bool fib6_rule_matchall(const struct fib_rule *rule) @@ -345,6 +346,20 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, return 1; } +static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + if (rule6->dscp) { + NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); + return -EINVAL; + } + + rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule6->dscp_full = true; + + return 0; +} + static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, @@ -361,6 +376,9 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } rule6->dscp = inet_dsfield_to_dscp(frh->tos); + if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) + goto errout; + if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid table"); @@ -413,9 +431,19 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) return 0; - if (frh->tos && inet_dscp_to_dsfield(rule6->dscp) != frh->tos) + if (frh->tos && + (rule6->dscp_full || + inet_dscp_to_dsfield(rule6->dscp) != frh->tos)) return 0; + if (tb[FRA_DSCP]) { + dscp_t dscp; + + dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); + if (!rule6->dscp_full || rule6->dscp != dscp) + return 0; + } + if (frh->src_len && nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) return 0; @@ -434,7 +462,15 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; - frh->tos = inet_dscp_to_dsfield(rule6->dscp); + + if (rule6->dscp_full) { + frh->tos = 0; + if (nla_put_u8(skb, FRA_DSCP, + inet_dscp_to_dsfield(rule6->dscp) >> 2)) + goto nla_put_failure; + } else { + frh->tos = inet_dscp_to_dsfield(rule6->dscp); + } if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || @@ -450,7 +486,8 @@ nla_put_failure: static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ - + nla_total_size(16); /* src */ + + nla_total_size(16) /* src */ + + nla_total_size(1); /* dscp */ } static void fib6_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 7b31674644ef..071b0bc1179d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -175,14 +175,16 @@ static bool icmpv6_mask_allow(struct net *net, int type) return false; } -static bool icmpv6_global_allow(struct net *net, int type) +static bool icmpv6_global_allow(struct net *net, int type, + bool *apply_ratelimit) { if (icmpv6_mask_allow(net, type)) return true; - if (icmp_global_allow()) + if (icmp_global_allow(net)) { + *apply_ratelimit = true; return true; - + } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -191,13 +193,13 @@ static bool icmpv6_global_allow(struct net *net, int type) * Check the ICMP output rate limit */ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) + struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(net, type)) + if (!apply_ratelimit) return true; /* @@ -228,6 +230,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), ICMP6_MIB_RATELIMITHOST); + else + icmp_global_consume(net); dst_release(dst); return res; } @@ -452,6 +456,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; + bool apply_ratelimit = false; struct dst_entry *dst; struct icmp6hdr tmp_hdr; struct flowi6 fl6; @@ -533,11 +538,12 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, return; } - /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ + /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, type, &apply_ratelimit)) goto out_bh_enable; mip6_addr_swap(skb, parm); @@ -575,7 +581,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); - if (!icmpv6_xrlim_allow(sk, type, &fl6)) + if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) goto out; tmp_hdr.icmp6_type = type; @@ -717,6 +723,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); + bool apply_ratelimit = false; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; @@ -781,8 +788,9 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) goto out; /* Check the ratelimit */ - if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || - !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + if ((!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit)) goto out_dst_release; idev = __in6_dev_get(skb->dev); diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index ad5f6f6ba333..85b92917849b 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -108,6 +108,7 @@ int ila_lwt_init(void); void ila_lwt_fini(void); int ila_xlat_init_net(struct net *net); +void ila_xlat_pre_exit_net(struct net *net); void ila_xlat_exit_net(struct net *net); int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c index 69caed07315f..976c78efbae1 100644 --- a/net/ipv6/ila/ila_main.c +++ b/net/ipv6/ila/ila_main.c @@ -71,6 +71,11 @@ ila_xlat_init_fail: return err; } +static __net_exit void ila_pre_exit_net(struct net *net) +{ + ila_xlat_pre_exit_net(net); +} + static __net_exit void ila_exit_net(struct net *net) { ila_xlat_exit_net(net); @@ -78,6 +83,7 @@ static __net_exit void ila_exit_net(struct net *net) static struct pernet_operations ila_net_ops = { .init = ila_init_net, + .pre_exit = ila_pre_exit_net, .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 67e8c9440977..534a4498e280 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -619,6 +619,15 @@ int ila_xlat_init_net(struct net *net) return 0; } +void ila_xlat_pre_exit_net(struct net *net) +{ + struct ila_net *ilan = net_generic(net, ila_net_id); + + if (ilan->xlat.hooks_registered) + nf_unregister_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); +} + void ila_xlat_exit_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -626,10 +635,6 @@ void ila_xlat_exit_net(struct net *net) rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL); free_bucket_spinlocks(ilan->xlat.locks); - - if (ilan->xlat.hooks_registered) - nf_unregister_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); } static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 6db71bb1cd30..9ec05e354baa 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -46,7 +46,7 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn); * * The sockhash lock must be held as a reader here. */ -struct sock *__inet6_lookup_established(struct net *net, +struct sock *__inet6_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, @@ -89,7 +89,7 @@ found: } EXPORT_SYMBOL(__inet6_lookup_established); -static inline int compute_score(struct sock *sk, struct net *net, +static inline int compute_score(struct sock *sk, const struct net *net, const unsigned short hnum, const struct in6_addr *daddr, const int dif, const int sdif) @@ -126,7 +126,7 @@ static inline int compute_score(struct sock *sk, struct net *net, * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ -struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, +struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, @@ -147,7 +147,7 @@ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ -static struct sock *inet6_lhash2_lookup(struct net *net, +static struct sock *inet6_lhash2_lookup(const struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -174,7 +174,7 @@ static struct sock *inet6_lhash2_lookup(struct net *net, return result; } -struct sock *inet6_lookup_run_sk_lookup(struct net *net, +struct sock *inet6_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -199,7 +199,7 @@ struct sock *inet6_lookup_run_sk_lookup(struct net *net, } EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); -struct sock *inet6_lookup_listener(struct net *net, +struct sock *inet6_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, @@ -243,7 +243,8 @@ done: } EXPORT_SYMBOL_GPL(inet6_lookup_listener); -struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, +struct sock *inet6_lookup(const struct net *net, + struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index bf7120ecea1e..beb6b4cfc551 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -42,8 +42,10 @@ struct ioam6_lwt { struct ioam6_lwt_freq freq; atomic_t pkt_cnt; u8 mode; + bool has_tunsrc; + struct in6_addr tunsrc; struct in6_addr tundst; - struct ioam6_lwt_encap tuninfo; + struct ioam6_lwt_encap tuninfo; }; static const struct netlink_range_validation freq_range = { @@ -72,8 +74,10 @@ static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8, IOAM6_IPTUNNEL_MODE_MIN, IOAM6_IPTUNNEL_MODE_MAX), + [IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), - [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)), + [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN( + sizeof(struct ioam6_trace_hdr)), }; static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) @@ -85,7 +89,7 @@ static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | - trace->type.bit21) + trace->type.bit21 | trace->type.bit23) return false; trace->nodelen = 0; @@ -143,6 +147,11 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, else mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]); + if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) { + NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode"); + return -EINVAL; + } + if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) { NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination"); return -EINVAL; @@ -167,19 +176,40 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, ilwt = ioam6_lwt_state(lwt); err = dst_cache_init(&ilwt->cache, GFP_ATOMIC); - if (err) { - kfree(lwt); - return err; - } + if (err) + goto free_lwt; atomic_set(&ilwt->pkt_cnt, 0); ilwt->freq.k = freq_k; ilwt->freq.n = freq_n; ilwt->mode = mode; - if (tb[IOAM6_IPTUNNEL_DST]) + + if (!tb[IOAM6_IPTUNNEL_SRC]) { + ilwt->has_tunsrc = false; + } else { + ilwt->has_tunsrc = true; + ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]); + + if (ipv6_addr_any(&ilwt->tunsrc)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC], + "invalid tunnel source address"); + err = -EINVAL; + goto free_cache; + } + } + + if (tb[IOAM6_IPTUNNEL_DST]) { ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]); + if (ipv6_addr_any(&ilwt->tundst)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST], + "invalid tunnel dest address"); + err = -EINVAL; + goto free_cache; + } + } + tuninfo = ioam6_lwt_info(lwt); tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1; tuninfo->pad[0] = IPV6_TLV_PADN; @@ -201,6 +231,11 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, *ts = lwt; return 0; +free_cache: + dst_cache_destroy(&ilwt->cache); +free_lwt: + kfree(lwt); + return err; } static int ioam6_do_fill(struct net *net, struct sk_buff *skb) @@ -256,6 +291,8 @@ static int ioam6_do_inline(struct net *net, struct sk_buff *skb, static int ioam6_do_encap(struct net *net, struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo, + bool has_tunsrc, + struct in6_addr *tunsrc, struct in6_addr *tundst) { struct dst_entry *dst = skb_dst(skb); @@ -285,8 +322,12 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, hdr->nexthdr = NEXTHDR_HOP; hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); hdr->daddr = *tundst; - ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, - IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); + + if (has_tunsrc) + memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc)); + else + ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, + IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); skb_postpush_rcsum(skb, hdr, len); @@ -328,7 +369,9 @@ do_inline: case IOAM6_IPTUNNEL_MODE_ENCAP: do_encap: /* Encapsulation (ip6ip6) */ - err = ioam6_do_encap(net, skb, &ilwt->tuninfo, &ilwt->tundst); + err = ioam6_do_encap(net, skb, &ilwt->tuninfo, + ilwt->has_tunsrc, &ilwt->tunsrc, + &ilwt->tundst); if (unlikely(err)) goto drop; @@ -414,6 +457,13 @@ static int ioam6_fill_encap_info(struct sk_buff *skb, goto ret; if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { + if (ilwt->has_tunsrc) { + err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC, + &ilwt->tunsrc); + if (err) + goto ret; + } + err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst); if (err) goto ret; @@ -435,8 +485,12 @@ static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) nla_total_size(sizeof(ilwt->mode)) + nla_total_size(sizeof(ilwt->tuninfo.traceh)); - if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) + if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { + if (ilwt->has_tunsrc) + nlsize += nla_total_size(sizeof(ilwt->tunsrc)); + nlsize += nla_total_size(sizeof(ilwt->tundst)); + } return nlsize; } @@ -451,17 +505,21 @@ static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) return (ilwt_a->freq.k != ilwt_b->freq.k || ilwt_a->freq.n != ilwt_b->freq.n || ilwt_a->mode != ilwt_b->mode || + ilwt_a->has_tunsrc != ilwt_b->has_tunsrc || (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) || + (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && + ilwt_a->has_tunsrc && + !ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) || trace_a->namespace_id != trace_b->namespace_id); } static const struct lwtunnel_encap_ops ioam6_iptun_ops = { .build_state = ioam6_build_state, .destroy_state = ioam6_destroy_state, - .output = ioam6_output, + .output = ioam6_output, .fill_encap = ioam6_fill_encap_info, - .get_encap_size = ioam6_encap_nlsize, + .get_encap_size = ioam6_encap_nlsize, .cmp_encap = ioam6_encap_cmp, .owner = THIS_MODULE, }; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3942bd2ade78..235808cfec70 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1471,7 +1471,7 @@ static void ip6gre_tnl_init_features(struct net_device *dev) { struct ip6_tnl *nt = netdev_priv(dev); - dev->features |= GRE6_FEATURES | NETIF_F_LLTX; + dev->features |= GRE6_FEATURES; dev->hw_features |= GRE6_FEATURES; /* TCP offload with GRE SEQ is not supported, nor can we support 2 @@ -1485,6 +1485,8 @@ static void ip6gre_tnl_init_features(struct net_device *dev) dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + + dev->lltx = true; } static int ip6gre_tunnel_init_common(struct net_device *dev) @@ -1619,8 +1621,7 @@ static int __net_init ip6gre_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; - + ign->fb_tunnel_dev->netns_local = true; ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 133610a49da6..70c0e16c0ae6 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -111,9 +111,8 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk, { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct dst_entry *dst; @@ -327,9 +326,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); - INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ab504d31f0cd..f26841f1490f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -70,11 +70,15 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { + /* Make sure idev stays alive */ + rcu_read_lock(); skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); return -ENOMEM; } + rcu_read_unlock(); } hdr = ipv6_hdr(skb); @@ -283,11 +287,15 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { + /* Make sure idev stays alive */ + rcu_read_lock(); skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); return -ENOBUFS; } + rcu_read_unlock(); } if (opt) { @@ -1956,6 +1964,7 @@ int ip6_send_skb(struct sk_buff *skb) struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); int err; + rcu_read_lock(); err = ip6_local_out(net, skb->sk, skb); if (err) { if (err > 0) @@ -1965,6 +1974,7 @@ int ip6_send_skb(struct sk_buff *skb) IPSTATS_MIB_OUTDISCARDS); } + rcu_read_unlock(); return err; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9dee0c127955..b60e13c42bca 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -53,6 +53,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/dst_metadata.h> +#include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); @@ -608,7 +609,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, - 0, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + 0, 0, 0, IPPROTO_IPIP, + eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; @@ -619,7 +621,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, - IPPROTO_IPIP, RT_TOS(eiph->tos), 0); + IPPROTO_IPIP, + eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); @@ -1507,7 +1510,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { - dev->hard_header_len = tdev->hard_header_len + t_hlen; + dev->needed_headroom = tdev->hard_header_len + + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; @@ -1731,7 +1735,9 @@ ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); + int t_hlen; + t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; @@ -1740,10 +1746,10 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { - if (new_mtu > IP6_MAX_MTU - dev->hard_header_len) + if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { - if (new_mtu > IP_MAX_MTU - dev->hard_header_len) + if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); @@ -1846,7 +1852,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); @@ -1887,12 +1893,11 @@ ip6_tnl_dev_init_gen(struct net_device *dev) t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; + dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); @@ -2256,7 +2261,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; + ip6n->fb_tnl_dev->netns_local = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index dd342e6ecf3f..2ce4ae0d8dc3 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -640,7 +640,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) @@ -2431,8 +2431,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, errout: kfree_skb(skb); - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index cd342d5015c6..1e225e6489ea 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -985,7 +985,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) - return udp_prot.setsockopt(sk, level, optname, optval, optlen); + return ip_setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; @@ -1475,7 +1475,7 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) - return udp_prot.getsockopt(sk, level, optname, optval, optlen); + return ip_getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 7ba01d8cfbae..b244dbf61d5f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -586,7 +586,8 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, const struct in6_addr *group; struct ipv6_mc_socklist *pmc; struct ip6_sf_socklist *psl; - int i, count, copycount; + unsigned int count; + int i, copycount; group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; @@ -610,7 +611,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, psl = sock_dereference(pmc->sflist, sk); count = psl ? psl->sl_count : 0; - copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + copycount = min(count, gsf->gf_numsrc); gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b8eec1b6cc2c..aba94a348673 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -200,9 +200,9 @@ static inline int ndisc_is_useropt(const struct net_device *dev, return opt->nd_opt_type == ND_OPT_PREFIX_INFO || opt->nd_opt_type == ND_OPT_RDNSS || opt->nd_opt_type == ND_OPT_DNSSL || + opt->nd_opt_type == ND_OPT_6CO || opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL || - opt->nd_opt_type == ND_OPT_PREF64 || - ndisc_ops_is_useropt(dev, opt->nd_opt_type); + opt->nd_opt_type == ND_OPT_PREF64; } static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev, @@ -1944,7 +1944,7 @@ static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl, static char warncomm[TASK_COMM_LEN]; static int warned; if (strcmp(warncomm, current->comm) && warned < 5) { - strcpy(warncomm, current->comm); + strscpy(warncomm, current->comm); pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n", warncomm, func, dev_name, ctl->procname, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 131f7bb2110d..7d5602950ae7 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1773,7 +1773,7 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, goto out_free; } - ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index c82f3fdd4a65..492a811828a7 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -38,13 +38,13 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_ADDR] == NULL) return -EINVAL; - err = nft_parse_register_load(tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr, sizeof(struct in6_addr)); if (err < 0) return err; if (tb[NFTA_DUP_SREG_DEV]) - err = nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], + err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); return err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 219701caba1e..b4251915585f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -174,7 +174,7 @@ static void rt6_uncached_list_flush_dev(struct net_device *dev) struct net_device *rt_dev = rt->dst.dev; bool handled = false; - if (rt_idev->dev == dev) { + if (rt_idev && rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); handled = true; @@ -6193,8 +6193,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, info->nlh, gfp_any()); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_rt_update(struct net *net, struct fib6_info *rt, @@ -6220,8 +6219,7 @@ void fib6_rt_update(struct net *net, struct fib6_info *rt, info->nlh, gfp_any()); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 2c83b7586422..db3c19a42e1c 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -263,10 +263,8 @@ static int rpl_input(struct sk_buff *skb) rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } + if (unlikely(err)) + goto drop; local_bh_disable(); dst = dst_cache_get(&rlwt->cache); @@ -286,9 +284,13 @@ static int rpl_input(struct sk_buff *skb) err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) - return err; + goto drop; return dst_input(skb); + +drop: + kfree_skb(skb); + return err; } static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 83b195f09561..39bd8951bfca 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -51,6 +51,7 @@ #include <net/dsfield.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/inet_dscp.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c @@ -935,8 +936,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, - RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6, - 0, dst, tiph->saddr, 0, 0, + tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, + IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); @@ -1111,7 +1112,7 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) iph->daddr, iph->saddr, 0, 0, IPPROTO_IPV6, - RT_TOS(iph->tos), + iph->tos & INET_DSCP_MASK, tunnel->parms.link); if (!IS_ERR(rt)) { @@ -1435,7 +1436,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->features |= SIT_FEATURES; dev->hw_features |= SIT_FEATURES; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; @@ -1855,7 +1856,7 @@ static int __net_init sit_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; + sitn->fb_tunnel_dev->netns_local = true; err = register_netdev(sitn->fb_tunnel_dev); if (err) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 200fea92f12f..d71ab4e1efe1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1193,7 +1193,8 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) #endif } - tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, + READ_ONCE(tcptw->tw_rcv_nxt), tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, @@ -2258,7 +2259,7 @@ static void get_timewait6_sock(struct seq_file *seq, src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, - tw->tw_substate, 0, 0, + READ_ONCE(tw->tw_substate), 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6602a2e9cdb5..52dfbb2ff1a8 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -114,7 +114,7 @@ void udp_v6_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -static int compute_score(struct sock *sk, struct net *net, +static int compute_score(struct sock *sk, const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, int dif, int sdif) @@ -160,7 +160,7 @@ static int compute_score(struct sock *sk, struct net *net, } /* called with rcu_read_lock() */ -static struct sock *udp6_lib_lookup2(struct net *net, +static struct sock *udp6_lib_lookup2(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, int sdif, struct udp_hslot *hslot2, @@ -217,7 +217,7 @@ rescore: } /* rcu_read_lock() must be held */ -struct sock *__udp6_lib_lookup(struct net *net, +struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, @@ -300,7 +300,7 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6) -struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, +struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { struct sock *sk; diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 1e42e13ad24e..d3e9efab7f4b 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -86,13 +86,15 @@ struct device *iucv_alloc_device(const struct attribute_group **attrs, { struct device *dev; va_list vargs; + char buf[20]; int rc; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) goto out_error; va_start(vargs, fmt); - rc = dev_set_name(dev, fmt, vargs); + vsnprintf(buf, sizeof(buf), fmt, vargs); + rc = dev_set_name(dev, "%s", buf); va_end(vargs); if (rc) goto out_error; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 2f191e50d4fc..d4118c796290 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -755,6 +755,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); int err = -EPIPE; + mutex_lock(&kcm->tx_mutex); lock_sock(sk); /* Per tcp_sendmsg this should be in poll */ @@ -926,6 +927,7 @@ partial_message: KCM_STATS_ADD(kcm->stats.tx_bytes, copied); release_sock(sk); + mutex_unlock(&kcm->tx_mutex); return copied; out_error: @@ -951,6 +953,7 @@ out_error: sk->sk_write_space(sk); release_sock(sk); + mutex_unlock(&kcm->tx_mutex); return err; } @@ -1204,6 +1207,7 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) spin_unlock_bh(&mux->lock); INIT_WORK(&kcm->tx_work, kcm_tx_work); + mutex_init(&kcm->tx_mutex); spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 2e86f520f799..3eec23ac5ab1 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -117,12 +117,12 @@ struct l2tp_net { struct hlist_head l2tp_v3_session_htable[16]; }; -static inline u32 l2tp_v2_session_key(u16 tunnel_id, u16 session_id) +static u32 l2tp_v2_session_key(u16 tunnel_id, u16 session_id) { return ((u32)tunnel_id) << 16 | session_id; } -static inline unsigned long l2tp_v3_session_hashkey(struct sock *sk, u32 session_id) +static unsigned long l2tp_v3_session_hashkey(struct sock *sk, u32 session_id) { return ((unsigned long)sk) + session_id; } @@ -135,63 +135,81 @@ static bool l2tp_sk_is_v6(struct sock *sk) } #endif -static inline struct l2tp_net *l2tp_pernet(const struct net *net) +static struct l2tp_net *l2tp_pernet(const struct net *net) { return net_generic(net, l2tp_net_id); } static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) { + struct sock *sk = tunnel->sock; + trace_free_tunnel(tunnel); - sock_put(tunnel->sock); - /* the tunnel is freed in the socket destructor */ + + if (sk) { + /* Disable udp encapsulation */ + switch (tunnel->encap) { + case L2TP_ENCAPTYPE_UDP: + /* No longer an encapsulation socket. See net/ipv4/udp.c */ + WRITE_ONCE(udp_sk(sk)->encap_type, 0); + udp_sk(sk)->encap_rcv = NULL; + udp_sk(sk)->encap_destroy = NULL; + break; + case L2TP_ENCAPTYPE_IP: + break; + } + + tunnel->sock = NULL; + sock_put(sk); + } + + kfree_rcu(tunnel, rcu); } static void l2tp_session_free(struct l2tp_session *session) { trace_free_session(session); if (session->tunnel) - l2tp_tunnel_dec_refcount(session->tunnel); - kfree(session); + l2tp_tunnel_put(session->tunnel); + kfree_rcu(session, rcu); } -struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk) +struct l2tp_tunnel *l2tp_sk_to_tunnel(const struct sock *sk) { - struct l2tp_tunnel *tunnel = sk->sk_user_data; + const struct net *net = sock_net(sk); + unsigned long tunnel_id, tmp; + struct l2tp_tunnel *tunnel; + struct l2tp_net *pn; - if (tunnel) - if (WARN_ON(tunnel->magic != L2TP_TUNNEL_MAGIC)) - return NULL; + rcu_read_lock_bh(); + pn = l2tp_pernet(net); + idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { + if (tunnel && + tunnel->sock == sk && + refcount_inc_not_zero(&tunnel->ref_count)) { + rcu_read_unlock_bh(); + return tunnel; + } + } + rcu_read_unlock_bh(); - return tunnel; + return NULL; } EXPORT_SYMBOL_GPL(l2tp_sk_to_tunnel); -void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) -{ - refcount_inc(&tunnel->ref_count); -} -EXPORT_SYMBOL_GPL(l2tp_tunnel_inc_refcount); - -void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_put(struct l2tp_tunnel *tunnel) { if (refcount_dec_and_test(&tunnel->ref_count)) l2tp_tunnel_free(tunnel); } -EXPORT_SYMBOL_GPL(l2tp_tunnel_dec_refcount); +EXPORT_SYMBOL_GPL(l2tp_tunnel_put); -void l2tp_session_inc_refcount(struct l2tp_session *session) -{ - refcount_inc(&session->ref_count); -} -EXPORT_SYMBOL_GPL(l2tp_session_inc_refcount); - -void l2tp_session_dec_refcount(struct l2tp_session *session) +void l2tp_session_put(struct l2tp_session *session) { if (refcount_dec_and_test(&session->ref_count)) l2tp_session_free(session); } -EXPORT_SYMBOL_GPL(l2tp_session_dec_refcount); +EXPORT_SYMBOL_GPL(l2tp_session_put); /* Lookup a tunnel. A new reference is held on the returned tunnel. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) @@ -211,26 +229,27 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) } EXPORT_SYMBOL_GPL(l2tp_tunnel_get); -struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth) +struct l2tp_tunnel *l2tp_tunnel_get_next(const struct net *net, unsigned long *key) { struct l2tp_net *pn = l2tp_pernet(net); - unsigned long tunnel_id, tmp; - struct l2tp_tunnel *tunnel; - int count = 0; + struct l2tp_tunnel *tunnel = NULL; rcu_read_lock_bh(); - idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) { - if (tunnel && ++count > nth && - refcount_inc_not_zero(&tunnel->ref_count)) { +again: + tunnel = idr_get_next_ul(&pn->l2tp_tunnel_idr, key); + if (tunnel) { + if (refcount_inc_not_zero(&tunnel->ref_count)) { rcu_read_unlock_bh(); return tunnel; } + (*key)++; + goto again; } rcu_read_unlock_bh(); return NULL; } -EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth); +EXPORT_SYMBOL_GPL(l2tp_tunnel_get_next); struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id) { @@ -254,7 +273,15 @@ struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, hash_for_each_possible_rcu(pn->l2tp_v3_session_htable, session, hlist, key) { - if (session->tunnel->sock == sk && + /* session->tunnel may be NULL if another thread is in + * l2tp_session_register and has added an item to + * l2tp_v3_session_htable but hasn't yet added the + * session to its tunnel's session_list. + */ + struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel); + + if (session->session_id == session_id && + tunnel && tunnel->sock == sk && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock_bh(); return session; @@ -295,24 +322,109 @@ struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, in } EXPORT_SYMBOL_GPL(l2tp_session_get); -struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth) +static struct l2tp_session *l2tp_v2_session_get_next(const struct net *net, + u16 tid, + unsigned long *key) { - struct l2tp_session *session; - int count = 0; + struct l2tp_net *pn = l2tp_pernet(net); + struct l2tp_session *session = NULL; + + /* Start searching within the range of the tid */ + if (*key == 0) + *key = l2tp_v2_session_key(tid, 0); rcu_read_lock_bh(); - list_for_each_entry_rcu(session, &tunnel->session_list, list) { - if (++count > nth) { - l2tp_session_inc_refcount(session); +again: + session = idr_get_next_ul(&pn->l2tp_v2_session_idr, key); + if (session) { + struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel); + + /* ignore sessions with id 0 as they are internal for pppol2tp */ + if (session->session_id == 0) { + (*key)++; + goto again; + } + + if (tunnel->tunnel_id == tid && + refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock_bh(); return session; } + + (*key)++; + if (tunnel->tunnel_id == tid) + goto again; } rcu_read_unlock_bh(); return NULL; } -EXPORT_SYMBOL_GPL(l2tp_session_get_nth); + +static struct l2tp_session *l2tp_v3_session_get_next(const struct net *net, + u32 tid, struct sock *sk, + unsigned long *key) +{ + struct l2tp_net *pn = l2tp_pernet(net); + struct l2tp_session *session = NULL; + + rcu_read_lock_bh(); +again: + session = idr_get_next_ul(&pn->l2tp_v3_session_idr, key); + if (session && !hash_hashed(&session->hlist)) { + struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel); + + if (tunnel && tunnel->tunnel_id == tid && + refcount_inc_not_zero(&session->ref_count)) { + rcu_read_unlock_bh(); + return session; + } + + (*key)++; + goto again; + } + + /* If we get here and session is non-NULL, the IDR entry may be one + * where the session_id collides with one in another tunnel. Check + * session_htable for a match. There can only be one session of a given + * ID per tunnel so we can return as soon as a match is found. + */ + if (session && hash_hashed(&session->hlist)) { + unsigned long hkey = l2tp_v3_session_hashkey(sk, session->session_id); + u32 sid = session->session_id; + + hash_for_each_possible_rcu(pn->l2tp_v3_session_htable, session, + hlist, hkey) { + struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel); + + if (session->session_id == sid && + tunnel && tunnel->tunnel_id == tid && + refcount_inc_not_zero(&session->ref_count)) { + rcu_read_unlock_bh(); + return session; + } + } + + /* If no match found, the colliding session ID isn't in our + * tunnel so try the next session ID. + */ + (*key)++; + goto again; + } + + rcu_read_unlock_bh(); + + return NULL; +} + +struct l2tp_session *l2tp_session_get_next(const struct net *net, struct sock *sk, int pver, + u32 tunnel_id, unsigned long *key) +{ + if (pver == L2TP_HDR_VER_2) + return l2tp_v2_session_get_next(net, tunnel_id, key); + else + return l2tp_v3_session_get_next(net, tunnel_id, sk, key); +} +EXPORT_SYMBOL_GPL(l2tp_session_get_next); /* Lookup a session by interface name. * This is very inefficient but is only used by management interfaces. @@ -330,7 +442,7 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, if (tunnel) { list_for_each_entry_rcu(session, &tunnel->session_list, list) { if (!strcmp(session->ifname, ifname)) { - l2tp_session_inc_refcount(session); + refcount_inc(&session->ref_count); rcu_read_unlock_bh(); return session; @@ -347,7 +459,7 @@ EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname); static void l2tp_session_coll_list_add(struct l2tp_session_coll_list *clist, struct l2tp_session *session) { - l2tp_session_inc_refcount(session); + refcount_inc(&session->ref_count); WARN_ON_ONCE(session->coll_list); session->coll_list = clist; spin_lock(&clist->lock); @@ -387,12 +499,12 @@ static int l2tp_session_collision_add(struct l2tp_net *pn, /* If existing session isn't already in the session hlist, add it. */ if (!hash_hashed(&session2->hlist)) - hash_add(pn->l2tp_v3_session_htable, &session2->hlist, - session2->hlist_key); + hash_add_rcu(pn->l2tp_v3_session_htable, &session2->hlist, + session2->hlist_key); /* Add new session to the hlist and collision list */ - hash_add(pn->l2tp_v3_session_htable, &session1->hlist, - session1->hlist_key); + hash_add_rcu(pn->l2tp_v3_session_htable, &session1->hlist, + session1->hlist_key); refcount_inc(&clist->ref_count); l2tp_session_coll_list_add(clist, session1); @@ -408,7 +520,7 @@ static void l2tp_session_collision_del(struct l2tp_net *pn, lockdep_assert_held(&pn->l2tp_session_idr_lock); - hash_del(&session->hlist); + hash_del_rcu(&session->hlist); if (clist) { /* Remove session from its collision list. If there @@ -433,7 +545,7 @@ static void l2tp_session_collision_del(struct l2tp_net *pn, spin_unlock(&clist->lock); if (refcount_dec_and_test(&clist->ref_count)) kfree(clist); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); } } @@ -442,6 +554,7 @@ int l2tp_session_register(struct l2tp_session *session, { struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net); struct l2tp_session *other_session = NULL; + void *old = NULL; u32 session_key; int err; @@ -481,16 +594,23 @@ int l2tp_session_register(struct l2tp_session *session, goto out; } - l2tp_tunnel_inc_refcount(tunnel); - list_add(&session->list, &tunnel->session_list); + refcount_inc(&tunnel->ref_count); + WRITE_ONCE(session->tunnel, tunnel); + list_add_rcu(&session->list, &tunnel->session_list); + /* this makes session available to lockless getters */ if (tunnel->version == L2TP_HDR_VER_3) { if (!other_session) - idr_replace(&pn->l2tp_v3_session_idr, session, session_key); + old = idr_replace(&pn->l2tp_v3_session_idr, session, session_key); } else { - idr_replace(&pn->l2tp_v2_session_idr, session, session_key); + old = idr_replace(&pn->l2tp_v2_session_idr, session, session_key); } + /* old should be NULL, unless something removed or modified + * the IDR entry after our idr_alloc_32 above (which shouldn't + * happen). + */ + WARN_ON_ONCE(old); out: spin_unlock_bh(&pn->l2tp_session_idr_lock); spin_unlock_bh(&tunnel->list_lock); @@ -797,7 +917,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, if (!session->lns_mode && !session->send_seq) { trace_session_seqnum_lns_enable(session); session->send_seq = 1; - l2tp_session_set_header_len(session, tunnel->version); + l2tp_session_set_header_len(session, tunnel->version, + tunnel->encap); } } else { /* No sequence numbers. @@ -818,7 +939,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, if (!session->lns_mode && session->send_seq) { trace_session_seqnum_lns_disable(session); session->send_seq = 0; - l2tp_session_set_header_len(session, tunnel->version); + l2tp_session_set_header_len(session, tunnel->version, + tunnel->encap); } else if (session->send_seq) { pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n", session->name); @@ -955,7 +1077,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!session || !session->recv_skb) { if (session) - l2tp_session_dec_refcount(session); + l2tp_session_put(session); /* Not found? Pass to userspace to deal with */ goto pass; @@ -969,12 +1091,12 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (version == L2TP_HDR_VER_3 && l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) { - l2tp_session_dec_refcount(session); + l2tp_session_put(session); goto invalid; } l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); return 0; @@ -1218,44 +1340,6 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb); * Tinnel and session create/destroy. *****************************************************************************/ -/* Tunnel socket destruct hook. - * The tunnel context is deleted only when all session sockets have been - * closed. - */ -static void l2tp_tunnel_destruct(struct sock *sk) -{ - struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); - - if (!tunnel) - goto end; - - /* Disable udp encapsulation */ - switch (tunnel->encap) { - case L2TP_ENCAPTYPE_UDP: - /* No longer an encapsulation socket. See net/ipv4/udp.c */ - WRITE_ONCE(udp_sk(sk)->encap_type, 0); - udp_sk(sk)->encap_rcv = NULL; - udp_sk(sk)->encap_destroy = NULL; - break; - case L2TP_ENCAPTYPE_IP: - break; - } - - /* Remove hooks into tunnel socket */ - write_lock_bh(&sk->sk_callback_lock); - sk->sk_destruct = tunnel->old_sk_destruct; - sk->sk_user_data = NULL; - write_unlock_bh(&sk->sk_callback_lock); - - /* Call the original destructor */ - if (sk->sk_destruct) - (*sk->sk_destruct)(sk); - - kfree_rcu(tunnel, rcu); -end: - return; -} - /* Remove an l2tp session from l2tp_core's lists. */ static void l2tp_session_unhash(struct l2tp_session *session) { @@ -1288,8 +1372,6 @@ static void l2tp_session_unhash(struct l2tp_session *session) spin_unlock_bh(&pn->l2tp_session_idr_lock); spin_unlock_bh(&tunnel->list_lock); - - synchronize_rcu(); } } @@ -1301,28 +1383,21 @@ static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel) spin_lock_bh(&tunnel->list_lock); tunnel->acpt_newsess = false; - for (;;) { - session = list_first_entry_or_null(&tunnel->session_list, - struct l2tp_session, list); - if (!session) - break; - l2tp_session_inc_refcount(session); - list_del_init(&session->list); - spin_unlock_bh(&tunnel->list_lock); + list_for_each_entry(session, &tunnel->session_list, list) l2tp_session_delete(session); - spin_lock_bh(&tunnel->list_lock); - l2tp_session_dec_refcount(session); - } spin_unlock_bh(&tunnel->list_lock); } /* Tunnel socket destroy hook for UDP encapsulation */ static void l2tp_udp_encap_destroy(struct sock *sk) { - struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); + struct l2tp_tunnel *tunnel; - if (tunnel) + tunnel = l2tp_sk_to_tunnel(sk); + if (tunnel) { l2tp_tunnel_delete(tunnel); + l2tp_tunnel_put(tunnel); + } } static void l2tp_tunnel_remove(struct net *net, struct l2tp_tunnel *tunnel) @@ -1356,10 +1431,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work) l2tp_tunnel_remove(tunnel->l2tp_net, tunnel); /* drop initial ref */ - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); /* drop workqueue ref */ - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); } /* Create a socket for the tunnel, if one isn't set up by @@ -1505,7 +1580,6 @@ int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, tunnel->tunnel_id = tunnel_id; tunnel->peer_tunnel_id = peer_tunnel_id; - tunnel->magic = L2TP_TUNNEL_MAGIC; sprintf(&tunnel->name[0], "tunl %u", tunnel_id); spin_lock_init(&tunnel->list_lock); tunnel->acpt_newsess = true; @@ -1531,6 +1605,8 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); static int l2tp_validate_socket(const struct sock *sk, const struct net *net, enum l2tp_encap_type encap) { + struct l2tp_tunnel *tunnel; + if (!net_eq(sock_net(sk), net)) return -EINVAL; @@ -1544,8 +1620,14 @@ static int l2tp_validate_socket(const struct sock *sk, const struct net *net, (encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP)) return -EPROTONOSUPPORT; - if (sk->sk_user_data) + if (encap == L2TP_ENCAPTYPE_UDP && sk->sk_user_data) + return -EBUSY; + + tunnel = l2tp_sk_to_tunnel(sk); + if (tunnel) { + l2tp_tunnel_put(tunnel); return -EBUSY; + } return 0; } @@ -1584,12 +1666,10 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, ret = l2tp_validate_socket(sk, net, tunnel->encap); if (ret < 0) goto err_inval_sock; - rcu_assign_sk_user_data(sk, tunnel); write_unlock_bh(&sk->sk_callback_lock); if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { struct udp_tunnel_sock_cfg udp_cfg = { - .sk_user_data = tunnel, .encap_type = UDP_ENCAP_L2TPINUDP, .encap_rcv = l2tp_udp_encap_recv, .encap_err_rcv = l2tp_udp_encap_err_recv, @@ -1599,8 +1679,6 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net, setup_udp_tunnel_sock(net, sock, &udp_cfg); } - tunnel->old_sk_destruct = sk->sk_destruct; - sk->sk_destruct = &l2tp_tunnel_destruct; sk->sk_allocation = GFP_ATOMIC; release_sock(sk); @@ -1639,7 +1717,7 @@ void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { if (!test_and_set_bit(0, &tunnel->dead)) { trace_delete_tunnel(tunnel); - l2tp_tunnel_inc_refcount(tunnel); + refcount_inc(&tunnel->ref_count); queue_work(l2tp_wq, &tunnel->del_work); } } @@ -1647,23 +1725,37 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); void l2tp_session_delete(struct l2tp_session *session) { - if (test_and_set_bit(0, &session->dead)) - return; + if (!test_and_set_bit(0, &session->dead)) { + trace_delete_session(session); + refcount_inc(&session->ref_count); + queue_work(l2tp_wq, &session->del_work); + } +} +EXPORT_SYMBOL_GPL(l2tp_session_delete); + +/* Workqueue session deletion function */ +static void l2tp_session_del_work(struct work_struct *work) +{ + struct l2tp_session *session = container_of(work, struct l2tp_session, + del_work); - trace_delete_session(session); l2tp_session_unhash(session); l2tp_session_queue_purge(session); if (session->session_close) (*session->session_close)(session); - l2tp_session_dec_refcount(session); + /* drop initial ref */ + l2tp_session_put(session); + + /* drop workqueue ref */ + l2tp_session_put(session); } -EXPORT_SYMBOL_GPL(l2tp_session_delete); /* We come here whenever a session's send_seq, cookie_len or * l2specific_type parameters are set. */ -void l2tp_session_set_header_len(struct l2tp_session *session, int version) +void l2tp_session_set_header_len(struct l2tp_session *session, int version, + enum l2tp_encap_type encap) { if (version == L2TP_HDR_VER_2) { session->hdr_len = 6; @@ -1672,7 +1764,7 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version) } else { session->hdr_len = 4 + session->cookie_len; session->hdr_len += l2tp_get_l2specific_len(session); - if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP) + if (encap == L2TP_ENCAPTYPE_UDP) session->hdr_len += 4; } } @@ -1686,7 +1778,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn session = kzalloc(sizeof(*session) + priv_size, GFP_KERNEL); if (session) { session->magic = L2TP_SESSION_MAGIC; - session->tunnel = tunnel; session->session_id = session_id; session->peer_session_id = peer_session_id; @@ -1710,6 +1801,7 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn INIT_HLIST_NODE(&session->hlist); INIT_LIST_HEAD(&session->clist); INIT_LIST_HEAD(&session->list); + INIT_WORK(&session->del_work, l2tp_session_del_work); if (cfg) { session->pwtype = cfg->pw_type; @@ -1724,7 +1816,7 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len); } - l2tp_session_set_header_len(session, tunnel->version); + l2tp_session_set_header_len(session, tunnel->version, tunnel->encap); refcount_set(&session->ref_count, 1); @@ -1753,7 +1845,7 @@ static __net_init int l2tp_init_net(struct net *net) return 0; } -static __net_exit void l2tp_exit_net(struct net *net) +static __net_exit void l2tp_pre_exit_net(struct net *net) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel = NULL; @@ -1766,18 +1858,34 @@ static __net_exit void l2tp_exit_net(struct net *net) } rcu_read_unlock_bh(); - if (l2tp_wq) - flush_workqueue(l2tp_wq); - rcu_barrier(); + if (l2tp_wq) { + /* Run all TUNNEL_DELETE work items just queued. */ + __flush_workqueue(l2tp_wq); + + /* Each TUNNEL_DELETE work item will queue a SESSION_DELETE + * work item for each session in the tunnel. Flush the + * workqueue again to process these. + */ + __flush_workqueue(l2tp_wq); + } +} + +static __net_exit void l2tp_exit_net(struct net *net) +{ + struct l2tp_net *pn = l2tp_pernet(net); + WARN_ON_ONCE(!idr_is_empty(&pn->l2tp_v2_session_idr)); idr_destroy(&pn->l2tp_v2_session_idr); + WARN_ON_ONCE(!idr_is_empty(&pn->l2tp_v3_session_idr)); idr_destroy(&pn->l2tp_v3_session_idr); + WARN_ON_ONCE(!idr_is_empty(&pn->l2tp_tunnel_idr)); idr_destroy(&pn->l2tp_tunnel_idr); } static struct pernet_operations l2tp_net_ops = { .init = l2tp_init_net, .exit = l2tp_exit_net, + .pre_exit = l2tp_pre_exit_net, .id = &l2tp_net_id, .size = sizeof(struct l2tp_net), }; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 8ac81bc1bc6f..ffd8ced3a51f 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -16,7 +16,6 @@ #endif /* Random numbers used for internal consistency checks of tunnel and session structures */ -#define L2TP_TUNNEL_MAGIC 0x42114DDA #define L2TP_SESSION_MAGIC 0x0C04EB7D struct sk_buff; @@ -67,6 +66,7 @@ struct l2tp_session_coll_list { struct l2tp_session { int magic; /* should be L2TP_SESSION_MAGIC */ long dead; + struct rcu_head rcu; struct l2tp_tunnel *tunnel; /* back pointer to tunnel context */ u32 session_id; @@ -103,6 +103,7 @@ struct l2tp_session { int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; struct l2tp_stats stats; + struct work_struct del_work; /* Session receive handler for data packets. * Each pseudowire implementation should implement this callback in order to @@ -155,8 +156,6 @@ struct l2tp_tunnel_cfg { */ #define L2TP_TUNNEL_NAME_MAX 20 struct l2tp_tunnel { - int magic; /* Should be L2TP_TUNNEL_MAGIC */ - unsigned long dead; struct rcu_head rcu; @@ -176,7 +175,6 @@ struct l2tp_tunnel { struct net *l2tp_net; /* the net we belong to */ refcount_t ref_count; - void (*old_sk_destruct)(struct sock *sk); struct sock *sock; /* parent socket */ int fd; /* parent fd, if tunnel socket was created * by userspace @@ -211,23 +209,22 @@ static inline void *l2tp_session_priv(struct l2tp_session *session) } /* Tunnel and session refcounts */ -void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel); -void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel); -void l2tp_session_inc_refcount(struct l2tp_session *session); -void l2tp_session_dec_refcount(struct l2tp_session *session); +void l2tp_tunnel_put(struct l2tp_tunnel *tunnel); +void l2tp_session_put(struct l2tp_session *session); /* Tunnel and session lookup. * These functions take a reference on the instances they return, so * the caller must ensure that the reference is dropped appropriately. */ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); -struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth); +struct l2tp_tunnel *l2tp_tunnel_get_next(const struct net *net, unsigned long *key); struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id); struct l2tp_session *l2tp_v2_session_get(const struct net *net, u16 tunnel_id, u16 session_id); struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, int pver, u32 tunnel_id, u32 session_id); -struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth); +struct l2tp_session *l2tp_session_get_next(const struct net *net, struct sock *sk, int pver, + u32 tunnel_id, unsigned long *key); struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, const char *ifname); @@ -260,7 +257,8 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb, int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb); /* Transmit path helpers for sending packets over the tunnel socket. */ -void l2tp_session_set_header_len(struct l2tp_session *session, int version); +void l2tp_session_set_header_len(struct l2tp_session *session, int version, + enum l2tp_encap_type encap); int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb); /* Pseudowire management. @@ -273,10 +271,7 @@ void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); /* IOCTL helper for IP encap modules. */ int l2tp_ioctl(struct sock *sk, int cmd, int *karg); -/* Extract the tunnel structure from a socket's sk_user_data pointer, - * validating the tunnel magic feather. - */ -struct l2tp_tunnel *l2tp_sk_to_tunnel(struct sock *sk); +struct l2tp_tunnel *l2tp_sk_to_tunnel(const struct sock *sk); static inline int l2tp_get_l2specific_len(struct l2tp_session *session) { diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 8755ae521154..2d0c8275a3a8 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -34,8 +34,8 @@ static struct dentry *rootdir; struct l2tp_dfs_seq_data { struct net *net; netns_tracker ns_tracker; - int tunnel_idx; /* current tunnel */ - int session_idx; /* index of session within current tunnel */ + unsigned long tkey; /* lookup key of current tunnel */ + unsigned long skey; /* lookup key of current session */ struct l2tp_tunnel *tunnel; struct l2tp_session *session; /* NULL means get next tunnel */ }; @@ -44,23 +44,25 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->tunnel) - l2tp_tunnel_dec_refcount(pd->tunnel); + l2tp_tunnel_put(pd->tunnel); - pd->tunnel = l2tp_tunnel_get_nth(pd->net, pd->tunnel_idx); - pd->tunnel_idx++; + pd->tunnel = l2tp_tunnel_get_next(pd->net, &pd->tkey); + pd->tkey++; } static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->session) - l2tp_session_dec_refcount(pd->session); + l2tp_session_put(pd->session); - pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); - pd->session_idx++; + pd->session = l2tp_session_get_next(pd->net, pd->tunnel->sock, + pd->tunnel->version, + pd->tunnel->tunnel_id, &pd->skey); + pd->skey++; if (!pd->session) { - pd->session_idx = 0; + pd->skey = 0; l2tp_dfs_next_tunnel(pd); } } @@ -109,11 +111,11 @@ static void l2tp_dfs_seq_stop(struct seq_file *p, void *v) * or l2tp_dfs_next_tunnel(). */ if (pd->session) { - l2tp_session_dec_refcount(pd->session); + l2tp_session_put(pd->session); pd->session = NULL; } if (pd->tunnel) { - l2tp_tunnel_dec_refcount(pd->tunnel); + l2tp_tunnel_put(pd->tunnel); pd->tunnel = NULL; } } diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 8ba00ad433c2..d692b902e120 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -72,31 +72,19 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev unsigned int len = skb->len; int ret = l2tp_xmit_skb(session, skb); - if (likely(ret == NET_XMIT_SUCCESS)) { - DEV_STATS_ADD(dev, tx_bytes, len); - DEV_STATS_INC(dev, tx_packets); - } else { + if (likely(ret == NET_XMIT_SUCCESS)) + dev_sw_netstats_tx_add(dev, 1, len); + else DEV_STATS_INC(dev, tx_dropped); - } - return NETDEV_TX_OK; -} -static void l2tp_eth_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) -{ - stats->tx_bytes = DEV_STATS_READ(dev, tx_bytes); - stats->tx_packets = DEV_STATS_READ(dev, tx_packets); - stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); - stats->rx_bytes = DEV_STATS_READ(dev, rx_bytes); - stats->rx_packets = DEV_STATS_READ(dev, rx_packets); - stats->rx_errors = DEV_STATS_READ(dev, rx_errors); + return NETDEV_TX_OK; } static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_init = l2tp_eth_dev_init, .ndo_uninit = l2tp_eth_dev_uninit, .ndo_start_xmit = l2tp_eth_dev_xmit, - .ndo_get_stats64 = l2tp_eth_get_stats64, + .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = eth_mac_addr, }; @@ -109,9 +97,10 @@ static void l2tp_eth_dev_setup(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &l2tpeth_type); ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->netdev_ops = &l2tp_eth_netdev_ops; dev->needs_free_netdev = true; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; } static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) @@ -138,12 +127,11 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, if (!dev) goto error_rcu; - if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) { - DEV_STATS_INC(dev, rx_packets); - DEV_STATS_ADD(dev, rx_bytes, data_len); - } else { + if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) + dev_sw_netstats_rx_add(dev, data_len); + else DEV_STATS_INC(dev, rx_errors); - } + rcu_read_unlock(); return; @@ -283,7 +271,7 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, spriv = l2tp_session_priv(session); - l2tp_session_inc_refcount(session); + refcount_inc(&session->ref_count); rtnl_lock(); @@ -301,7 +289,7 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, if (rc < 0) { rtnl_unlock(); l2tp_session_delete(session); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); free_netdev(dev); return rc; @@ -312,17 +300,17 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, rtnl_unlock(); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); __module_get(THIS_MODULE); return 0; err_sess_dev: - l2tp_session_dec_refcount(session); + l2tp_session_put(session); free_netdev(dev); err_sess: - kfree(session); + l2tp_session_put(session); err: return rc; } diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index e48aa177d74c..4bc24fddfd52 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -22,9 +22,19 @@ #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include "l2tp_core.h" +/* per-net private data for this module */ +static unsigned int l2tp_ip_net_id; +struct l2tp_ip_net { + rwlock_t l2tp_ip_lock; + struct hlist_head l2tp_ip_table; + struct hlist_head l2tp_ip_bind_table; +}; + struct l2tp_ip_sock { /* inet_sock has to be the first member of l2tp_ip_sock */ struct inet_sock inet; @@ -33,21 +43,23 @@ struct l2tp_ip_sock { u32 peer_conn_id; }; -static DEFINE_RWLOCK(l2tp_ip_lock); -static struct hlist_head l2tp_ip_table; -static struct hlist_head l2tp_ip_bind_table; - -static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) +static struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) { return (struct l2tp_ip_sock *)sk; } +static struct l2tp_ip_net *l2tp_ip_pernet(const struct net *net) +{ + return net_generic(net, l2tp_ip_net_id); +} + static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, __be32 raddr, int dif, u32 tunnel_id) { + struct l2tp_ip_net *pn = l2tp_ip_pernet(net); struct sock *sk; - sk_for_each_bound(sk, &l2tp_ip_bind_table) { + sk_for_each_bound(sk, &pn->l2tp_ip_bind_table) { const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); const struct inet_sock *inet = inet_sk(sk); int bound_dev_if; @@ -113,6 +125,7 @@ found: static int l2tp_ip_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct l2tp_ip_net *pn; struct sock *sk; u32 session_id; u32 tunnel_id; @@ -121,6 +134,8 @@ static int l2tp_ip_recv(struct sk_buff *skb) struct l2tp_tunnel *tunnel = NULL; struct iphdr *iph; + pn = l2tp_ip_pernet(net); + if (!pskb_may_pull(skb, 4)) goto discard; @@ -152,7 +167,7 @@ static int l2tp_ip_recv(struct sk_buff *skb) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); return 0; @@ -167,15 +182,15 @@ pass_up: tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = (struct iphdr *)skb_network_header(skb); - read_lock_bh(&l2tp_ip_lock); + read_lock_bh(&pn->l2tp_ip_lock); sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, inet_iif(skb), tunnel_id); if (!sk) { - read_unlock_bh(&l2tp_ip_lock); + read_unlock_bh(&pn->l2tp_ip_lock); goto discard; } sock_hold(sk); - read_unlock_bh(&l2tp_ip_lock); + read_unlock_bh(&pn->l2tp_ip_lock); if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; @@ -185,7 +200,7 @@ pass_up: return sk_receive_skb(sk, skb, 1); discard_sess: - l2tp_session_dec_refcount(session); + l2tp_session_put(session); goto discard; discard_put: @@ -198,21 +213,25 @@ discard: static int l2tp_ip_hash(struct sock *sk) { + struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); + if (sk_unhashed(sk)) { - write_lock_bh(&l2tp_ip_lock); - sk_add_node(sk, &l2tp_ip_table); - write_unlock_bh(&l2tp_ip_lock); + write_lock_bh(&pn->l2tp_ip_lock); + sk_add_node(sk, &pn->l2tp_ip_table); + write_unlock_bh(&pn->l2tp_ip_lock); } return 0; } static void l2tp_ip_unhash(struct sock *sk) { + struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); + if (sk_unhashed(sk)) return; - write_lock_bh(&l2tp_ip_lock); + write_lock_bh(&pn->l2tp_ip_lock); sk_del_node_init(sk); - write_unlock_bh(&l2tp_ip_lock); + write_unlock_bh(&pn->l2tp_ip_lock); } static int l2tp_ip_open(struct sock *sk) @@ -226,23 +245,26 @@ static int l2tp_ip_open(struct sock *sk) static void l2tp_ip_close(struct sock *sk, long timeout) { - write_lock_bh(&l2tp_ip_lock); + struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); + + write_lock_bh(&pn->l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); - write_unlock_bh(&l2tp_ip_lock); + write_unlock_bh(&pn->l2tp_ip_lock); sk_common_release(sk); } static void l2tp_ip_destroy_sock(struct sock *sk) { - struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); - struct sk_buff *skb; + struct l2tp_tunnel *tunnel; - while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) - kfree_skb(skb); + __skb_queue_purge(&sk->sk_write_queue); - if (tunnel) + tunnel = l2tp_sk_to_tunnel(sk); + if (tunnel) { l2tp_tunnel_delete(tunnel); + l2tp_tunnel_put(tunnel); + } } static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -250,6 +272,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct inet_sock *inet = inet_sk(sk); struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; struct net *net = sock_net(sk); + struct l2tp_ip_net *pn; int ret; int chk_addr_ret; @@ -280,10 +303,11 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ - write_lock_bh(&l2tp_ip_lock); + pn = l2tp_ip_pernet(net); + write_lock_bh(&pn->l2tp_ip_lock); if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0, sk->sk_bound_dev_if, addr->l2tp_conn_id)) { - write_unlock_bh(&l2tp_ip_lock); + write_unlock_bh(&pn->l2tp_ip_lock); ret = -EADDRINUSE; goto out; } @@ -291,9 +315,9 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id; - sk_add_bind_node(sk, &l2tp_ip_bind_table); + sk_add_bind_node(sk, &pn->l2tp_ip_bind_table); sk_del_node_init(sk); - write_unlock_bh(&l2tp_ip_lock); + write_unlock_bh(&pn->l2tp_ip_lock); ret = 0; sock_reset_flag(sk, SOCK_ZAPPED); @@ -307,6 +331,7 @@ out: static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; + struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); int rc; if (addr_len < sizeof(*lsa)) @@ -329,10 +354,10 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; - write_lock_bh(&l2tp_ip_lock); + write_lock_bh(&pn->l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); - sk_add_bind_node(sk, &l2tp_ip_bind_table); - write_unlock_bh(&l2tp_ip_lock); + sk_add_bind_node(sk, &pn->l2tp_ip_bind_table); + write_unlock_bh(&pn->l2tp_ip_lock); out_sk: release_sock(sk); @@ -637,25 +662,58 @@ static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, }; +static __net_init int l2tp_ip_init_net(struct net *net) +{ + struct l2tp_ip_net *pn = net_generic(net, l2tp_ip_net_id); + + rwlock_init(&pn->l2tp_ip_lock); + INIT_HLIST_HEAD(&pn->l2tp_ip_table); + INIT_HLIST_HEAD(&pn->l2tp_ip_bind_table); + return 0; +} + +static __net_exit void l2tp_ip_exit_net(struct net *net) +{ + struct l2tp_ip_net *pn = l2tp_ip_pernet(net); + + write_lock_bh(&pn->l2tp_ip_lock); + WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip_table) != 0); + WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip_bind_table) != 0); + write_unlock_bh(&pn->l2tp_ip_lock); +} + +static struct pernet_operations l2tp_ip_net_ops = { + .init = l2tp_ip_init_net, + .exit = l2tp_ip_exit_net, + .id = &l2tp_ip_net_id, + .size = sizeof(struct l2tp_ip_net), +}; + static int __init l2tp_ip_init(void) { int err; pr_info("L2TP IP encapsulation support (L2TPv3)\n"); + err = register_pernet_device(&l2tp_ip_net_ops); + if (err) + goto out; + err = proto_register(&l2tp_ip_prot, 1); if (err != 0) - goto out; + goto out1; err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); if (err) - goto out1; + goto out2; inet_register_protosw(&l2tp_ip_protosw); return 0; -out1: +out2: proto_unregister(&l2tp_ip_prot); +out1: + unregister_pernet_device(&l2tp_ip_net_ops); out: return err; } @@ -665,6 +723,7 @@ static void __exit l2tp_ip_exit(void) inet_unregister_protosw(&l2tp_ip_protosw); inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip_prot); + unregister_pernet_device(&l2tp_ip_net_ops); } module_init(l2tp_ip_init); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index d217ff1f229e..f4c1da070826 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -22,6 +22,8 @@ #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <net/transp_v6.h> #include <net/addrconf.h> @@ -29,6 +31,14 @@ #include "l2tp_core.h" +/* per-net private data for this module */ +static unsigned int l2tp_ip6_net_id; +struct l2tp_ip6_net { + rwlock_t l2tp_ip6_lock; + struct hlist_head l2tp_ip6_table; + struct hlist_head l2tp_ip6_bind_table; +}; + struct l2tp_ip6_sock { /* inet_sock has to be the first member of l2tp_ip6_sock */ struct inet_sock inet; @@ -39,23 +49,25 @@ struct l2tp_ip6_sock { struct ipv6_pinfo inet6; }; -static DEFINE_RWLOCK(l2tp_ip6_lock); -static struct hlist_head l2tp_ip6_table; -static struct hlist_head l2tp_ip6_bind_table; - -static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) +static struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) { return (struct l2tp_ip6_sock *)sk; } +static struct l2tp_ip6_net *l2tp_ip6_pernet(const struct net *net) +{ + return net_generic(net, l2tp_ip6_net_id); +} + static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, const struct in6_addr *laddr, const struct in6_addr *raddr, int dif, u32 tunnel_id) { + struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); struct sock *sk; - sk_for_each_bound(sk, &l2tp_ip6_bind_table) { + sk_for_each_bound(sk, &pn->l2tp_ip6_bind_table) { const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); @@ -123,6 +135,7 @@ found: static int l2tp_ip6_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct l2tp_ip6_net *pn; struct sock *sk; u32 session_id; u32 tunnel_id; @@ -131,6 +144,8 @@ static int l2tp_ip6_recv(struct sk_buff *skb) struct l2tp_tunnel *tunnel = NULL; struct ipv6hdr *iph; + pn = l2tp_ip6_pernet(net); + if (!pskb_may_pull(skb, 4)) goto discard; @@ -162,7 +177,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); return 0; @@ -177,15 +192,15 @@ pass_up: tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = ipv6_hdr(skb); - read_lock_bh(&l2tp_ip6_lock); + read_lock_bh(&pn->l2tp_ip6_lock); sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr, inet6_iif(skb), tunnel_id); if (!sk) { - read_unlock_bh(&l2tp_ip6_lock); + read_unlock_bh(&pn->l2tp_ip6_lock); goto discard; } sock_hold(sk); - read_unlock_bh(&l2tp_ip6_lock); + read_unlock_bh(&pn->l2tp_ip6_lock); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; @@ -195,7 +210,7 @@ pass_up: return sk_receive_skb(sk, skb, 1); discard_sess: - l2tp_session_dec_refcount(session); + l2tp_session_put(session); goto discard; discard_put: @@ -208,21 +223,25 @@ discard: static int l2tp_ip6_hash(struct sock *sk) { + struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); + if (sk_unhashed(sk)) { - write_lock_bh(&l2tp_ip6_lock); - sk_add_node(sk, &l2tp_ip6_table); - write_unlock_bh(&l2tp_ip6_lock); + write_lock_bh(&pn->l2tp_ip6_lock); + sk_add_node(sk, &pn->l2tp_ip6_table); + write_unlock_bh(&pn->l2tp_ip6_lock); } return 0; } static void l2tp_ip6_unhash(struct sock *sk) { + struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); + if (sk_unhashed(sk)) return; - write_lock_bh(&l2tp_ip6_lock); + write_lock_bh(&pn->l2tp_ip6_lock); sk_del_node_init(sk); - write_unlock_bh(&l2tp_ip6_lock); + write_unlock_bh(&pn->l2tp_ip6_lock); } static int l2tp_ip6_open(struct sock *sk) @@ -236,24 +255,29 @@ static int l2tp_ip6_open(struct sock *sk) static void l2tp_ip6_close(struct sock *sk, long timeout) { - write_lock_bh(&l2tp_ip6_lock); + struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk)); + + write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); - write_unlock_bh(&l2tp_ip6_lock); + write_unlock_bh(&pn->l2tp_ip6_lock); sk_common_release(sk); } static void l2tp_ip6_destroy_sock(struct sock *sk) { - struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); + struct l2tp_tunnel *tunnel; lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); - if (tunnel) + tunnel = l2tp_sk_to_tunnel(sk); + if (tunnel) { l2tp_tunnel_delete(tunnel); + l2tp_tunnel_put(tunnel); + } } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -262,11 +286,14 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr; struct net *net = sock_net(sk); + struct l2tp_ip6_net *pn; __be32 v4addr = 0; int bound_dev_if; int addr_type; int err; + pn = l2tp_ip6_pernet(net); + if (addr->l2tp_family != AF_INET6) return -EINVAL; if (addr_len < sizeof(*addr)) @@ -324,10 +351,10 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) } rcu_read_unlock(); - write_lock_bh(&l2tp_ip6_lock); + write_lock_bh(&pn->l2tp_ip6_lock); if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if, addr->l2tp_conn_id)) { - write_unlock_bh(&l2tp_ip6_lock); + write_unlock_bh(&pn->l2tp_ip6_lock); err = -EADDRINUSE; goto out_unlock; } @@ -340,9 +367,9 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id; - sk_add_bind_node(sk, &l2tp_ip6_bind_table); + sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); sk_del_node_init(sk); - write_unlock_bh(&l2tp_ip6_lock); + write_unlock_bh(&pn->l2tp_ip6_lock); sock_reset_flag(sk, SOCK_ZAPPED); release_sock(sk); @@ -364,6 +391,7 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, struct in6_addr *daddr; int addr_type; int rc; + struct l2tp_ip6_net *pn; if (addr_len < sizeof(*lsa)) return -EINVAL; @@ -395,10 +423,11 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; - write_lock_bh(&l2tp_ip6_lock); + pn = l2tp_ip6_pernet(sock_net(sk)); + write_lock_bh(&pn->l2tp_ip6_lock); hlist_del_init(&sk->sk_bind_node); - sk_add_bind_node(sk, &l2tp_ip6_bind_table); - write_unlock_bh(&l2tp_ip6_lock); + sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table); + write_unlock_bh(&pn->l2tp_ip6_lock); out_sk: release_sock(sk); @@ -765,25 +794,58 @@ static struct inet6_protocol l2tp_ip6_protocol __read_mostly = { .handler = l2tp_ip6_recv, }; +static __net_init int l2tp_ip6_init_net(struct net *net) +{ + struct l2tp_ip6_net *pn = net_generic(net, l2tp_ip6_net_id); + + rwlock_init(&pn->l2tp_ip6_lock); + INIT_HLIST_HEAD(&pn->l2tp_ip6_table); + INIT_HLIST_HEAD(&pn->l2tp_ip6_bind_table); + return 0; +} + +static __net_exit void l2tp_ip6_exit_net(struct net *net) +{ + struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net); + + write_lock_bh(&pn->l2tp_ip6_lock); + WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_table) != 0); + WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_bind_table) != 0); + write_unlock_bh(&pn->l2tp_ip6_lock); +} + +static struct pernet_operations l2tp_ip6_net_ops = { + .init = l2tp_ip6_init_net, + .exit = l2tp_ip6_exit_net, + .id = &l2tp_ip6_net_id, + .size = sizeof(struct l2tp_ip6_net), +}; + static int __init l2tp_ip6_init(void) { int err; pr_info("L2TP IP encapsulation support for IPv6 (L2TPv3)\n"); + err = register_pernet_device(&l2tp_ip6_net_ops); + if (err) + goto out; + err = proto_register(&l2tp_ip6_prot, 1); if (err != 0) - goto out; + goto out1; err = inet6_add_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); if (err) - goto out1; + goto out2; inet6_register_protosw(&l2tp_ip6_protosw); return 0; -out1: +out2: proto_unregister(&l2tp_ip6_prot); +out1: + unregister_pernet_device(&l2tp_ip6_net_ops); out: return err; } @@ -793,6 +855,7 @@ static void __exit l2tp_ip6_exit(void) inet6_unregister_protosw(&l2tp_ip6_protosw); inet6_del_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip6_prot); + unregister_pernet_device(&l2tp_ip6_net_ops); } module_init(l2tp_ip6_init); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index d105030520f9..284f1dec1b56 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -63,7 +63,7 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info) if (tunnel) { session = l2tp_session_get(net, tunnel->sock, tunnel->version, tunnel_id, session_id); - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); } } @@ -242,7 +242,7 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info if (ret < 0) goto out; - l2tp_tunnel_inc_refcount(tunnel); + refcount_inc(&tunnel->ref_count); ret = l2tp_tunnel_register(tunnel, net, &cfg); if (ret < 0) { kfree(tunnel); @@ -250,7 +250,7 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info } ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_CREATE); - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); out: return ret; @@ -280,7 +280,7 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info l2tp_tunnel_delete(tunnel); - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); out: return ret; @@ -308,7 +308,7 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); out: return ret; @@ -479,42 +479,48 @@ static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info) if (ret < 0) goto err_nlmsg_tunnel; - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); return genlmsg_unicast(net, msg, info->snd_portid); err_nlmsg_tunnel: - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); err_nlmsg: nlmsg_free(msg); err: return ret; } +struct l2tp_nl_cb_data { + unsigned long tkey; + unsigned long skey; +}; + static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int ti = cb->args[0]; + struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0]; + unsigned long key = cbd->tkey; struct l2tp_tunnel *tunnel; struct net *net = sock_net(skb->sk); for (;;) { - tunnel = l2tp_tunnel_get_nth(net, ti); + tunnel = l2tp_tunnel_get_next(net, &key); if (!tunnel) goto out; if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, tunnel, L2TP_CMD_TUNNEL_GET) < 0) { - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); goto out; } - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); - ti++; + key++; } out: - cb->args[0] = ti; + cbd->tkey = key; return skb->len; } @@ -641,12 +647,12 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (session) { ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_CREATE); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); } } out_tunnel: - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); out: return ret; } @@ -671,7 +677,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete) l2tp_nl_cmd_ops[pw_type]->session_delete(session); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); out: return ret; @@ -692,8 +698,10 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); if (info->attrs[L2TP_ATTR_SEND_SEQ]) { + struct l2tp_tunnel *tunnel = session->tunnel; + session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]); - l2tp_session_set_header_len(session, session->tunnel->version); + l2tp_session_set_header_len(session, tunnel->version, tunnel->encap); } if (info->attrs[L2TP_ATTR_LNS_MODE]) @@ -705,7 +713,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf ret = l2tp_session_notify(&l2tp_nl_family, info, session, L2TP_CMD_SESSION_MODIFY); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); out: return ret; @@ -816,57 +824,59 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info) ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); return ret; err_ref_msg: nlmsg_free(msg); err_ref: - l2tp_session_dec_refcount(session); + l2tp_session_put(session); err: return ret; } static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0]; struct net *net = sock_net(skb->sk); struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; - int ti = cb->args[0]; - int si = cb->args[1]; + unsigned long tkey = cbd->tkey; + unsigned long skey = cbd->skey; for (;;) { if (!tunnel) { - tunnel = l2tp_tunnel_get_nth(net, ti); + tunnel = l2tp_tunnel_get_next(net, &tkey); if (!tunnel) goto out; } - session = l2tp_session_get_nth(tunnel, si); + session = l2tp_session_get_next(net, tunnel->sock, tunnel->version, + tunnel->tunnel_id, &skey); if (!session) { - ti++; - l2tp_tunnel_dec_refcount(tunnel); + tkey++; + l2tp_tunnel_put(tunnel); tunnel = NULL; - si = 0; + skey = 0; continue; } if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, session, L2TP_CMD_SESSION_GET) < 0) { - l2tp_session_dec_refcount(session); - l2tp_tunnel_dec_refcount(tunnel); + l2tp_session_put(session); + l2tp_tunnel_put(tunnel); break; } - l2tp_session_dec_refcount(session); + l2tp_session_put(session); - si++; + skey++; } out: - cb->args[0] = ti; - cb->args[1] = si; + cbd->tkey = tkey; + cbd->skey = skey; return skb->len; } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 3596290047b2..53baf2dd5d5d 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -119,7 +119,6 @@ struct pppol2tp_session { struct mutex sk_lock; /* Protects .sk */ struct sock __rcu *sk; /* Pointer to the session PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ - struct rcu_head rcu; /* For asynchronous release */ }; static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb); @@ -150,27 +149,23 @@ static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) /* Helpers to obtain tunnel/session contexts from sockets. */ -static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) +static struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) { struct l2tp_session *session; if (!sk) return NULL; - sock_hold(sk); - session = (struct l2tp_session *)(sk->sk_user_data); - if (!session) { - sock_put(sk); - goto out; - } - if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) { - session = NULL; - sock_put(sk); - goto out; + rcu_read_lock(); + session = rcu_dereference_sk_user_data(sk); + if (session && refcount_inc_not_zero(&session->ref_count)) { + rcu_read_unlock(); + WARN_ON_ONCE(session->magic != L2TP_SESSION_MAGIC); + return session; } + rcu_read_unlock(); -out: - return session; + return NULL; } /***************************************************************************** @@ -318,12 +313,12 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, l2tp_xmit_skb(session, skb); local_bh_enable(); - sock_put(sk); + l2tp_session_put(session); return total_len; error_put_sess: - sock_put(sk); + l2tp_session_put(session); error: return error; } @@ -377,12 +372,12 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) l2tp_xmit_skb(session, skb); local_bh_enable(); - sock_put(sk); + l2tp_session_put(session); return 1; abort_put_sess: - sock_put(sk); + l2tp_session_put(session); abort: /* Free the original skb */ kfree_skb(skb); @@ -393,29 +388,32 @@ abort: * Session (and tunnel control) socket create/destroy. *****************************************************************************/ -static void pppol2tp_put_sk(struct rcu_head *head) -{ - struct pppol2tp_session *ps; - - ps = container_of(head, typeof(*ps), rcu); - sock_put(ps->__sk); -} - /* Really kill the session socket. (Called from sock_put() if * refcnt == 0.) */ static void pppol2tp_session_destruct(struct sock *sk) { - struct l2tp_session *session = sk->sk_user_data; - skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); +} - if (session) { - sk->sk_user_data = NULL; - if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) - return; - l2tp_session_dec_refcount(session); +static void pppol2tp_session_close(struct l2tp_session *session) +{ + struct pppol2tp_session *ps; + + ps = l2tp_session_priv(session); + mutex_lock(&ps->sk_lock); + ps->__sk = rcu_dereference_protected(ps->sk, + lockdep_is_held(&ps->sk_lock)); + RCU_INIT_POINTER(ps->sk, NULL); + mutex_unlock(&ps->sk_lock); + if (ps->__sk) { + /* detach socket */ + rcu_assign_sk_user_data(ps->__sk, NULL); + sock_put(ps->__sk); + + /* drop ref taken when we referenced socket via sk_user_data */ + l2tp_session_put(session); } } @@ -444,30 +442,13 @@ static int pppol2tp_release(struct socket *sock) session = pppol2tp_sock_to_session(sk); if (session) { - struct pppol2tp_session *ps; - l2tp_session_delete(session); - - ps = l2tp_session_priv(session); - mutex_lock(&ps->sk_lock); - ps->__sk = rcu_dereference_protected(ps->sk, - lockdep_is_held(&ps->sk_lock)); - RCU_INIT_POINTER(ps->sk, NULL); - mutex_unlock(&ps->sk_lock); - call_rcu(&ps->rcu, pppol2tp_put_sk); - - /* Rely on the sock_put() call at the end of the function for - * dropping the reference held by pppol2tp_sock_to_session(). - * The last reference will be dropped by pppol2tp_put_sk(). - */ + /* drop ref taken by pppol2tp_sock_to_session */ + l2tp_session_put(session); } release_sock(sk); - /* This will delete the session context via - * pppol2tp_session_destruct() if the socket's refcnt drops to - * zero. - */ sock_put(sk); return 0; @@ -506,6 +487,7 @@ static int pppol2tp_create(struct net *net, struct socket *sock, int kern) goto out; sock_init_data(sock, sk); + sock_set_flag(sk, SOCK_RCU_FREE); sock->state = SS_UNCONNECTED; sock->ops = &pppol2tp_ops; @@ -542,6 +524,7 @@ static void pppol2tp_session_init(struct l2tp_session *session) struct pppol2tp_session *ps; session->recv_skb = pppol2tp_recv; + session->session_close = pppol2tp_session_close; if (IS_ENABLED(CONFIG_L2TP_DEBUGFS)) session->show = pppol2tp_show; @@ -685,7 +668,7 @@ static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net, if (error < 0) return ERR_PTR(error); - l2tp_tunnel_inc_refcount(tunnel); + refcount_inc(&tunnel->ref_count); error = l2tp_tunnel_register(tunnel, net, &tcfg); if (error < 0) { kfree(tunnel); @@ -701,7 +684,7 @@ static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net, /* Error if socket is not prepped */ if (!tunnel->sock) { - l2tp_tunnel_dec_refcount(tunnel); + l2tp_tunnel_put(tunnel); return ERR_PTR(-ENOENT); } } @@ -787,18 +770,20 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, goto end; } + drop_refcnt = true; + pppol2tp_session_init(session); ps = l2tp_session_priv(session); - l2tp_session_inc_refcount(session); + refcount_inc(&session->ref_count); mutex_lock(&ps->sk_lock); error = l2tp_session_register(session, tunnel); if (error < 0) { mutex_unlock(&ps->sk_lock); - kfree(session); + l2tp_session_put(session); goto end; } - drop_refcnt = true; + new_session = true; } @@ -830,12 +815,13 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, out_no_ppp: /* This is how we get the session context from the socket. */ - sk->sk_user_data = session; + sock_hold(sk); + rcu_assign_sk_user_data(sk, session); rcu_assign_pointer(ps->sk, sk); mutex_unlock(&ps->sk_lock); /* Keep the reference we've grabbed on the session: sk doesn't expect - * the session to disappear. pppol2tp_session_destruct() is responsible + * the session to disappear. pppol2tp_session_close() is responsible * for dropping it. */ drop_refcnt = false; @@ -850,8 +836,8 @@ end: l2tp_tunnel_delete(tunnel); } if (drop_refcnt) - l2tp_session_dec_refcount(session); - l2tp_tunnel_dec_refcount(tunnel); + l2tp_session_put(session); + l2tp_tunnel_put(tunnel); release_sock(sk); return error; @@ -891,7 +877,7 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, return 0; err_sess: - kfree(session); + l2tp_session_put(session); err: return error; } @@ -1002,7 +988,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, error = len; - sock_put(sk); + l2tp_session_put(session); end: return error; } @@ -1052,12 +1038,12 @@ static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, return -EBADR; if (session->pwtype != L2TP_PWTYPE_PPP) { - l2tp_session_dec_refcount(session); + l2tp_session_put(session); return -EBADR; } pppol2tp_copy_stats(stats, &session->stats); - l2tp_session_dec_refcount(session); + l2tp_session_put(session); return 0; } @@ -1205,7 +1191,8 @@ static int pppol2tp_session_setsockopt(struct sock *sk, po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ : PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; } - l2tp_session_set_header_len(session, session->tunnel->version); + l2tp_session_set_header_len(session, session->tunnel->version, + session->tunnel->encap); break; case PPPOL2TP_SO_LNSMODE: @@ -1274,7 +1261,7 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, err = pppol2tp_session_setsockopt(sk, session, optname, val); } - sock_put(sk); + l2tp_session_put(session); end: return err; } @@ -1395,7 +1382,7 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, err = 0; end_put_sess: - sock_put(sk); + l2tp_session_put(session); end: return err; } @@ -1406,14 +1393,12 @@ end: * L2TPv2, we dump only L2TPv2 tunnels and sessions here. *****************************************************************************/ -static unsigned int pppol2tp_net_id; - #ifdef CONFIG_PROC_FS struct pppol2tp_seq_data { struct seq_net_private p; - int tunnel_idx; /* current tunnel */ - int session_idx; /* index of session within current tunnel */ + unsigned long tkey; /* lookup key of current tunnel */ + unsigned long skey; /* lookup key of current session */ struct l2tp_tunnel *tunnel; struct l2tp_session *session; /* NULL means get next tunnel */ }; @@ -1422,17 +1407,17 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->tunnel) - l2tp_tunnel_dec_refcount(pd->tunnel); + l2tp_tunnel_put(pd->tunnel); for (;;) { - pd->tunnel = l2tp_tunnel_get_nth(net, pd->tunnel_idx); - pd->tunnel_idx++; + pd->tunnel = l2tp_tunnel_get_next(net, &pd->tkey); + pd->tkey++; /* Only accept L2TPv2 tunnels */ if (!pd->tunnel || pd->tunnel->version == 2) return; - l2tp_tunnel_dec_refcount(pd->tunnel); + l2tp_tunnel_put(pd->tunnel); } } @@ -1440,13 +1425,15 @@ static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->session) - l2tp_session_dec_refcount(pd->session); + l2tp_session_put(pd->session); - pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); - pd->session_idx++; + pd->session = l2tp_session_get_next(net, pd->tunnel->sock, + pd->tunnel->version, + pd->tunnel->tunnel_id, &pd->skey); + pd->skey++; if (!pd->session) { - pd->session_idx = 0; + pd->skey = 0; pppol2tp_next_tunnel(net, pd); } } @@ -1498,11 +1485,11 @@ static void pppol2tp_seq_stop(struct seq_file *p, void *v) * or pppol2tp_next_tunnel(). */ if (pd->session) { - l2tp_session_dec_refcount(pd->session); + l2tp_session_put(pd->session); pd->session = NULL; } if (pd->tunnel) { - l2tp_tunnel_dec_refcount(pd->tunnel); + l2tp_tunnel_put(pd->tunnel); pd->tunnel = NULL; } } @@ -1513,7 +1500,7 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) seq_printf(m, "\nTUNNEL '%s', %c %d\n", tunnel->name, - (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N', + tunnel->sock ? 'Y' : 'N', refcount_read(&tunnel->ref_count) - 1); seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", 0, @@ -1641,7 +1628,6 @@ static __net_exit void pppol2tp_exit_net(struct net *net) static struct pernet_operations pppol2tp_net_ops = { .init = pppol2tp_init_net, .exit = pppol2tp_exit_net, - .id = &pppol2tp_net_id, }; /***************************************************************************** diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 9bffac7a4974..fe7eab4b681b 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -207,20 +207,7 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, return; skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = skb_put_zero(skb, 24); - memcpy(mgmt->da, da, ETH_ALEN); - memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_MESH_POINT) - memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); - else if (sdata->vif.type == NL80211_IFTYPE_STATION) - memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); - else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) - memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN); - - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_ACTION); + mgmt = ieee80211_mgmt_ba(skb, da, sdata); skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp)); mgmt->u.action.category = WLAN_CATEGORY_BACK; diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 677bbbac9f16..1c18b862ef8c 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -74,20 +74,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, return; skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = skb_put_zero(skb, 24); - memcpy(mgmt->da, da, ETH_ALEN); - memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_MESH_POINT) - memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); - else if (sdata->vif.type == NL80211_IFTYPE_STATION) - memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); - else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) - memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN); - - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_ACTION); + mgmt = ieee80211_mgmt_ba(skb, da, sdata); skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req)); diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c index fdf8b658fede..c61df637232a 100644 --- a/net/mac80211/airtime.c +++ b/net/mac80211/airtime.c @@ -55,10 +55,21 @@ #define HE_DURATION_S(shift, streams, gi, bps) \ (HE_DURATION(streams, gi, bps) >> shift) +/* gi in HE/EHT is identical. It matches enum nl80211_eht_gi as well */ +#define EHT_GI_08 HE_GI_08 +#define EHT_GI_16 HE_GI_16 +#define EHT_GI_32 HE_GI_32 + +#define EHT_DURATION(streams, gi, bps) \ + HE_DURATION(streams, gi, bps) +#define EHT_DURATION_S(shift, streams, gi, bps) \ + HE_DURATION_S(shift, streams, gi, bps) + #define BW_20 0 #define BW_40 1 #define BW_80 2 #define BW_160 3 +#define BW_320 4 /* * Define group sort order: HT40 -> SGI -> #streams @@ -68,17 +79,26 @@ #define IEEE80211_VHT_STREAM_GROUPS 8 /* BW(=4) * SGI(=2) */ #define IEEE80211_HE_MAX_STREAMS 8 +#define IEEE80211_HE_STREAM_GROUPS 12 /* BW(=4) * GI(=3) */ + +#define IEEE80211_EHT_MAX_STREAMS 8 +#define IEEE80211_EHT_STREAM_GROUPS 15 /* BW(=5) * GI(=3) */ #define IEEE80211_HT_GROUPS_NB (IEEE80211_MAX_STREAMS * \ IEEE80211_HT_STREAM_GROUPS) #define IEEE80211_VHT_GROUPS_NB (IEEE80211_MAX_STREAMS * \ IEEE80211_VHT_STREAM_GROUPS) +#define IEEE80211_HE_GROUPS_NB (IEEE80211_HE_MAX_STREAMS * \ + IEEE80211_HE_STREAM_GROUPS) +#define IEEE80211_EHT_GROUPS_NB (IEEE80211_EHT_MAX_STREAMS * \ + IEEE80211_EHT_STREAM_GROUPS) #define IEEE80211_HT_GROUP_0 0 #define IEEE80211_VHT_GROUP_0 (IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB) #define IEEE80211_HE_GROUP_0 (IEEE80211_VHT_GROUP_0 + IEEE80211_VHT_GROUPS_NB) +#define IEEE80211_EHT_GROUP_0 (IEEE80211_HE_GROUP_0 + IEEE80211_HE_GROUPS_NB) -#define MCS_GROUP_RATES 12 +#define MCS_GROUP_RATES 14 #define HT_GROUP_IDX(_streams, _sgi, _ht40) \ IEEE80211_HT_GROUP_0 + \ @@ -203,6 +223,69 @@ #define HE_GROUP(_streams, _gi, _bw) \ __HE_GROUP(_streams, _gi, _bw, \ HE_GROUP_SHIFT(_streams, _gi, _bw)) + +#define EHT_BW2VBPS(_bw, r5, r4, r3, r2, r1) \ + ((_bw) == BW_320 ? r5 : BW2VBPS(_bw, r4, r3, r2, r1)) + +#define EHT_GROUP_IDX(_streams, _gi, _bw) \ + (IEEE80211_EHT_GROUP_0 + \ + IEEE80211_EHT_MAX_STREAMS * 3 * (_bw) + \ + IEEE80211_EHT_MAX_STREAMS * (_gi) + \ + (_streams) - 1) + +#define __EHT_GROUP(_streams, _gi, _bw, _s) \ + [EHT_GROUP_IDX(_streams, _gi, _bw)] = { \ + .shift = _s, \ + .duration = { \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 1960, 980, 490, 234, 117)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 3920, 1960, 980, 468, 234)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 5880, 2937, 1470, 702, 351)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 7840, 3920, 1960, 936, 468)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 11760, 5880, 2940, 1404, 702)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 15680, 7840, 3920, 1872, 936)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 17640, 8820, 4410, 2106, 1053)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 19600, 9800, 4900, 2340, 1170)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 23520, 11760, 5880, 2808, 1404)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 26133, 13066, 6533, 3120, 1560)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 29400, 14700, 7350, 3510, 1755)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 32666, 16333, 8166, 3900, 1950)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 35280, 17640, 8820, 4212, 2106)), \ + EHT_DURATION_S(_s, _streams, _gi, \ + EHT_BW2VBPS(_bw, 39200, 19600, 9800, 4680, 2340)) \ + } \ +} + +#define EHT_GROUP_SHIFT(_streams, _gi, _bw) \ + GROUP_SHIFT(EHT_DURATION(_streams, _gi, \ + EHT_BW2VBPS(_bw, 1960, 980, 490, 234, 117))) + +#define EHT_GROUP(_streams, _gi, _bw) \ + __EHT_GROUP(_streams, _gi, _bw, \ + EHT_GROUP_SHIFT(_streams, _gi, _bw)) + +#define EHT_GROUP_RANGE(_gi, _bw) \ + EHT_GROUP(1, _gi, _bw), \ + EHT_GROUP(2, _gi, _bw), \ + EHT_GROUP(3, _gi, _bw), \ + EHT_GROUP(4, _gi, _bw), \ + EHT_GROUP(5, _gi, _bw), \ + EHT_GROUP(6, _gi, _bw), \ + EHT_GROUP(7, _gi, _bw), \ + EHT_GROUP(8, _gi, _bw) + struct mcs_group { u8 shift; u16 duration[MCS_GROUP_RATES]; @@ -376,6 +459,26 @@ static const struct mcs_group airtime_mcs_groups[] = { HE_GROUP(6, HE_GI_32, BW_160), HE_GROUP(7, HE_GI_32, BW_160), HE_GROUP(8, HE_GI_32, BW_160), + + EHT_GROUP_RANGE(EHT_GI_08, BW_20), + EHT_GROUP_RANGE(EHT_GI_16, BW_20), + EHT_GROUP_RANGE(EHT_GI_32, BW_20), + + EHT_GROUP_RANGE(EHT_GI_08, BW_40), + EHT_GROUP_RANGE(EHT_GI_16, BW_40), + EHT_GROUP_RANGE(EHT_GI_32, BW_40), + + EHT_GROUP_RANGE(EHT_GI_08, BW_80), + EHT_GROUP_RANGE(EHT_GI_16, BW_80), + EHT_GROUP_RANGE(EHT_GI_32, BW_80), + + EHT_GROUP_RANGE(EHT_GI_08, BW_160), + EHT_GROUP_RANGE(EHT_GI_16, BW_160), + EHT_GROUP_RANGE(EHT_GI_32, BW_160), + + EHT_GROUP_RANGE(EHT_GI_08, BW_320), + EHT_GROUP_RANGE(EHT_GI_16, BW_320), + EHT_GROUP_RANGE(EHT_GI_32, BW_320), }; static u32 @@ -422,6 +525,9 @@ static u32 ieee80211_get_rate_duration(struct ieee80211_hw *hw, case RATE_INFO_BW_160: bw = BW_160; break; + case RATE_INFO_BW_320: + bw = BW_320; + break; default: WARN_ON_ONCE(1); return 0; @@ -443,14 +549,27 @@ static u32 ieee80211_get_rate_duration(struct ieee80211_hw *hw, idx = status->rate_idx; group = HE_GROUP_IDX(streams, status->he_gi, bw); break; + case RX_ENC_EHT: + streams = status->nss; + idx = status->rate_idx; + group = EHT_GROUP_IDX(streams, status->eht.gi, bw); + break; default: WARN_ON_ONCE(1); return 0; } - if (WARN_ON_ONCE((status->encoding != RX_ENC_HE && streams > 4) || - (status->encoding == RX_ENC_HE && streams > 8))) - return 0; + switch (status->encoding) { + case RX_ENC_EHT: + case RX_ENC_HE: + if (WARN_ON_ONCE(streams > 8)) + return 0; + break; + default: + if (WARN_ON_ONCE(streams > 4)) + return 0; + break; + } if (idx >= MCS_GROUP_RATES) return 0; @@ -517,7 +636,9 @@ static bool ieee80211_fill_rate_info(struct ieee80211_hw *hw, stat->nss = ri->nss; stat->rate_idx = ri->mcs; - if (ri->flags & RATE_INFO_FLAGS_HE_MCS) + if (ri->flags & RATE_INFO_FLAGS_EHT_MCS) + stat->encoding = RX_ENC_EHT; + else if (ri->flags & RATE_INFO_FLAGS_HE_MCS) stat->encoding = RX_ENC_HE; else if (ri->flags & RATE_INFO_FLAGS_VHT_MCS) stat->encoding = RX_ENC_VHT; @@ -529,7 +650,14 @@ static bool ieee80211_fill_rate_info(struct ieee80211_hw *hw, if (ri->flags & RATE_INFO_FLAGS_SHORT_GI) stat->enc_flags |= RX_ENC_FLAG_SHORT_GI; - stat->he_gi = ri->he_gi; + switch (stat->encoding) { + case RX_ENC_EHT: + stat->eht.gi = ri->eht_gi; + break; + default: + stat->he_gi = ri->he_gi; + break; + } if (stat->encoding != RX_ENC_LEGACY) return true; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b02b84ce2130..847304a3a29a 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1662,12 +1662,12 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BEACON_ENABLED); - if (sdata->wdev.cac_started) { + if (sdata->wdev.links[link_id].cac_started) { chandef = link_conf->chanreq.oper; - wiphy_delayed_work_cancel(wiphy, &sdata->dfs_cac_timer_work); + wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, - GFP_KERNEL); + GFP_KERNEL, link_id); } drv_stop_ap(sdata->local, sdata, link_conf); @@ -3462,51 +3462,58 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, static int ieee80211_start_radar_detection(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef, - u32 cac_time_ms) + u32 cac_time_ms, int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chan_req chanreq = { .oper = *chandef }; struct ieee80211_local *local = sdata->local; + struct ieee80211_link_data *link_data; int err; lockdep_assert_wiphy(local->hw.wiphy); - if (!list_empty(&local->roc_list) || local->scanning) { - err = -EBUSY; - goto out_unlock; - } + if (!list_empty(&local->roc_list) || local->scanning) + return -EBUSY; + + link_data = sdata_dereference(sdata->link[link_id], sdata); + if (!link_data) + return -ENOLINK; /* whatever, but channel contexts should not complain about that one */ - sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; - sdata->deflink.needed_rx_chains = local->rx_chains; + link_data->smps_mode = IEEE80211_SMPS_OFF; + link_data->needed_rx_chains = local->rx_chains; - err = ieee80211_link_use_channel(&sdata->deflink, &chanreq, + err = ieee80211_link_use_channel(link_data, &chanreq, IEEE80211_CHANCTX_SHARED); if (err) - goto out_unlock; + return err; - wiphy_delayed_work_queue(wiphy, &sdata->dfs_cac_timer_work, + wiphy_delayed_work_queue(wiphy, &link_data->dfs_cac_timer_work, msecs_to_jiffies(cac_time_ms)); - out_unlock: - return err; + return 0; } static void ieee80211_end_cac(struct wiphy *wiphy, - struct net_device *dev) + struct net_device *dev, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; + struct ieee80211_link_data *link_data; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { + link_data = sdata_dereference(sdata->link[link_id], sdata); + if (!link_data) + continue; + wiphy_delayed_work_cancel(wiphy, - &sdata->dfs_cac_timer_work); + &link_data->dfs_cac_timer_work); - if (sdata->wdev.cac_started) { - ieee80211_link_release_channel(&sdata->deflink); - sdata->wdev.cac_started = false; + if (sdata->wdev.links[link_id].cac_started) { + ieee80211_link_release_channel(link_data); + sdata->wdev.links[link_id].cac_started = false; } } } @@ -3961,7 +3968,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, if (!list_empty(&local->roc_list) || local->scanning) return -EBUSY; - if (sdata->wdev.cac_started) + if (sdata->wdev.links[link_id].cac_started) return -EBUSY; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index e8567723e94d..cca6d14084d2 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -286,7 +286,9 @@ ieee80211_get_max_required_bw(struct ieee80211_link_data *link) enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT; struct sta_info *sta; - list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + list_for_each_entry(sta, &sdata->local->sta_list, list) { if (sdata != sta->sdata && !(sta->sdata->bss && sta->sdata->bss == sdata->bss)) continue; @@ -681,6 +683,7 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local, ctx->mode = mode; ctx->conf.radar_enabled = false; ctx->conf.radio_idx = radio_idx; + ctx->radar_detected = false; _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); return ctx; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 79caeb485fd5..1c2b7dd8976a 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -467,20 +467,7 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, return; skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = skb_put_zero(skb, 24); - memcpy(mgmt->da, da, ETH_ALEN); - memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - sdata->vif.type == NL80211_IFTYPE_MESH_POINT) - memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); - else if (sdata->vif.type == NL80211_IFTYPE_STATION) - memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); - else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) - memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN); - - mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | - IEEE80211_STYPE_ACTION); + mgmt = ieee80211_mgmt_ba(skb, da, sdata); skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba)); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a3485e4c6132..4f0390918b60 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -893,6 +893,8 @@ struct ieee80211_chanctx { struct ieee80211_chan_req req; struct ieee80211_chanctx_conf conf; + + bool radar_detected; }; struct mac80211_qos_map { @@ -1067,6 +1069,7 @@ struct ieee80211_link_data { int ap_power_level; /* in dBm */ bool radar_required; + struct wiphy_delayed_work dfs_cac_timer_work; union { struct ieee80211_link_data_managed mgd; @@ -1165,8 +1168,6 @@ struct ieee80211_sub_if_data { struct ieee80211_link_data deflink; struct ieee80211_link_data __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; - struct wiphy_delayed_work dfs_cac_timer_work; - /* for ieee80211_set_active_links_async() */ struct wiphy_work activate_links_work; u16 desired_active_links; @@ -2072,8 +2073,6 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, u32 info_flags, u32 ctrl_flags, u64 *cookie); -void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, - struct sk_buff_head *skbs); struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); @@ -2136,6 +2135,29 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, struct ieee80211_mgmt *mgmt, size_t len); +static inline struct ieee80211_mgmt * +ieee80211_mgmt_ba(struct sk_buff *skb, const u8 *da, + struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_mgmt *mgmt = skb_put_zero(skb, 24); + + ether_addr_copy(mgmt->da, da); + ether_addr_copy(mgmt->sa, sdata->vif.addr); + + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN || + sdata->vif.type == NL80211_IFTYPE_MESH_POINT) + ether_addr_copy(mgmt->bssid, sdata->vif.addr); + else if (sdata->vif.type == NL80211_IFTYPE_STATION) + ether_addr_copy(mgmt->bssid, sdata->vif.cfg.ap_addr); + else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + ether_addr_copy(mgmt->bssid, sdata->u.ibss.bssid); + + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + return mgmt; +} + int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, enum ieee80211_agg_stop_reason reason); void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, @@ -2629,7 +2651,8 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, bool ieee80211_is_radar_required(struct ieee80211_local *local); void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work); -void ieee80211_dfs_cac_cancel(struct ieee80211_local *local); +void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, + struct ieee80211_chanctx *chanctx); void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct wiphy_work *work); int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b4ad66af3af3..6ef0990d3d29 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -462,6 +462,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do { struct ieee80211_local *local = sdata->local; unsigned long flags; + struct sk_buff_head freeq; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; int i, flushed; @@ -550,15 +551,15 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.color_change_finalize_work); wiphy_delayed_work_cancel(local->hw.wiphy, - &sdata->dfs_cac_timer_work); + &sdata->deflink.dfs_cac_timer_work); - if (sdata->wdev.cac_started) { + if (sdata->wdev.links[0].cac_started) { chandef = sdata->vif.bss_conf.chanreq.oper; WARN_ON(local->suspended); ieee80211_link_release_channel(&sdata->deflink); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, - GFP_KERNEL); + GFP_KERNEL, 0); } if (sdata->vif.type == NL80211_IFTYPE_AP) { @@ -637,18 +638,32 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do skb_queue_purge(&sdata->status_queue); } + /* + * Since ieee80211_free_txskb() may issue __dev_queue_xmit() + * which should be called with interrupts enabled, reclamation + * is done in two phases: + */ + __skb_queue_head_init(&freeq); + + /* unlink from local queues... */ spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { skb_queue_walk_safe(&local->pending[i], skb, tmp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (info->control.vif == &sdata->vif) { __skb_unlink(skb, &local->pending[i]); - ieee80211_free_txskb(&local->hw, skb); + __skb_queue_tail(&freeq, skb); } } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + /* ... and perform actual reclamation with interrupts enabled. */ + skb_queue_walk_safe(&freeq, skb, tmp) { + __skb_unlink(skb, &freeq); + ieee80211_free_txskb(&local->hw, skb); + } + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ieee80211_txq_remove_vlan(local, sdata); @@ -1729,8 +1744,6 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, wiphy_work_init(&sdata->work, ieee80211_iface_work); wiphy_work_init(&sdata->activate_links_work, ieee80211_activate_links_work); - wiphy_delayed_work_init(&sdata->dfs_cac_timer_work, - ieee80211_dfs_cac_timer_work); switch (type) { case NL80211_IFTYPE_P2P_GO: diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 1a211b8d4057..0bbac64d5fa0 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -45,6 +45,8 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_color_collision_detection_work); INIT_LIST_HEAD(&link->assigned_chanctx_list); INIT_LIST_HEAD(&link->reserved_chanctx_list); + wiphy_delayed_work_init(&link->dfs_cac_timer_work, + ieee80211_dfs_cac_timer_work); if (!deflink) { switch (sdata->vif.type) { @@ -75,6 +77,16 @@ void ieee80211_link_stop(struct ieee80211_link_data *link) &link->color_change_finalize_work); wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->csa.finalize_work); + + if (link->sdata->wdev.links[link->link_id].cac_started) { + wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, + &link->dfs_cac_timer_work); + cfg80211_cac_event(link->sdata->dev, + &link->conf->chanreq.oper, + NL80211_RADAR_CAC_ABORTED, + GFP_KERNEL, link->link_id); + } + ieee80211_link_release_channel(link); } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index a3104b6ea6f0..89084690350f 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1051,9 +1051,9 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) return 0; /* Driver provides cipher suites, but we need to exclude WEP */ - suites = kmemdup(local->hw.wiphy->cipher_suites, - sizeof(u32) * local->hw.wiphy->n_cipher_suites, - GFP_KERNEL); + suites = kmemdup_array(local->hw.wiphy->cipher_suites, + local->hw.wiphy->n_cipher_suites, + sizeof(u32), GFP_KERNEL); if (!suites) return -ENOMEM; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index c0a5c75cddcb..30c0d89203af 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -580,7 +580,7 @@ void mesh_fast_tx_cache(struct ieee80211_sub_if_data *sdata, prev = rhashtable_lookup_get_insert_fast(&cache->rht, &entry->rhash, fast_tx_rht_params); - if (unlikely(IS_ERR(prev))) { + if (IS_ERR(prev)) { kfree(entry); goto unlock_cache; } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 4779a18ab75d..735e78adb0db 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1231,7 +1231,7 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, bool disable_mu_mimo = false; struct ieee80211_sub_if_data *other; - list_for_each_entry_rcu(other, &local->interfaces, list) { + list_for_each_entry(other, &local->interfaces, list) { if (other->vif.bss_conf.mu_mimo_owner) { disable_mu_mimo = true; break; @@ -3031,18 +3031,19 @@ void ieee80211_dynamic_ps_timer(struct timer_list *t) void ieee80211_dfs_cac_timer_work(struct wiphy *wiphy, struct wiphy_work *work) { - struct ieee80211_sub_if_data *sdata = - container_of(work, struct ieee80211_sub_if_data, + struct ieee80211_link_data *link = + container_of(work, struct ieee80211_link_data, dfs_cac_timer_work.work); - struct cfg80211_chan_def chandef = sdata->vif.bss_conf.chanreq.oper; + struct cfg80211_chan_def chandef = link->conf->chanreq.oper; + struct ieee80211_sub_if_data *sdata = link->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); - if (sdata->wdev.cac_started) { - ieee80211_link_release_channel(&sdata->deflink); + if (sdata->wdev.links[link->link_id].cac_started) { + ieee80211_link_release_channel(link); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_FINISHED, - GFP_KERNEL); + GFP_KERNEL, link->link_id); } } @@ -4715,7 +4716,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, ((assoc_data->wmm && !elems->wmm_param) || (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->ht_cap_elem || !elems->ht_operation)) || - (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && + (is_5ghz && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT && (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems *bss_elems; @@ -4763,19 +4764,22 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } - if (!elems->vht_cap_elem && bss_elems->vht_cap_elem && - link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { - elems->vht_cap_elem = bss_elems->vht_cap_elem; - sdata_info(sdata, - "AP bug: VHT capa missing from AssocResp\n"); - } - if (!elems->vht_operation && bss_elems->vht_operation && - link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { - elems->vht_operation = bss_elems->vht_operation; - sdata_info(sdata, - "AP bug: VHT operation missing from AssocResp\n"); - } + if (is_5ghz) { + if (!elems->vht_cap_elem && bss_elems->vht_cap_elem && + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { + elems->vht_cap_elem = bss_elems->vht_cap_elem; + sdata_info(sdata, + "AP bug: VHT capa missing from AssocResp\n"); + } + + if (!elems->vht_operation && bss_elems->vht_operation && + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_VHT) { + elems->vht_operation = bss_elems->vht_operation; + sdata_info(sdata, + "AP bug: VHT operation missing from AssocResp\n"); + } + } kfree(bss_elems); } @@ -6664,7 +6668,7 @@ static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, return true; /* hidden SSID: zeroed out */ - if (memcmp(elems->ssid, zero_ssid, elems->ssid_len)) + if (!memcmp(elems->ssid, zero_ssid, elems->ssid_len)) return false; return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); @@ -7660,6 +7664,7 @@ static int ieee80211_do_assoc(struct ieee80211_sub_if_data *sdata) lockdep_assert_wiphy(sdata->local->hw.wiphy); assoc_data->tries++; + assoc_data->comeback = false; if (assoc_data->tries > IEEE80211_ASSOC_MAX_TRIES) { sdata_info(sdata, "association with %pM timed out\n", assoc_data->ap_addr); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 28d03196ef75..29fab7ae47b4 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -997,6 +997,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, } IEEE80211_SKB_CB(skb)->flags = flags; + IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; skb->dev = sdata->dev; diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index d823d58303e8..7be52345f218 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -32,7 +32,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) ieee80211_scan_cancel(local); - ieee80211_dfs_cac_cancel(local); + ieee80211_dfs_cac_cancel(local, NULL); ieee80211_roc_purge(local, NULL); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 4dc1def69548..3dc9752188d5 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -890,7 +890,7 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, if (ieee80211_is_tx_data(skb)) rate_control_apply_mask(sdata, sta, sband, dest, max_rates); - if (!(info->control.flags & IEEE80211_TX_CTRL_SCAN_TX)) + if (!(info->control.flags & IEEE80211_TX_CTRL_DONT_USE_RATE_MASK)) mask = sdata->rc_rateidx_mask[info->band]; if (dest[0].idx < 0) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index b5f2df61c7f6..adb88c06b598 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -504,7 +504,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) * the scan was in progress; if there was none this will * just be a no-op for the particular interface. */ - list_for_each_entry_rcu(sdata, &local->interfaces, list) { + list_for_each_entry(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } @@ -575,6 +575,7 @@ static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; + unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); @@ -585,8 +586,9 @@ static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) return false; list_for_each_entry(sdata_iter, &local->interfaces, list) { - if (sdata_iter->wdev.cac_started) - return false; + for_each_valid_link(&sdata_iter->wdev, link_id) + if (sdata_iter->wdev.links[link_id].cac_started) + return false; } return true; @@ -649,7 +651,7 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; - IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_SCAN_TX; + IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } @@ -1013,10 +1015,8 @@ set_channel: */ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !scan_req->n_ssids) { - *next_delay = msecs_to_jiffies(scan_req->duration) > - IEEE80211_PASSIVE_CHANNEL_TIME ? - msecs_to_jiffies(scan_req->duration) : - IEEE80211_PASSIVE_CHANNEL_TIME; + *next_delay = max(msecs_to_jiffies(scan_req->duration), + IEEE80211_PASSIVE_CHANNEL_TIME); local->next_scan_state = SCAN_DECISION; if (scan_req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index dd8f857a1fbc..d1cf987de13b 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -1301,3 +1301,4 @@ void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, while ((skb = __skb_dequeue(skbs))) ieee80211_free_txskb(hw, skb); } +EXPORT_SYMBOL(ieee80211_purge_tx_queue); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index edba4a31844f..a9ee86982259 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -699,7 +699,7 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) txrc.skb = tx->skb; txrc.reported_rate.idx = -1; - if (unlikely(info->control.flags & IEEE80211_TX_CTRL_SCAN_TX)) { + if (unlikely(info->control.flags & IEEE80211_TX_CTRL_DONT_USE_RATE_MASK)) { txrc.rate_idx_mask = ~0; } else { txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[info->band]; @@ -5348,8 +5348,10 @@ ieee80211_beacon_get_ap(struct ieee80211_hw *hw, if (beacon->tail) skb_put_data(skb, beacon->tail, beacon->tail_len); - if (ieee80211_beacon_protect(skb, local, sdata, link) < 0) + if (ieee80211_beacon_protect(skb, local, sdata, link) < 0) { + dev_kfree_skb(skb); return NULL; + } ieee80211_beacon_get_finish(hw, vif, link, offs, beacon, skb, chanctx_conf, csa_off_base); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index c7ad9bc5973a..f94faa86ba8a 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -751,7 +751,9 @@ static void __iterate_interfaces(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata; bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; - list_for_each_entry_rcu(sdata, &local->interfaces, list) { + list_for_each_entry_rcu(sdata, &local->interfaces, list, + lockdep_is_held(&local->iflist_mtx) || + lockdep_is_held(&local->hw.wiphy->mtx)) { switch (sdata->vif.type) { case NL80211_IFTYPE_MONITOR: if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)) @@ -833,7 +835,8 @@ static void __iterate_stations(struct ieee80211_local *local, { struct sta_info *sta; - list_for_each_entry_rcu(sta, &local->sta_list, list) { + list_for_each_entry_rcu(sta, &local->sta_list, list, + lockdep_is_held(&local->hw.wiphy->mtx)) { if (!sta->uploaded) continue; @@ -854,6 +857,19 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic); +void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, + void (*iterator)(void *data, + struct ieee80211_sta *sta), + void *data) +{ + struct ieee80211_local *local = hw_to_local(hw); + + lockdep_assert_wiphy(local->hw.wiphy); + + __iterate_stations(local, iterator, data); +} +EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_mtx); + struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); @@ -3451,24 +3467,44 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, return ts; } -void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) +/* Cancel CAC for the interfaces under the specified @local. If @ctx is + * also provided, only the interfaces using that ctx will be canceled. + */ +void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef; + struct ieee80211_link_data *link; + struct ieee80211_chanctx_conf *chanctx_conf; + unsigned int link_id; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(sdata, &local->interfaces, list) { - wiphy_delayed_work_cancel(local->hw.wiphy, - &sdata->dfs_cac_timer_work); - - if (sdata->wdev.cac_started) { - chandef = sdata->vif.bss_conf.chanreq.oper; - ieee80211_link_release_channel(&sdata->deflink); - cfg80211_cac_event(sdata->dev, - &chandef, + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; + link_id++) { + link = sdata_dereference(sdata->link[link_id], + sdata); + if (!link) + continue; + + chanctx_conf = sdata_dereference(link->conf->chanctx_conf, + sdata); + if (ctx && &ctx->conf != chanctx_conf) + continue; + + wiphy_delayed_work_cancel(local->hw.wiphy, + &link->dfs_cac_timer_work); + + if (!sdata->wdev.links[link_id].cac_started) + continue; + + chandef = link->conf->chanreq.oper; + ieee80211_link_release_channel(link); + cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, - GFP_KERNEL); + GFP_KERNEL, link_id); } } } @@ -3478,9 +3514,8 @@ void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, { struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); - struct cfg80211_chan_def chandef = local->hw.conf.chandef; + struct cfg80211_chan_def chandef; struct ieee80211_chanctx *ctx; - int num_chanctx = 0; lockdep_assert_wiphy(local->hw.wiphy); @@ -3488,25 +3523,46 @@ void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; - num_chanctx++; + if (!ctx->radar_detected) + continue; + + ctx->radar_detected = false; + chandef = ctx->conf.def; + + ieee80211_dfs_cac_cancel(local, ctx); + cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL); } +} - ieee80211_dfs_cac_cancel(local); +static void +ieee80211_radar_mark_chan_ctx_iterator(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf, + void *data) +{ + struct ieee80211_chanctx *ctx = + container_of(chanctx_conf, struct ieee80211_chanctx, + conf); - if (num_chanctx > 1) - /* XXX: multi-channel is not supported yet */ - WARN_ON(1); - else - cfg80211_radar_event(local->hw.wiphy, &chandef, GFP_KERNEL); + if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) + return; + + if (data && data != chanctx_conf) + return; + + ctx->radar_detected = true; } -void ieee80211_radar_detected(struct ieee80211_hw *hw) +void ieee80211_radar_detected(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf) { struct ieee80211_local *local = hw_to_local(hw); trace_api_radar_detected(local); + ieee80211_iter_chan_contexts_atomic(hw, ieee80211_radar_mark_chan_ctx_iterator, + chanctx_conf); + wiphy_work_queue(hw->wiphy, &local->radar_detected_work); } EXPORT_SYMBOL(ieee80211_radar_detected); diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index de52a9191da0..43288b408fde 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -486,6 +486,9 @@ static int mctp_ioctl_droptag(struct mctp_sock *msk, bool tagv2, tag = ctl.tag & MCTP_TAG_MASK; rc = -EINVAL; + if (ctl.peer_addr == MCTP_ADDR_NULL) + ctl.peer_addr = MCTP_ADDR_ANY; + spin_lock_irqsave(&net->mctp.keys_lock, flags); hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { /* we do an irqsave here, even though we know the irq state, diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 77e5dd422258..8551dab1d1e6 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -366,7 +366,7 @@ static void mctp_test_route_input_sk(struct kunit *test) skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2); - KUNIT_EXPECT_EQ(test, skb->len, 1); + KUNIT_EXPECT_EQ(test, skb2->len, 1); skb_free_datagram(sock->sk, skb2); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 0e6c94a8c2bc..aba983531ed3 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1201,8 +1201,7 @@ static void mpls_netconf_notify_devconf(struct net *net, int event, rtnl_notify(skb, net, 0, RTNLGRP_MPLS_NETCONF, NULL, GFP_KERNEL); return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err); + rtnl_set_sk_err(net, RTNLGRP_MPLS_NETCONF, err); } static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = { @@ -2278,8 +2277,7 @@ static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, return; errout: - if (err < 0) - rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); + rtnl_set_sk_err(net, RTNLGRP_MPLS_ROUTE, err); } static int mpls_valid_getroute_req(struct sk_buff *skb, diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 4385fd3b13be..6e73da94af7f 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -106,7 +106,7 @@ static int mpls_xmit(struct sk_buff *skb) hh_len = 0; /* Ensure there is enough space for the headers in the skb */ - if (skb_cow(skb, hh_len + new_header_size)) + if (skb_cow_head(skb, hh_len + new_header_size)) goto drop; skb_set_inner_protocol(skb, skb->protocol); diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 99382c317ebb..38d8121331d4 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -12,6 +12,7 @@ #include <net/netns/generic.h> #include "protocol.h" +#include "mib.h" #define MPTCP_SYSCTL_PATH "net/mptcp" @@ -27,8 +28,11 @@ struct mptcp_pernet { #endif unsigned int add_addr_timeout; + unsigned int blackhole_timeout; unsigned int close_timeout; unsigned int stale_loss_cnt; + atomic_t active_disable_times; + unsigned long active_disable_stamp; u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; @@ -87,6 +91,8 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; + pernet->blackhole_timeout = 3600; + atomic_set(&pernet->active_disable_times, 0); pernet->close_timeout = TCP_TIMEWAIT_LEN; pernet->checksum_enabled = 0; pernet->allow_join_initial_addr_port = 1; @@ -151,6 +157,20 @@ static int proc_available_schedulers(const struct ctl_table *ctl, return ret; } +static int proc_blackhole_detect_timeout(const struct ctl_table *table, + int write, void *buffer, size_t *lenp, + loff_t *ppos) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(current->nsproxy->net_ns); + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + atomic_set(&pernet->active_disable_times, 0); + + return ret; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -217,6 +237,13 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "blackhole_timeout", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_blackhole_detect_timeout, + .extra1 = SYSCTL_ZERO, + }, }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -240,6 +267,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[6].data = &pernet->scheduler; /* table[7] is for available_schedulers which is read-only info */ table[8].data = &pernet->close_timeout; + table[9].data = &pernet->blackhole_timeout; hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); @@ -277,6 +305,111 @@ static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} #endif /* CONFIG_SYSCTL */ +/* The following code block is to deal with middle box issues with MPTCP, + * similar to what is done with TFO. + * The proposed solution is to disable active MPTCP globally when SYN+MPC are + * dropped, while SYN without MPC aren't. In this case, active side MPTCP is + * disabled globally for 1hr at first. Then if it happens again, it is disabled + * for 2h, then 4h, 8h, ... + * The timeout is reset back to 1hr when a successful active MPTCP connection is + * fully established. + */ + +/* Disable active MPTCP and record current jiffies and active_disable_times */ +void mptcp_active_disable(struct sock *sk) +{ + struct net *net = sock_net(sk); + struct mptcp_pernet *pernet; + + pernet = mptcp_get_pernet(net); + + if (!READ_ONCE(pernet->blackhole_timeout)) + return; + + /* Paired with READ_ONCE() in mptcp_active_should_disable() */ + WRITE_ONCE(pernet->active_disable_stamp, jiffies); + + /* Paired with smp_rmb() in mptcp_active_should_disable(). + * We want pernet->active_disable_stamp to be updated first. + */ + smp_mb__before_atomic(); + atomic_inc(&pernet->active_disable_times); + + MPTCP_INC_STATS(net, MPTCP_MIB_BLACKHOLE); +} + +/* Calculate timeout for MPTCP active disable + * Return true if we are still in the active MPTCP disable period + * Return false if timeout already expired and we should use active MPTCP + */ +bool mptcp_active_should_disable(struct sock *ssk) +{ + struct net *net = sock_net(ssk); + unsigned int blackhole_timeout; + struct mptcp_pernet *pernet; + unsigned long timeout; + int disable_times; + int multiplier; + + pernet = mptcp_get_pernet(net); + blackhole_timeout = READ_ONCE(pernet->blackhole_timeout); + + if (!blackhole_timeout) + return false; + + disable_times = atomic_read(&pernet->active_disable_times); + if (!disable_times) + return false; + + /* Paired with smp_mb__before_atomic() in mptcp_active_disable() */ + smp_rmb(); + + /* Limit timeout to max: 2^6 * initial timeout */ + multiplier = 1 << min(disable_times - 1, 6); + + /* Paired with the WRITE_ONCE() in mptcp_active_disable(). */ + timeout = READ_ONCE(pernet->active_disable_stamp) + + multiplier * blackhole_timeout * HZ; + + return time_before(jiffies, timeout); +} + +/* Enable active MPTCP and reset active_disable_times if needed */ +void mptcp_active_enable(struct sock *sk) +{ + struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk)); + + if (atomic_read(&pernet->active_disable_times)) { + struct dst_entry *dst = sk_dst_get(sk); + + if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)) + atomic_set(&pernet->active_disable_times, 0); + } +} + +/* Check the number of retransmissions, and fallback to TCP if needed */ +void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) +{ + struct mptcp_subflow_context *subflow; + u32 timeouts; + + if (!sk_is_mptcp(ssk)) + return; + + timeouts = inet_csk(ssk)->icsk_retransmits; + subflow = mptcp_subflow_ctx(ssk); + + if (subflow->request_mptcp && ssk->sk_state == TCP_SYN_SENT) { + if (timeouts == 2 || (timeouts < 2 && expired)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDROP); + subflow->mpc_drop = 1; + mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); + } else { + subflow->mpc_drop = 0; + } + } +} + static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index ad28da655f8b..a29ff901df75 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -68,12 +68,12 @@ void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflo skb = skb_peek_tail(&sk->sk_receive_queue); if (skb) { WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); - pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk, + pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk, MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; } - pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq); + pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq); } diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 7884217f33eb..38c2efc82b94 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -15,6 +15,8 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), + SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP), + SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), @@ -25,6 +27,10 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), + SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), + SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), + SNMP_MIB_ITEM("MPJoinSynTxConnectErr", MPTCP_MIB_JOINSYNTXCONNECTERR), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), @@ -69,6 +75,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB), + SNMP_MIB_ITEM("Blackhole", MPTCP_MIB_BLACKHOLE), SNMP_MIB_SENTINEL }; diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 66aa67f49d03..c8ffe18a8722 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -10,6 +10,8 @@ enum linux_mptcp_mib_field { MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ + MPTCP_MIB_MPCAPABLEACTIVEDROP, /* Client-side fallback due to a MPC drop */ + MPTCP_MIB_MPCAPABLEACTIVEDISABLED, /* Client-side disabled due to past issues */ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ @@ -20,6 +22,10 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNTXCONNECTERR, /* Not able to connect() when sending a SYN + MP_JOIN */ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ MPTCP_MIB_INFINITEMAPTX, /* Sent an infinite mapping */ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ @@ -70,6 +76,7 @@ enum linux_mptcp_mib_field { */ MPTCP_MIB_RCVWNDCONFLICT, /* Conflict with while updating msk rcv wnd */ MPTCP_MIB_CURRESTAB, /* Current established MPTCP connections */ + MPTCP_MIB_BLACKHOLE, /* A blackhole has been detected */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index ac2f1a54cc43..370c3836b771 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -117,7 +117,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } - pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", + pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u\n", version, flags, opsize, mp_opt->sndr_key, mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; @@ -131,7 +131,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 4; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; - pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", + pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { @@ -142,19 +142,19 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 8; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; - pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", + pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); - pr_debug("MP_JOIN hmac"); + pr_debug("MP_JOIN hmac\n"); } break; case MPTCPOPT_DSS: - pr_debug("DSS"); + pr_debug("DSS\n"); ptr++; /* we must clear 'mpc_map' be able to detect MP_CAPABLE @@ -169,7 +169,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); - pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", + pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d\n", mp_opt->data_fin, mp_opt->dsn64, mp_opt->use_map, mp_opt->ack64, mp_opt->use_ack); @@ -207,7 +207,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 4; } - pr_debug("data_ack=%llu", mp_opt->data_ack); + pr_debug("data_ack=%llu\n", mp_opt->data_ack); } if (mp_opt->use_map) { @@ -231,7 +231,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; } - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", mp_opt->data_seq, mp_opt->subflow_seq, mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), mp_opt->csum); @@ -293,7 +293,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } - pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", + pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d\n", (mp_opt->addr.family == AF_INET6) ? "6" : "", mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; @@ -309,7 +309,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; - pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr); + pr_debug("RM_ADDR: rm_list_nr=%d\n", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: @@ -318,7 +318,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; - pr_debug("MP_PRIO: prio=%d", mp_opt->backup); + pr_debug("MP_PRIO: prio=%d\n", mp_opt->backup); break; case MPTCPOPT_MP_FASTCLOSE: @@ -329,7 +329,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; - pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key); + pr_debug("MP_FASTCLOSE: recv_key=%llu\n", mp_opt->rcvr_key); break; case MPTCPOPT_RST: @@ -343,7 +343,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; - pr_debug("MP_RST: transient=%u reason=%u", + pr_debug("MP_RST: transient=%u reason=%u\n", mp_opt->reset_transient, mp_opt->reset_reason); break; @@ -354,7 +354,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; mp_opt->suboptions |= OPTION_MPTCP_FAIL; mp_opt->fail_seq = get_unaligned_be64(ptr); - pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq); + pr_debug("MP_FAIL: data_seq=%llu\n", mp_opt->fail_seq); break; default: @@ -417,7 +417,7 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { - pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, + pr_debug("remote_token=%u, nonce=%u\n", subflow->remote_token, subflow->local_nonce); opts->suboptions = OPTION_MPTCP_MPJ_SYN; opts->join_id = subflow->local_id; @@ -500,7 +500,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, *size = TCPOLEN_MPTCP_MPC_ACK; } - pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", + pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d\n", subflow, subflow->local_key, subflow->remote_key, data_len); @@ -509,7 +509,7 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, opts->suboptions = OPTION_MPTCP_MPJ_ACK; memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); *size = TCPOLEN_MPTCP_MPJ_ACK; - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); /* we can use the full delegate action helper only from BH context * If we are in process context - sk is flushing the backlog at @@ -675,7 +675,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * *size = len; if (drop_other_suboptions) { - pr_debug("drop other suboptions"); + pr_debug("drop other suboptions\n"); opts->suboptions = 0; /* note that e.g. DSS could have written into the memory @@ -695,7 +695,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } - pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", + pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; @@ -726,7 +726,7 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, opts->rm_list = rm_list; for (i = 0; i < opts->rm_list.nr; i++) - pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]); + pr_debug("rm_list_ids[%d]=%d\n", i, opts->rm_list.ids[i]); MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } @@ -752,7 +752,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, opts->suboptions |= OPTION_MPTCP_PRIO; opts->backup = subflow->request_bkup; - pr_debug("prio=%d", opts->backup); + pr_debug("prio=%d\n", opts->backup); return true; } @@ -794,7 +794,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk, opts->suboptions |= OPTION_MPTCP_FASTCLOSE; opts->rcvr_key = READ_ONCE(msk->remote_key); - pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); + pr_debug("FASTCLOSE key=%llu\n", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } @@ -816,7 +816,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk, opts->suboptions |= OPTION_MPTCP_FAIL; opts->fail_seq = subflow->map_seq; - pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); + pr_debug("MP_FAIL fail_seq=%llu\n", opts->fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; @@ -904,7 +904,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, opts->csum_reqd = subflow_req->csum_reqd; opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; - pr_debug("subflow_req=%p, local_key=%llu", + pr_debug("subflow_req=%p, local_key=%llu\n", subflow_req, subflow_req->local_key); return true; } else if (subflow_req->mp_join) { @@ -913,7 +913,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; - pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", + pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u\n", subflow_req, opts->backup, opts->join_id, opts->thmac, opts->nonce); *size = TCPOLEN_MPTCP_MPJ_SYNACK; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 23bb89c94e90..620264c75dc2 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -19,7 +19,7 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, { u8 add_addr = READ_ONCE(msk->pm.addr_signal); - pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo); + pr_debug("msk=%p, local_id=%d, echo=%d\n", msk, addr->id, echo); lockdep_assert_held(&msk->pm.lock); @@ -45,7 +45,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ { u8 rm_addr = READ_ONCE(msk->pm.addr_signal); - pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); + pr_debug("msk=%p, rm_list_nr=%d\n", msk, rm_list->nr); if (rm_addr) { MPTCP_ADD_STATS(sock_net((struct sock *)msk), @@ -60,23 +60,13 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ return 0; } -int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) -{ - pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr); - - spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, rm_list); - spin_unlock_bh(&msk->pm.lock); - return 0; -} - /* path manager event handlers */ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side) { struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p, token=%u side=%d", msk, READ_ONCE(msk->token), server_side); + pr_debug("msk=%p, token=%u side=%d\n", msk, READ_ONCE(msk->token), server_side); WRITE_ONCE(pm->server_side, server_side); mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); @@ -100,7 +90,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, + pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, subflows_max, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ @@ -124,7 +114,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, enum mptcp_pm_status new_status) { - pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status, + pr_debug("msk=%p status=%x new=%lx\n", msk, msk->pm.status, BIT(new_status)); if (msk->pm.status & BIT(new_status)) return false; @@ -139,7 +129,7 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) struct mptcp_pm_data *pm = &msk->pm; bool announce = false; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); spin_lock_bh(&pm->lock); @@ -163,14 +153,14 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) void mptcp_pm_connection_closed(struct mptcp_sock *msk) { - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); } void mptcp_pm_subflow_established(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); if (!READ_ONCE(pm->work_pending)) return; @@ -222,7 +212,7 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, + pr_debug("msk=%p remote_id=%d accept=%d\n", msk, addr->id, READ_ONCE(pm->accept_addr)); mptcp_event_addr_announced(ssk, addr); @@ -236,7 +226,9 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } - } else if (!READ_ONCE(pm->accept_addr)) { + /* id0 should not have a different address */ + } else if ((addr->id == 0 && !mptcp_pm_nl_is_init_remote_addr(msk, addr)) || + (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { @@ -253,7 +245,7 @@ void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, { struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); spin_lock_bh(&pm->lock); @@ -277,7 +269,7 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, struct mptcp_pm_data *pm = &msk->pm; u8 i; - pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr); + pr_debug("msk=%p remote_ids_nr=%d\n", msk, rm_list->nr); for (i = 0; i < rm_list->nr; i++) mptcp_event_addr_removed(msk, rm_list->ids[i]); @@ -309,19 +301,19 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - pr_debug("fail_seq=%llu", fail_seq); + pr_debug("fail_seq=%llu\n", fail_seq); if (!READ_ONCE(msk->allow_infinite_fallback)) return; if (!subflow->fail_tout) { - pr_debug("send MP_FAIL response and infinite map"); + pr_debug("send MP_FAIL response and infinite map\n"); subflow->send_mp_fail = 1; subflow->send_infinite_map = 1; tcp_send_ack(sk); } else { - pr_debug("MP_FAIL response received"); + pr_debug("MP_FAIL response received\n"); WRITE_ONCE(subflow->fail_tout, 0); } } @@ -438,20 +430,6 @@ bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_is_backup(msk, &skc_local); } -int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, - u8 *flags, int *ifindex) -{ - *flags = 0; - *ifindex = 0; - - if (!id) - return 0; - - if (mptcp_pm_is_userspace(msk)) - return mptcp_userspace_pm_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); - return mptcp_pm_nl_get_flags_and_ifindex_by_id(msk, id, flags, ifindex); -} - int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info) { if (info->attrs[MPTCP_PM_ATTR_TOKEN]) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 4cae2aa7be5c..64fe0e7d87d7 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -130,12 +130,15 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; - struct sock_common *skc; list_for_each_entry(subflow, list, node) { - skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - remote_address(skc, &cur); + if (!((1 << inet_sk_state_load(ssk)) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) + continue; + + remote_address((struct sock_common *)ssk, &cur); if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } @@ -143,11 +146,13 @@ static bool lookup_subflow_by_daddr(const struct list_head *list, return false; } -static struct mptcp_pm_addr_entry * +static bool select_local_address(const struct pm_nl_pernet *pernet, - const struct mptcp_sock *msk) + const struct mptcp_sock *msk, + struct mptcp_pm_local *new_local) { - struct mptcp_pm_addr_entry *entry, *ret = NULL; + struct mptcp_pm_addr_entry *entry; + bool found = false; msk_owned_by_me(msk); @@ -159,17 +164,23 @@ select_local_address(const struct pm_nl_pernet *pernet, if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; - ret = entry; + new_local->addr = entry->addr; + new_local->flags = entry->flags; + new_local->ifindex = entry->ifindex; + found = true; break; } rcu_read_unlock(); - return ret; + + return found; } -static struct mptcp_pm_addr_entry * -select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) +static bool +select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, + struct mptcp_pm_local *new_local) { - struct mptcp_pm_addr_entry *entry, *ret = NULL; + struct mptcp_pm_addr_entry *entry; + bool found = false; rcu_read_lock(); /* do not keep any additional per socket state, just signal @@ -184,11 +195,15 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; - ret = entry; + new_local->addr = entry->addr; + new_local->flags = entry->flags; + new_local->ifindex = entry->ifindex; + found = true; break; } rcu_read_unlock(); - return ret; + + return found; } unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) @@ -279,7 +294,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); if (!msk) return; @@ -298,7 +313,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { - pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); + pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; @@ -323,15 +338,21 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; + struct timer_list *add_timer = NULL; spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); - if (entry && (!check_id || entry->addr.id == addr->id)) + if (entry && (!check_id || entry->addr.id == addr->id)) { entry->retrans_times = ADD_ADDR_RETRANS_MAX; + add_timer = &entry->add_timer; + } + if (!check_id && entry) + list_del(&entry->list); spin_unlock_bh(&msk->pm.lock); - if (entry && (!check_id || entry->addr.id == addr->id)) - sk_stop_timer_sync(sk, &entry->add_timer); + /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ + if (add_timer) + sk_stop_timer_sync(sk, add_timer); return entry; } @@ -379,7 +400,7 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.anno_list, &free_list); @@ -465,7 +486,7 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; - pr_debug("send ack for %s", + pr_debug("send ack for %s\n", prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); slow = lock_sock_fast(ssk); @@ -512,11 +533,12 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { - struct mptcp_pm_addr_entry *local, *signal_and_subflow = NULL; struct sock *sk = (struct sock *)msk; unsigned int add_addr_signal_max; + bool signal_and_subflow = false; unsigned int local_addr_max; struct pm_nl_pernet *pernet; + struct mptcp_pm_local local; unsigned int subflows_max; pernet = pm_nl_get_pernet(sock_net(sk)); @@ -565,23 +587,27 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) return; - local = select_signal_address(pernet, msk); - if (!local) + if (!select_signal_address(pernet, msk, &local)) goto subflow; /* If the alloc fails, we are on memory pressure, not worth * continuing, and trying to create subflows. */ - if (!mptcp_pm_alloc_anno_list(msk, &local->addr)) + if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) return; - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; - mptcp_pm_announce_addr(msk, &local->addr, false); + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + + mptcp_pm_announce_addr(msk, &local.addr, false); mptcp_pm_nl_addr_send_ack(msk); - if (local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) - signal_and_subflow = local; + if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + signal_and_subflow = true; } subflow: @@ -592,26 +618,28 @@ subflow: bool fullmesh; int i, nr; - if (signal_and_subflow) { - local = signal_and_subflow; - signal_and_subflow = NULL; - } else { - local = select_local_address(pernet, msk); - if (!local) - break; - } + if (signal_and_subflow) + signal_and_subflow = false; + else if (!select_local_address(pernet, msk, &local)) + break; - fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); + fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); - msk->pm.local_addr_used++; - __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); - nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs); + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + else /* local_addr_used is not decr for ID 0 */ + msk->pm.local_addr_used++; + + nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); if (nr == 0) continue; spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) - __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); + __mptcp_subflow_connect(sk, &local, &addrs[i]); spin_lock_bh(&msk->pm.lock); } mptcp_pm_nl_check_work_pending(msk); @@ -632,10 +660,11 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) */ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, - struct mptcp_addr_info *addrs) + struct mptcp_pm_local *locals) { struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; struct pm_nl_pernet *pernet; unsigned int subflows_max; int i = 0; @@ -643,6 +672,8 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, pernet = pm_nl_get_pernet_from_msk(msk); subflows_max = mptcp_pm_get_subflows_max(msk); + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) @@ -652,8 +683,16 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, continue; if (msk->pm.subflows < subflows_max) { + locals[i].addr = entry->addr; + locals[i].flags = entry->flags; + locals[i].ifindex = entry->ifindex; + + /* Special case for ID0: set the correct ID */ + if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) + locals[i].addr.id = 0; + msk->pm.subflows++; - addrs[i++] = entry->addr; + i++; } } rcu_read_unlock(); @@ -662,21 +701,19 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, * 'IPADDRANY' local address */ if (!i) { - struct mptcp_addr_info local; - - memset(&local, 0, sizeof(local)); - local.family = + memset(&locals[i], 0, sizeof(locals[i])); + locals[i].addr.family = #if IS_ENABLED(CONFIG_MPTCP_IPV6) remote->family == AF_INET6 && ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : #endif remote->family; - if (!mptcp_pm_addr_families_match(sk, &local, remote)) + if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) return 0; msk->pm.subflows++; - addrs[i++] = local; + i++; } return i; @@ -684,7 +721,7 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { - struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; + struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; @@ -695,7 +732,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("accepted %d:%d remote family %d", + pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); @@ -713,24 +750,35 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ - nr = fill_local_addresses_vec(msk, &remote, addrs); + nr = fill_local_addresses_vec(msk, &remote, locals); if (nr == 0) return; spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) - if (__mptcp_subflow_connect(sk, &addrs[i], &remote) == 0) + if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) sf_created = true; spin_lock_bh(&msk->pm.lock); if (sf_created) { - msk->pm.add_addr_accepted++; + /* add_addr_accepted is not decr for ID 0 */ + if (remote.id) + msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || msk->pm.subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); } } +bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote) +{ + struct mptcp_addr_info mpc_remote; + + remote_address((struct sock_common *)msk, &mpc_remote); + return mptcp_addresses_equal(&mpc_remote, remote, remote->port); +} + void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -742,9 +790,12 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) !mptcp_pm_should_rm_signal(msk)) return; - subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); - if (subflow) - mptcp_pm_send_ack(msk, subflow, false, false); + mptcp_for_each_subflow(msk, subflow) { + if (__mptcp_subflow_active(subflow)) { + mptcp_pm_send_ack(msk, subflow, false, false); + break; + } + } } int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, @@ -754,7 +805,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, { struct mptcp_subflow_context *subflow; - pr_debug("bkup=%d", bkup); + pr_debug("bkup=%d\n", bkup); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -777,11 +828,6 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } -static bool mptcp_local_id_match(const struct mptcp_sock *msk, u8 local_id, u8 id) -{ - return local_id == id || (!local_id && msk->mpc_endpoint_id == id); -} - static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list, enum linux_mptcp_mib_field rm_type) @@ -790,7 +836,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, struct sock *sk = (struct sock *)msk; u8 i; - pr_debug("%s rm_list_nr %d", + pr_debug("%s rm_list_nr %d\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); msk_owned_by_me(msk); @@ -814,12 +860,14 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow_get_local_id(subflow); + if (inet_sk_state_load(ssk) == TCP_CLOSE) + continue; if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) continue; - if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id)) + if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) continue; - pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u", + pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", i, rm_id, id, remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); @@ -829,25 +877,27 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); - removed = true; + removed |= subflow->request_join; if (rm_type == MPTCP_MIB_RMSUBFLOW) __MPTCP_INC_STATS(sock_net(sk), rm_type); } - if (rm_type == MPTCP_MIB_RMSUBFLOW) - __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap); - else if (rm_type == MPTCP_MIB_RMADDR) + + if (rm_type == MPTCP_MIB_RMADDR) __MPTCP_INC_STATS(sock_net(sk), rm_type); + if (!removed) continue; if (!mptcp_pm_is_kernel(msk)) continue; - if (rm_type == MPTCP_MIB_RMADDR) { - msk->pm.add_addr_accepted--; - WRITE_ONCE(msk->pm.accept_addr, true); - } else if (rm_type == MPTCP_MIB_RMSUBFLOW) { - msk->pm.local_addr_used--; + if (rm_type == MPTCP_MIB_RMADDR && rm_id && + !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + /* Note: if the subflow has been closed before, this + * add_addr_accepted counter will not be decremented. + */ + if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + WRITE_ONCE(msk->pm.accept_addr, true); } } } @@ -857,8 +907,8 @@ static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } -void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list) +static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } @@ -874,7 +924,7 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) spin_lock_bh(&msk->pm.lock); - pr_debug("msk=%p status=%x", msk, pm->status); + pr_debug("msk=%p status=%x\n", msk, pm->status); if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); @@ -1292,20 +1342,27 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -static int mptcp_nl_add_subflow_or_signal_addr(struct net *net) +static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, + struct mptcp_addr_info *addr) { struct mptcp_sock *msk; long s_slot = 0, s_num = 0; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info mpc_addr; if (!READ_ONCE(msk->fully_established) || mptcp_pm_is_userspace(msk)) goto next; + /* if the endp linked to the init sf is re-added with a != ID */ + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + lock_sock(sk); spin_lock_bh(&msk->pm.lock); + if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) + msk->mpc_endpoint_id = addr->id; mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); release_sock(sk); @@ -1378,7 +1435,7 @@ int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) goto out_free; } - mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk)); + mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); return 0; out_free: @@ -1386,24 +1443,6 @@ out_free: return ret; } -int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, - u8 *flags, int *ifindex) -{ - struct mptcp_pm_addr_entry *entry; - struct sock *sk = (struct sock *)msk; - struct net *net = sock_net(sk); - - rcu_read_lock(); - entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id); - if (entry) { - *flags = entry->flags; - *ifindex = entry->ifindex; - } - rcu_read_unlock(); - - return 0; -} - static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { @@ -1411,7 +1450,6 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, entry = mptcp_pm_del_add_timer(msk, addr, false); if (entry) { - list_del(&entry->list); kfree(entry); return true; } @@ -1419,6 +1457,12 @@ static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, return false; } +static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; +} + static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) @@ -1426,29 +1470,38 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, struct mptcp_rm_list list = { .nr = 0 }; bool ret; - list.ids[list.nr++] = addr->id; + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); - msk->pm.add_addr_signaled -= ret; + if (ret) { + __set_bit(addr->id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled--; + } mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } return ret; } +static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) +{ + /* If it was marked as used, and not ID 0, decrement local_addr_used */ + if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && + id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) + msk->pm.local_addr_used--; +} + static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, const struct mptcp_pm_addr_entry *entry) { const struct mptcp_addr_info *addr = &entry->addr; - struct mptcp_rm_list list = { .nr = 0 }; + struct mptcp_rm_list list = { .nr = 1 }; long s_slot = 0, s_num = 0; struct mptcp_sock *msk; - pr_debug("remove_id=%d", addr->id); - - list.ids[list.nr++] = addr->id; + pr_debug("remove_id=%d\n", addr->id); while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; @@ -1466,8 +1519,22 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); - if (remove_subflow) - mptcp_pm_remove_subflow(msk, &list); + + list.ids[0] = mptcp_endp_get_local_id(msk, addr); + if (remove_subflow) { + spin_lock_bh(&msk->pm.lock); + mptcp_pm_nl_rm_subflow_received(msk, &list); + spin_unlock_bh(&msk->pm.lock); + } + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + spin_lock_bh(&msk->pm.lock); + __mark_subflow_endp_available(msk, list.ids[0]); + spin_unlock_bh(&msk->pm.lock); + } + + if (msk->mpc_endpoint_id == entry->addr.id) + msk->mpc_endpoint_id = 0; release_sock(sk); next: @@ -1502,6 +1569,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net, spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); mptcp_pm_nl_rm_subflow_received(msk, &list); + __mark_subflow_endp_available(msk, 0); spin_unlock_bh(&msk->pm.lock); release_sock(sk); @@ -1561,6 +1629,7 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return ret; } +/* Called from the userspace PM only */ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }; @@ -1589,8 +1658,9 @@ void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) } } -static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list) +/* Called from the in-kernel PM only */ +static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; @@ -1598,25 +1668,28 @@ static void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, list_for_each_entry(entry, rm_list, list) { if (slist.nr < MPTCP_RM_IDS_MAX && lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) - slist.ids[slist.nr++] = entry->addr.id; + slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); if (alist.nr < MPTCP_RM_IDS_MAX && remove_anno_list_by_saddr(msk, &entry->addr)) - alist.ids[alist.nr++] = entry->addr.id; + alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); } + spin_lock_bh(&msk->pm.lock); if (alist.nr) { - spin_lock_bh(&msk->pm.lock); msk->pm.add_addr_signaled -= alist.nr; mptcp_pm_remove_addr(msk, &alist); - spin_unlock_bh(&msk->pm.lock); } if (slist.nr) - mptcp_pm_remove_subflow(msk, &slist); + mptcp_pm_nl_rm_subflow_received(msk, &slist); + /* Reset counters: maybe some subflows have been removed before */ + bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + msk->pm.local_addr_used = 0; + spin_unlock_bh(&msk->pm.lock); } -static void mptcp_nl_remove_addrs_list(struct net *net, - struct list_head *rm_list) +static void mptcp_nl_flush_addrs_list(struct net *net, + struct list_head *rm_list) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; @@ -1629,7 +1702,7 @@ static void mptcp_nl_remove_addrs_list(struct net *net, if (!mptcp_pm_is_userspace(msk)) { lock_sock(sk); - mptcp_pm_remove_addrs_and_subflows(msk, rm_list); + mptcp_pm_flush_addrs_and_subflows(msk, rm_list); release_sock(sk); } @@ -1670,7 +1743,7 @@ int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); - mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); + mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); synchronize_rcu(); __flush_addrs(&free_list); return 0; @@ -1896,10 +1969,11 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, { struct mptcp_rm_list list = { .nr = 0 }; - list.ids[list.nr++] = addr->id; + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); + __mark_subflow_endp_available(msk, list.ids[0]); mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); } diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 8eaa9fbe3e34..2cceded3a83a 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -119,23 +119,6 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) return NULL; } -int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, - unsigned int id, - u8 *flags, int *ifindex) -{ - struct mptcp_pm_addr_entry *match; - - spin_lock_bh(&msk->pm.lock); - match = mptcp_userspace_pm_lookup_addr_by_id(msk, id); - spin_unlock_bh(&msk->pm.lock); - if (match) { - *flags = match->flags; - *ifindex = match->ifindex; - } - - return 0; -} - int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { @@ -352,8 +335,9 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct mptcp_pm_addr_entry local = { 0 }; + struct mptcp_pm_addr_entry entry = { 0 }; struct mptcp_addr_info addr_r; + struct mptcp_pm_local local; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; @@ -379,18 +363,18 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) goto create_err; } - err = mptcp_pm_parse_entry(laddr, info, true, &local); + err = mptcp_pm_parse_entry(laddr, info, true, &entry); if (err < 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); goto create_err; } - if (local.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + if (entry.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { GENL_SET_ERR_MSG(info, "invalid addr flags"); err = -EINVAL; goto create_err; } - local.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + entry.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; err = mptcp_pm_parse_addr(raddr, info, &addr_r); if (err < 0) { @@ -398,27 +382,29 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) goto create_err; } - if (!mptcp_pm_addr_families_match(sk, &local.addr, &addr_r)) { + if (!mptcp_pm_addr_families_match(sk, &entry.addr, &addr_r)) { GENL_SET_ERR_MSG(info, "families mismatch"); err = -EINVAL; goto create_err; } - err = mptcp_userspace_pm_append_new_local_addr(msk, &local, false); + err = mptcp_userspace_pm_append_new_local_addr(msk, &entry, false); if (err < 0) { GENL_SET_ERR_MSG(info, "did not match address and id"); goto create_err; } - lock_sock(sk); - - err = __mptcp_subflow_connect(sk, &local.addr, &addr_r); + local.addr = entry.addr; + local.flags = entry.flags; + local.ifindex = entry.ifindex; + lock_sock(sk); + err = __mptcp_subflow_connect(sk, &local, &addr_r); release_sock(sk); spin_lock_bh(&msk->pm.lock); if (err) - mptcp_userspace_pm_delete_local_addr(msk, &local); + mptcp_userspace_pm_delete_local_addr(msk, &entry); else msk->pm.subflows++; spin_unlock_bh(&msk->pm.lock); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 0d536b183a6c..c2317919fc14 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -139,7 +139,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; - pr_debug("colesced seq %llx into %llx new len %d new end seq %llx", + pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; @@ -217,7 +217,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb) end_seq = MPTCP_SKB_CB(skb)->end_seq; max_seq = atomic64_read(&msk->rcv_wnd_sent); - pr_debug("msk=%p seq=%llx limit=%llx empty=%d", msk, seq, max_seq, + pr_debug("msk=%p seq=%llx limit=%llx empty=%d\n", msk, seq, max_seq, RB_EMPTY_ROOT(&msk->out_of_order_queue)); if (after64(end_seq, max_seq)) { /* out of window */ @@ -643,7 +643,7 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } } - pr_debug("msk=%p ssk=%p", msk, ssk); + pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); do { u32 map_remaining, offset; @@ -724,7 +724,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) u64 end_seq; p = rb_first(&msk->out_of_order_queue); - pr_debug("msk=%p empty=%d", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); + pr_debug("msk=%p empty=%d\n", msk, RB_EMPTY_ROOT(&msk->out_of_order_queue)); while (p) { skb = rb_to_skb(p); if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) @@ -746,7 +746,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) int delta = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; /* skip overlapping data, if any */ - pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d", + pr_debug("uncoalesced seq=%llx ack seq=%llx delta=%d\n", MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq, delta); MPTCP_SKB_CB(skb)->offset += delta; @@ -1240,7 +1240,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, size_t copy; int i; - pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", + pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u\n", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); if (WARN_ON_ONCE(info->sent > info->limit || @@ -1341,7 +1341,7 @@ alloc_skb: mpext->use_map = 1; mpext->dsn64 = 1; - pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d", + pr_debug("data_seq=%llu subflow_seq=%u data_len=%u dsn64=%d\n", mpext->data_seq, mpext->subflow_seq, mpext->data_len, mpext->dsn64); @@ -1892,7 +1892,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!msk->first_pending) WRITE_ONCE(msk->first_pending, dfrag); } - pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk, + pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, !dfrag_collapsed); @@ -2248,7 +2248,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } - pr_debug("block timeout %ld", timeo); + pr_debug("block timeout %ld\n", timeo); sk_wait_data(sk, &timeo, NULL); } @@ -2264,7 +2264,7 @@ out_err: } } - pr_debug("msk=%p rx queue empty=%d:%d copied=%d", + pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", msk, skb_queue_empty_lockless(&sk->sk_receive_queue), skb_queue_empty(&msk->receive_queue), copied); if (!(flags & MSG_PEEK)) @@ -2326,7 +2326,7 @@ struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) continue; } - if (subflow->backup) { + if (subflow->backup || subflow->request_bkup) { if (!backup) backup = ssk; continue; @@ -2508,6 +2508,12 @@ out: void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { + /* The first subflow can already be closed and still in the list */ + if (subflow->close_event_done) + return; + + subflow->close_event_done = true; + if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); @@ -2533,8 +2539,11 @@ static void __mptcp_close_subflow(struct sock *sk) mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + int ssk_state = inet_sk_state_load(ssk); - if (inet_sk_state_load(ssk) != TCP_CLOSE) + if (ssk_state != TCP_CLOSE && + (ssk_state != TCP_CLOSE_WAIT || + inet_sk_state_load(sk) != TCP_ESTABLISHED)) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ @@ -2714,7 +2723,7 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) if (!ssk) return; - pr_debug("MP_FAIL doesn't respond, reset the subflow"); + pr_debug("MP_FAIL doesn't respond, reset the subflow\n"); slow = lock_sock_fast(ssk); mptcp_subflow_reset(ssk); @@ -2888,7 +2897,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) break; default: if (__mptcp_check_fallback(mptcp_sk(sk))) { - pr_debug("Fallback"); + pr_debug("Fallback\n"); ssk->sk_shutdown |= how; tcp_shutdown(ssk, how); @@ -2898,7 +2907,7 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt); mptcp_schedule_work(sk); } else { - pr_debug("Sending DATA_FIN on subflow %p", ssk); + pr_debug("Sending DATA_FIN on subflow %p\n", ssk); tcp_send_ack(ssk); if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); @@ -2964,7 +2973,7 @@ static void mptcp_check_send_data_fin(struct sock *sk) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu", + pr_debug("msk=%p snd_data_fin_enable=%d pending=%d snd_nxt=%llu write_seq=%llu\n", msk, msk->snd_data_fin_enable, !!mptcp_send_head(sk), msk->snd_nxt, msk->write_seq); @@ -2988,7 +2997,7 @@ static void __mptcp_wr_shutdown(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d", + pr_debug("msk=%p snd_data_fin_enable=%d shutdown=%x state=%d pending=%d\n", msk, msk->snd_data_fin_enable, sk->sk_shutdown, sk->sk_state, !!mptcp_send_head(sk)); @@ -3003,7 +3012,7 @@ static void __mptcp_destroy_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); might_sleep(); @@ -3111,7 +3120,7 @@ cleanup: mptcp_set_state(sk, TCP_CLOSE); sock_hold(sk); - pr_debug("msk=%p state=%d", sk, sk->sk_state); + pr_debug("msk=%p state=%d\n", sk, sk->sk_state); if (msk->token) mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); @@ -3543,7 +3552,7 @@ static int mptcp_get_port(struct sock *sk, unsigned short snum) { struct mptcp_sock *msk = mptcp_sk(sk); - pr_debug("msk=%p, ssk=%p", msk, msk->first); + pr_debug("msk=%p, ssk=%p\n", msk, msk->first); if (WARN_ON_ONCE(!msk->first)) return -EINVAL; @@ -3560,7 +3569,7 @@ void mptcp_finish_connect(struct sock *ssk) sk = subflow->conn; msk = mptcp_sk(sk); - pr_debug("msk=%p, token=%u", sk, subflow->token); + pr_debug("msk=%p, token=%u\n", sk, subflow->token); subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; @@ -3589,7 +3598,7 @@ bool mptcp_finish_join(struct sock *ssk) struct sock *parent = (void *)msk; bool ret = true; - pr_debug("msk=%p, subflow=%p", msk, subflow); + pr_debug("msk=%p, subflow=%p\n", msk, subflow); /* mptcp socket already closing? */ if (!mptcp_is_fully_established(parent)) { @@ -3635,7 +3644,7 @@ err_prohibited: static void mptcp_shutdown(struct sock *sk, int how) { - pr_debug("sk=%p, how=%d", sk, how); + pr_debug("sk=%p, how=%d\n", sk, how); if ((how & SEND_SHUTDOWN) && mptcp_close_state(sk)) __mptcp_wr_shutdown(sk); @@ -3708,13 +3717,6 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) return 0; } -static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) -{ - subflow->request_mptcp = 0; - __mptcp_do_fallback(msk); -} - static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct mptcp_subflow_context *subflow; @@ -3735,9 +3737,14 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (rcu_access_pointer(tcp_sk(ssk)->md5sig_info)) mptcp_subflow_early_fallback(msk, subflow); #endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssk)) { - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); - mptcp_subflow_early_fallback(msk, subflow); + if (subflow->request_mptcp) { + if (mptcp_active_should_disable(sk)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDISABLED); + mptcp_subflow_early_fallback(msk, subflow); + } else if (mptcp_token_new_connect(ssk) < 0) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_TOKENFALLBACKINIT); + mptcp_subflow_early_fallback(msk, subflow); + } } WRITE_ONCE(msk->write_seq, subflow->idsn); @@ -3856,7 +3863,7 @@ static int mptcp_listen(struct socket *sock, int backlog) struct sock *ssk; int err; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); lock_sock(sk); @@ -3895,7 +3902,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *newsk; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); /* Buggy applications can call accept on socket states other then LISTEN * but no need to allocate the first subflow just to error out. @@ -3904,12 +3911,12 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk) return -EINVAL; - pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); + pr_debug("ssk=%p, listener=%p\n", ssk, mptcp_subflow_ctx(ssk)); newsk = inet_csk_accept(ssk, arg); if (!newsk) return arg->err; - pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); + pr_debug("newsk=%p, subflow is mptcp=%d\n", newsk, sk_is_mptcp(newsk)); if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; struct sock *new_mptcp_sock; @@ -4002,7 +4009,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); - pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); + pr_debug("msk=%p state=%d flags=%lx\n", msk, state, msk->flags); if (state == TCP_LISTEN) { struct sock *ssk = READ_ONCE(msk->first); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 60c6b073d65f..74417aae08d0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -236,6 +236,12 @@ struct mptcp_pm_data { struct mptcp_rm_list rm_list_rx; }; +struct mptcp_pm_local { + struct mptcp_addr_info addr; + u8 flags; + int ifindex; +}; + struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; @@ -524,7 +530,9 @@ struct mptcp_subflow_context { stale : 1, /* unable to snd/rcv data, do not use for xmit */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ - __unused : 10; + close_event_done : 1, /* has done the post-closed part */ + mpc_drop : 1, /* the MPC option has been dropped in a rtx */ + __unused : 8; bool data_avail; bool scheduled; u32 remote_nonce; @@ -690,6 +698,11 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); + +void mptcp_active_disable(struct sock *sk); +bool mptcp_active_should_disable(struct sock *ssk); +void mptcp_active_enable(struct sock *sk); + void mptcp_get_available_schedulers(char *buf, size_t maxlen); void __mptcp_subflow_fully_established(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, @@ -718,7 +731,7 @@ bool mptcp_addresses_equal(const struct mptcp_addr_info *a, void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); /* called with sk socket lock held */ -int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, +int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, struct socket **new_sock); @@ -992,6 +1005,8 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); +bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote); void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); @@ -1011,14 +1026,6 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr); -int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, - unsigned int id, - u8 *flags, int *ifindex); -int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, - u8 *flags, int *ifindex); -int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, - unsigned int id, - u8 *flags, int *ifindex); int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info); int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info); @@ -1026,7 +1033,6 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); -int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); void mptcp_free_local_addr_list(struct mptcp_sock *msk); @@ -1133,8 +1139,6 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo void __init mptcp_pm_nl_init(void); void mptcp_pm_nl_work(struct mptcp_sock *msk); -void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); @@ -1154,7 +1158,6 @@ static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } -void mptcp_sockopt_sync(struct mptcp_sock *msk, struct sock *ssk); void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) @@ -1180,7 +1183,7 @@ static inline bool mptcp_check_fallback(const struct sock *sk) static inline void __mptcp_do_fallback(struct mptcp_sock *msk) { if (__mptcp_check_fallback(msk)) { - pr_debug("TCP fallback already done (msk=%p)", msk); + pr_debug("TCP fallback already done (msk=%p)\n", msk); return; } set_bit(MPTCP_FALLBACK_DONE, &msk->flags); @@ -1216,7 +1219,15 @@ static inline void mptcp_do_fallback(struct sock *ssk) } } -#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) +#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a) + +static inline void mptcp_subflow_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + pr_fallback(msk); + subflow->request_mptcp = 0; + __mptcp_do_fallback(msk); +} static inline bool mptcp_check_infinite_map(struct sk_buff *skb) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index 4a7fd0508ad2..78ed508ebc1b 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -86,7 +86,7 @@ int mptcp_register_scheduler(struct mptcp_sched_ops *sched) list_add_tail_rcu(&sched->list, &mptcp_sched_list); spin_unlock(&mptcp_sched_list_lock); - pr_debug("%s registered", sched->name); + pr_debug("%s registered\n", sched->name); return 0; } @@ -118,7 +118,7 @@ int mptcp_init_sched(struct mptcp_sock *msk, if (msk->sched->init) msk->sched->init(msk); - pr_debug("sched=%s", msk->sched->name); + pr_debug("sched=%s\n", msk->sched->name); return 0; } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2026a9a36f80..505445a9598f 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -873,7 +873,7 @@ int mptcp_setsockopt(struct sock *sk, int level, int optname, struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); if (level == SOL_SOCKET) return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen); @@ -1453,7 +1453,7 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname, struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; - pr_debug("msk=%p", msk); + pr_debug("msk=%p\n", msk); /* @@ the meaning of setsockopt() when the socket is connected and * there are multiple subflows is not yet defined. It is up to the diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a21c712350c3..1040b3b9696b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -39,7 +39,7 @@ static void subflow_req_destructor(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); - pr_debug("subflow_req=%p", subflow_req); + pr_debug("subflow_req=%p\n", subflow_req); if (subflow_req->msk) sock_put((struct sock *)subflow_req->msk); @@ -146,7 +146,7 @@ static int subflow_check_req(struct request_sock *req, struct mptcp_options_received mp_opt; bool opt_mp_capable, opt_mp_join; - pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); + pr_debug("subflow_req=%p, listener=%p\n", subflow_req, listener); #ifdef CONFIG_TCP_MD5SIG /* no MPTCP if MD5SIG is enabled on this socket or we may run out of @@ -221,7 +221,7 @@ again: } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { - pr_debug("syn inet_sport=%d %d", + pr_debug("syn inet_sport=%d %d\n", ntohs(inet_sk(sk_listener)->inet_sport), ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) { @@ -243,7 +243,7 @@ again: subflow_init_req_cookie_join_save(subflow_req, skb); } - pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token, + pr_debug("token=%u, remote_nonce=%u msk=%p\n", subflow_req->token, subflow_req->remote_nonce, subflow_req->msk); } @@ -527,7 +527,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->rel_write_seq = 1; subflow->conn_finished = 1; subflow->ssn_offset = TCP_SKB_CB(skb)->seq; - pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset); + pr_debug("subflow=%p synack seq=%x\n", subflow, subflow->ssn_offset); mptcp_get_options(skb, &mp_opt); if (subflow->request_mptcp) { @@ -546,6 +546,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_capable = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); + mptcp_active_enable(parent); mptcp_propagate_state(parent, sk, subflow, &mp_opt); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -559,7 +560,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; WRITE_ONCE(subflow->remote_id, mp_opt.join_id); - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d\n", subflow, subflow->thmac, subflow->remote_nonce, subflow->backup); @@ -585,12 +586,15 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKBACKUPRX); if (subflow_use_different_dport(msk, sk)) { - pr_debug("synack inet_dport=%d %d", + pr_debug("synack inet_dport=%d %d\n", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX); } } else if (mptcp_check_fallback(sk)) { + /* It looks like MPTCP is blocked, while TCP is not */ + if (subflow->mpc_drop) + mptcp_active_disable(parent); fallback: mptcp_propagate_state(parent, sk, subflow, NULL); } @@ -655,7 +659,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); /* Never answer to SYNs sent to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -686,7 +690,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); if (skb->protocol == htons(ETH_P_IP)) return subflow_v4_conn_request(sk, skb); @@ -807,7 +811,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_sock *owner; struct sock *child; - pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + pr_debug("listener=%p, req=%p, conn=%p\n", listener, req, listener->conn); /* After child creation we must look for MPC even when options * are not parsed @@ -898,7 +902,7 @@ create_child: ctx->conn = (struct sock *)owner; if (subflow_use_different_sport(owner, sk)) { - pr_debug("ack inet_sport=%d %d", + pr_debug("ack inet_sport=%d %d\n", ntohs(inet_sk(sk)->inet_sport), ntohs(inet_sk((struct sock *)owner)->inet_sport)); if (!mptcp_pm_sport_in_anno_list(owner, sk)) { @@ -961,7 +965,7 @@ enum mapping_status { static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) { - pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d", + pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d\n", ssn, subflow->map_subflow_seq, subflow->map_data_len); } @@ -1121,7 +1125,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_len = mpext->data_len; if (data_len == 0) { - pr_debug("infinite mapping received"); + pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); subflow->map_data_len = 0; return MAPPING_INVALID; @@ -1133,7 +1137,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (data_len == 1) { bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq, mpext->dsn64); - pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq); + pr_debug("DATA_FIN with no payload seq=%llu\n", mpext->data_seq); if (subflow->map_valid) { /* A DATA_FIN might arrive in a DSS * option before the previous mapping @@ -1159,7 +1163,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, data_fin_seq &= GENMASK_ULL(31, 0); mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64); - pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d", + pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d\n", data_fin_seq, mpext->dsn64); /* Adjust for DATA_FIN using 1 byte of sequence space */ @@ -1205,7 +1209,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (unlikely(subflow->map_csum_reqd != csum_reqd)) return MAPPING_INVALID; - pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", + pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", subflow->map_seq, subflow->map_subflow_seq, subflow->map_data_len, subflow->map_csum_reqd, subflow->map_data_csum); @@ -1240,7 +1244,7 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, avail_len = skb->len - offset; incr = limit >= avail_len ? avail_len + fin : limit; - pr_debug("discarding=%d len=%d offset=%d seq=%d", incr, skb->len, + pr_debug("discarding=%d len=%d offset=%d seq=%d\n", incr, skb->len, offset, subflow->map_subflow_seq); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA); tcp_sk(ssk)->copied_seq += incr; @@ -1255,12 +1259,16 @@ out: /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { - if (likely(ssk->sk_state != TCP_CLOSE)) + struct sock *sk = (struct sock *)msk; + + if (likely(ssk->sk_state != TCP_CLOSE && + (ssk->sk_state != TCP_CLOSE_WAIT || + inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; if (skb_queue_empty(&ssk->sk_receive_queue) && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - mptcp_schedule_work((struct sock *)msk); + mptcp_schedule_work(sk); } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) @@ -1337,7 +1345,7 @@ static bool subflow_check_data_avail(struct sock *ssk) old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); - pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack, + pr_debug("msk ack_seq=%llx subflow ack_seq=%llx\n", old_ack, ack_seq); if (unlikely(before64(ack_seq, old_ack))) { mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq); @@ -1409,7 +1417,7 @@ bool mptcp_subflow_data_available(struct sock *sk) subflow->map_valid = 0; WRITE_ONCE(subflow->data_avail, false); - pr_debug("Done with mapping: seq=%u data_len=%u", + pr_debug("Done with mapping: seq=%u data_len=%u\n", subflow->map_subflow_seq, subflow->map_data_len); } @@ -1519,7 +1527,7 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk); - pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d", + pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d\n", subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped); if (likely(icsk->icsk_af_ops == target)) @@ -1561,28 +1569,31 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, #endif } -int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, +int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; + int local_id = local->addr.id; struct sockaddr_storage addr; int remote_id = remote->id; - int local_id = loc->id; int err = -ENOTCONN; struct socket *sf; struct sock *ssk; u32 remote_token; int addrlen; - int ifindex; - u8 flags; + /* The userspace PM sent the request too early? */ if (!mptcp_is_fully_established(sk)) goto err_out; - err = mptcp_subflow_create_socket(sk, loc->family, &sf); - if (err) + err = mptcp_subflow_create_socket(sk, local->addr.family, &sf); + if (err) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCREATSKERR); + pr_debug("msk=%p local=%d remote=%d create sock error: %d\n", + msk, local_id, remote_id, err); goto err_out; + } ssk = sf->sk; subflow = mptcp_subflow_ctx(ssk); @@ -1590,42 +1601,61 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, get_random_bytes(&subflow->local_nonce, sizeof(u32)); } while (!subflow->local_nonce); - if (local_id) + /* if 'IPADDRANY', the ID will be set later, after the routing */ + if (local->addr.family == AF_INET) { + if (!local->addr.addr.s_addr) + local_id = -1; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + } else if (sk->sk_family == AF_INET6) { + if (ipv6_addr_any(&local->addr.addr6)) + local_id = -1; +#endif + } + + if (local_id >= 0) subflow_set_local_id(subflow, local_id); - mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, - &flags, &ifindex); subflow->remote_key_valid = 1; subflow->remote_key = READ_ONCE(msk->remote_key); subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; - mptcp_info2sockaddr(loc, &addr, ssk->sk_family); + mptcp_info2sockaddr(&local->addr, &addr, ssk->sk_family); addrlen = sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr.ss_family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif - ssk->sk_bound_dev_if = ifindex; + ssk->sk_bound_dev_if = local->ifindex; err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); - if (err) + if (err) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); + pr_debug("msk=%p local=%d remote=%d bind error: %d\n", + msk, local_id, remote_id, err); goto failed; + } mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); - pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk, + pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d\n", msk, remote_token, local_id, remote_id); subflow->remote_token = remote_token; WRITE_ONCE(subflow->remote_id, remote_id); subflow->request_join = 1; - subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); + subflow->request_bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); subflow->subflow_id = msk->subflow_id++; mptcp_info2sockaddr(remote, &addr, ssk->sk_family); sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); - if (err && err != -EINPROGRESS) + if (err && err != -EINPROGRESS) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); + pr_debug("msk=%p local=%d remote=%d connect error: %d\n", + msk, local_id, remote_id, err); goto failed_unlink; + } + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTX); /* discard the subflow socket */ mptcp_sock_graft(ssk, sk->sk_socket); @@ -1747,7 +1777,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; subflow = mptcp_subflow_ctx(sf->sk); - pr_debug("subflow=%p", subflow); + pr_debug("subflow=%p\n", subflow); *new_sock = sf; sock_hold(sk); @@ -1776,7 +1806,7 @@ static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk, INIT_LIST_HEAD(&ctx->node); INIT_LIST_HEAD(&ctx->delegated_node); - pr_debug("subflow=%p", ctx); + pr_debug("subflow=%p\n", ctx); ctx->tcp_sock = sk; WRITE_ONCE(ctx->local_id, -1); @@ -1927,7 +1957,7 @@ static int subflow_ulp_init(struct sock *sk) goto out; } - pr_debug("subflow=%p, family=%d", ctx, sk->sk_family); + pr_debug("subflow=%p, family=%d\n", ctx, sk->sk_family); tp->is_mptcp = 1; ctx->icsk_af_ops = icsk->icsk_af_ops; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b00fc285b334..b9f551f02c81 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -655,11 +655,9 @@ void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e) { struct sk_buff *skb, *next; - struct list_head sublist; + LIST_HEAD(sublist); int ret; - INIT_LIST_HEAD(&sublist); - list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); ret = nf_hook_slow(skb, state, e, 0); diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 34ba14e59e95..4890af4dc263 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -522,11 +522,10 @@ unsigned int nf_conncount_count(struct net *net, } EXPORT_SYMBOL_GPL(nf_conncount_count); -struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, - unsigned int keylen) +struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { struct nf_conncount_data *data; - int ret, i; + int i; if (keylen % sizeof(u32) || keylen / sizeof(u32) > MAX_KEYLEN || @@ -539,12 +538,6 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family if (!data) return ERR_PTR(-ENOMEM); - ret = nf_ct_netns_get(net, family); - if (ret < 0) { - kfree(data); - return ERR_PTR(ret); - } - for (i = 0; i < ARRAY_SIZE(data->root); ++i) data->root[i] = RB_ROOT; @@ -581,13 +574,11 @@ static void destroy_tree(struct rb_root *r) } } -void nf_conncount_destroy(struct net *net, unsigned int family, - struct nf_conncount_data *data) +void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data) { unsigned int i; cancel_work_sync(&data->gc_work); - nf_ct_netns_put(net, family); for (i = 0; i < ARRAY_SIZE(data->root); ++i) destroy_tree(&data->root[i]); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9384426ddc06..d3cb53b008f5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1722,7 +1722,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) - return (struct nf_conntrack_tuple_hash *)ct; + return ERR_CAST(ct); if (!nf_ct_add_synproxy(ct, tmpl)) { nf_conntrack_free(ct); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 4cbf71d0786b..123e2e933e9b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1579,9 +1579,6 @@ static int ctnetlink_flush_conntrack(struct net *net, }; if (ctnetlink_needs_filter(family, cda)) { - if (cda[CTA_FILTER]) - return -EOPNOTSUPP; - filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -1610,14 +1607,14 @@ static int ctnetlink_del_conntrack(struct sk_buff *skb, if (err < 0) return err; - if (cda[CTA_TUPLE_ORIG]) + if (cda[CTA_TUPLE_ORIG] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, family, &zone); - else if (cda[CTA_TUPLE_REPLY]) + else if (cda[CTA_TUPLE_REPLY] && !cda[CTA_FILTER]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, family, &zone); else { - u_int8_t u3 = info->nfmsg->version ? family : AF_UNSPEC; + u8 u3 = info->nfmsg->version || cda[CTA_FILTER] ? family : AF_UNSPEC; return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 5c1ff07eaee0..df72b0376970 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -670,8 +670,14 @@ static int __init nf_flow_table_module_init(void) if (ret) goto out_offload; + ret = nf_flow_register_bpf(); + if (ret) + goto out_bpf; + return 0; +out_bpf: + nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); return ret; diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 88787b45e30d..b0f199171932 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -17,6 +17,9 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, switch (skb->protocol) { case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return NF_ACCEPT; + veth = (struct vlan_ethhdr *)skb_mac_header(skb); proto = veth->h_vlan_encapsulated_proto; break; @@ -98,7 +101,7 @@ static int __init nf_flow_inet_module_init(void) nft_register_flowtable_type(&flowtable_ipv6); nft_register_flowtable_type(&flowtable_inet); - return nf_flow_register_bpf(); + return 0; } static void __exit nf_flow_inet_module_exit(void) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index c2c005234dcd..98edcaa37b38 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -281,6 +281,9 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, switch (skb->protocol) { case htons(ETH_P_8021Q): + if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) + return false; + veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { *offset += VLAN_HLEN; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 016c816d91cb..6d8da6dddf99 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1104,7 +1104,7 @@ int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); - nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); + nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0a2f79346958..57259b5f3ef5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -146,6 +146,8 @@ static void nft_ctx_init(struct nft_ctx *ctx, ctx->report = nlmsg_report(nlh); ctx->flags = nlh->nlmsg_flags; ctx->seq = nlh->nlmsg_seq; + + bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, @@ -393,6 +395,7 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans_binding *binding; + struct nft_trans_set *trans_set; list_add_tail(&trans->list, &nft_net->commit_list); @@ -402,9 +405,13 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr switch (trans->msg_type) { case NFT_MSG_NEWSET: + trans_set = nft_trans_container_set(trans); + if (!nft_trans_set_update(trans) && nft_set_is_anonymous(nft_trans_set(trans))) list_add_tail(&binding->binding_list, &nft_net->binding_list); + + list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list); break; case NFT_MSG_NEWCHAIN: if (!nft_trans_chain_update(trans) && @@ -611,6 +618,7 @@ static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, trans_set = nft_trans_container_set(trans); INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list); + INIT_LIST_HEAD(&trans_set->list_trans_newset); if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) { nft_trans_set_id(trans) = @@ -3878,7 +3886,6 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) { struct nft_expr *expr, *last; - const struct nft_data *data; struct nft_rule *rule; int err; @@ -3899,7 +3906,7 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) /* This may call nft_chain_validate() recursively, * callers that do so must increment ctx->level. */ - err = expr->ops->validate(ctx, expr, &data); + err = expr->ops->validate(ctx, expr); if (err < 0) return err; } @@ -4485,17 +4492,16 @@ static struct nft_set *nft_set_lookup_byid(const struct net *net, { struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); - struct nft_trans *trans; + struct nft_trans_set *trans; - list_for_each_entry(trans, &nft_net->commit_list, list) { - if (trans->msg_type == NFT_MSG_NEWSET) { - struct nft_set *set = nft_trans_set(trans); + /* its likely the id we need is at the tail, not at start */ + list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) { + struct nft_set *set = trans->set; - if (id == nft_trans_set_id(trans) && - set->table == table && - nft_active_genmask(set, genmask)) - return set; - } + if (id == trans->set_id && + set->table == table && + nft_active_genmask(set, genmask)) + return set; } return ERR_PTR(-ENOENT); } @@ -4587,7 +4593,7 @@ int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) return -ERANGE; ms *= NSEC_PER_MSEC; - *result = nsecs_to_jiffies64(ms); + *result = nsecs_to_jiffies64(ms) ? : !!ms; return 0; } @@ -5688,12 +5694,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u8), }, [NFT_SET_EXT_TIMEOUT] = { - .len = sizeof(u64), - .align = __alignof__(u64), - }, - [NFT_SET_EXT_EXPIRATION] = { - .len = sizeof(u64), - .align = __alignof__(u64), + .len = sizeof(struct nft_timeout), + .align = __alignof__(struct nft_timeout), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), @@ -5812,25 +5814,32 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, htonl(*nft_set_ext_flags(ext)))) goto nla_put_failure; - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && - nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout); + u64 set_timeout = READ_ONCE(set->timeout); + __be64 msecs = 0; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - u64 expires, now = get_jiffies_64(); + if (set_timeout != timeout) { + msecs = nf_jiffies64_to_msecs(timeout); + if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs, + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } - expires = *nft_set_ext_expiration(ext); - if (time_before64(now, expires)) - expires -= now; - else - expires = 0; + if (timeout > 0) { + u64 expires, now = get_jiffies_64(); - if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - nf_jiffies64_to_msecs(expires), - NFTA_SET_ELEM_PAD)) - goto nla_put_failure; + expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration); + if (time_before64(now, expires)) + expires -= now; + else + expires = 0; + + if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, + nf_jiffies64_to_msecs(expires), + NFTA_SET_ELEM_PAD)) + goto nla_put_failure; + } } if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) { @@ -6493,13 +6502,14 @@ struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, nft_set_ext_data(ext), data, set->dlen) < 0) goto err_ext_check; - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - *nft_set_ext_expiration(ext) = get_jiffies_64() + expiration; + if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) { + nft_set_ext_timeout(ext)->timeout = timeout; + if (expiration == 0) - *nft_set_ext_expiration(ext) += timeout; + expiration = timeout; + + nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration; } - if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) - *nft_set_ext_timeout(ext) = timeout; return elem; @@ -6842,6 +6852,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data_desc desc; enum nft_registers dreg; struct nft_trans *trans; + u8 update_flags; u64 expiration; u64 timeout; int err, i; @@ -6910,17 +6921,23 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } else if (set->flags & NFT_SET_TIMEOUT && !(flags & NFT_SET_ELEM_INTERVAL_END)) { - timeout = READ_ONCE(set->timeout); + timeout = set->timeout; } expiration = 0; if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; + if (timeout == 0) + return -EOPNOTSUPP; + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION], &expiration); if (err) return err; + + if (expiration > timeout) + return -ERANGE; } if (nla[NFTA_SET_ELEM_EXPR]) { @@ -7006,16 +7023,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_parse_key_end; } - if (timeout > 0) { - err = nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); + if (set->flags & NFT_SET_TIMEOUT) { + err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); if (err < 0) goto err_parse_key_end; - - if (timeout != READ_ONCE(set->timeout)) { - err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT); - if (err < 0) - goto err_parse_key_end; - } } if (num_exprs) { @@ -7153,8 +7164,30 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) && *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2))) goto err_element_clash; - else if (!(nlmsg_flags & NLM_F_EXCL)) + else if (!(nlmsg_flags & NLM_F_EXCL)) { err = 0; + if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) { + update_flags = 0; + if (timeout != nft_set_ext_timeout(ext2)->timeout) { + nft_trans_elem_timeout(trans) = timeout; + if (expiration == 0) + expiration = timeout; + + update_flags |= NFT_TRANS_UPD_TIMEOUT; + } + if (expiration) { + nft_trans_elem_expiration(trans) = expiration; + update_flags |= NFT_TRANS_UPD_EXPIRATION; + } + + if (update_flags) { + nft_trans_elem_priv(trans) = elem_priv; + nft_trans_elem_update_flags(trans) = update_flags; + nft_trans_commit_list_add_tail(ctx->net, trans); + goto err_elem_free; + } + } + } } else if (err == -ENOTEMPTY) { /* ENOTEMPTY reports overlapping between this element * and an existing one. @@ -10447,6 +10480,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_flow_rule_destroy(nft_trans_flow_rule(trans)); break; case NFT_MSG_NEWSET: + list_del(&nft_trans_container_set(trans)->list_trans_newset); if (nft_trans_set_update(trans)) { struct nft_set *set = nft_trans_set(trans); @@ -10478,7 +10512,22 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = nft_trans_container_elem(trans); - nft_setelem_activate(net, te->set, te->elem_priv); + if (te->update_flags) { + const struct nft_set_ext *ext = + nft_set_elem_ext(te->set, te->elem_priv); + + if (te->update_flags & NFT_TRANS_UPD_TIMEOUT) { + WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, + te->timeout); + } + if (te->update_flags & NFT_TRANS_UPD_EXPIRATION) { + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, + get_jiffies_64() + te->expiration); + } + } else { + nft_setelem_activate(net, te->set, te->elem_priv); + } + nf_tables_setelem_notify(&ctx, te->set, te->elem_priv, NFT_MSG_NEWSETELEM); @@ -10755,6 +10804,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWSET: + list_del(&nft_trans_container_set(trans)->list_trans_newset); if (nft_trans_set_update(trans)) { nft_trans_destroy(trans); break; @@ -10777,12 +10827,16 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: - if (nft_trans_elem_set_bound(trans)) { + if (nft_trans_elem_update_flags(trans) || + nft_trans_elem_set_bound(trans)) { nft_trans_destroy(trans); break; } te = nft_trans_container_elem(trans); - nft_setelem_remove(net, te->set, te->elem_priv); + if (!te->set->ops->abort || + nft_setelem_is_catchall(te->set, te->elem_priv)) + nft_setelem_remove(net, te->set, te->elem_priv); + if (!nft_setelem_is_catchall(te->set, te->elem_priv)) atomic_dec(&te->set->nelems); @@ -10850,6 +10904,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } } + WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); + nft_set_abort_update(&set_update_list); synchronize_rcu(); @@ -11026,10 +11082,11 @@ static int nft_validate_register_load(enum nft_registers reg, unsigned int len) return 0; } -int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) +int nft_parse_register_load(const struct nft_ctx *ctx, + const struct nlattr *attr, u8 *sreg, u32 len) { - u32 reg; - int err; + int err, invalid_reg; + u32 reg, next_register; err = nft_parse_register(attr, ®); if (err < 0) @@ -11039,11 +11096,36 @@ int nft_parse_register_load(const struct nlattr *attr, u8 *sreg, u32 len) if (err < 0) return err; + next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg; + + /* Can't happen: nft_validate_register_load() should have failed */ + if (WARN_ON_ONCE(next_register > NFT_REG32_NUM)) + return -EINVAL; + + /* find first register that did not see an earlier store. */ + invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg); + + /* invalid register within the range that we're loading from? */ + if (invalid_reg < next_register) + return -ENODATA; + *sreg = reg; return 0; } EXPORT_SYMBOL_GPL(nft_parse_register_load); +static void nft_saw_register_store(const struct nft_ctx *__ctx, + int reg, unsigned int len) +{ + unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE); + struct nft_ctx *ctx = (struct nft_ctx *)__ctx; + + if (WARN_ON_ONCE(len == 0 || reg < 0)) + return; + + bitmap_set(ctx->reg_inited, reg, registers); +} + static int nft_validate_register_store(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, @@ -11065,7 +11147,7 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, return err; } - return 0; + break; default: if (type != NFT_DATA_VALUE) return -EINVAL; @@ -11078,8 +11160,11 @@ static int nft_validate_register_store(const struct nft_ctx *ctx, sizeof_field(struct nft_regs, data)) return -ERANGE; - return 0; + break; } + + nft_saw_register_store(ctx, reg, len); + return 0; } int nft_parse_register_store(const struct nft_ctx *ctx, @@ -11519,6 +11604,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->commit_set_list); INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); @@ -11549,6 +11635,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) gc_seq = nft_gc_seq_begin(nft_net); WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list)); if (!list_empty(&nft_net->module_list)) nf_tables_module_autoload_cleanup(net); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index a48d5f0e2f3e..75598520b0fa 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -256,7 +256,7 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) const struct net *net = nft_net(pkt); const struct nft_expr *expr, *last; const struct nft_rule_dp *rule; - struct nft_regs regs = {}; + struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; bool genbit = READ_ONCE(net->nft.gencursor); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 932b3ddb34f1..7784ec094097 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -402,27 +402,27 @@ replay_abort: { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } } if (!ss->valid_genid || !ss->commit || !ss->abort) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } if (!try_module_get(ss->owner)) { nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); - return kfree_skb(skb); + return consume_skb(skb); } if (!ss->valid_genid(net, genid)) { module_put(ss->owner); nfnl_unlock(subsys_id); netlink_ack(oskb, nlh, -ERESTART, NULL); - return kfree_skb(skb); + return consume_skb(skb); } nfnl_unlock(subsys_id); @@ -567,7 +567,7 @@ done: if (status & NFNL_BATCH_REPLAY) { ss->abort(net, oskb, NFNL_ABORT_AUTOLOAD); nfnl_err_reset(&err_list); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); goto replay; } else if (status == NFNL_BATCH_DONE) { @@ -593,7 +593,7 @@ done: err = ss->abort(net, oskb, abort_action); if (err == -EAGAIN) { nfnl_err_reset(&err_list); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); status |= NFNL_BATCH_FAILURE; goto replay_abort; @@ -601,7 +601,7 @@ done: } nfnl_err_deliver(&err_list, oskb); - kfree_skb(skb); + consume_skb(skb); module_put(ss->owner); } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index e0716da256bf..d2773ce9b585 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -540,6 +540,14 @@ nla_put_failure: return -1; } +static int nf_queue_checksum_help(struct sk_buff *entskb) +{ + if (skb_csum_is_sctp(entskb)) + return skb_crc32c_csum_help(entskb); + + return skb_checksum_help(entskb); +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -602,7 +610,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(entskb)) + nf_queue_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); @@ -1014,7 +1022,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) break; } - if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) + if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb))) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index ca857afbf061..7de95674fd8c 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -171,7 +171,7 @@ static int nft_bitwise_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg, priv->len); if (err < 0) return err; @@ -365,7 +365,7 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); int err; - err = nft_parse_register_load(tb[NFTA_BITWISE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BITWISE_SREG], &priv->sreg, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index f6e791a68101..2f82a444d21b 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -139,7 +139,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_BYTEORDER_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_BYTEORDER_SREG], &priv->sreg, priv->len); if (err < 0) return err; diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index cd4652259095..2605f43737bc 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -83,7 +83,7 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -222,7 +222,7 @@ static int nft_cmp_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; @@ -323,7 +323,7 @@ static int nft_cmp16_fast_init(const struct nft_ctx *ctx, if (err < 0) return err; - err = nft_parse_register_load(tb[NFTA_CMP_SREG], &priv->sreg, desc.len); + err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index d3d11dede545..52cdfee17f73 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -350,8 +350,7 @@ nla_put_failure: } static int nft_target_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct xt_target *target = expr->ops->data; unsigned int hook_mask = 0; @@ -611,8 +610,7 @@ static int nft_match_large_dump(struct sk_buff *skb, } static int nft_match_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct xt_match *match = expr->ops->data; unsigned int hook_mask = 0; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 291ed2026367..cc7325329496 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -8,7 +8,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/seqlock.h> +#include <linux/u64_stats_sync.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -17,6 +17,11 @@ #include <net/netfilter/nf_tables_offload.h> struct nft_counter { + u64_stats_t bytes; + u64_stats_t packets; +}; + +struct nft_counter_tot { s64 bytes; s64 packets; }; @@ -25,25 +30,24 @@ struct nft_counter_percpu_priv { struct nft_counter __percpu *counter; }; -static DEFINE_PER_CPU(seqcount_t, nft_counter_seq); +static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync); static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); - - write_seqcount_begin(myseq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - this_cpu->bytes += pkt->skb->len; - this_cpu->packets++; + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->bytes, pkt->skb->len); + u64_stats_inc(&this_cpu->packets); + u64_stats_update_end(nft_sync); - write_seqcount_end(myseq); local_bh_enable(); } @@ -66,17 +70,16 @@ static int nft_counter_do_init(const struct nlattr * const tb[], if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); + this_cpu = raw_cpu_ptr(cpu_stats); if (tb[NFTA_COUNTER_PACKETS]) { - this_cpu->packets = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + u64_stats_set(&this_cpu->packets, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]))); } if (tb[NFTA_COUNTER_BYTES]) { - this_cpu->bytes = - be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + u64_stats_set(&this_cpu->bytes, + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]))); } - preempt_enable(); + priv->counter = cpu_stats; return 0; } @@ -104,35 +107,41 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx, } static void nft_counter_reset(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - this_cpu->packets -= total->packets; - this_cpu->bytes -= total->bytes; + nft_sync = this_cpu_ptr(&nft_counter_sync); + + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, -total->packets); + u64_stats_add(&this_cpu->bytes, -total->bytes); + u64_stats_update_end(nft_sync); + local_bh_enable(); } static void nft_counter_fetch(struct nft_counter_percpu_priv *priv, - struct nft_counter *total) + struct nft_counter_tot *total) { struct nft_counter *this_cpu; - const seqcount_t *myseq; u64 bytes, packets; unsigned int seq; int cpu; memset(total, 0, sizeof(*total)); for_each_possible_cpu(cpu) { - myseq = per_cpu_ptr(&nft_counter_seq, cpu); + struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu); + this_cpu = per_cpu_ptr(priv->counter, cpu); do { - seq = read_seqcount_begin(myseq); - bytes = this_cpu->bytes; - packets = this_cpu->packets; - } while (read_seqcount_retry(myseq, seq)); + seq = u64_stats_fetch_begin(nft_sync); + bytes = u64_stats_read(&this_cpu->bytes); + packets = u64_stats_read(&this_cpu->packets); + } while (u64_stats_fetch_retry(nft_sync, seq)); total->bytes += bytes; total->packets += packets; @@ -143,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb, struct nft_counter_percpu_priv *priv, bool reset) { - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); @@ -232,7 +241,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, g struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); struct nft_counter __percpu *cpu_stats; struct nft_counter *this_cpu; - struct nft_counter total; + struct nft_counter_tot total; nft_counter_fetch(priv, &total); @@ -240,11 +249,9 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, g if (cpu_stats == NULL) return -ENOMEM; - preempt_disable(); - this_cpu = this_cpu_ptr(cpu_stats); - this_cpu->packets = total.packets; - this_cpu->bytes = total.bytes; - preempt_enable(); + this_cpu = raw_cpu_ptr(cpu_stats); + u64_stats_set(&this_cpu->packets, total.packets); + u64_stats_set(&this_cpu->bytes, total.bytes); priv_clone->counter = cpu_stats; return 0; @@ -262,18 +269,18 @@ static void nft_counter_offload_stats(struct nft_expr *expr, const struct flow_stats *stats) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct u64_stats_sync *nft_sync; struct nft_counter *this_cpu; - seqcount_t *myseq; - preempt_disable(); + local_bh_disable(); this_cpu = this_cpu_ptr(priv->counter); - myseq = this_cpu_ptr(&nft_counter_seq); + nft_sync = this_cpu_ptr(&nft_counter_sync); - write_seqcount_begin(myseq); - this_cpu->packets += stats->pkts; - this_cpu->bytes += stats->bytes; - write_seqcount_end(myseq); - preempt_enable(); + u64_stats_update_begin(nft_sync); + u64_stats_add(&this_cpu->packets, stats->pkts); + u64_stats_add(&this_cpu->bytes, stats->bytes); + u64_stats_update_end(nft_sync); + local_bh_enable(); } void nft_counter_init_seqcount(void) @@ -281,7 +288,7 @@ void nft_counter_init_seqcount(void) int cpu; for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu)); + u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu)); } struct nft_expr_type nft_counter_type; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 452ed94c3a4d..67a41cd2baaf 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -606,7 +606,7 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, } priv->len = len; - err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index e5739a59ebf1..0573f96ce079 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -40,7 +40,7 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_DUP_SREG_DEV] == NULL) return -EINVAL; - return nft_parse_register_load(tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, + return nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index b4ada3ab2167..88922e0e8e83 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -56,7 +56,7 @@ static struct nft_elem_priv *nft_dynset_new(struct nft_set *set, if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; - timeout = priv->timeout ? : set->timeout; + timeout = priv->timeout ? : READ_ONCE(set->timeout); elem_priv = nft_set_elem_init(set, &priv->tmpl, ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], @@ -94,9 +94,10 @@ void nft_dynset_eval(const struct nft_expr *expr, if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, expr, regs, &ext)) { if (priv->op == NFT_DYNSET_OP_UPDATE && - nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - timeout = priv->timeout ? : set->timeout; - *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout; + nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && + READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) { + timeout = priv->timeout ? : READ_ONCE(set->timeout); + WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + timeout); } nft_set_elem_update_expr(ext, regs, pkt); @@ -215,7 +216,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return err; } - err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, + err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_KEY], &priv->sreg_key, set->klen); if (err < 0) return err; @@ -226,7 +227,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; - err = nft_parse_register_load(tb[NFTA_DYNSET_SREG_DATA], + err = nft_parse_register_load(ctx, tb[NFTA_DYNSET_SREG_DATA], &priv->sreg_data, set->dlen); if (err < 0) return err; @@ -312,12 +313,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (priv->num_exprs) nft_dynset_ext_add_expr(priv); - if (set->flags & NFT_SET_TIMEOUT) { - if (timeout || set->timeout) { - nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); - nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_EXPIRATION); - } - } + if (set->flags & NFT_SET_TIMEOUT && + (timeout || READ_ONCE(set->timeout))) + nft_set_ext_add(&priv->tmpl, NFT_SET_EXT_TIMEOUT); priv->timeout = timeout; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 6eb571d0c3fd..6bfd33516241 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -588,7 +588,7 @@ static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, priv->flags = flags; priv->op = op; - return nft_parse_register_load(tb[NFTA_EXTHDR_SREG], &priv->sreg, + return nft_parse_register_load(ctx, tb[NFTA_EXTHDR_SREG], &priv->sreg, priv->len); } diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index b58f62195ff3..96e02a83c045 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -26,8 +26,7 @@ const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { }; EXPORT_SYMBOL(nft_fib_policy); -int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_fib *priv = nft_expr_priv(expr); unsigned int hooks; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index ab9576098701..2f732fae5a83 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -9,6 +9,7 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> /* for ipv4 options. */ +#include <net/inet_dscp.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> @@ -235,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos); + fl.u.ip4.flowi4_tos = ip_hdr(pkt->skb)->tos & INET_DSCP_MASK; fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; @@ -380,8 +381,7 @@ out: } static int nft_flow_offload_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 358e742afad7..152a9fb4d23a 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -52,7 +52,7 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, if (tb[NFTA_FWD_SREG_DEV] == NULL) return -EINVAL; - return nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, sizeof(int)); } @@ -178,12 +178,12 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nft_parse_register_load(tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, + err = nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, sizeof(int)); if (err < 0) return err; - return nft_parse_register_load(tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, + return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, addr_len); } @@ -204,8 +204,7 @@ nla_put_failure: } static int nft_fwd_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) | (1 << NF_NETDEV_EGRESS)); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 868d68302d22..5d034bbb6913 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -92,7 +92,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; - err = nft_parse_register_load(tb[NFTA_HASH_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_HASH_SREG], &priv->sreg, len); if (err < 0) return err; diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index ac2422c215e5..02ee5fb69871 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -244,8 +244,7 @@ nla_put_failure: } static int nft_immediate_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **d) + const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_ctx *pctx = (struct nft_ctx *)ctx; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index f3080fa1b226..63ef832b8aa7 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -113,7 +113,7 @@ static int nft_lookup_init(const struct nft_ctx *ctx, if (IS_ERR(set)) return PTR_ERR(set); - err = nft_parse_register_load(tb[NFTA_LOOKUP_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_LOOKUP_SREG], &priv->sreg, set->klen); if (err < 0) return err; @@ -206,8 +206,7 @@ nla_put_failure: } static int nft_lookup_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **d) + const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); struct nft_set_iter iter; diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 8a14aaca93bb..868bd4d73555 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -27,8 +27,7 @@ static const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { }; static int nft_masq_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { int err; @@ -52,13 +51,13 @@ static int nft_masq_init(const struct nft_ctx *ctx, priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); if (tb[NFTA_MASQ_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_MASQ_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_MASQ_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_MASQ_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 9139ce38ea7b..8c8eb14d647b 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -581,8 +581,7 @@ static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) } static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -600,8 +599,7 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, } int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -657,7 +655,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, } priv->len = len; - err = nft_parse_register_load(tb[NFTA_META_SREG], &priv->sreg, len); + err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len); if (err < 0) return err; diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 808f5802c270..6e21f72c5b57 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -137,8 +137,7 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { }; static int nft_nat_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { struct nft_nat *priv = nft_expr_priv(expr); int err; @@ -214,13 +213,13 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->family = family; if (tb[NFTA_NAT_REG_ADDR_MIN]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MIN], &priv->sreg_addr_min, alen); if (err < 0) return err; if (tb[NFTA_NAT_REG_ADDR_MAX]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_ADDR_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_ADDR_MAX], &priv->sreg_addr_max, alen); if (err < 0) @@ -234,13 +233,13 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_NAT_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_NAT_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_NAT_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_NAT_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 509011b1ef59..09da7a3f9f96 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -143,7 +143,7 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; - err = nft_parse_register_load(tb[NFTA_OBJREF_SET_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_OBJREF_SET_SREG], &priv->sreg, set->klen); if (err < 0) return err; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 7fec57ff736f..1c0b493ef0a9 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -108,8 +108,7 @@ nla_put_failure: } static int nft_osf_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { unsigned int hooks; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 50429cbd42da..330609a76fb2 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -981,7 +981,7 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, } priv->csum_type = csum_type; - return nft_parse_register_load(tb[NFTA_PAYLOAD_SREG], &priv->sreg, + return nft_parse_register_load(ctx, tb[NFTA_PAYLOAD_SREG], &priv->sreg, priv->len); } diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index b2b8127c8d43..344fe311878f 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -69,8 +69,7 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr, } static int nft_queue_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | @@ -136,7 +135,7 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, struct nft_queue *priv = nft_expr_priv(expr); int err; - err = nft_parse_register_load(tb[NFTA_QUEUE_SREG_QNUM], + err = nft_parse_register_load(ctx, tb[NFTA_QUEUE_SREG_QNUM], &priv->sreg_qnum, sizeof(u32)); if (err < 0) return err; diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 51ae64cd268f..ea382f7bbd78 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -83,7 +83,7 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr goto err2; } - err = nft_parse_register_load(tb[NFTA_RANGE_SREG], &priv->sreg, + err = nft_parse_register_load(ctx, tb[NFTA_RANGE_SREG], &priv->sreg, desc_from.len); if (err < 0) goto err2; diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index a58bd8d291ff..95eedad85c83 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -27,8 +27,7 @@ static const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { }; static int nft_redir_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { int err; @@ -51,13 +50,13 @@ static int nft_redir_init(const struct nft_ctx *ctx, plen = sizeof_field(struct nf_nat_range, min_proto.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { - err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MIN], + err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MIN], &priv->sreg_proto_min, plen); if (err < 0) return err; if (tb[NFTA_REDIR_REG_PROTO_MAX]) { - err = nft_parse_register_load(tb[NFTA_REDIR_REG_PROTO_MAX], + err = nft_parse_register_load(ctx, tb[NFTA_REDIR_REG_PROTO_MAX], &priv->sreg_proto_max, plen); if (err < 0) diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index ed2e668474d6..196a92c7ea09 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -24,8 +24,7 @@ const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 973fa31a9dd6..49020e67304a 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -61,8 +61,7 @@ static void nft_reject_inet_eval(const struct nft_expr *expr, } static int nft_reject_inet_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c index 7865cd8b11bb..2558ce1505d9 100644 --- a/net/netfilter/nft_reject_netdev.c +++ b/net/netfilter/nft_reject_netdev.c @@ -145,8 +145,7 @@ out: } static int nft_reject_netdev_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS)); } diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 14d88394bcb7..dc50b9a5bd68 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -160,8 +160,7 @@ nla_put_failure: return -1; } -static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_rt *priv = nft_expr_priv(expr); unsigned int hooks; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index f30163e2ca62..f5da0c1775f2 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -9,7 +9,8 @@ struct nft_socket { enum nft_socket_keys key:8; - u8 level; + u8 level; /* cgroupv2 level to extract */ + u8 level_user; /* cgroupv2 level provided by userspace */ u8 len; union { u8 dreg; @@ -53,6 +54,28 @@ nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo memcpy(dest, &cgid, sizeof(u64)); return true; } + +/* process context only, uses current->nsproxy. */ +static noinline int nft_socket_cgroup_subtree_level(void) +{ + struct cgroup *cgrp = cgroup_get_from_path("/"); + int level; + + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + level = cgrp->level; + + cgroup_put(cgrp); + + if (WARN_ON_ONCE(level > 255)) + return -ERANGE; + + if (WARN_ON_ONCE(level < 0)) + return -EINVAL; + + return level; +} #endif static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt) @@ -110,13 +133,13 @@ static void nft_socket_eval(const struct nft_expr *expr, *dest = READ_ONCE(sk->sk_mark); } else { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } break; case NFT_SOCKET_WILDCARD: if (!sk_fullsock(sk)) { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } nft_socket_wildcard(pkt, regs, sk, dest); break; @@ -124,7 +147,7 @@ static void nft_socket_eval(const struct nft_expr *expr, case NFT_SOCKET_CGROUPV2: if (!nft_sock_get_eval_cgroupv2(dest, sk, pkt, priv->level)) { regs->verdict.code = NFT_BREAK; - return; + goto out_put_sk; } break; #endif @@ -133,6 +156,7 @@ static void nft_socket_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; } +out_put_sk: if (sk != skb->sk) sock_gen_put(sk); } @@ -173,9 +197,10 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_MARK: len = sizeof(u32); break; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_SOCK_CGROUP_DATA case NFT_SOCKET_CGROUPV2: { unsigned int level; + int err; if (!tb[NFTA_SOCKET_LEVEL]) return -EINVAL; @@ -184,6 +209,17 @@ static int nft_socket_init(const struct nft_ctx *ctx, if (level > 255) return -EOPNOTSUPP; + err = nft_socket_cgroup_subtree_level(); + if (err < 0) + return err; + + priv->level_user = level; + + level += err; + /* Implies a giant cgroup tree */ + if (WARN_ON_ONCE(level > 255)) + return -EOPNOTSUPP; + priv->level = level; len = sizeof(u64); break; @@ -208,7 +244,7 @@ static int nft_socket_dump(struct sk_buff *skb, if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; if (priv->key == NFT_SOCKET_CGROUPV2 && - nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) + nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user))) return -1; return 0; } @@ -239,8 +275,7 @@ static bool nft_socket_reduce(struct nft_regs_track *track, } static int nft_socket_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 1d737f89dfc1..5d3e51825985 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -248,8 +248,7 @@ static void nft_synproxy_eval(const struct nft_expr *expr, } static int nft_synproxy_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index 71412adb73d4..50481280abd2 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -254,14 +254,14 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, } if (tb[NFTA_TPROXY_REG_ADDR]) { - err = nft_parse_register_load(tb[NFTA_TPROXY_REG_ADDR], + err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_ADDR], &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { - err = nft_parse_register_load(tb[NFTA_TPROXY_REG_PORT], + err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_PORT], &priv->sreg_port, sizeof(u16)); if (err < 0) return err; @@ -313,8 +313,7 @@ static int nft_tproxy_dump(struct sk_buff *skb, } static int nft_tproxy_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 1c866757db55..8a07b46cc8fb 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -229,8 +229,7 @@ static int nft_xfrm_get_dump(struct sk_buff *skb, return 0; } -static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) +static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 5d04ef80a61d..0e762277bcf8 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -86,6 +86,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) { struct xt_connlimit_info *info = par->matchinfo; unsigned int keylen; + int ret; keylen = sizeof(u32); if (par->family == NFPROTO_IPV6) @@ -93,8 +94,17 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) else keylen += sizeof(struct in_addr); + ret = nf_ct_netns_get(par->net, par->family); + if (ret < 0) { + pr_info_ratelimited("cannot load conntrack support for proto=%u\n", + par->family); + return ret; + } + /* init private data */ - info->data = nf_conncount_init(par->net, par->family, keylen); + info->data = nf_conncount_init(par->net, keylen); + if (IS_ERR(info->data)) + nf_ct_netns_put(par->net, par->family); return PTR_ERR_OR_ZERO(info->data); } @@ -103,7 +113,8 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { const struct xt_connlimit_info *info = par->matchinfo; - nf_conncount_destroy(par->net, par->family, info->data); + nf_conncount_destroy(par->net, info->data); + nf_ct_netns_put(par->net, par->family); } static struct xt_match connlimit_mt_reg __read_mostly = { diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 9751e29d4bbb..5b0e4e62ab8b 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -41,7 +41,6 @@ struct netlink_sock { struct netlink_callback cb; struct mutex nl_cb_mutex; - struct mutex *dump_cb_mutex; void (*netlink_rcv)(struct sk_buff *skb); int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index bd2b17b219ae..2b5e246b8d9a 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -189,7 +189,7 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, } nr_node->callsign = *nr; - strcpy(nr_node->mnemonic, mnemonic); + strscpy(nr_node->mnemonic, mnemonic); nr_node->which = 0; nr_node->count = 1; @@ -214,7 +214,7 @@ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, nr_node_lock(nr_node); if (quality != 0) - strcpy(nr_node->mnemonic, mnemonic); + strscpy(nr_node->mnemonic, mnemonic); for (found = 0, i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 101f9a23792c..16e260014684 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -237,14 +237,18 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_vlan *vlan) { + int err; + if (skb_vlan_tag_present(skb)) { invalidate_flow_key(key); } else { key->eth.vlan.tci = vlan->vlan_tci; key->eth.vlan.tpid = vlan->vlan_tpid; } - return skb_vlan_push(skb, vlan->vlan_tpid, - ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK); + err = skb_vlan_push(skb, vlan->vlan_tpid, + ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK); + skb_reset_mac_len(skb); + return err; } /* 'src' is already properly masked. */ diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 8eb1d644b741..3bb4810234aa 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1368,11 +1368,8 @@ bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) attr == OVS_KEY_ATTR_CT_MARK) return true; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - attr == OVS_KEY_ATTR_CT_LABELS) { - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - return ovs_net->xt_label; - } + attr == OVS_KEY_ATTR_CT_LABELS) + return true; return false; } @@ -1381,6 +1378,7 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log) { + unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; struct ovs_conntrack_info ct_info; const char *helper = NULL; u16 family; @@ -1409,6 +1407,12 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, return -ENOMEM; } + if (nf_connlabels_get(net, n_bits - 1)) { + nf_ct_tmpl_free(ct_info.ct); + OVS_NLERR(log, "Failed to set connlabel length"); + return -EOPNOTSUPP; + } + if (ct_info.timeout[0]) { if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, ct_info.timeout)) @@ -1577,6 +1581,7 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) if (ct_info->ct) { if (ct_info->timeout[0]) nf_ct_destroy_timeout(ct_info->ct); + nf_connlabels_put(nf_ct_net(ct_info->ct)); nf_ct_tmpl_free(ct_info->ct); } } @@ -1603,8 +1608,7 @@ static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net) for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++) INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]); - ovs_net->ct_limit_info->data = - nf_conncount_init(net, NFPROTO_INET, sizeof(u32)); + ovs_net->ct_limit_info->data = nf_conncount_init(net, sizeof(u32)); if (IS_ERR(ovs_net->ct_limit_info->data)) { err = PTR_ERR(ovs_net->ct_limit_info->data); @@ -1621,7 +1625,7 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info; int i; - nf_conncount_destroy(net, NFPROTO_INET, info->data); + nf_conncount_destroy(net, info->data); for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { struct hlist_head *head = &info->limits[i]; struct ovs_ct_limit *ct_limit; @@ -2002,17 +2006,9 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = { int ovs_ct_init(struct net *net) { - unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - if (nf_connlabels_get(net, n_bits - 1)) { - ovs_net->xt_label = false; - OVS_NLERR(true, "Failed to set connlabel length"); - } else { - ovs_net->xt_label = true; - } - -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) return ovs_ct_limit_init(net, ovs_net); #else return 0; @@ -2021,12 +2017,9 @@ int ovs_ct_init(struct net *net) void ovs_ct_exit(struct net *net) { +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_net *ovs_net = net_generic(net, ovs_net_id); -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) ovs_ct_limit_exit(net, ovs_net); #endif - - if (ovs_net->xt_label) - nf_connlabels_put(net); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 99d72543abd3..78d9961fcd44 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2706,7 +2706,7 @@ static struct pernet_operations ovs_net_ops = { }; static const char * const ovs_drop_reasons[] = { -#define S(x) (#x), +#define S(x) [(x) & ~SKB_DROP_REASON_SUBSYS_MASK] = (#x), OVS_DROP_REASONS(S) #undef S }; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 9ca6231ea647..365b9bb7f546 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -160,9 +160,6 @@ struct ovs_net { #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_ct_limit_info *ct_limit_info; #endif - - /* Module reference for configuring conntrack. */ - bool xt_label; }; /** diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c92bdc4dfe19..729ef582a3a8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2491,7 +2491,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) - return (void *)acts; + return ERR_CAST(acts); memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); acts->actions_len = (*sfa)->actions_len; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 4b33133cbdff..5858d65ea1a9 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -102,19 +102,20 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | IFF_NO_QUEUE; + netdev->lltx = true; netdev->needs_free_netdev = true; netdev->priv_destructor = NULL; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; - netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | - NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; + netdev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | + NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL; netdev->vlan_features = netdev->features; netdev->hw_enc_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + netdev->hw_features = netdev->features; eth_hw_addr_random(netdev); } @@ -148,7 +149,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->netns_local = true; rtnl_lock(); err = register_netdevice(vport->dev); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 4a364cdd445e..a705ec214254 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2216,7 +2216,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, } } - snaplen = skb->len; + snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) @@ -2336,7 +2336,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } } - snaplen = skb->len; + snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) diff --git a/net/rds/Kconfig b/net/rds/Kconfig index 75cd696963b2..f007730aa2bb 100644 --- a/net/rds/Kconfig +++ b/net/rds/Kconfig @@ -26,3 +26,12 @@ config RDS_DEBUG bool "RDS debugging messages" depends on RDS default n + +config GCOV_PROFILE_RDS + bool "Enable GCOV profiling on RDS" + depends on GCOV_KERNEL + help + Enable GCOV profiling on RDS for checking which functions/lines + are executed. + + If unsure, say N. diff --git a/net/rds/Makefile b/net/rds/Makefile index 8fdc118e2927..3af1ca1d965c 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -15,3 +15,8 @@ rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \ tcp_send.o tcp_stats.o ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG + +# for GCOV coverage profiling +ifdef CONFIG_GCOV_PROFILE_RDS +GCOV_PROFILE := y +endif diff --git a/net/rds/ib.h b/net/rds/ib.h index 2ba71102b1f1..8ef3178ed4d6 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -369,9 +369,6 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); void rds_ib_conn_free(void *arg); int rds_ib_conn_path_connect(struct rds_conn_path *cp); void rds_ib_conn_path_shutdown(struct rds_conn_path *cp); -void rds_ib_state_change(struct sock *sk); -int rds_ib_listen_init(void); -void rds_ib_listen_stop(void); __printf(2, 3) void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -402,7 +399,6 @@ void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc, struct rds_ib_ack_state *state); -void rds_ib_recv_tasklet_fn(unsigned long data); void rds_ib_recv_init_ring(struct rds_ib_connection *ic); void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); void rds_ib_recv_init_ack(struct rds_ib_connection *ic); diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 7a5367628c05..13a5126bc36e 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -539,18 +539,14 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type) #endif bool rfkill_set_hw_state_reason(struct rfkill *rfkill, - bool blocked, unsigned long reason) + bool blocked, + enum rfkill_hard_block_reasons reason) { unsigned long flags; bool ret, prev; BUG_ON(!rfkill); - if (WARN(reason & ~(RFKILL_HARD_BLOCK_SIGNAL | - RFKILL_HARD_BLOCK_NOT_OWNER), - "hw_state reason not supported: 0x%lx", reason)) - return rfkill_blocked(rfkill); - spin_lock_irqsave(&rfkill->lock, flags); prev = !!(rfkill->hard_block_reasons & reason); if (blocked) { diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 84529886c2e6..c268c2b011f4 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -3,6 +3,7 @@ * Copyright (c) 2011, NVIDIA Corporation. */ +#include <linux/dmi.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> @@ -72,6 +73,20 @@ static int rfkill_gpio_acpi_probe(struct device *dev, return devm_acpi_dev_add_driver_gpios(dev, acpi_rfkill_default_gpios); } +/* List of DMI matches for devices on which rfkill-gpio should not load, + * to avoid firmware bugs. + */ +static const struct dmi_system_id rfkill_gpio_deny_table[] = { + { + /* Lenovo Yoga Tab 3 Pro YT3-X90, bogus "BCM4752" device in DSDT */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Intel Corporation"), + DMI_MATCH(DMI_PRODUCT_VERSION, "Blade3-10A-001"), + }, + }, + { } +}; + static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; @@ -81,6 +96,9 @@ static int rfkill_gpio_probe(struct platform_device *pdev) const char *type_name; int ret; + if (dmi_check_system(rfkill_gpio_deny_table)) + return -ENODEV; + rfkill = devm_kzalloc(&pdev->dev, sizeof(*rfkill), GFP_KERNEL); if (!rfkill) return -ENOMEM; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 08de24658f4f..80d682f89b23 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -856,7 +856,6 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb); -void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); /* @@ -969,7 +968,6 @@ void rxrpc_connect_client_calls(struct rxrpc_local *local); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); -void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_discard_expired_client_conns(struct rxrpc_local *local); void rxrpc_clean_up_local_conns(struct rxrpc_local *); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 3ba8e7e739b5..2197eb625658 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -44,8 +44,6 @@ static DEFINE_MUTEX(zones_mutex); struct zones_ht_key { struct net *net; u16 zone; - /* Note : pad[] must be the last field. */ - u8 pad[]; }; struct tcf_ct_flow_table { @@ -62,7 +60,7 @@ struct tcf_ct_flow_table { static const struct rhashtable_params zones_params = { .head_offset = offsetof(struct tcf_ct_flow_table, node), .key_offset = offsetof(struct tcf_ct_flow_table, key), - .key_len = offsetof(struct zones_ht_key, pad), + .key_len = offsetofend(struct zones_ht_key, zone), .automatic_shrinking = true, }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 22f4b1e8ade9..383bf18b6862 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -96,6 +96,7 @@ out: if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); + skb_reset_mac_len(skb); return action; drop: diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 9602dafe32e6..f2f9b75008bb 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -361,8 +361,24 @@ static const u8 besteffort[] = { static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; static const u8 bulk_order[] = {1, 0, 2, 3}; +/* There is a big difference in timing between the accurate values placed in the + * cache and the approximations given by a single Newton step for small count + * values, particularly when stepping from count 1 to 2 or vice versa. Hence, + * these values are calculated using eight Newton steps, using the + * implementation below. Above 16, a single Newton step gives sufficient + * accuracy in either direction, given the precision stored. + * + * The magnitude of the error when stepping up to count 2 is such as to give the + * value that *should* have been produced at count 4. + */ + #define REC_INV_SQRT_CACHE (16) -static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; +static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { + ~0, ~0, 3037000500, 2479700525, + 2147483647, 1920767767, 1753413056, 1623345051, + 1518500250, 1431655765, 1358187914, 1294981364, + 1239850263, 1191209601, 1147878294, 1108955788 +}; /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) @@ -388,47 +404,14 @@ static void cobalt_newton_step(struct cobalt_vars *vars) static void cobalt_invsqrt(struct cobalt_vars *vars) { if (vars->count < REC_INV_SQRT_CACHE) - vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; + vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; else cobalt_newton_step(vars); } -/* There is a big difference in timing between the accurate values placed in - * the cache and the approximations given by a single Newton step for small - * count values, particularly when stepping from count 1 to 2 or vice versa. - * Above 16, a single Newton step gives sufficient accuracy in either - * direction, given the precision stored. - * - * The magnitude of the error when stepping up to count 2 is such as to give - * the value that *should* have been produced at count 4. - */ - -static void cobalt_cache_init(void) -{ - struct cobalt_vars v; - - memset(&v, 0, sizeof(v)); - v.rec_inv_sqrt = ~0U; - cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; - - for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { - cobalt_newton_step(&v); - cobalt_newton_step(&v); - cobalt_newton_step(&v); - cobalt_newton_step(&v); - - cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; - } -} - static void cobalt_vars_init(struct cobalt_vars *vars) { memset(vars, 0, sizeof(*vars)); - - if (!cobalt_rec_inv_sqrt_cache[0]) { - cobalt_cache_init(); - cobalt_rec_inv_sqrt_cache[0] = ~0; - } } /* CoDel control_law is t + interval/sqrt(count) @@ -786,12 +769,15 @@ skip_hash: * queue, accept the collision, update the host tags. */ q->way_collisions++; - if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { - q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; - q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; - } allocate_src = cake_dsrc(flow_mode); allocate_dst = cake_ddst(flow_mode); + + if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { + if (allocate_src) + q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; + if (allocate_dst) + q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; + } found: /* reserve queue for future packets in same flow */ reduced_hash = outer_hash + k; diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 238974725679..19a49af5a9e5 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -663,7 +663,9 @@ begin: pband = &q->band_flows[q->band_nr]; pband->credit = min(pband->credit + pband->quantum, pband->quantum); - goto begin; + if (pband->credit > 0) + goto begin; + retry = 0; } if (q->time_next_delayed_flow != ~0ULL) qdisc_watchdog_schedule_range_ns(&q->watchdog, diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index edc72962ae63..39382ee1e331 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -446,12 +446,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; - struct sk_buff *skb2; + struct sk_buff *skb2 = NULL; struct sk_buff *segs = NULL; unsigned int prev_len = qdisc_pkt_len(skb); int count = 1; - int rc = NET_XMIT_SUCCESS; - int rc_drop = NET_XMIT_DROP; /* Do not fool qdisc_drop_all() */ skb->prev = NULL; @@ -480,19 +478,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_orphan_partial(skb); /* - * If we need to duplicate packet, then re-insert at top of the - * qdisc tree, since parent queuer expects that only one - * skb will be queued. + * If we need to duplicate packet, then clone it before + * original is modified. */ - if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { - struct Qdisc *rootq = qdisc_root_bh(sch); - u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ - - q->duplicate = 0; - rootq->enqueue(skb2, rootq, to_free); - q->duplicate = dupsave; - rc_drop = NET_XMIT_SUCCESS; - } + if (count > 1) + skb2 = skb_clone(skb, GFP_ATOMIC); /* * Randomized packet corruption. @@ -504,7 +494,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb_is_gso(skb)) { skb = netem_segment(skb, sch, to_free); if (!skb) - return rc_drop; + goto finish_segs; + segs = skb->next; skb_mark_not_on_list(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; @@ -530,7 +521,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); - return rc_drop; + if (skb2) + __qdisc_drop(skb2, to_free); + return NET_XMIT_DROP; + } + + /* + * If doing duplication then re-insert at top of the + * qdisc tree, since parent queuer expects that only one + * skb will be queued. + */ + if (skb2) { + struct Qdisc *rootq = qdisc_root_bh(sch); + u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ + + q->duplicate = 0; + rootq->enqueue(skb2, rootq, to_free); + q->duplicate = dupsave; + skb2 = NULL; } qdisc_qstats_backlog_inc(sch, skb); @@ -601,9 +609,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, } finish_segs: + if (skb2) + __qdisc_drop(skb2, to_free); + if (segs) { unsigned int len, last_len; - int nb; + int rc, nb; len = skb ? skb->len : 0; nb = skb ? 1 : 0; @@ -731,11 +742,10 @@ deliver: err = qdisc_enqueue(skb, q->qdisc, &to_free); kfree_skb_list(to_free); - if (err != NET_XMIT_SUCCESS && - net_xmit_drop_count(err)) { - qdisc_qstats_drop(sch); - qdisc_tree_reduce_backlog(sch, 1, - pkt_len); + if (err != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(err)) + qdisc_qstats_drop(sch); + qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index cc2df9f8c14a..8498d0606b24 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1952,7 +1952,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto unlock; } - rcu_assign_pointer(q->admin_sched, new_admin); + /* Not going to race against advance_sched(), but still */ + admin = rcu_replace_pointer(q->admin_sched, new_admin, + lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5a7436a13b74..39ca5403d4d7 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -44,6 +44,7 @@ #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/udp_tunnel.h> +#include <net/inet_dscp.h> #define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) @@ -435,7 +436,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_tos = tos & INET_DSCP_MASK; fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5adf0c0a6c1a..7d315a18612b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2260,12 +2260,6 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( } } - /* Update socket peer label if first association. */ - if (security_sctp_assoc_request(new_asoc, chunk->head_skb ?: chunk->skb)) { - sctp_association_free(new_asoc); - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - } - /* Set temp so that it won't be added into hashtable */ new_asoc->temp = 1; @@ -2274,6 +2268,22 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( */ action = sctp_tietags_compare(new_asoc, asoc); + /* In cases C and E the association doesn't enter the ESTABLISHED + * state, so there is no need to call security_sctp_assoc_request(). + */ + switch (action) { + case 'A': /* Association restart. */ + case 'B': /* Collision case B. */ + case 'D': /* Collision case D. */ + /* Update socket peer label if first association. */ + if (security_sctp_assoc_request((struct sctp_association *)asoc, + chunk->head_skb ?: chunk->skb)) { + sctp_association_free(new_asoc); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + } + break; + } + switch (action) { case 'A': /* Association restart. */ retval = sctp_sf_do_dupcook_a(net, ep, asoc, chunk, commands, diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8e3093938cd2..0316217b7687 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1466,10 +1466,6 @@ connect_abort: static int smc_connect_check_aclc(struct smc_init_info *ini, struct smc_clc_msg_accept_confirm *aclc) { - if (aclc->hdr.typev1 != SMC_TYPE_R && - aclc->hdr.typev1 != SMC_TYPE_D) - return SMC_CLC_DECL_MODEUNSUPP; - if (aclc->hdr.version >= SMC_V2) { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v2)) || @@ -1523,10 +1519,6 @@ static int __smc_connect(struct smc_sock *smc) ini->smcd_version &= ~SMC_V1; ini->smcr_version = 0; ini->smc_type_v1 = SMC_TYPE_N; - if (!ini->smcd_version) { - rc = SMC_CLC_DECL_GETVLANERR; - goto fallback; - } } rc = smc_find_proposal_devices(smc, ini); diff --git a/net/smc/smc.h b/net/smc/smc.h index 34b781e463c4..ad77d6b6b8d3 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -284,6 +284,9 @@ struct smc_connection { struct smc_sock { /* smc sock container */ struct sock sk; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6_pinfo *pinet6; +#endif struct socket *clcsock; /* internal tcp socket */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 467effb50cd6..5625fda2960b 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -145,6 +145,8 @@ struct smc_clc_v2_extension { ); u8 user_eids[][SMC_MAX_EID_LEN]; }; +static_assert(offsetof(struct smc_clc_v2_extension, user_eids) == sizeof(struct smc_clc_v2_extension_fixed), + "struct member likely outside of struct_group_tagged()"); struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ @@ -169,6 +171,8 @@ struct smc_clc_smcd_v2_extension { ); struct smc_clc_smcd_gid_chid gidchid[]; }; +static_assert(offsetof(struct smc_clc_smcd_v2_extension, gidchid) == sizeof(struct smc_clc_smcd_v2_extension_fixed), + "struct member likely outside of struct_group_tagged()"); struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_hdr hdr; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3b95828d9976..4e694860ece4 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -221,6 +221,35 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) write_unlock_bh(&lgr->conns_lock); } +static void smc_lgr_buf_list_add(struct smc_link_group *lgr, + bool is_rmb, + struct list_head *buf_list, + struct smc_buf_desc *buf_desc) +{ + list_add(&buf_desc->list, buf_list); + if (is_rmb) { + lgr->alloc_rmbs += buf_desc->len; + lgr->alloc_rmbs += + lgr->is_smcd ? sizeof(struct smcd_cdc_msg) : 0; + } else { + lgr->alloc_sndbufs += buf_desc->len; + } +} + +static void smc_lgr_buf_list_del(struct smc_link_group *lgr, + bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + list_del(&buf_desc->list); + if (is_rmb) { + lgr->alloc_rmbs -= buf_desc->len; + lgr->alloc_rmbs -= + lgr->is_smcd ? sizeof(struct smcd_cdc_msg) : 0; + } else { + lgr->alloc_sndbufs -= buf_desc->len; + } +} + int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); @@ -363,6 +392,10 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_R_SNDBUF_ALLOC, lgr->alloc_sndbufs)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_R_RMB_ALLOC, lgr->alloc_rmbs)) + goto errattr; if (lgr->smc_version > SMC_V1) { v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); if (!v2_attrs) @@ -541,6 +574,10 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_D_SNDBUF_ALLOC, lgr->alloc_sndbufs)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_D_DMB_ALLOC, lgr->alloc_rmbs)) + goto errattr; memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) @@ -1138,7 +1175,7 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, lock = is_rmb ? &lgr->rmbs_lock : &lgr->sndbufs_lock; down_write(lock); - list_del(&buf_desc->list); + smc_lgr_buf_list_del(lgr, is_rmb, buf_desc); up_write(lock); smc_buf_free(lgr, is_rmb, buf_desc); @@ -1166,22 +1203,30 @@ static void smcd_buf_detach(struct smc_connection *conn) static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + bool is_smcd = lgr->is_smcd; + int bufsize; + if (conn->sndbuf_desc) { - if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + bufsize = conn->sndbuf_desc->len; + if (!is_smcd && conn->sndbuf_desc->is_vm) { smcr_buf_unuse(conn->sndbuf_desc, false, lgr); } else { - memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); + memzero_explicit(conn->sndbuf_desc->cpu_addr, bufsize); WRITE_ONCE(conn->sndbuf_desc->used, 0); } + SMC_STAT_RMB_SIZE(smc, is_smcd, false, false, bufsize); } if (conn->rmb_desc) { - if (!lgr->is_smcd) { + bufsize = conn->rmb_desc->len; + if (!is_smcd) { smcr_buf_unuse(conn->rmb_desc, true, lgr); } else { - memzero_explicit(conn->rmb_desc->cpu_addr, - conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); + bufsize += sizeof(struct smcd_cdc_msg); + memzero_explicit(conn->rmb_desc->cpu_addr, bufsize); WRITE_ONCE(conn->rmb_desc->used, 0); } + SMC_STAT_RMB_SIZE(smc, is_smcd, true, false, bufsize); } } @@ -1377,7 +1422,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) buf_list = &lgr->sndbufs[i]; list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { - list_del(&buf_desc->list); + smc_lgr_buf_list_del(lgr, is_rmb, buf_desc); smc_buf_free(lgr, is_rmb, buf_desc); } } @@ -2250,7 +2295,7 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) } static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, - bool is_rmb, int bufsize) + int bufsize) { struct smc_buf_desc *buf_desc; @@ -2390,7 +2435,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; - SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ } @@ -2398,7 +2443,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_smcd) buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); else - buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); + buf_desc = smcr_new_buf_create(lgr, bufsize); if (PTR_ERR(buf_desc) == -ENOMEM) break; @@ -2411,10 +2456,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) } SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); - SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); buf_desc->used = 1; down_write(lock); - list_add(&buf_desc->list, buf_list); + smc_lgr_buf_list_add(lgr, is_rmb, buf_list, buf_desc); up_write(lock); break; /* found */ } @@ -2496,7 +2541,8 @@ create_rmb: rc = __smc_buf_create(smc, is_smcd, true); if (rc && smc->conn.sndbuf_desc) { down_write(&smc->conn.lgr->sndbufs_lock); - list_del(&smc->conn.sndbuf_desc->list); + smc_lgr_buf_list_del(smc->conn.lgr, false, + smc->conn.sndbuf_desc); up_write(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); smc->conn.sndbuf_desc = NULL; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index d93cf51dbd7c..0db4e5f79ac4 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -281,6 +281,8 @@ struct smc_link_group { struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ struct rw_semaphore rmbs_lock; /* protects rx buffers */ + u64 alloc_sndbufs; /* stats of tx buffers */ + u64 alloc_rmbs; /* stats of rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c index bece346dd8e9..a5b2041600f9 100644 --- a/net/smc/smc_inet.c +++ b/net/smc/smc_inet.c @@ -60,6 +60,11 @@ static struct inet_protosw smc_inet_protosw = { }; #if IS_ENABLED(CONFIG_IPV6) +struct smc6_sock { + struct smc_sock smc; + struct ipv6_pinfo inet6; +}; + static struct proto smc_inet6_prot = { .name = "INET6_SMC", .owner = THIS_MODULE, @@ -67,9 +72,10 @@ static struct proto smc_inet6_prot = { .hash = smc_hash_sk, .unhash = smc_unhash_sk, .release_cb = smc_release_cb, - .obj_size = sizeof(struct smc_sock), + .obj_size = sizeof(struct smc6_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, + .ipv6_pinfo_offset = offsetof(struct smc6_sock, inet6), }; static const struct proto_ops smc_inet6_stream_ops = { diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 6dd4292dae56..04dc6808d2e1 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -15,7 +15,6 @@ #define _SMC_LOOPBACK_H #include <linux/device.h> -#include <linux/err.h> #include <net/smc.h> #if IS_ENABLED(CONFIG_SMC_LO) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2adb92b8c469..1dd362326c0a 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -887,9 +887,6 @@ int smc_pnet_net_init(struct net *net) smc_pnet_create_pnetids_list(net); - /* disable handshake limitation by default */ - net->smc.limit_smc_hs = 0; - return 0; } diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index ca14c0f3a07d..e71b17d1e21c 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -218,6 +218,12 @@ static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, smc_tech->tx_bytes, SMC_NLA_STATS_PAD)) goto errattr; + if (nla_put_uint(skb, SMC_NLA_STATS_T_RX_RMB_USAGE, + smc_tech->rx_rmbuse)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_STATS_T_TX_RMB_USAGE, + smc_tech->tx_rmbuse)) + goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, smc_tech->rx_cnt, SMC_NLA_STATS_PAD)) diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index e19177ce4092..571f9d9e7814 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -79,6 +79,8 @@ struct smc_stats_tech { u64 tx_bytes; u64 rx_cnt; u64 tx_cnt; + u64 rx_rmbuse; + u64 tx_rmbuse; }; struct smc_stats { @@ -135,38 +137,46 @@ do { \ } \ while (0) -#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \ +#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _is_add, _len) \ do { \ + typeof(_smc_stats) stats = (_smc_stats); \ + typeof(_is_add) is_a = (_is_add); \ typeof(_len) _l = (_len); \ typeof(_tech) t = (_tech); \ int _pos; \ int m = SMC_BUF_MAX - 1; \ if (_l <= 0) \ break; \ - _pos = fls((_l - 1) >> 13); \ - _pos = (_pos <= m) ? _pos : m; \ - this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ + if (is_a) { \ + _pos = fls((_l - 1) >> 13); \ + _pos = (_pos <= m) ? _pos : m; \ + this_cpu_inc((*stats).smc[t].k ## _rmbsize.buf[_pos]); \ + this_cpu_add((*stats).smc[t].k ## _rmbuse, _l); \ + } else { \ + this_cpu_sub((*stats).smc[t].k ## _rmbuse, _l); \ + } \ } \ while (0) #define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \ this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt) -#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \ +#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _is_add, _len) \ do { \ struct net *_net = sock_net(&(_smc)->sk); \ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(_is_add) is_add = (_is_add); \ typeof(_is_smcd) is_d = (_is_smcd); \ typeof(_is_rx) is_r = (_is_rx); \ typeof(_len) l = (_len); \ if ((is_d) && (is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, is_add, l); \ if ((is_d) && !(is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, is_add, l); \ if (!(is_d) && (is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, is_add, l); \ if (!(is_d) && !(is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, is_add, l); \ } \ while (0) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 13f2bc092db1..2fab6456f765 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -90,6 +90,15 @@ static struct ctl_table smc_table[] = { .extra1 = &conns_per_lgr_min, .extra2 = &conns_per_lgr_max, }, + { + .procname = "limit_smc_hs", + .data = &init_net.smc.limit_smc_hs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; int __net_init smc_sysctl_net_init(struct net *net) @@ -121,6 +130,8 @@ int __net_init smc_sysctl_net_init(struct net *net) WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; + /* disable handshake limitation by default */ + net->smc.limit_smc_hs = 0; return 0; diff --git a/net/socket.c b/net/socket.c index fcbdd5bc47ac..8d8b84fa404a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -946,11 +946,17 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, memset(&tss, 0, sizeof(tss)); tsflags = READ_ONCE(sk->sk_tsflags); - if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE && + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || + skb_is_err_queue(skb) || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) && ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && - (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && + (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || + skb_is_err_queue(skb) || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) && !skb_is_swtx_tstamp(skb, false_tstamp)) { if_index = 0; if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) @@ -2362,7 +2368,7 @@ INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, int do_sock_getsockopt(struct socket *sock, bool compat, int level, int optname, sockptr_t optval, sockptr_t optlen) { - int max_optlen __maybe_unused; + int max_optlen __maybe_unused = 0; const struct proto_ops *ops; int err; @@ -2371,7 +2377,7 @@ int do_sock_getsockopt(struct socket *sock, bool compat, int level, return err; if (!compat) - max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen); + copy_from_sockptr(&max_optlen, optlen, sizeof(int)); ops = READ_ONCE(sock->ops); if (level == SOL_SOCKET) { diff --git a/net/sunrpc/xprtrdma/ib_client.c b/net/sunrpc/xprtrdma/ib_client.c index a938c19c3490..8507cd4d8921 100644 --- a/net/sunrpc/xprtrdma/ib_client.c +++ b/net/sunrpc/xprtrdma/ib_client.c @@ -62,10 +62,11 @@ int rpcrdma_rn_register(struct ib_device *device, if (!rd || test_bit(RPCRDMA_RD_F_REMOVING, &rd->rd_flags)) return -ENETUNREACH; - kref_get(&rd->rd_kref); if (xa_alloc(&rd->rd_xa, &rn->rn_index, rn, xa_limit_32b, GFP_KERNEL) < 0) return -ENOMEM; + kref_get(&rd->rd_kref); rn->rn_done = done; + trace_rpcrdma_client_register(device, rn); return 0; } @@ -91,6 +92,7 @@ void rpcrdma_rn_unregister(struct ib_device *device, if (!rd) return; + trace_rpcrdma_client_unregister(device, rn); xa_erase(&rd->rd_xa, rn->rn_index); kref_put(&rd->rd_kref, rpcrdma_rn_release); } @@ -111,7 +113,7 @@ static int rpcrdma_add_one(struct ib_device *device) return -ENOMEM; kref_init(&rd->rd_kref); - xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC1); + xa_init_flags(&rd->rd_xa, XA_FLAGS_ALLOC); rd->rd_device = device; init_completion(&rd->rd_done); ib_set_client_data(device, &rpcrdma_ib_client, rd); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 593846d25214..114fef65f92e 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -320,8 +320,8 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, { struct tipc_msg *hdr, *_hdr; struct sk_buff_head tmpq; + u16 cong_link_cnt = 0; struct sk_buff *_skb; - u16 cong_link_cnt; int rc = 0; /* Is a cluster supporting with new capabilities ? */ diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 5a526ebafeb4..ae1ddbf71853 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -163,8 +163,12 @@ static int bearer_name_validate(const char *name, /* return bearer name components, if necessary */ if (name_parts) { - strcpy(name_parts->media_name, media_name); - strcpy(name_parts->if_name, if_name); + if (strscpy(name_parts->media_name, media_name, + TIPC_MAX_MEDIA_NAME) < 0) + return 0; + if (strscpy(name_parts->if_name, if_name, + TIPC_MAX_IF_NAME) < 0) + return 0; } return 1; } @@ -322,7 +326,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, if (!b) return -ENOMEM; - strcpy(b->name, name); + strscpy(b->name, name); b->media = m; res = m->enable_media(net, b, attr); if (res) { diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 77a3d016cade..e2f19627e43d 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -149,7 +149,7 @@ static int dom_size(int peers) while ((i * i) < peers) i++; - return i < MAX_MON_DOMAIN ? i : MAX_MON_DOMAIN; + return min(i, MAX_MON_DOMAIN); } static void map_set(u64 *up_map, int i, unsigned int v) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 1a0cd06f0eae..65dcbb54f55d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1009,12 +1009,11 @@ static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, struct tipc_member *mbr = NULL; struct net *net = sock_net(sk); u32 node, port, exclude; - struct list_head dsts; + LIST_HEAD(dsts); int lookups = 0; int dstcnt, rc; bool cong; - INIT_LIST_HEAD(&dsts); ua->sa.type = msg_nametype(hdr); ua->scope = msg_lookup_scope(hdr); @@ -1161,10 +1160,9 @@ static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - struct list_head dsts; u32 dstcnt, exclude; + LIST_HEAD(dsts); - INIT_LIST_HEAD(&dsts); ua->sa.type = msg_nametype(hdr); ua->scope = msg_lookup_scope(hdr); exclude = tipc_group_exclude(grp); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 305a412785f5..bbf26cc4f6ee 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1201,7 +1201,7 @@ trim_sgl: if (!num_async) { goto send_end; - } else if (num_zc) { + } else if (num_zc || eor) { int err; /* Wait for pending encryptions to get completed */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0be0dcb07f7b..001ccc55ef0f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -693,10 +693,7 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_state_unlock(sk); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } + u->oob_skb = NULL; #endif wake_up_interruptible_all(&u->peer_wait); @@ -2226,13 +2223,9 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other } maybe_add_creds(skb, sock, other); - skb_get(skb); - scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); - if (ousk->oob_skb) - consume_skb(ousk->oob_skb); WRITE_ONCE(ousk->oob_skb, skb); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); @@ -2640,8 +2633,6 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) if (!(state->flags & MSG_PEEK)) WRITE_ONCE(u->oob_skb, NULL); - else - skb_get(oob_skb); spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); @@ -2651,8 +2642,6 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) if (!(state->flags & MSG_PEEK)) UNIXCB(oob_skb).consumed += 1; - consume_skb(oob_skb); - mutex_unlock(&u->iolock); if (chunk < 0) @@ -2665,56 +2654,52 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, int flags, int copied) { + struct sk_buff *read_skb = NULL, *unread_skb = NULL; struct unix_sock *u = unix_sk(sk); - if (!unix_skb_len(skb)) { - struct sk_buff *unlinked_skb = NULL; + if (likely(unix_skb_len(skb) && skb != READ_ONCE(u->oob_skb))) + return skb; - spin_lock(&sk->sk_receive_queue.lock); + spin_lock(&sk->sk_receive_queue.lock); + if (!unix_skb_len(skb)) { if (copied && (!u->oob_skb || skb == u->oob_skb)) { skb = NULL; } else if (flags & MSG_PEEK) { skb = skb_peek_next(skb, &sk->sk_receive_queue); } else { - unlinked_skb = skb; + read_skb = skb; skb = skb_peek_next(skb, &sk->sk_receive_queue); - __skb_unlink(unlinked_skb, &sk->sk_receive_queue); + __skb_unlink(read_skb, &sk->sk_receive_queue); } - spin_unlock(&sk->sk_receive_queue.lock); + if (!skb) + goto unlock; + } - consume_skb(unlinked_skb); - } else { - struct sk_buff *unlinked_skb = NULL; + if (skb != u->oob_skb) + goto unlock; - spin_lock(&sk->sk_receive_queue.lock); + if (copied) { + skb = NULL; + } else if (!(flags & MSG_PEEK)) { + WRITE_ONCE(u->oob_skb, NULL); - if (skb == u->oob_skb) { - if (copied) { - skb = NULL; - } else if (!(flags & MSG_PEEK)) { - if (sock_flag(sk, SOCK_URGINLINE)) { - WRITE_ONCE(u->oob_skb, NULL); - consume_skb(skb); - } else { - __skb_unlink(skb, &sk->sk_receive_queue); - WRITE_ONCE(u->oob_skb, NULL); - unlinked_skb = skb; - skb = skb_peek(&sk->sk_receive_queue); - } - } else if (!sock_flag(sk, SOCK_URGINLINE)) { - skb = skb_peek_next(skb, &sk->sk_receive_queue); - } + if (!sock_flag(sk, SOCK_URGINLINE)) { + __skb_unlink(skb, &sk->sk_receive_queue); + unread_skb = skb; + skb = skb_peek(&sk->sk_receive_queue); } + } else if (!sock_flag(sk, SOCK_URGINLINE)) { + skb = skb_peek_next(skb, &sk->sk_receive_queue); + } - spin_unlock(&sk->sk_receive_queue.lock); +unlock: + spin_unlock(&sk->sk_receive_queue.lock); + + consume_skb(read_skb); + kfree_skb(unread_skb); - if (unlinked_skb) { - WARN_ON_ONCE(skb_unref(unlinked_skb)); - kfree_skb(unlinked_skb); - } - } return skb; } #endif @@ -2756,7 +2741,6 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) unix_state_unlock(sk); if (drop) { - WARN_ON_ONCE(skb_unref(skb)); kfree_skb(skb); return -EAGAIN; } @@ -3192,9 +3176,13 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) skb = skb_peek(&sk->sk_receive_queue); if (skb) { struct sk_buff *oob_skb = READ_ONCE(u->oob_skb); + struct sk_buff *next_skb; + + next_skb = skb_peek_next(skb, &sk->sk_receive_queue); if (skb == oob_skb || - (!oob_skb && !unix_skb_len(skb))) + (!unix_skb_len(skb) && + (!oob_skb || next_skb == oob_skb))) answ = 1; } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 06d94ad999e9..0068e758be4d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -337,18 +337,6 @@ static bool unix_vertex_dead(struct unix_vertex *vertex) return true; } -static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist) -{ - skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist); - -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - WARN_ON_ONCE(skb_unref(u->oob_skb)); - u->oob_skb = NULL; - } -#endif -} - static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) { struct unix_vertex *vertex; @@ -371,11 +359,11 @@ static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; spin_lock(&embryo_queue->lock); - unix_collect_queue(unix_sk(skb->sk), hitlist); + skb_queue_splice_init(embryo_queue, hitlist); spin_unlock(&embryo_queue->lock); } } else { - unix_collect_queue(u, hitlist); + skb_queue_splice_init(queue, hitlist); } spin_unlock(&queue->lock); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 0ff9b2dd86ba..35681adedd9a 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -112,6 +112,7 @@ #include <net/sock.h> #include <net/af_vsock.h> #include <uapi/linux/vm_sockets.h> +#include <uapi/asm-generic/ioctls.h> static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); @@ -1295,6 +1296,57 @@ int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); +static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, + int __user *arg) +{ + struct sock *sk = sock->sk; + struct vsock_sock *vsk; + int ret; + + vsk = vsock_sk(sk); + + switch (cmd) { + case SIOCOUTQ: { + ssize_t n_bytes; + + if (!vsk->transport || !vsk->transport->unsent_bytes) { + ret = -EOPNOTSUPP; + break; + } + + if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { + ret = -EINVAL; + break; + } + + n_bytes = vsk->transport->unsent_bytes(vsk); + if (n_bytes < 0) { + ret = n_bytes; + break; + } + + ret = put_user(n_bytes, arg); + break; + } + default: + ret = -ENOIOCTLCMD; + } + + return ret; +} + +static int vsock_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + int ret; + + lock_sock(sock->sk); + ret = vsock_do_ioctl(sock, cmd, (int __user *)arg); + release_sock(sock->sk); + + return ret; +} + static const struct proto_ops vsock_dgram_ops = { .family = PF_VSOCK, .owner = THIS_MODULE, @@ -1305,7 +1357,7 @@ static const struct proto_ops vsock_dgram_ops = { .accept = sock_no_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = sock_no_listen, .shutdown = vsock_shutdown, .sendmsg = vsock_dgram_sendmsg, @@ -2294,7 +2346,7 @@ static const struct proto_ops vsock_stream_ops = { .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, @@ -2316,7 +2368,7 @@ static const struct proto_ops vsock_seqpacket_ops = { .accept = vsock_accept, .getname = vsock_getname, .poll = vsock_poll, - .ioctl = sock_no_ioctl, + .ioctl = vsock_ioctl, .listen = vsock_listen, .shutdown = vsock_shutdown, .setsockopt = vsock_connectible_setsockopt, diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 64a07acfef12..e0160da4ef43 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -311,7 +311,7 @@ static void virtio_transport_tx_work(struct work_struct *work) virtqueue_disable_cb(vq); while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { - consume_skb(skb); + virtio_transport_consume_skb_sent(skb, true); added = true; } } while (!virtqueue_enable_cb(vq)); @@ -540,6 +540,8 @@ static struct virtio_transport virtio_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, + .unsent_bytes = virtio_transport_unsent_bytes, + .read_skb = virtio_transport_read_skb, }, diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 16ff976a86e3..884ee128851e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -463,6 +463,26 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff * } EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); +void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume) +{ + struct sock *s = skb->sk; + + if (s && skb->len) { + struct vsock_sock *vs = vsock_sk(s); + struct virtio_vsock_sock *vvs; + + vvs = vs->trans; + + spin_lock_bh(&vvs->tx_lock); + vvs->bytes_unsent -= skb->len; + spin_unlock_bh(&vvs->tx_lock); + } + + if (consume) + consume_skb(skb); +} +EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent); + u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) { u32 ret; @@ -475,6 +495,7 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) if (ret > credit) ret = credit; vvs->tx_cnt += ret; + vvs->bytes_unsent += ret; spin_unlock_bh(&vvs->tx_lock); return ret; @@ -488,6 +509,7 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit) spin_lock_bh(&vvs->tx_lock); vvs->tx_cnt -= credit; + vvs->bytes_unsent -= credit; spin_unlock_bh(&vvs->tx_lock); } EXPORT_SYMBOL_GPL(virtio_transport_put_credit); @@ -1090,6 +1112,19 @@ void virtio_transport_destruct(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(virtio_transport_destruct); +ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + size_t ret; + + spin_lock_bh(&vvs->tx_lock); + ret = vvs->bytes_unsent; + spin_unlock_bh(&vvs->tx_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(virtio_transport_unsent_bytes); + static int virtio_transport_reset(struct vsock_sock *vsk, struct sk_buff *skb) { diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 6dea6119f5b2..6e78927a598e 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -98,6 +98,8 @@ static struct virtio_transport loopback_transport = { .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, + .unsent_bytes = virtio_transport_unsent_bytes, + .read_skb = virtio_transport_read_skb, }, @@ -123,6 +125,10 @@ static void vsock_loopback_work(struct work_struct *work) spin_unlock_bh(&vsock->pkt_queue.lock); while ((skb = __skb_dequeue(&pkts))) { + /* Decrement the bytes_unsent counter without deallocating skb + * It is freed by the receiver. + */ + virtio_transport_consume_skb_sent(skb, false); virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&loopback_transport, skb); } diff --git a/net/wireless/core.c b/net/wireless/core.c index 4d5d351bd0b5..661adfc77644 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -165,11 +165,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; - wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + wdev->netdev->netns_local = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; - wdev->netdev->features |= NETIF_F_NETNS_LOCAL; + wdev->netdev->netns_local = true; } if (err) { @@ -181,11 +181,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, list) { if (!wdev->netdev) continue; - wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; + wdev->netdev->netns_local = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); WARN_ON(err); - wdev->netdev->features |= NETIF_F_NETNS_LOCAL; + wdev->netdev->netns_local = true; } return err; @@ -1473,7 +1473,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, SET_NETDEV_DEVTYPE(dev, &wiphy_type); wdev->netdev = dev; /* can only change netns with wiphy */ - dev->features |= NETIF_F_NETNS_LOCAL; + dev->netns_local = true; cfg80211_init_wdev(wdev); break; diff --git a/net/wireless/core.h b/net/wireless/core.h index 41c8c0e3ba2e..3b3e3cd7027a 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -170,6 +170,12 @@ static inline int for_each_rdev_check_rtnl(void) if (for_each_rdev_check_rtnl()) {} else \ list_for_each_entry(rdev, &cfg80211_rdev_list, list) +enum bss_source_type { + BSS_SOURCE_DIRECT = 0, + BSS_SOURCE_MBSSID, + BSS_SOURCE_STA_PROFILE, +}; + struct cfg80211_internal_bss { struct list_head list; struct list_head hidden_list; @@ -191,6 +197,8 @@ struct cfg80211_internal_bss { */ u8 parent_bssid[ETH_ALEN] __aligned(2); + enum bss_source_type bss_source; + /* must be last because of priv member */ struct cfg80211_bss pub; }; diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 34e5acff3935..1e3ed29f7cfc 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -94,7 +94,7 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, lockdep_assert_held(&rdev->wiphy.mtx); - if (wdev->cac_started) + if (wdev->links[0].cac_started) return -EBUSY; if (wdev->u.ibss.ssid_len) diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c index d66a913027e0..64c447040786 100644 --- a/net/wireless/lib80211.c +++ b/net/wireless/lib80211.c @@ -34,7 +34,7 @@ MODULE_LICENSE("GPL"); struct lib80211_crypto_alg { struct list_head list; - struct lib80211_crypto_ops *ops; + const struct lib80211_crypto_ops *ops; }; static LIST_HEAD(lib80211_crypto_algs); @@ -161,7 +161,7 @@ void lib80211_crypt_delayed_deinit(struct lib80211_crypt_info *info, } EXPORT_SYMBOL(lib80211_crypt_delayed_deinit); -int lib80211_register_crypto_ops(struct lib80211_crypto_ops *ops) +int lib80211_register_crypto_ops(const struct lib80211_crypto_ops *ops) { unsigned long flags; struct lib80211_crypto_alg *alg; @@ -183,7 +183,7 @@ int lib80211_register_crypto_ops(struct lib80211_crypto_ops *ops) } EXPORT_SYMBOL(lib80211_register_crypto_ops); -int lib80211_unregister_crypto_ops(struct lib80211_crypto_ops *ops) +int lib80211_unregister_crypto_ops(const struct lib80211_crypto_ops *ops) { struct lib80211_crypto_alg *alg; unsigned long flags; @@ -206,7 +206,7 @@ int lib80211_unregister_crypto_ops(struct lib80211_crypto_ops *ops) } EXPORT_SYMBOL(lib80211_unregister_crypto_ops); -struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name) +const struct lib80211_crypto_ops *lib80211_get_crypto_ops(const char *name) { struct lib80211_crypto_alg *alg; unsigned long flags; @@ -234,7 +234,7 @@ static void lib80211_crypt_null_deinit(void *priv) { } -static struct lib80211_crypto_ops lib80211_crypt_null = { +static const struct lib80211_crypto_ops lib80211_crypt_null = { .name = "NULL", .init = lib80211_crypt_null_init, .deinit = lib80211_crypt_null_deinit, diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c index cca5e1cf089e..5aad139130e1 100644 --- a/net/wireless/lib80211_crypt_ccmp.c +++ b/net/wireless/lib80211_crypt_ccmp.c @@ -418,7 +418,7 @@ static void lib80211_ccmp_print_stats(struct seq_file *m, void *priv) ccmp->dot11RSNAStatsCCMPDecryptErrors); } -static struct lib80211_crypto_ops lib80211_crypt_ccmp = { +static const struct lib80211_crypto_ops lib80211_crypt_ccmp = { .name = "CCMP", .init = lib80211_ccmp_init, .deinit = lib80211_ccmp_deinit, diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c index 5c8cdf7681e3..63e68e5e121e 100644 --- a/net/wireless/lib80211_crypt_tkip.c +++ b/net/wireless/lib80211_crypt_tkip.c @@ -705,7 +705,7 @@ static void lib80211_tkip_print_stats(struct seq_file *m, void *priv) tkip->dot11RSNAStatsTKIPLocalMICFailures); } -static struct lib80211_crypto_ops lib80211_crypt_tkip = { +static const struct lib80211_crypto_ops lib80211_crypt_tkip = { .name = "TKIP", .init = lib80211_tkip_init, .deinit = lib80211_tkip_deinit, diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c index 6ab9957b8f96..3b148c7bef85 100644 --- a/net/wireless/lib80211_crypt_wep.c +++ b/net/wireless/lib80211_crypt_wep.c @@ -226,7 +226,7 @@ static void lib80211_wep_print_stats(struct seq_file *m, void *priv) seq_printf(m, "key[%d] alg=WEP len=%d\n", wep->key_idx, wep->key_len); } -static struct lib80211_crypto_ops lib80211_crypt_wep = { +static const struct lib80211_crypto_ops lib80211_crypt_wep = { .name = "WEP", .init = lib80211_wep_init, .deinit = lib80211_wep_deinit, diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index aaca65b66af4..2c6654075ca9 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -127,7 +127,7 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, if (!rdev->ops->join_mesh) return -EOPNOTSUPP; - if (wdev->cac_started) + if (wdev->links[0].cac_started) return -EBUSY; if (!setup->chandef.chan) { diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 4052041a19ea..4dac81854721 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -1110,26 +1110,28 @@ EXPORT_SYMBOL(__cfg80211_radar_event); void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, - enum nl80211_radar_event event, gfp_t gfp) + enum nl80211_radar_event event, gfp_t gfp, + unsigned int link_id) { struct wireless_dev *wdev = netdev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long timeout; - /* not yet supported */ - if (wdev->valid_links) + if (WARN_ON(wdev->valid_links && + !(wdev->valid_links & BIT(link_id)))) return; - trace_cfg80211_cac_event(netdev, event); + trace_cfg80211_cac_event(netdev, event, link_id); - if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED)) + if (WARN_ON(!wdev->links[link_id].cac_started && + event != NL80211_RADAR_CAC_STARTED)) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: - timeout = wdev->cac_start_time + - msecs_to_jiffies(wdev->cac_time_ms); + timeout = wdev->links[link_id].cac_start_time + + msecs_to_jiffies(wdev->links[link_id].cac_time_ms); WARN_ON(!time_after_eq(jiffies, timeout)); cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); memcpy(&rdev->cac_done_chandef, chandef, @@ -1138,10 +1140,10 @@ void cfg80211_cac_event(struct net_device *netdev, cfg80211_sched_dfs_chan_update(rdev); fallthrough; case NL80211_RADAR_CAC_ABORTED: - wdev->cac_started = false; + wdev->links[link_id].cac_started = false; break; case NL80211_RADAR_CAC_STARTED: - wdev->cac_started = true; + wdev->links[link_id].cac_started = true; break; default: WARN_ON(1); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7397a372c78e..9ab777e0bd4d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6066,7 +6066,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->start_ap) return -EOPNOTSUPP; - if (wdev->cac_started) + if (wdev->links[link_id].cac_started) return -EBUSY; if (wdev->links[link_id].ap.beacon_interval) @@ -9778,7 +9778,8 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, return ERR_PTR(-ENOMEM); if (n_ssids) - request->ssids = (void *)&request->channels[n_channels]; + request->ssids = (void *)request + + struct_size(request, channels, n_channels); request->n_ssids = n_ssids; if (ie_len) { if (n_ssids) @@ -10072,6 +10073,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; + int link_id = nl80211_link_id(info->attrs); struct wiphy *wiphy = wdev->wiphy; struct cfg80211_chan_def chandef; enum nl80211_dfs_regions dfs_region; @@ -10121,7 +10123,20 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, goto unlock; } - if (cfg80211_beaconing_iface_active(wdev) || wdev->cac_started) { + if (cfg80211_beaconing_iface_active(wdev)) { + /* During MLO other link(s) can beacon, only the current link + * can not already beacon + */ + if (wdev->valid_links && + !wdev->links[link_id].ap.beacon_interval) { + /* nothing */ + } else { + err = -EBUSY; + goto unlock; + } + } + + if (wdev->links[link_id].cac_started) { err = -EBUSY; goto unlock; } @@ -10141,12 +10156,26 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, if (WARN_ON(!cac_time_ms)) cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; - err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms); + err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms, + link_id); if (!err) { - wdev->links[0].ap.chandef = chandef; - wdev->cac_started = true; - wdev->cac_start_time = jiffies; - wdev->cac_time_ms = cac_time_ms; + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + wdev->links[0].ap.chandef = chandef; + break; + case NL80211_IFTYPE_ADHOC: + wdev->u.ibss.chandef = chandef; + break; + case NL80211_IFTYPE_MESH_POINT: + wdev->u.mesh.chandef = chandef; + break; + default: + break; + } + wdev->links[link_id].cac_started = true; + wdev->links[link_id].cac_start_time = jiffies; + wdev->links[link_id].cac_time_ms = cac_time_ms; } unlock: wiphy_unlock(wiphy); @@ -10494,17 +10523,21 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, NL80211_BSS_CHAIN_SIGNAL)) goto nla_put_failure; - switch (rdev->wiphy.signal_type) { - case CFG80211_SIGNAL_TYPE_MBM: - if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, res->signal)) - goto nla_put_failure; - break; - case CFG80211_SIGNAL_TYPE_UNSPEC: - if (nla_put_u8(msg, NL80211_BSS_SIGNAL_UNSPEC, res->signal)) - goto nla_put_failure; - break; - default: - break; + if (intbss->bss_source != BSS_SOURCE_STA_PROFILE) { + switch (rdev->wiphy.signal_type) { + case CFG80211_SIGNAL_TYPE_MBM: + if (nla_put_u32(msg, NL80211_BSS_SIGNAL_MBM, + res->signal)) + goto nla_put_failure; + break; + case CFG80211_SIGNAL_TYPE_UNSPEC: + if (nla_put_u8(msg, NL80211_BSS_SIGNAL_UNSPEC, + res->signal)) + goto nla_put_failure; + break; + default: + break; + } } switch (wdev->iftype) { @@ -16498,10 +16531,10 @@ nl80211_set_ttlm(struct sk_buff *skb, struct genl_info *info) SELECTOR(__sel, NETDEV_UP_NOTMX, \ NL80211_FLAG_NEED_NETDEV_UP | \ NL80211_FLAG_NO_WIPHY_MTX) \ - SELECTOR(__sel, NETDEV_UP_NOTMX_NOMLO, \ + SELECTOR(__sel, NETDEV_UP_NOTMX_MLO, \ NL80211_FLAG_NEED_NETDEV_UP | \ NL80211_FLAG_NO_WIPHY_MTX | \ - NL80211_FLAG_MLO_UNSUPPORTED) \ + NL80211_FLAG_MLO_VALID_LINK_ID) \ SELECTOR(__sel, NETDEV_UP_CLEAR, \ NL80211_FLAG_NEED_NETDEV_UP | \ NL80211_FLAG_CLEAR_SKB) \ @@ -17396,7 +17429,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NO_WIPHY_MTX | - NL80211_FLAG_MLO_UNSUPPORTED), + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_GET_PROTOCOL_FEATURES, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index ec3f4aa1c807..f5adbf6b5c84 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1200,26 +1200,27 @@ static inline int rdev_start_radar_detection(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef, - u32 cac_time_ms) + u32 cac_time_ms, int link_id) { int ret = -EOPNOTSUPP; trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef, - cac_time_ms); + cac_time_ms, link_id); if (rdev->ops->start_radar_detection) ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev, - chandef, cac_time_ms); + chandef, cac_time_ms, + link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_end_cac(struct cfg80211_registered_device *rdev, - struct net_device *dev) + struct net_device *dev, unsigned int link_id) { - trace_rdev_end_cac(&rdev->wiphy, dev); + trace_rdev_end_cac(&rdev->wiphy, dev, link_id); if (rdev->ops->end_cac) - rdev->ops->end_cac(&rdev->wiphy, dev); + rdev->ops->end_cac(&rdev->wiphy, dev, link_id); trace_rdev_return_void(&rdev->wiphy); } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 4a27f3823e25..6489ba943a63 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -4229,6 +4229,8 @@ EXPORT_SYMBOL(regulatory_pre_cac_allowed); static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; + unsigned int link_id; + /* If we finished CAC or received radar, we should end any * CAC running on the same channels. * the check !cfg80211_chandef_dfs_usable contain 2 options: @@ -4241,16 +4243,17 @@ static void cfg80211_check_and_end_cac(struct cfg80211_registered_device *rdev) list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { struct cfg80211_chan_def *chandef; - if (!wdev->cac_started) - continue; + for_each_valid_link(wdev, link_id) { + if (!wdev->links[link_id].cac_started) + continue; - /* FIXME: radar detection is tied to link 0 for now */ - chandef = wdev_chandef(wdev, 0); - if (!chandef) - continue; + chandef = wdev_chandef(wdev, link_id); + if (!chandef) + continue; - if (!cfg80211_chandef_dfs_usable(&rdev->wiphy, chandef)) - rdev_end_cac(rdev, wdev->netdev); + if (!cfg80211_chandef_dfs_usable(&rdev->wiphy, chandef)) + rdev_end_cac(rdev, wdev->netdev, link_id); + } } } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 64eeed82d43d..59a90bf3c0d6 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1910,6 +1910,7 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, known->pub.bssid_index = new->pub.bssid_index; known->pub.use_for &= new->pub.use_for; known->pub.cannot_use_reasons = new->pub.cannot_use_reasons; + known->bss_source = new->bss_source; return true; } @@ -2008,10 +2009,10 @@ __cfg80211_bss_update(struct cfg80211_registered_device *rdev, return found; free_ies: - ies = (void *)rcu_dereference(tmp->pub.beacon_ies); + ies = (void *)rcu_access_pointer(tmp->pub.beacon_ies); if (ies) kfree_rcu(ies, rcu_head); - ies = (void *)rcu_dereference(tmp->pub.proberesp_ies); + ies = (void *)rcu_access_pointer(tmp->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); @@ -2149,11 +2150,7 @@ struct cfg80211_inform_single_bss_data { const u8 *ie; size_t ielen; - enum { - BSS_SOURCE_DIRECT = 0, - BSS_SOURCE_MBSSID, - BSS_SOURCE_STA_PROFILE, - } bss_source; + enum bss_source_type bss_source; /* Set if reporting bss_source != BSS_SOURCE_DIRECT */ struct cfg80211_bss *source_bss; u8 max_bssid_indicator; @@ -2268,6 +2265,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, IEEE80211_MAX_CHAINS); tmp.pub.use_for = data->use_for; tmp.pub.cannot_use_reasons = data->cannot_use_reasons; + tmp.bss_source = data->bss_source; switch (data->bss_source) { case BSS_SOURCE_MBSSID: @@ -2907,6 +2905,9 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, struct element *reporter_rnr = NULL; struct ieee80211_multi_link_elem *ml_elem; struct cfg80211_mle *mle; + const struct element *ssid_elem; + const u8 *ssid = NULL; + size_t ssid_len = 0; u16 control; u8 ml_common_len; u8 *new_ie = NULL; @@ -2961,6 +2962,13 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, bss_change_count, gfp); + ssid_elem = cfg80211_find_elem(WLAN_EID_SSID, tx_data->ie, + tx_data->ielen); + if (ssid_elem) { + ssid = ssid_elem->data; + ssid_len = ssid_elem->datalen; + } + for (i = 0; i < ARRAY_SIZE(mle->sta_prof) && mle->sta_prof[i]; i++) { const struct ieee80211_neighbor_ap_info *ap_info; enum nl80211_band band; @@ -3042,6 +3050,23 @@ cfg80211_parse_ml_elem_sta_data(struct wiphy *wiphy, freq = ieee80211_channel_to_freq_khz(ap_info->channel, band); data.channel = ieee80211_get_channel_khz(wiphy, freq); + /* Skip if BSS entry generated from MBSSID or DIRECT source + * frame data available already. + */ + bss = cfg80211_get_bss(wiphy, data.channel, data.bssid, ssid, + ssid_len, IEEE80211_BSS_TYPE_ANY, + IEEE80211_PRIVACY_ANY); + if (bss) { + struct cfg80211_internal_bss *ibss = bss_from_pub(bss); + + if (data.capability == bss->capability && + ibss->bss_source != BSS_SOURCE_STA_PROFILE) { + cfg80211_put_bss(wiphy, bss); + continue; + } + cfg80211_put_bss(wiphy, bss); + } + if (use_for == NL80211_BSS_USE_FOR_MLD_LINK && !(wiphy->flags & WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY)) { use_for = 0; @@ -3467,8 +3492,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, n_channels = ieee80211_get_num_supported_channels(wiphy); } - creq = kzalloc(sizeof(*creq) + sizeof(struct cfg80211_ssid) + - n_channels * sizeof(void *), + creq = kzalloc(struct_size(creq, channels, n_channels) + + sizeof(struct cfg80211_ssid), GFP_ATOMIC); if (!creq) return -ENOMEM; @@ -3476,7 +3501,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, creq->wiphy = wiphy; creq->wdev = dev->ieee80211_ptr; /* SSIDs come after channels */ - creq->ssids = (void *)&creq->channels[n_channels]; + creq->ssids = (void *)creq + struct_size(creq, channels, n_channels); creq->n_channels = n_channels; creq->n_ssids = 1; creq->scan_start = jiffies; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index d9d7bf8bb5c1..431da30817a6 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -115,7 +115,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) n_channels = i; } request->n_channels = n_channels; - request->ssids = (void *)&request->channels[n_channels]; + request->ssids = (void *)request + + struct_size(request, channels, n_channels); request->n_ssids = 1; memcpy(request->ssids[0].ssid, wdev->conn->params.ssid, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 5c26f065bd68..97c21b627791 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -805,9 +805,22 @@ DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa, TP_ARGS(wiphy, netdev) ); -DEFINE_EVENT(wiphy_netdev_evt, rdev_end_cac, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), - TP_ARGS(wiphy, netdev) +TRACE_EVENT(rdev_end_cac, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + unsigned int link_id), + TP_ARGS(wiphy, netdev, link_id), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(unsigned int, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->link_id = link_id; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", link_id: %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->link_id) ); DECLARE_EVENT_CLASS(station_add_change, @@ -2652,24 +2665,26 @@ TRACE_EVENT(rdev_external_auth, TRACE_EVENT(rdev_start_radar_detection, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_chan_def *chandef, - u32 cac_time_ms), - TP_ARGS(wiphy, netdev, chandef, cac_time_ms), + u32 cac_time_ms, int link_id), + TP_ARGS(wiphy, netdev, chandef, cac_time_ms, link_id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY __field(u32, cac_time_ms) + __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(chandef); __entry->cac_time_ms = cac_time_ms; + __entry->link_id = link_id; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT - ", cac_time_ms=%u", + ", cac_time_ms=%u, link_id=%d", WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG, - __entry->cac_time_ms) + __entry->cac_time_ms, __entry->link_id) ); TRACE_EVENT(rdev_set_mcast_rate, @@ -3483,18 +3498,21 @@ TRACE_EVENT(cfg80211_radar_event, ); TRACE_EVENT(cfg80211_cac_event, - TP_PROTO(struct net_device *netdev, enum nl80211_radar_event evt), - TP_ARGS(netdev, evt), + TP_PROTO(struct net_device *netdev, enum nl80211_radar_event evt, + unsigned int link_id), + TP_ARGS(netdev, evt, link_id), TP_STRUCT__entry( NETDEV_ENTRY __field(enum nl80211_radar_event, evt) + __field(unsigned int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; __entry->evt = evt; + __entry->link_id = link_id; ), - TP_printk(NETDEV_PR_FMT ", event: %d", - NETDEV_PR_ARG, __entry->evt) + TP_printk(NETDEV_PR_FMT ", event: %d, link_id=%u", + NETDEV_PR_ARG, __entry->evt, __entry->link_id) ); DECLARE_EVENT_CLASS(cfg80211_rx_evt, diff --git a/net/wireless/util.c b/net/wireless/util.c index 9a7c3adc8a3b..f49b55724f83 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -998,10 +998,10 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb, * Diffserv Service Classes no update is needed: * - Standard: DF * - Low Priority Data: CS1 - * - Multimedia Streaming: AF31, AF32, AF33 * - Multimedia Conferencing: AF41, AF42, AF43 * - Network Control Traffic: CS7 * - Real-Time Interactive: CS4 + * - Signaling: CS5 */ switch (dscp >> 2) { case 10: @@ -1026,9 +1026,11 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb, /* Broadcasting video: CS3 */ ret = 4; break; - case 40: - /* Signaling: CS5 */ - ret = 5; + case 26: + case 28: + case 30: + /* Multimedia Streaming: AF31, AF32, AF33 */ + ret = 4; break; case 44: /* Voice Admit: VA */ @@ -2435,8 +2437,8 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, if (params->num_different_channels > c->num_different_channels) continue; - limits = kmemdup(c->limits, sizeof(limits[0]) * c->n_limits, - GFP_KERNEL); + limits = kmemdup_array(c->limits, c->n_limits, sizeof(*limits), + GFP_KERNEL); if (!limits) return -ENOMEM; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index c0e0204b9630..521a2938e50a 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -211,6 +211,11 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_pool; } + if (dev_get_min_mp_channel_count(netdev)) { + err = -EBUSY; + goto err_unreg_pool; + } + bpf.command = XDP_SETUP_XSK_POOL; bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; @@ -623,20 +628,31 @@ static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u3 return nb_entries; } -u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +static u32 xp_alloc_slow(struct xsk_buff_pool *pool, struct xdp_buff **xdp, + u32 max) { - u32 nb_entries1 = 0, nb_entries2; + int i; - if (unlikely(pool->dev && dma_dev_need_sync(pool->dev))) { + for (i = 0; i < max; i++) { struct xdp_buff *buff; - /* Slow path */ buff = xp_alloc(pool); - if (buff) - *xdp = buff; - return !!buff; + if (unlikely(!buff)) + return i; + *xdp = buff; + xdp++; } + return max; +} + +u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 nb_entries1 = 0, nb_entries2; + + if (unlikely(pool->dev && dma_dev_need_sync(pool->dev))) + return xp_alloc_slow(pool, xdp, max); + if (unlikely(pool->free_list_cnt)) { nb_entries1 = xp_alloc_reused(pool, xdp, max); if (nb_entries1 == max) @@ -656,9 +672,17 @@ EXPORT_SYMBOL(xp_alloc_batch); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) { + u32 req_count, avail_count; + if (pool->free_list_cnt >= count) return true; - return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); + + req_count = count - pool->free_list_cnt; + avail_count = xskq_cons_nb_entries(pool->fq, req_count); + if (!avail_count) + pool->fq->queue_empty_descs++; + + return avail_count >= req_count; } EXPORT_SYMBOL(xp_can_alloc); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6f2d1621c992..406b20dfee8d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -306,11 +306,6 @@ static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) return entries >= max ? max : entries; } -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) -{ - return xskq_cons_nb_entries(q, cnt) >= cnt; -} - static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 9a44d363ba62..f123b7c9ec82 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -328,12 +328,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, /* User explicitly requested packet offload mode and configured * policy in addition to the XFRM state. So be civil to users, * and return an error instead of taking fallback path. - * - * This WARN_ON() can be seen as a documentation for driver - * authors to do not return -EOPNOTSUPP in packet offload mode. */ - WARN_ON(err == -EOPNOTSUPP && is_packet_offload); - if (err != -EOPNOTSUPP || is_packet_offload) { + if ((err != -EOPNOTSUPP && !is_packet_offload) || is_packet_offload) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state"); return err; } diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index e50e4bf993fa..98f1e2b67c76 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -769,7 +769,7 @@ static int xfrmi_dev_init(struct net_device *dev) if (err) return err; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c56c61b0c12e..914bac03b52a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -45,6 +45,7 @@ #ifdef CONFIG_XFRM_ESPINTCP #include <net/espintcp.h> #endif +#include <net/inet_dscp.h> #include "xfrm_hash.h" @@ -109,7 +110,11 @@ struct xfrm_pol_inexact_node { * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with - * the lowest priority. If two results have same prio, youngest one wins. + * the lowest priority. If two candidates have the same priority, the + * struct xfrm_policy pos member with the lower number is used. + * + * This replicates previous single-list-search algorithm which would + * return first matching policy in the (ordered-by-priority) list. */ struct xfrm_pol_inexact_key { @@ -196,8 +201,6 @@ xfrm_policy_inexact_lookup_rcu(struct net *net, static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); -static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, - struct xfrm_policy *policy); static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, @@ -410,7 +413,6 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); - INIT_HLIST_NODE(&policy->bydst_inexact_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); @@ -1228,26 +1230,31 @@ xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) return ERR_PTR(-EEXIST); } - chain = &net->xfrm.policy_inexact[dir]; - xfrm_policy_insert_inexact_list(chain, policy); - if (delpol) __xfrm_policy_inexact_prune_bin(bin, false); return delpol; } +static bool xfrm_policy_is_dead_or_sk(const struct xfrm_policy *policy) +{ + int dir; + + if (policy->walk.dead) + return true; + + dir = xfrm_policy_id2dir(policy->index); + return dir >= XFRM_POLICY_MAX; +} + static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); - unsigned int hmask; struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; - struct hlist_head *odst; struct hlist_node *newpos; - int i; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; @@ -1274,13 +1281,10 @@ static void xfrm_hash_rebuild(struct work_struct *work) struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; - if (policy->walk.dead) + if (xfrm_policy_is_dead_or_sk(policy)) continue; dir = xfrm_policy_id2dir(policy->index); - if (dir >= XFRM_POLICY_MAX) - continue; - if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { if (policy->family == AF_INET) { dbits = rbits4; @@ -1311,23 +1315,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) goto out_unlock; } - /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { - struct hlist_node *n; - - hlist_for_each_entry_safe(policy, n, - &net->xfrm.policy_inexact[dir], - bydst_inexact_list) { - hlist_del_rcu(&policy->bydst); - hlist_del_init(&policy->bydst_inexact_list); - } - - hmask = net->xfrm.policy_bydst[dir].hmask; - odst = net->xfrm.policy_bydst[dir].table; - for (i = hmask; i >= 0; i--) { - hlist_for_each_entry_safe(policy, n, odst + i, bydst) - hlist_del_rcu(&policy->bydst); - } if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; @@ -1345,14 +1333,13 @@ static void xfrm_hash_rebuild(struct work_struct *work) /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { - if (policy->walk.dead) + if (xfrm_policy_is_dead_or_sk(policy)) continue; - dir = xfrm_policy_id2dir(policy->index); - if (dir >= XFRM_POLICY_MAX) { - /* skip socket policies */ - continue; - } + + hlist_del_rcu(&policy->bydst); + newpos = NULL; + dir = xfrm_policy_id2dir(policy->index); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); @@ -1519,42 +1506,6 @@ static const struct rhashtable_params xfrm_pol_inexact_params = { .automatic_shrinking = true, }; -static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, - struct xfrm_policy *policy) -{ - struct xfrm_policy *pol, *delpol = NULL; - struct hlist_node *newpos = NULL; - int i = 0; - - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - if (pol->type == policy->type && - pol->if_id == policy->if_id && - !selector_cmp(&pol->selector, &policy->selector) && - xfrm_policy_mark_match(&policy->mark, pol) && - xfrm_sec_ctx_match(pol->security, policy->security) && - !WARN_ON(delpol)) { - delpol = pol; - if (policy->priority > pol->priority) - continue; - } else if (policy->priority >= pol->priority) { - newpos = &pol->bydst_inexact_list; - continue; - } - if (delpol) - break; - } - - if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) - hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); - else - hlist_add_head_rcu(&policy->bydst_inexact_list, chain); - - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - pol->pos = i; - i++; - } -} - static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) @@ -2294,10 +2245,52 @@ out: return pol; } +static u32 xfrm_gen_pos_slow(struct net *net) +{ + struct xfrm_policy *policy; + u32 i = 0; + + /* oldest entry is last in list */ + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + if (!xfrm_policy_is_dead_or_sk(policy)) + policy->pos = ++i; + } + + return i; +} + +static u32 xfrm_gen_pos(struct net *net) +{ + const struct xfrm_policy *policy; + u32 i = 0; + + /* most recently added policy is at the head of the list */ + list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { + if (xfrm_policy_is_dead_or_sk(policy)) + continue; + + if (policy->pos == UINT_MAX) + return xfrm_gen_pos_slow(net); + + i = policy->pos + 1; + break; + } + + return i; +} + static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_FWD: + case XFRM_POLICY_OUT: + pol->pos = xfrm_gen_pos(net); + break; + } + list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); @@ -2314,7 +2307,6 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); - hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } @@ -2561,7 +2553,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static int xfrm_get_tos(const struct flowi *fl, int family) { if (family == AF_INET) - return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; + return fl->u.ip4.flowi4_tos & INET_DSCP_MASK; return 0; } @@ -4437,63 +4429,50 @@ EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE -static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, - const struct xfrm_selector *sel_tgt) -{ - if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { - if (sel_tgt->family == sel_cmp->family && - xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr, - sel_cmp->family) && - xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr, - sel_cmp->family) && - sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && - sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { - return true; - } - } else { - if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) { - return true; - } - } - return false; -} - static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net, u32 if_id) { - struct xfrm_policy *pol, *ret = NULL; - struct hlist_head *chain; - u32 priority = ~0U; + struct xfrm_policy *pol; + struct flowi fl; - spin_lock_bh(&net->xfrm.xfrm_policy_lock); - chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); - hlist_for_each_entry(pol, chain, bydst) { - if ((if_id == 0 || pol->if_id == if_id) && - xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type) { - ret = pol; - priority = ret->priority; - break; - } - } - chain = &net->xfrm.policy_inexact[dir]; - hlist_for_each_entry(pol, chain, bydst_inexact_list) { - if ((pol->priority >= priority) && ret) - break; + memset(&fl, 0, sizeof(fl)); - if ((if_id == 0 || pol->if_id == if_id) && - xfrm_migrate_selector_match(sel, &pol->selector) && - pol->type == type) { - ret = pol; + fl.flowi_proto = sel->proto; + + switch (sel->family) { + case AF_INET: + fl.u.ip4.saddr = sel->saddr.a4; + fl.u.ip4.daddr = sel->daddr.a4; + if (sel->proto == IPSEC_ULPROTO_ANY) break; - } + fl.u.flowi4_oif = sel->ifindex; + fl.u.ip4.fl4_sport = sel->sport; + fl.u.ip4.fl4_dport = sel->dport; + break; + case AF_INET6: + fl.u.ip6.saddr = sel->saddr.in6; + fl.u.ip6.daddr = sel->daddr.in6; + if (sel->proto == IPSEC_ULPROTO_ANY) + break; + fl.u.flowi6_oif = sel->ifindex; + fl.u.ip6.fl4_sport = sel->sport; + fl.u.ip6.fl4_dport = sel->dport; + break; + default: + return ERR_PTR(-EAFNOSUPPORT); } - xfrm_pol_hold(ret); + rcu_read_lock(); - spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + pol = xfrm_policy_lookup_bytype(net, type, &fl, sel->family, dir, if_id); + if (IS_ERR_OR_NULL(pol)) + goto out_unlock; - return ret; + if (!xfrm_pol_hold_rcu(pol)) + pol = NULL; +out_unlock: + rcu_read_unlock(); + return pol; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) @@ -4630,9 +4609,9 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, /* Stage 1 - find policy */ pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); - if (!pol) { + if (IS_ERR_OR_NULL(pol)) { NL_SET_ERR_MSG(extack, "Target policy not found"); - err = -ENOENT; + err = IS_ERR(pol) ? PTR_ERR(pol) : -ENOENT; goto out; } |