diff options
Diffstat (limited to 'net')
227 files changed, 12673 insertions, 5725 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 5505ee6ebdbe..73a65789271b 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -118,17 +118,21 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) } int vlan_check_real_dev(struct net_device *real_dev, - __be16 protocol, u16 vlan_id) + __be16 protocol, u16 vlan_id, + struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { pr_info("VLANs not supported on %s\n", name); + NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } - if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) + if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { + NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; + } return 0; } @@ -215,7 +219,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) if (vlan_id >= VLAN_VID_MASK) return -ERANGE; - err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id); + err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, + NULL); if (err < 0) return err; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index e23aac3e4d37..44df1c3df02d 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -109,7 +109,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask); void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); int vlan_check_real_dev(struct net_device *real_dev, - __be16 protocol, u16 vlan_id); + __be16 protocol, u16 vlan_id, + struct netlink_ext_ack *extack); void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 236452ebbd9e..546af0e73ac3 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -215,7 +215,9 @@ int vlan_dev_set_egress_priority(const struct net_device *dev, return 0; } -/* Flags are defined in the vlan_flags enum in include/linux/if_vlan.h file. */ +/* Flags are defined in the vlan_flags enum in + * include/uapi/linux/if_vlan.h file. + */ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 6689c0b272a7..9b60c1e399e2 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -47,14 +47,20 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], int err; if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { + NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EINVAL; - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + } + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { + NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EADDRNOTAVAIL; + } } - if (!data) + if (!data) { + NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified"); return -EINVAL; + } if (data[IFLA_VLAN_PROTOCOL]) { switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { @@ -62,29 +68,38 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], case htons(ETH_P_8021AD): break; default: + NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol"); return -EPROTONOSUPPORT; } } if (data[IFLA_VLAN_ID]) { id = nla_get_u16(data[IFLA_VLAN_ID]); - if (id >= VLAN_VID_MASK) + if (id >= VLAN_VID_MASK) { + NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id"); return -ERANGE; + } } if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | - VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) + VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; + } } err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map"); return err; + } err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map"); return err; + } return 0; } @@ -126,14 +141,21 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, __be16 proto; int err; - if (!data[IFLA_VLAN_ID]) + if (!data[IFLA_VLAN_ID]) { + NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified"); return -EINVAL; + } - if (!tb[IFLA_LINK]) + if (!tb[IFLA_LINK]) { + NL_SET_ERR_MSG_MOD(extack, "link not specified"); return -EINVAL; + } + real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); - if (!real_dev) + if (!real_dev) { + NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; + } if (data[IFLA_VLAN_PROTOCOL]) proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]); @@ -146,7 +168,8 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlan->flags = VLAN_FLAG_REORDER_HDR; - err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id); + err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id, + extack); if (err < 0) return err; diff --git a/net/9p/mod.c b/net/9p/mod.c index 6ab36aea7727..eb9777f05755 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -104,7 +104,7 @@ EXPORT_SYMBOL(v9fs_unregister_trans); /** * v9fs_get_trans_by_name - get transport with the matching name - * @name: string identifying transport + * @s: string identifying transport * */ struct p9_trans_module *v9fs_get_trans_by_name(char *s) diff --git a/net/Kconfig b/net/Kconfig index 0428f12c25c2..df8d45ef47d8 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -59,6 +59,7 @@ source "net/tls/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" +source "net/xdp/Kconfig" config INET bool "TCP/IP networking" @@ -407,6 +408,9 @@ config GRO_CELLS bool default n +config SOCK_VALIDATE_XMIT + bool + config NET_DEVLINK tristate "Network physical/parent device Netlink interface" help @@ -423,6 +427,9 @@ config MAY_USE_DEVLINK on MAY_USE_DEVLINK to ensure they do not cause link errors when devlink is a loadable module and the driver using it is built-in. +config PAGE_POOL + bool + endif # if NET # Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/Makefile b/net/Makefile index a6147c61b174..77aaddedbd29 100644 --- a/net/Makefile +++ b/net/Makefile @@ -85,3 +85,4 @@ obj-y += l3mdev/ endif obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ +obj-$(CONFIG_XDP_SOCKETS) += xdp/ diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 40d260f2bea5..b0ee9edaae35 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3422,6 +3422,37 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, return 0; } +int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, + const void *param) +{ + struct sk_buff *skb; + + if (hci_opcode_ogf(opcode) != 0x3f) { + /* A controller receiving a command shall respond with either + * a Command Status Event or a Command Complete Event. + * Therefore, all standard HCI commands must be sent via the + * standard API, using hci_send_cmd or hci_cmd_sync helpers. + * Some vendors do not comply with this rule for vendor-specific + * commands and do not return any event. We want to support + * unresponded commands for such cases only. + */ + bt_dev_err(hdev, "unresponded command not supported"); + return -EINVAL; + } + + skb = hci_prepare_cmd(hdev, opcode, plen, param); + if (!skb) { + bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", + opcode); + return -ENOMEM; + } + + hci_send_frame(hdev, skb); + + return 0; +} +EXPORT_SYMBOL(__hci_cmd_send); + /* Get data from the previously sent command */ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 139707cd9d35..235b5aaab23d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4942,10 +4942,14 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb) struct hci_ev_le_advertising_info *ev = ptr; s8 rssi; - rssi = ev->data[ev->length]; - process_adv_report(hdev, ev->evt_type, &ev->bdaddr, - ev->bdaddr_type, NULL, 0, rssi, - ev->data, ev->length); + if (ev->length <= HCI_MAX_AD_LENGTH) { + rssi = ev->data[ev->length]; + process_adv_report(hdev, ev->evt_type, &ev->bdaddr, + ev->bdaddr_type, NULL, 0, rssi, + ev->data, ev->length); + } else { + bt_dev_err(hdev, "Dropping invalid advertising data"); + } ptr += sizeof(*ev) + ev->length + 1; } diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 66c0781773df..e44d34734834 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -122,7 +122,6 @@ void hci_req_sync_cancel(struct hci_dev *hdev, int err) struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param, u8 event, u32 timeout) { - DECLARE_WAITQUEUE(wait, current); struct hci_request req; struct sk_buff *skb; int err = 0; @@ -135,21 +134,14 @@ struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen, hdev->req_status = HCI_REQ_PEND; - add_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_INTERRUPTIBLE); - err = hci_req_run_skb(&req, hci_req_sync_complete); - if (err < 0) { - remove_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_RUNNING); + if (err < 0) return ERR_PTR(err); - } - schedule_timeout(timeout); + err = wait_event_interruptible_timeout(hdev->req_wait_q, + hdev->req_status != HCI_REQ_PEND, timeout); - remove_wait_queue(&hdev->req_wait_q, &wait); - - if (signal_pending(current)) + if (err == -ERESTARTSYS) return ERR_PTR(-EINTR); switch (hdev->req_status) { @@ -197,7 +189,6 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, unsigned long opt, u32 timeout, u8 *hci_status) { struct hci_request req; - DECLARE_WAITQUEUE(wait, current); int err = 0; BT_DBG("%s start", hdev->name); @@ -213,16 +204,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, return err; } - add_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_INTERRUPTIBLE); - err = hci_req_run_skb(&req, hci_req_sync_complete); if (err < 0) { hdev->req_status = 0; - remove_wait_queue(&hdev->req_wait_q, &wait); - set_current_state(TASK_RUNNING); - /* ENODATA means the HCI request command queue is empty. * This can happen when a request with conditionals doesn't * trigger any commands to be sent. This is normal behavior @@ -240,11 +225,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, return err; } - schedule_timeout(timeout); - - remove_wait_queue(&hdev->req_wait_q, &wait); + err = wait_event_interruptible_timeout(hdev->req_wait_q, + hdev->req_status != HCI_REQ_PEND, timeout); - if (signal_pending(current)) + if (err == -ERESTARTSYS) return -EINTR; switch (hdev->req_status) { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2ced48662c1f..68c3578343b4 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -170,7 +170,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.rxq = &rxqueue->xdp_rxq; retval = bpf_test_run(prog, &xdp, repeat, &duration); - if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN) + if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || + xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); kfree(data); diff --git a/net/bridge/br.c b/net/bridge/br.c index 671d13c10f6f..b0a0b82e2d91 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -34,6 +34,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; + bool notified = false; bool changed_addr; int err; @@ -67,7 +68,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v break; case NETDEV_CHANGE: - br_port_carrier_check(p); + br_port_carrier_check(p, ¬ified); break; case NETDEV_FEAT_CHANGE: @@ -76,8 +77,10 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v case NETDEV_DOWN: spin_lock_bh(&br->lock); - if (br->dev->flags & IFF_UP) + if (br->dev->flags & IFF_UP) { br_stp_disable_port(p); + notified = true; + } spin_unlock_bh(&br->lock); break; @@ -85,6 +88,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v if (netif_running(br->dev) && netif_oper_up(dev)) { spin_lock_bh(&br->lock); br_stp_enable_port(p); + notified = true; spin_unlock_bh(&br->lock); } break; @@ -110,8 +114,8 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v } /* Events that may cause spanning tree to refresh */ - if (event == NETDEV_CHANGEADDR || event == NETDEV_UP || - event == NETDEV_CHANGE || event == NETDEV_DOWN) + if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP || + event == NETDEV_CHANGE || event == NETDEV_DOWN)) br_ifinfo_notify(RTM_NEWLINK, NULL, p); return NOTIFY_DONE; @@ -141,7 +145,7 @@ static int br_switchdev_event(struct notifier_block *unused, case SWITCHDEV_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, - fdb_info->vid); + fdb_info->vid, false); if (err) { err = notifier_from_errno(err); break; @@ -152,7 +156,7 @@ static int br_switchdev_event(struct notifier_block *unused, case SWITCHDEV_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, - fdb_info->vid); + fdb_info->vid, false); if (err) err = notifier_from_errno(err); break; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index d9e69e4514be..b19e3104afd6 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -40,7 +40,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid); static void fdb_notify(struct net_bridge *br, - const struct net_bridge_fdb_entry *, int); + const struct net_bridge_fdb_entry *, int, bool); int __init br_fdb_init(void) { @@ -121,6 +121,28 @@ static struct net_bridge_fdb_entry *br_fdb_find(struct net_bridge *br, return fdb; } +struct net_device *br_fdb_find_port(const struct net_device *br_dev, + const unsigned char *addr, + __u16 vid) +{ + struct net_bridge_fdb_entry *f; + struct net_device *dev = NULL; + struct net_bridge *br; + + ASSERT_RTNL(); + + if (!netif_is_bridge_master(br_dev)) + return NULL; + + br = netdev_priv(br_dev); + f = br_fdb_find(br, addr, vid); + if (f && f->dst) + dev = f->dst->dev; + + return dev; +} +EXPORT_SYMBOL_GPL(br_fdb_find_port); + struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, const unsigned char *addr, __u16 vid) @@ -173,7 +195,8 @@ static void fdb_del_hw_addr(struct net_bridge *br, const unsigned char *addr) } } -static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) +static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f, + bool swdev_notify) { trace_fdb_delete(br, f); @@ -183,7 +206,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f) hlist_del_init_rcu(&f->fdb_node); rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode, br_fdb_rht_params); - fdb_notify(br, f, RTM_DELNEIGH); + fdb_notify(br, f, RTM_DELNEIGH, swdev_notify); call_rcu(&f->rcu, fdb_rcu_free); } @@ -219,7 +242,7 @@ static void fdb_delete_local(struct net_bridge *br, return; } - fdb_delete(br, f); + fdb_delete(br, f, true); } void br_fdb_find_delete_local(struct net_bridge *br, @@ -334,7 +357,7 @@ void br_fdb_cleanup(struct work_struct *work) } else { spin_lock_bh(&br->hash_lock); if (!hlist_unhashed(&f->fdb_node)) - fdb_delete(br, f); + fdb_delete(br, f, true); spin_unlock_bh(&br->hash_lock); } } @@ -354,7 +377,7 @@ void br_fdb_flush(struct net_bridge *br) spin_lock_bh(&br->hash_lock); hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) { if (!f->is_static) - fdb_delete(br, f); + fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); } @@ -383,7 +406,7 @@ void br_fdb_delete_by_port(struct net_bridge *br, if (f->is_local) fdb_delete_local(br, p, f); else - fdb_delete(br, f); + fdb_delete(br, f, true); } spin_unlock_bh(&br->hash_lock); } @@ -509,7 +532,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return 0; br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", source ? source->dev->name : br->dev->name, addr, vid); - fdb_delete(br, fdb); + fdb_delete(br, fdb, true); } fdb = fdb_create(br, source, addr, vid, 1, 1); @@ -517,7 +540,7 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, return -ENOMEM; fdb_add_hw_addr(br, addr); - fdb_notify(br, fdb, RTM_NEWNEIGH); + fdb_notify(br, fdb, RTM_NEWNEIGH, true); return 0; } @@ -572,7 +595,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, fdb->added_by_user = 1; if (unlikely(fdb_modified)) { trace_br_fdb_update(br, source, addr, vid, added_by_user); - fdb_notify(br, fdb, RTM_NEWNEIGH); + fdb_notify(br, fdb, RTM_NEWNEIGH, true); } } } else { @@ -583,7 +606,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, fdb->added_by_user = 1; trace_br_fdb_update(br, source, addr, vid, added_by_user); - fdb_notify(br, fdb, RTM_NEWNEIGH); + fdb_notify(br, fdb, RTM_NEWNEIGH, true); } /* else we lose race and someone else inserts * it first, don't bother updating @@ -665,13 +688,15 @@ static inline size_t fdb_nlmsg_size(void) } static void fdb_notify(struct net_bridge *br, - const struct net_bridge_fdb_entry *fdb, int type) + const struct net_bridge_fdb_entry *fdb, int type, + bool swdev_notify) { struct net *net = dev_net(br->dev); struct sk_buff *skb; int err = -ENOBUFS; - br_switchdev_fdb_notify(fdb, type); + if (swdev_notify) + br_switchdev_fdb_notify(fdb, type); skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) @@ -810,7 +835,7 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, fdb->used = jiffies; if (modified) { fdb->updated = jiffies; - fdb_notify(br, fdb, RTM_NEWNEIGH); + fdb_notify(br, fdb, RTM_NEWNEIGH, true); } return 0; @@ -834,7 +859,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, rcu_read_unlock(); local_bh_enable(); } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { - err = br_fdb_external_learn_add(br, p, addr, vid); + err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm->ndm_state, @@ -923,7 +948,7 @@ static int fdb_delete_by_addr_and_port(struct net_bridge *br, if (!fdb || fdb->dst != p) return -ENOENT; - fdb_delete(br, fdb); + fdb_delete(br, fdb, true); return 0; } @@ -1043,7 +1068,8 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) } int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + bool swdev_notify) { struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -1061,7 +1087,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, goto err_unlock; } fdb->added_by_external_learn = 1; - fdb_notify(br, fdb, RTM_NEWNEIGH); + fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { fdb->updated = jiffies; @@ -1080,7 +1106,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, } if (modified) - fdb_notify(br, fdb, RTM_NEWNEIGH); + fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } err_unlock: @@ -1090,7 +1116,8 @@ err_unlock: } int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid) + const unsigned char *addr, u16 vid, + bool swdev_notify) { struct net_bridge_fdb_entry *fdb; int err = 0; @@ -1099,7 +1126,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, fdb = br_fdb_find(br, addr, vid); if (fdb && fdb->added_by_external_learn) - fdb_delete(br, fdb); + fdb_delete(br, fdb, swdev_notify); else err = -ENOENT; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index b4eed113d2ec..7a7fd672ccf2 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -274,8 +274,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge_port *port, *lport, *rport; lport = p ? p->port : NULL; - rport = rp ? hlist_entry(rp, struct net_bridge_port, rlist) : - NULL; + rport = hlist_entry_safe(rp, struct net_bridge_port, rlist); if ((unsigned long)lport > (unsigned long)rport) { port = lport; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 5bb6681fa91e..05e42d86882d 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -64,7 +64,7 @@ static int port_cost(struct net_device *dev) /* Check for port carrier transitions. */ -void br_port_carrier_check(struct net_bridge_port *p) +void br_port_carrier_check(struct net_bridge_port *p, bool *notified) { struct net_device *dev = p->dev; struct net_bridge *br = p->br; @@ -73,16 +73,21 @@ void br_port_carrier_check(struct net_bridge_port *p) netif_running(dev) && netif_oper_up(dev)) p->path_cost = port_cost(dev); + *notified = false; if (!netif_running(br->dev)) return; spin_lock_bh(&br->lock); if (netif_running(dev) && netif_oper_up(dev)) { - if (p->state == BR_STATE_DISABLED) + if (p->state == BR_STATE_DISABLED) { br_stp_enable_port(p); + *notified = true; + } } else { - if (p->state != BR_STATE_DISABLED) + if (p->state != BR_STATE_DISABLED) { br_stp_disable_port(p); + *notified = true; + } } spin_unlock_bh(&br->lock); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a7cb3ece5031..742f40aefdaf 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -553,9 +553,11 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + bool swdev_notify); int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + bool swdev_notify); void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid); @@ -573,7 +575,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig); /* br_if.c */ -void br_port_carrier_check(struct net_bridge_port *p); +void br_port_carrier_check(struct net_bridge_port *p, bool *notified); int br_add_bridge(struct net *net, const char *name); int br_del_bridge(struct net *net, const char *name); int br_add_if(struct net_bridge *br, struct net_device *dev, @@ -594,11 +596,22 @@ static inline bool br_rx_handler_check_rcu(const struct net_device *dev) return rcu_dereference(dev->rx_handler) == br_handle_frame; } +static inline bool br_rx_handler_check_rtnl(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame; +} + static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev) { return br_rx_handler_check_rcu(dev) ? br_port_get_rcu(dev) : NULL; } +static inline struct net_bridge_port * +br_port_get_check_rtnl(const struct net_device *dev) +{ + return br_rx_handler_check_rtnl(dev) ? br_port_get_rtnl_rcu(dev) : NULL; +} + /* br_ioctl.c */ int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index ee775f4ff76c..35474d49555d 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -102,13 +102,15 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, static void br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, - u16 vid, struct net_device *dev) + u16 vid, struct net_device *dev, + bool added_by_user) { struct switchdev_notifier_fdb_info info; unsigned long notifier_type; info.addr = mac; info.vid = vid; + info.added_by_user = added_by_user; notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE; call_switchdev_notifiers(notifier_type, dev, &info.info); } @@ -116,19 +118,21 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) { - if (!fdb->added_by_user || !fdb->dst) + if (!fdb->dst) return; switch (type) { case RTM_DELNEIGH: br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr, fdb->key.vlan_id, - fdb->dst->dev); + fdb->dst->dev, + fdb->added_by_user); break; case RTM_NEWNEIGH: br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr, fdb->key.vlan_id, - fdb->dst->dev); + fdb->dst->dev, + fdb->added_by_user); break; } } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 9896f4975353..dc832c0934c6 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1149,3 +1149,44 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, stats->tx_packets += txpackets; } } + +int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) +{ + struct net_bridge_vlan_group *vg; + + ASSERT_RTNL(); + if (netif_is_bridge_master(dev)) + vg = br_vlan_group(netdev_priv(dev)); + else + return -EINVAL; + + *p_pvid = br_get_pvid(vg); + return 0; +} +EXPORT_SYMBOL_GPL(br_vlan_get_pvid); + +int br_vlan_get_info(const struct net_device *dev, u16 vid, + struct bridge_vlan_info *p_vinfo) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge_port *p; + + ASSERT_RTNL(); + p = br_port_get_check_rtnl(dev); + if (p) + vg = nbp_vlan_group(p); + else if (netif_is_bridge_master(dev)) + vg = br_vlan_group(netdev_priv(dev)); + else + return -EINVAL; + + v = br_vlan_find(vg, vid); + if (!v) + return -ENOENT; + + p_vinfo->vid = vid; + p_vinfo->flags = v->flags; + return 0; +} +EXPORT_SYMBOL_GPL(br_vlan_get_info); diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index f212447794bd..9a0159aebe1a 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -8,13 +8,6 @@ menuconfig NF_TABLES_BRIDGE bool "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE - -config NFT_BRIDGE_META - tristate "Netfilter nf_table bridge meta support" - depends on NFT_META - help - Add support for bridge dedicated meta key. - config NFT_BRIDGE_REJECT tristate "Netfilter nf_tables bridge reject support" depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6 diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 4bc758dd4a8c..9b868861f21a 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,7 +3,6 @@ # Makefile for the netfilter modules for Link Layer filtering on a bridge. # -obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o # packet logging diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 28a4c3490359..b286ed5596c3 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -101,7 +101,7 @@ ebt_do_match(struct ebt_entry_match *m, const struct sk_buff *skb, { par->match = m->u.match; par->matchinfo = m->data; - return m->u.match->match(skb, par) ? EBT_MATCH : EBT_NOMATCH; + return !m->u.match->match(skb, par); } static inline int @@ -177,6 +177,12 @@ struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry) return (void *)entry + entry->next_offset; } +static inline const struct ebt_entry_target * +ebt_get_target_c(const struct ebt_entry *e) +{ + return ebt_get_target((struct ebt_entry *)e); +} + /* Do some firewalling */ unsigned int ebt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, @@ -230,8 +236,7 @@ unsigned int ebt_do_table(struct sk_buff *skb, */ EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar); - t = (struct ebt_entry_target *) - (((char *)point) + point->target_offset); + t = ebt_get_target_c(point); /* standard target */ if (!t->u.target->target) verdict = ((struct ebt_standard_target *)t)->verdict; @@ -343,6 +348,16 @@ find_table_lock(struct net *net, const char *name, int *error, "ebtable_", error, mutex); } +static inline void ebt_free_table_info(struct ebt_table_info *info) +{ + int i; + + if (info->chainstack) { + for_each_possible_cpu(i) + vfree(info->chainstack[i]); + vfree(info->chainstack); + } +} static inline int ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, unsigned int *cnt) @@ -627,7 +642,7 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt) return 1; EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, net, NULL); EBT_MATCH_ITERATE(e, ebt_cleanup_match, net, NULL); - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target(e); par.net = net; par.target = t->u.target; @@ -706,7 +721,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net, ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, &tgpar, &j); if (ret != 0) goto cleanup_watchers; - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target(e); gap = e->next_offset - e->target_offset; target = xt_request_find_target(NFPROTO_BRIDGE, t->u.name, 0); @@ -779,8 +794,7 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack if (pos == nentries) continue; } - t = (struct ebt_entry_target *) - (((char *)e) + e->target_offset); + t = ebt_get_target_c(e); if (strcmp(t->u.name, EBT_STANDARD_TARGET)) goto letscontinue; if (e->target_offset + sizeof(struct ebt_standard_target) > @@ -975,7 +989,7 @@ static void get_counters(const struct ebt_counter *oldcounters, static int do_replace_finish(struct net *net, struct ebt_replace *repl, struct ebt_table_info *newinfo) { - int ret, i; + int ret; struct ebt_counter *counterstmp = NULL; /* used to be able to unlock earlier */ struct ebt_table_info *table; @@ -1051,13 +1065,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, ebt_cleanup_entry, net, NULL); vfree(table->entries); - if (table->chainstack) { - for_each_possible_cpu(i) - vfree(table->chainstack[i]); - vfree(table->chainstack); - } + ebt_free_table_info(table); vfree(table); - vfree(counterstmp); #ifdef CONFIG_AUDIT @@ -1078,11 +1087,7 @@ free_iterate: free_counterstmp: vfree(counterstmp); /* can be initialized in translate_table() */ - if (newinfo->chainstack) { - for_each_possible_cpu(i) - vfree(newinfo->chainstack[i]); - vfree(newinfo->chainstack); - } + ebt_free_table_info(newinfo); return ret; } @@ -1147,8 +1152,6 @@ free_newinfo: static void __ebt_unregister_table(struct net *net, struct ebt_table *table) { - int i; - mutex_lock(&ebt_mutex); list_del(&table->list); mutex_unlock(&ebt_mutex); @@ -1157,11 +1160,7 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) if (table->private->nentries) module_put(table->me); vfree(table->private->entries); - if (table->private->chainstack) { - for_each_possible_cpu(i) - vfree(table->private->chainstack[i]); - vfree(table->private->chainstack); - } + ebt_free_table_info(table->private); vfree(table->private); kfree(table); } @@ -1263,11 +1262,7 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, free_unlock: mutex_unlock(&ebt_mutex); free_chainstack: - if (newinfo->chainstack) { - for_each_possible_cpu(i) - vfree(newinfo->chainstack[i]); - vfree(newinfo->chainstack); - } + ebt_free_table_info(newinfo); vfree(newinfo->entries); free_newinfo: vfree(newinfo); @@ -1405,7 +1400,7 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, return -EFAULT; hlp = ubase + (((char *)e + e->target_offset) - base); - t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + t = ebt_get_target_c(e); ret = EBT_MATCH_ITERATE(e, ebt_match_to_user, base, ubase); if (ret != 0) @@ -1746,7 +1741,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr, return ret; target_offset = e->target_offset - (origsize - *size); - t = (struct ebt_entry_target *) ((char *) e + e->target_offset); + t = ebt_get_target(e); ret = compat_target_to_user(t, dstptr, size); if (ret) @@ -1794,7 +1789,7 @@ static int compat_calc_entry(const struct ebt_entry *e, EBT_MATCH_ITERATE(e, compat_calc_match, &off); EBT_WATCHER_ITERATE(e, compat_calc_watcher, &off); - t = (const struct ebt_entry_target *) ((char *) e + e->target_offset); + t = ebt_get_target_c(e); off += xt_compat_target_offset(t->u.target); off += ebt_compat_entry_padsize(); diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c deleted file mode 100644 index bb63c9aed55d..000000000000 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2014 Intel Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/netlink.h> -#include <linux/netfilter.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> -#include <net/netfilter/nft_meta.h> - -#include "../br_private.h" - -static void nft_meta_bridge_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - const struct nft_meta *priv = nft_expr_priv(expr); - const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); - u32 *dest = ®s->data[priv->dreg]; - const struct net_bridge_port *p; - - switch (priv->key) { - case NFT_META_BRI_IIFNAME: - if (in == NULL || (p = br_port_get_rcu(in)) == NULL) - goto err; - break; - case NFT_META_BRI_OIFNAME: - if (out == NULL || (p = br_port_get_rcu(out)) == NULL) - goto err; - break; - default: - goto out; - } - - strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); - return; -out: - return nft_meta_get_eval(expr, regs, pkt); -err: - regs->verdict.code = NFT_BREAK; -} - -static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_meta *priv = nft_expr_priv(expr); - unsigned int len; - - priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); - switch (priv->key) { - case NFT_META_BRI_IIFNAME: - case NFT_META_BRI_OIFNAME: - len = IFNAMSIZ; - break; - default: - return nft_meta_get_init(ctx, expr, tb); - } - - priv->dreg = nft_parse_register(tb[NFTA_META_DREG]); - return nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, len); -} - -static struct nft_expr_type nft_meta_bridge_type; -static const struct nft_expr_ops nft_meta_bridge_get_ops = { - .type = &nft_meta_bridge_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), - .eval = nft_meta_bridge_get_eval, - .init = nft_meta_bridge_get_init, - .dump = nft_meta_get_dump, -}; - -static const struct nft_expr_ops nft_meta_bridge_set_ops = { - .type = &nft_meta_bridge_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), - .eval = nft_meta_set_eval, - .init = nft_meta_set_init, - .destroy = nft_meta_set_destroy, - .dump = nft_meta_set_dump, - .validate = nft_meta_set_validate, -}; - -static const struct nft_expr_ops * -nft_meta_bridge_select_ops(const struct nft_ctx *ctx, - const struct nlattr * const tb[]) -{ - if (tb[NFTA_META_KEY] == NULL) - return ERR_PTR(-EINVAL); - - if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG]) - return ERR_PTR(-EINVAL); - - if (tb[NFTA_META_DREG]) - return &nft_meta_bridge_get_ops; - - if (tb[NFTA_META_SREG]) - return &nft_meta_bridge_set_ops; - - return ERR_PTR(-EINVAL); -} - -static struct nft_expr_type nft_meta_bridge_type __read_mostly = { - .family = NFPROTO_BRIDGE, - .name = "meta", - .select_ops = nft_meta_bridge_select_ops, - .policy = nft_meta_policy, - .maxattr = NFTA_META_MAX, - .owner = THIS_MODULE, -}; - -static int __init nft_meta_bridge_module_init(void) -{ - return nft_register_expr(&nft_meta_bridge_type); -} - -static void __exit nft_meta_bridge_module_exit(void) -{ - nft_unregister_expr(&nft_meta_bridge_type); -} - -module_init(nft_meta_bridge_module_init); -module_exit(nft_meta_bridge_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>"); -MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta"); diff --git a/net/core/Makefile b/net/core/Makefile index 6dbbba8c57ae..7080417f8bc8 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -14,6 +14,7 @@ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ fib_notifier.o xdp.o obj-y += net-sysfs.o +obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/dev.c b/net/core/dev.c index 2af787e8b130..1844d9bc5714 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1285,6 +1285,7 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return len; } +EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device @@ -1586,7 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) - }; + } #undef N return "UNKNOWN_NETDEV_EVENT"; } @@ -1754,38 +1755,38 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) EXPORT_SYMBOL(call_netdevice_notifiers); #ifdef CONFIG_NET_INGRESS -static struct static_key ingress_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(ingress_needed_key); void net_inc_ingress_queue(void) { - static_key_slow_inc(&ingress_needed); + static_branch_inc(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_ingress_queue); void net_dec_ingress_queue(void) { - static_key_slow_dec(&ingress_needed); + static_branch_dec(&ingress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_ingress_queue); #endif #ifdef CONFIG_NET_EGRESS -static struct static_key egress_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(egress_needed_key); void net_inc_egress_queue(void) { - static_key_slow_inc(&egress_needed); + static_branch_inc(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_inc_egress_queue); void net_dec_egress_queue(void) { - static_key_slow_dec(&egress_needed); + static_branch_dec(&egress_needed_key); } EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif -static struct static_key netstamp_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); #ifdef HAVE_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; @@ -1796,9 +1797,9 @@ static void netstamp_clear(struct work_struct *work) wanted = atomic_add_return(deferred, &netstamp_wanted); if (wanted > 0) - static_key_enable(&netstamp_needed); + static_branch_enable(&netstamp_needed_key); else - static_key_disable(&netstamp_needed); + static_branch_disable(&netstamp_needed_key); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif @@ -1818,7 +1819,7 @@ void net_enable_timestamp(void) atomic_inc(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else - static_key_slow_inc(&netstamp_needed); + static_branch_inc(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_enable_timestamp); @@ -1838,7 +1839,7 @@ void net_disable_timestamp(void) atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else - static_key_slow_dec(&netstamp_needed); + static_branch_dec(&netstamp_needed_key); #endif } EXPORT_SYMBOL(net_disable_timestamp); @@ -1846,15 +1847,15 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; - if (static_key_false(&netstamp_needed)) + if (static_branch_unlikely(&netstamp_needed_key)) __net_timestamp(skb); } -#define net_timestamp_check(COND, SKB) \ - if (static_key_false(&netstamp_needed)) { \ - if ((COND) && !(SKB)->tstamp) \ - __net_timestamp(SKB); \ - } \ +#define net_timestamp_check(COND, SKB) \ + if (static_branch_unlikely(&netstamp_needed_key)) { \ + if ((COND) && !(SKB)->tstamp) \ + __net_timestamp(SKB); \ + } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { @@ -2614,17 +2615,16 @@ EXPORT_SYMBOL(netif_device_attach); * Returns a Tx hash based on the given packet descriptor a Tx queues' number * to be used as a distribution range. */ -u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, - unsigned int num_tx_queues) +static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb) { u32 hash; u16 qoffset = 0; - u16 qcount = num_tx_queues; + u16 qcount = dev->real_num_tx_queues; if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); - while (unlikely(hash >= num_tx_queues)) - hash -= num_tx_queues; + while (unlikely(hash >= qcount)) + hash -= qcount; return hash; } @@ -2637,7 +2637,6 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; } -EXPORT_SYMBOL(__skb_tx_hash); static void skb_warn_bad_offload(const struct sk_buff *skb) { @@ -3113,6 +3112,10 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device if (unlikely(!skb)) goto out_null; + skb = sk_validate_xmit_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + if (netif_needs_gso(skb, features)) { struct sk_buff *segs; @@ -3241,7 +3244,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_DROP; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; - __qdisc_run(q); + qdisc_run(q); } if (unlikely(to_free)) @@ -3529,7 +3532,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) #ifdef CONFIG_NET_CLS_ACT skb->tc_at_ingress = 0; # ifdef CONFIG_NET_EGRESS - if (static_key_false(&egress_needed)) { + if (static_branch_unlikely(&egress_needed_key)) { skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; @@ -3624,6 +3627,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) } EXPORT_SYMBOL(dev_queue_xmit_accel); +int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + struct net_device *dev = skb->dev; + struct sk_buff *orig_skb = skb; + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + bool again = false; + + if (unlikely(!netif_running(dev) || + !netif_carrier_ok(dev))) + goto drop; + + skb = validate_xmit_skb_list(skb, dev, &again); + if (skb != orig_skb) + goto drop; + + skb_set_queue_mapping(skb, queue_id); + txq = skb_get_tx_queue(dev, skb); + + local_bh_disable(); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq, false); + HARD_TX_UNLOCK(dev, txq); + + local_bh_enable(); + + if (!dev_xmit_complete(ret)) + kfree_skb(skb); + + return ret; +drop: + atomic_long_inc(&dev->tx_dropped); + kfree_skb_list(skb); + return NET_XMIT_DROP; +} +EXPORT_SYMBOL(dev_direct_xmit); /************************************************************************* * Receiver routines @@ -3993,12 +4034,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct netdev_rx_queue *rxqueue; + void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; - struct xdp_buff xdp; - void *orig_data; int hlen, off; u32 mac_len; @@ -4033,31 +4074,42 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, */ mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data = xdp.data; + xdp->data = skb->data - mac_len; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + hlen; + xdp->data_hard_start = skb->data - skb_headroom(skb); + orig_data_end = xdp->data_end; + orig_data = xdp->data; rxqueue = netif_get_rxqueue(skb); - xdp.rxq = &rxqueue->xdp_rxq; + xdp->rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); - off = xdp.data - orig_data; + off = xdp->data - orig_data; if (off > 0) __skb_pull(skb, off); else if (off < 0) __skb_push(skb, -off); skb->mac_header += off; + /* check if bpf_xdp_adjust_tail was used. it can only "shrink" + * pckt. + */ + off = orig_data_end - xdp->data_end; + if (off != 0) { + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); + skb->len -= off; + + } + switch (act) { case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); break; case XDP_PASS: - metalen = xdp.data - xdp.data_meta; + metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; @@ -4102,22 +4154,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) } EXPORT_SYMBOL_GPL(generic_xdp_tx); -static struct static_key generic_xdp_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); + struct xdp_buff xdp; + u32 act; int err; + act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect(skb->dev, skb, - xdp_prog); + &xdp, xdp_prog); if (err) goto out_redir; - /* fallthru to submit skb */ + break; case XDP_TX: generic_xdp_tx(skb, xdp_prog); break; @@ -4140,7 +4194,7 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); - if (static_key_false(&generic_xdp_needed)) { + if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret; preempt_disable(); @@ -4512,7 +4566,7 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS - if (static_key_false(&ingress_needed)) { + if (static_branch_unlikely(&ingress_needed_key)) { skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out; @@ -4672,9 +4726,9 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) bpf_prog_put(old); if (old && !new) { - static_key_slow_dec(&generic_xdp_needed); + static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { - static_key_slow_inc(&generic_xdp_needed); + static_branch_inc(&generic_xdp_needed_key); dev_disable_lro(dev); dev_disable_gro_hw(dev); } @@ -4702,7 +4756,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - if (static_key_false(&generic_xdp_needed)) { + if (static_branch_unlikely(&generic_xdp_needed_key)) { int ret; preempt_disable(); @@ -7870,6 +7924,8 @@ int register_netdevice(struct net_device *dev) int ret; struct net *net = dev_net(dev); + BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE < + NETDEV_FEATURE_COUNT); BUG_ON(dev_boot_phase); ASSERT_RTNL(); diff --git a/net/core/devlink.c b/net/core/devlink.c index ad1317376798..5c8a40e1a01e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -453,6 +453,27 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static int devlink_nl_port_attrs_put(struct sk_buff *msg, + struct devlink_port *devlink_port) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + if (!attrs->set) + return 0; + if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number)) + return -EMSGSIZE; + if (!attrs->split) + return 0; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER, + attrs->split_subport_number)) + return -EMSGSIZE; + return 0; +} + static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, @@ -492,9 +513,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, ibdev->name)) goto nla_put_failure; } - if (devlink_port->split && - nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, - devlink_port->split_group)) + if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -2971,19 +2990,64 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) EXPORT_SYMBOL_GPL(devlink_port_type_clear); /** - * devlink_port_split_set - Set port is split + * devlink_port_attrs_set - Set port attributes * * @devlink_port: devlink port - * @split_group: split group - identifies group split port is part of + * @flavour: flavour of the port + * @port_number: number of the port that is facing user, for example + * the front panel port number + * @split: indicates if this is split port + * @split_subport_number: if the port is split, this is the number + * of subport. */ -void devlink_port_split_set(struct devlink_port *devlink_port, - u32 split_group) -{ - devlink_port->split = true; - devlink_port->split_group = split_group; +void devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour, + u32 port_number, bool split, + u32 split_subport_number) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + + attrs->set = true; + attrs->flavour = flavour; + attrs->port_number = port_number; + attrs->split = split; + attrs->split_subport_number = split_subport_number; devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } -EXPORT_SYMBOL_GPL(devlink_port_split_set); +EXPORT_SYMBOL_GPL(devlink_port_attrs_set); + +int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, + char *name, size_t len) +{ + struct devlink_port_attrs *attrs = &devlink_port->attrs; + int n = 0; + + if (!attrs->set) + return -EOPNOTSUPP; + + switch (attrs->flavour) { + case DEVLINK_PORT_FLAVOUR_PHYSICAL: + if (!attrs->split) + n = snprintf(name, len, "p%u", attrs->port_number); + else + n = snprintf(name, len, "p%us%u", attrs->port_number, + attrs->split_subport_number); + break; + case DEVLINK_PORT_FLAVOUR_CPU: + case DEVLINK_PORT_FLAVOUR_DSA: + /* As CPU and DSA ports do not have a netdevice associated + * case should not ever happen. + */ + WARN_ON(1); + return -EINVAL; + } + + if (n >= len) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, diff --git a/net/core/dst.c b/net/core/dst.c index 007aa0b08291..2d9b37f8944a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -58,6 +58,7 @@ const struct dst_metrics dst_default_metrics = { */ .refcnt = REFCOUNT_INIT(1), }; +EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, diff --git a/net/core/ethtool.c b/net/core/ethtool.c index ba02f0dfe85c..c15075dc7572 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -92,6 +92,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", + [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", @@ -109,6 +110,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", + [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", }; static const char @@ -210,23 +212,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) return ret; } -static int phy_get_sset_count(struct phy_device *phydev) -{ - int ret; - - if (phydev->drv->get_sset_count && - phydev->drv->get_strings && - phydev->drv->get_stats) { - mutex_lock(&phydev->lock); - ret = phydev->drv->get_sset_count(phydev); - mutex_unlock(&phydev->lock); - - return ret; - } - - return -EOPNOTSUPP; -} - static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -243,12 +228,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_PHY_TUNABLES) return ARRAY_SIZE(phy_tunable_strings); - if (sset == ETH_SS_PHY_STATS) { - if (dev->phydev) - return phy_get_sset_count(dev->phydev); - else - return -EOPNOTSUPP; - } + if (sset == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + return phy_ethtool_get_sset_count(dev->phydev); if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); @@ -271,17 +253,10 @@ static void __ethtool_get_strings(struct net_device *dev, memcpy(data, tunable_strings, sizeof(tunable_strings)); else if (stringset == ETH_SS_PHY_TUNABLES) memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); - else if (stringset == ETH_SS_PHY_STATS) { - struct phy_device *phydev = dev->phydev; - - if (phydev) { - mutex_lock(&phydev->lock); - phydev->drv->get_strings(phydev, data); - mutex_unlock(&phydev->lock); - } else { - return; - } - } else + else if (stringset == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + phy_ethtool_get_strings(dev->phydev, data); + else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } @@ -1998,15 +1973,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) { - struct ethtool_stats stats; + const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; + struct ethtool_stats stats; u64 *data; int ret, n_stats; - if (!phydev) + if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) return -EOPNOTSUPP; - n_stats = phy_get_sset_count(phydev); + if (dev->phydev && !ops->get_ethtool_phy_stats) + n_stats = phy_ethtool_get_sset_count(dev->phydev); + else + n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) @@ -2021,9 +2000,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (n_stats && !data) return -ENOMEM; - mutex_lock(&phydev->lock); - phydev->drv->get_stats(phydev, &stats, data); - mutex_unlock(&phydev->lock); + if (dev->phydev && !ops->get_ethtool_phy_stats) { + ret = phy_ethtool_get_stats(dev->phydev, &stats, data); + if (ret < 0) + return ret; + } else { + ops->get_ethtool_phy_stats(dev, &stats, data); + } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 33958f84c173..126ffc5bc630 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -387,247 +387,304 @@ unsigned int fib_rules_seq_read(struct net *net, int family) } EXPORT_SYMBOL_GPL(fib_rules_seq_read); -static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb, - struct fib_rules_ops *ops) -{ - int err = -EINVAL; - - if (frh->src_len) - if (tb[FRA_SRC] == NULL || - frh->src_len > (ops->addr_size * 8) || - nla_len(tb[FRA_SRC]) != ops->addr_size) - goto errout; - - if (frh->dst_len) - if (tb[FRA_DST] == NULL || - frh->dst_len > (ops->addr_size * 8) || - nla_len(tb[FRA_DST]) != ops->addr_size) - goto errout; - - err = 0; -errout: - return err; -} - -static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, - struct nlattr **tb, struct fib_rule *rule) +static struct fib_rule *rule_find(struct fib_rules_ops *ops, + struct fib_rule_hdr *frh, + struct nlattr **tb, + struct fib_rule *rule, + bool user_priority) { struct fib_rule *r; list_for_each_entry(r, &ops->rules_list, list) { - if (r->action != rule->action) + if (rule->action && r->action != rule->action) continue; - if (r->table != rule->table) + if (rule->table && r->table != rule->table) continue; - if (r->pref != rule->pref) + if (user_priority && r->pref != rule->pref) continue; - if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + if (rule->iifname[0] && + memcmp(r->iifname, rule->iifname, IFNAMSIZ)) continue; - if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + if (rule->oifname[0] && + memcmp(r->oifname, rule->oifname, IFNAMSIZ)) continue; - if (r->mark != rule->mark) + if (rule->mark && r->mark != rule->mark) continue; - if (r->mark_mask != rule->mark_mask) + if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; - if (r->tun_id != rule->tun_id) + if (rule->tun_id && r->tun_id != rule->tun_id) continue; if (r->fr_net != rule->fr_net) continue; - if (r->l3mdev != rule->l3mdev) + if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; - if (!uid_eq(r->uid_range.start, rule->uid_range.start) || - !uid_eq(r->uid_range.end, rule->uid_range.end)) + if (uid_range_set(&rule->uid_range) && + (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end))) continue; - if (r->ip_proto != rule->ip_proto) + if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; - if (!fib_rule_port_range_compare(&r->sport_range, + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) continue; - if (!fib_rule_port_range_compare(&r->dport_range, + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; if (!ops->compare(r, frh, tb)) continue; - return 1; + return r; + } + + return NULL; +} + +#ifdef CONFIG_NET_L3_MASTER_DEV +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, + struct netlink_ext_ack *extack) +{ + nlrule->l3mdev = nla_get_u8(nla); + if (nlrule->l3mdev != 1) { + NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute"); + return -1; } + return 0; } +#else +static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel"); + return -1; +} +#endif -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct fib_rule **rule, + bool *user_priority) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); - struct fib_rules_ops *ops = NULL; - struct fib_rule *rule, *r, *last = NULL; - struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL, unresolved = 0; - - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) - goto errout; + struct fib_rule *nlrule = NULL; + int err = -EINVAL; - ops = lookup_rules_ops(net, frh->family); - if (ops == NULL) { - err = -EAFNOSUPPORT; - goto errout; + if (frh->src_len) + if (!tb[FRA_SRC] || + frh->src_len > (ops->addr_size * 8) || + nla_len(tb[FRA_SRC]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid source address"); + goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); - if (err < 0) - goto errout; - - err = validate_rulemsg(frh, tb, ops); - if (err < 0) - goto errout; + if (frh->dst_len) + if (!tb[FRA_DST] || + frh->dst_len > (ops->addr_size * 8) || + nla_len(tb[FRA_DST]) != ops->addr_size) { + NL_SET_ERR_MSG(extack, "Invalid dst address"); + goto errout; + } - rule = kzalloc(ops->rule_size, GFP_KERNEL); - if (rule == NULL) { + nlrule = kzalloc(ops->rule_size, GFP_KERNEL); + if (!nlrule) { err = -ENOMEM; goto errout; } - refcount_set(&rule->refcnt, 1); - rule->fr_net = net; + refcount_set(&nlrule->refcnt, 1); + nlrule->fr_net = net; - rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) - : fib_default_rule_pref(ops); + if (tb[FRA_PRIORITY]) { + nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); + *user_priority = true; + } else { + nlrule->pref = fib_default_rule_pref(ops); + } - rule->proto = tb[FRA_PROTOCOL] ? + nlrule->proto = tb[FRA_PROTOCOL] ? nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; if (tb[FRA_IIFNAME]) { struct net_device *dev; - rule->iifindex = -1; - nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->iifname); + nlrule->iifindex = -1; + nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, nlrule->iifname); if (dev) - rule->iifindex = dev->ifindex; + nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { struct net_device *dev; - rule->oifindex = -1; - nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, rule->oifname); + nlrule->oifindex = -1; + nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + dev = __dev_get_by_name(net, nlrule->oifname); if (dev) - rule->oifindex = dev->ifindex; + nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { - rule->mark = nla_get_u32(tb[FRA_FWMARK]); - if (rule->mark) + nlrule->mark = nla_get_u32(tb[FRA_FWMARK]); + if (nlrule->mark) /* compatibility: if the mark value is non-zero all bits * are compared unless a mask is explicitly specified. */ - rule->mark_mask = 0xFFFFFFFF; + nlrule->mark_mask = 0xFFFFFFFF; } if (tb[FRA_FWMASK]) - rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); if (tb[FRA_TUN_ID]) - rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); err = -EINVAL; - if (tb[FRA_L3MDEV]) { -#ifdef CONFIG_NET_L3_MASTER_DEV - rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); - if (rule->l3mdev != 1) -#endif - goto errout_free; - } + if (tb[FRA_L3MDEV] && + fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0) + goto errout_free; - rule->action = frh->action; - rule->flags = frh->flags; - rule->table = frh_get_table(frh, tb); + nlrule->action = frh->action; + nlrule->flags = frh->flags; + nlrule->table = frh_get_table(frh, tb); if (tb[FRA_SUPPRESS_PREFIXLEN]) - rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); + nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]); else - rule->suppress_prefixlen = -1; + nlrule->suppress_prefixlen = -1; if (tb[FRA_SUPPRESS_IFGROUP]) - rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); + nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]); else - rule->suppress_ifgroup = -1; + nlrule->suppress_ifgroup = -1; if (tb[FRA_GOTO]) { - if (rule->action != FR_ACT_GOTO) + if (nlrule->action != FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Unexpected goto"); goto errout_free; + } - rule->target = nla_get_u32(tb[FRA_GOTO]); + nlrule->target = nla_get_u32(tb[FRA_GOTO]); /* Backward jumps are prohibited to avoid endless loops */ - if (rule->target <= rule->pref) + if (nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); goto errout_free; - - list_for_each_entry(r, &ops->rules_list, list) { - if (r->pref == rule->target) { - RCU_INIT_POINTER(rule->ctarget, r); - break; - } } - - if (rcu_dereference_protected(rule->ctarget, 1) == NULL) - unresolved = 1; - } else if (rule->action == FR_ACT_GOTO) + } else if (nlrule->action == FR_ACT_GOTO) { + NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; + } - if (rule->l3mdev && rule->table) + if (nlrule->l3mdev && nlrule->table) { + NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive"); goto errout_free; + } if (tb[FRA_UID_RANGE]) { if (current_user_ns() != net->user_ns) { err = -EPERM; + NL_SET_ERR_MSG(extack, "No permission to set uid"); goto errout_free; } - rule->uid_range = nla_get_kuid_range(tb); + nlrule->uid_range = nla_get_kuid_range(tb); - if (!uid_range_set(&rule->uid_range) || - !uid_lte(rule->uid_range.start, rule->uid_range.end)) + if (!uid_range_set(&nlrule->uid_range) || + !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) { + NL_SET_ERR_MSG(extack, "Invalid uid range"); goto errout_free; + } } else { - rule->uid_range = fib_kuid_range_unset; + nlrule->uid_range = fib_kuid_range_unset; } if (tb[FRA_IP_PROTO]) - rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); + nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); if (tb[FRA_SPORT_RANGE]) { err = nla_get_port_range(tb[FRA_SPORT_RANGE], - &rule->sport_range); - if (err) + &nlrule->sport_range); + if (err) { + NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; + } } if (tb[FRA_DPORT_RANGE]) { err = nla_get_port_range(tb[FRA_DPORT_RANGE], - &rule->dport_range); - if (err) + &nlrule->dport_range); + if (err) { + NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; + } } + *rule = nlrule; + + return 0; + +errout_free: + kfree(nlrule); +errout: + return err; +} + +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rules_ops *ops = NULL; + struct fib_rule *rule = NULL, *r, *last = NULL; + struct nlattr *tb[FRA_MAX + 1]; + int err = -EINVAL, unresolved = 0; + bool user_priority = false; + + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); + goto errout; + } + + ops = lookup_rules_ops(net, frh->family); + if (!ops) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); + goto errout; + } + + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); + goto errout; + } + + err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); + if (err) + goto errout; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && - rule_exists(ops, frh, tb, rule)) { + rule_find(ops, frh, tb, rule, user_priority)) { err = -EEXIST; goto errout_free; } - err = ops->configure(rule, skb, frh, tb); + err = ops->configure(rule, skb, frh, tb, extack); if (err < 0) goto errout_free; @@ -637,6 +694,16 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout_free; list_for_each_entry(r, &ops->rules_list, list) { + if (r->pref == rule->target) { + RCU_INIT_POINTER(rule->ctarget, r); + break; + } + } + + if (rcu_dereference_protected(rule->ctarget, 1) == NULL) + unresolved = 1; + + list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; last = r; @@ -690,171 +757,97 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); - struct fib_rule_port_range sprange = {0, 0}; - struct fib_rule_port_range dprange = {0, 0}; struct fib_rules_ops *ops = NULL; - struct fib_rule *rule, *r; + struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; - struct fib_kuid_range range; int err = -EINVAL; + bool user_priority = false; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) + if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; + } ops = lookup_rules_ops(net, frh->family); if (ops == NULL) { err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Rule family not supported"); goto errout; } err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, "Error parsing msg"); goto errout; + } - err = validate_rulemsg(frh, tb, ops); - if (err < 0) + err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); + if (err) goto errout; - if (tb[FRA_UID_RANGE]) { - range = nla_get_kuid_range(tb); - if (!uid_range_set(&range)) { - err = -EINVAL; - goto errout; - } - } else { - range = fib_kuid_range_unset; + rule = rule_find(ops, frh, tb, nlrule, user_priority); + if (!rule) { + err = -ENOENT; + goto errout; } - if (tb[FRA_SPORT_RANGE]) { - err = nla_get_port_range(tb[FRA_SPORT_RANGE], - &sprange); - if (err) - goto errout; + if (rule->flags & FIB_RULE_PERMANENT) { + err = -EPERM; + goto errout; } - if (tb[FRA_DPORT_RANGE]) { - err = nla_get_port_range(tb[FRA_DPORT_RANGE], - &dprange); + if (ops->delete) { + err = ops->delete(rule); if (err) goto errout; } - list_for_each_entry(rule, &ops->rules_list, list) { - if (tb[FRA_PROTOCOL] && - (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) - continue; - - if (frh->action && (frh->action != rule->action)) - continue; - - if (frh_get_table(frh, tb) && - (frh_get_table(frh, tb) != rule->table)) - continue; - - if (tb[FRA_PRIORITY] && - (rule->pref != nla_get_u32(tb[FRA_PRIORITY]))) - continue; - - if (tb[FRA_IIFNAME] && - nla_strcmp(tb[FRA_IIFNAME], rule->iifname)) - continue; - - if (tb[FRA_OIFNAME] && - nla_strcmp(tb[FRA_OIFNAME], rule->oifname)) - continue; - - if (tb[FRA_FWMARK] && - (rule->mark != nla_get_u32(tb[FRA_FWMARK]))) - continue; - - if (tb[FRA_FWMASK] && - (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) - continue; - - if (tb[FRA_TUN_ID] && - (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) - continue; - - if (tb[FRA_L3MDEV] && - (rule->l3mdev != nla_get_u8(tb[FRA_L3MDEV]))) - continue; - - if (uid_range_set(&range) && - (!uid_eq(rule->uid_range.start, range.start) || - !uid_eq(rule->uid_range.end, range.end))) - continue; - - if (tb[FRA_IP_PROTO] && - (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO]))) - continue; - - if (fib_rule_port_range_set(&sprange) && - !fib_rule_port_range_compare(&rule->sport_range, &sprange)) - continue; - - if (fib_rule_port_range_set(&dprange) && - !fib_rule_port_range_compare(&rule->dport_range, &dprange)) - continue; - - if (!ops->compare(rule, frh, tb)) - continue; - - if (rule->flags & FIB_RULE_PERMANENT) { - err = -EPERM; - goto errout; - } - - if (ops->delete) { - err = ops->delete(rule); - if (err) - goto errout; - } + if (rule->tun_id) + ip_tunnel_unneed_metadata(); - if (rule->tun_id) - ip_tunnel_unneed_metadata(); + list_del_rcu(&rule->list); - list_del_rcu(&rule->list); - - if (rule->action == FR_ACT_GOTO) { - ops->nr_goto_rules--; - if (rtnl_dereference(rule->ctarget) == NULL) - ops->unresolved_rules--; - } + if (rule->action == FR_ACT_GOTO) { + ops->nr_goto_rules--; + if (rtnl_dereference(rule->ctarget) == NULL) + ops->unresolved_rules--; + } - /* - * Check if this rule is a target to any of them. If so, - * adjust to the next one with the same preference or - * disable them. As this operation is eventually very - * expensive, it is only performed if goto rules, except - * current if it is goto rule, have actually been added. - */ - if (ops->nr_goto_rules > 0) { - struct fib_rule *n; - - n = list_next_entry(rule, list); - if (&n->list == &ops->rules_list || n->pref != rule->pref) - n = NULL; - list_for_each_entry(r, &ops->rules_list, list) { - if (rtnl_dereference(r->ctarget) != rule) - continue; - rcu_assign_pointer(r->ctarget, n); - if (!n) - ops->unresolved_rules++; - } + /* + * Check if this rule is a target to any of them. If so, + * adjust to the next one with the same preference or + * disable them. As this operation is eventually very + * expensive, it is only performed if goto rules, except + * current if it is goto rule, have actually been added. + */ + if (ops->nr_goto_rules > 0) { + struct fib_rule *n; + + n = list_next_entry(rule, list); + if (&n->list == &ops->rules_list || n->pref != rule->pref) + n = NULL; + list_for_each_entry(r, &ops->rules_list, list) { + if (rtnl_dereference(r->ctarget) != rule) + continue; + rcu_assign_pointer(r->ctarget, n); + if (!n) + ops->unresolved_rules++; } - - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, - NULL); - notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).portid); - fib_rule_put(rule); - flush_route_cache(ops); - rules_ops_put(ops); - return 0; } - err = -ENOENT; + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, + NULL); + notify_rule_change(RTM_DELRULE, rule, ops, nlh, + NETLINK_CB(skb).portid); + fib_rule_put(rule); + flush_route_cache(ops); + rules_ops_put(ops); + kfree(nlrule); + return 0; + errout: + if (nlrule) + kfree(nlrule); rules_ops_put(ops); return err; } diff --git a/net/core/filter.c b/net/core/filter.c index 201ff36b17a8..51ea7ddb2d8d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -57,7 +57,13 @@ #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> +#include <net/xfrm.h> #include <linux/bpf_trace.h> +#include <net/xdp_sock.h> +#include <linux/inetdevice.h> +#include <net/ip_fib.h> +#include <net/flow.h> +#include <net/arp.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -111,12 +117,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } -BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -136,7 +142,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -160,13 +166,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_0(__get_raw_cpu_id) +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return *(u8 *)ptr; + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u16 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be16(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u32 tmp, *ptr; + const int len = sizeof(tmp); + + if (likely(offset >= 0)) { + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be32(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = __get_raw_cpu_id, + .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -316,16 +403,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp, /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: - *insn = BPF_EMIT_CALL(__skb_get_pay_offset); + *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: - *insn = BPF_EMIT_CALL(__skb_get_nlattr); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: - *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: - *insn = BPF_EMIT_CALL(__get_raw_cpu_id); + *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); @@ -352,26 +439,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp, return true; } +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, + offset); + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} + /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: - * bpf_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_prog *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; @@ -410,12 +558,27 @@ do_pass: * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { - struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) @@ -458,6 +621,11 @@ do_pass: BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; + } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { @@ -567,21 +735,31 @@ jmp_rest: break; /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ - case BPF_LDX | BPF_MSH | BPF_B: - /* tmp = A */ - *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ - *insn++ = BPF_LD_ABS(BPF_B, fp->k); + convert_bpf_ld_abs(&tmp, &insn); + insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - + } /* RET_K is remaped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ @@ -663,6 +841,8 @@ jmp_rest: if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ return 0; } @@ -1024,6 +1204,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally @@ -1045,7 +1226,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) } /* 1st pass: calculate the new program length. */ - err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); if (err) goto out_err_free; @@ -1064,7 +1246,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -1512,6 +1695,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, + u32, offset, void *, to, u32, len, u32, start_header) +{ + u8 *ptr; + + if (unlikely(offset > 0xffff || len > skb_headlen(skb))) + goto err_clear; + + switch (start_header) { + case BPF_HDR_START_MAC: + ptr = skb_mac_header(skb) + offset; + break; + case BPF_HDR_START_NET: + ptr = skb_network_header(skb) + offset; + break; + default: + goto err_clear; + } + + if (likely(ptr >= skb_mac_header(skb) && + ptr + len <= skb_tail_pointer(skb))) { + memcpy(to, ptr, len); + return 0; + } + +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { + .func = bpf_skb_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write @@ -1857,6 +2081,33 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { @@ -1866,9 +2117,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - tcb->bpf.key = key; tcb->bpf.flags = flags; - tcb->bpf.map = map; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; return SK_PASS; } @@ -1876,16 +2128,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct sock *do_sk_redirect_map(struct sk_buff *skb) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - struct sock *sk = NULL; - if (tcb->bpf.map) { - sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); - - tcb->bpf.key = 0; - tcb->bpf.map = NULL; - } - - return sk; + return tcb->bpf.sk_redir; } static const struct bpf_func_proto bpf_sk_redirect_map_proto = { @@ -1898,32 +2142,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, - struct bpf_map *, map, u32, key, u64, flags) +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, + struct bpf_map *, map, void *, key, u64, flags) { /* If user passes invalid input drop the packet. */ if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - msg->key = key; msg->flags = flags; - msg->map = map; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; return SK_PASS; } -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) { - struct sock *sk = NULL; + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; - if (msg->map) { - sk = __sock_map_lookup_elem(msg->map, msg->key); + msg->flags = flags; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; - msg->key = 0; - msg->map = NULL; - } + return SK_PASS; +} - return sk; +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + return msg->sk_redir; } static const struct bpf_func_proto bpf_msg_redirect_map_proto = { @@ -2186,7 +2447,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, return ret; } -const struct bpf_func_proto bpf_skb_vlan_push_proto = { +static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, @@ -2194,7 +2455,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { @@ -2208,13 +2468,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) return ret; } -const struct bpf_func_proto bpf_skb_vlan_pop_proto = { +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { @@ -2699,8 +2958,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); - void *data_start = xdp->data_hard_start + metalen; + void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || @@ -2724,14 +2984,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) +{ + void *data_end = xdp->data_end + offset; + + /* only shrinking is allowed for now. */ + if (unlikely(offset >= 0)) + return -EINVAL; + + if (unlikely(data_end < xdp->data + ETH_HLEN)) + return -EINVAL; + + xdp->data_end = data_end; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { + .func = bpf_xdp_adjust_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; - if (unlikely(meta < xdp->data_hard_start || + if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely((metalen & (sizeof(__u32) - 1)) || @@ -2756,13 +3041,18 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp, u32 index) { + struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) return err; dev->netdev_ops->ndo_xdp_flush(dev); @@ -2776,24 +3066,44 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, { int err; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: { struct net_device *dev = fwd; + struct xdp_frame *xdpf; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + /* TODO: move to inside map code instead, for bulk support + * err = dev_map_enqueue(dev, xdp); + */ + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf); if (err) return err; __dev_map_insert_ctx(map, index); - - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + break; + } + case BPF_MAP_TYPE_CPUMAP: { struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; __cpu_map_insert_ctx(map, index); + break; + } + case BPF_MAP_TYPE_XSKMAP: { + struct xdp_sock *xs = fwd; + + err = __xsk_map_redirect(map, xdp, xs); + return err; + } + default: + break; } return 0; } @@ -2812,6 +3122,9 @@ void xdp_do_flush_map(void) case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); break; + case BPF_MAP_TYPE_XSKMAP: + __xsk_map_flush(map); + break; default: break; } @@ -2826,6 +3139,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) return __dev_map_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_lookup_elem(map, index); default: return NULL; } @@ -2923,13 +3238,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err = 0; ri->ifindex = 0; @@ -2951,6 +3267,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) goto err; skb->dev = fwd; + generic_xdp_tx(skb, xdp_prog); + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + struct xdp_sock *xs = fwd; + + err = xsk_generic_rcv(xs, xdp); + if (err) + goto err; + consume_skb(skb); } else { /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; @@ -2965,7 +3289,7 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); u32 index = ri->ifindex; @@ -2973,7 +3297,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int err = 0; if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); @@ -2987,6 +3311,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, index); + generic_xdp_tx(skb, xdp_prog); return 0; err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); @@ -3060,7 +3385,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data) + func == bpf_msg_pull_data || + func == bpf_xdp_adjust_tail) return true; return false; @@ -3711,6 +4037,308 @@ static const struct bpf_func_proto bpf_bind_proto = { .arg3_type = ARG_CONST_SIZE, }; +#ifdef CONFIG_XFRM +BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, + struct bpf_xfrm_state *, to, u32, size, u64, flags) +{ + const struct sec_path *sp = skb_sec_path(skb); + const struct xfrm_state *x; + + if (!sp || unlikely(index >= sp->len || flags)) + goto err_clear; + + x = sp->xvec[index]; + + if (unlikely(size != sizeof(struct bpf_xfrm_state))) + goto err_clear; + + to->reqid = x->props.reqid; + to->spi = x->id.spi; + to->family = x->props.family; + if (to->family == AF_INET6) { + memcpy(to->remote_ipv6, x->props.saddr.a6, + sizeof(to->remote_ipv6)); + } else { + to->remote_ipv4 = x->props.saddr.a4; + } + + return 0; +err_clear: + memset(to, 0, size); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { + .func = bpf_skb_get_xfrm_state, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; +#endif + +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, + const struct neighbour *neigh, + const struct net_device *dev) +{ + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + params->h_vlan_TCI = 0; + params->h_vlan_proto = 0; + + return dev->ifindex; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags) +{ + struct in_device *in_dev; + struct neighbour *neigh; + struct net_device *dev; + struct fib_result res; + struct fib_nh *nh; + struct flowi4 fl4; + int err; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + /* verify forwarding is enabled on this interface */ + in_dev = __in_dev_get_rcu(dev); + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl4.flowi4_iif = 1; + fl4.flowi4_oif = params->ifindex; + } else { + fl4.flowi4_iif = params->ifindex; + fl4.flowi4_oif = 0; + } + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; + + fl4.flowi4_proto = params->l4_protocol; + fl4.daddr = params->ipv4_dst; + fl4.saddr = params->ipv4_src; + fl4.fl4_sport = params->sport; + fl4.fl4_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib_table *tb; + + tb = fib_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + } else { + fl4.flowi4_mark = 0; + fl4.flowi4_secid = 0; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); + + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); + } + + if (err || res.type != RTN_UNICAST) + return 0; + + if (res.fi->fib_nhs > 1) + fib_select_path(net, &res, &fl4, NULL); + + nh = &res.fi->fib_nh[res.nh_sel]; + + /* do not handle lwt encaps right now */ + if (nh->nh_lwtstate) + return 0; + + dev = nh->nh_dev; + if (unlikely(!dev)) + return 0; + + if (nh->nh_gw) + params->ipv4_dst = nh->nh_gw; + + params->rt_metric = res.fi->fib_priority; + + /* xdp and cls_bpf programs are run in RCU-bh so + * rcu_read_lock_bh is not needed here + */ + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags) +{ + struct in6_addr *src = (struct in6_addr *) params->ipv6_src; + struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct neighbour *neigh; + struct net_device *dev; + struct inet6_dev *idev; + struct fib6_info *f6i; + struct flowi6 fl6; + int strict = 0; + int oif; + + /* link local addresses are never forwarded */ + if (rt6_need_strict(dst) || rt6_need_strict(src)) + return 0; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + idev = __in6_dev_get_safely(dev); + if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl6.flowi6_iif = 1; + oif = fl6.flowi6_oif = params->ifindex; + } else { + oif = fl6.flowi6_iif = params->ifindex; + fl6.flowi6_oif = 0; + strict = RT6_LOOKUP_F_HAS_SADDR; + } + fl6.flowlabel = params->flowlabel; + fl6.flowi6_scope = 0; + fl6.flowi6_flags = 0; + fl6.mp_hash = 0; + + fl6.flowi6_proto = params->l4_protocol; + fl6.daddr = *dst; + fl6.saddr = *src; + fl6.fl6_sport = params->sport; + fl6.fl6_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib6_table *tb; + + tb = ipv6_stub->fib6_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); + } else { + fl6.flowi6_mark = 0; + fl6.flowi6_secid = 0; + fl6.flowi6_tun_key.tun_id = 0; + fl6.flowi6_uid = sock_net_uid(net, NULL); + + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); + } + + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) + return 0; + + if (unlikely(f6i->fib6_flags & RTF_REJECT || + f6i->fib6_type != RTN_UNICAST)) + return 0; + + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, + fl6.flowi6_oif, NULL, + strict); + + if (f6i->fib6_nh.nh_lwtstate) + return 0; + + if (f6i->fib6_flags & RTF_GATEWAY) + *dst = f6i->fib6_nh.nh_gw; + + dev = f6i->fib6_nh.nh_dev; + params->rt_metric = f6i->fib6_metric; + + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is + * not needed here. Can not use __ipv6_neigh_lookup_noref here + * because we need to get nd_tbl via the stub + */ + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, + ndisc_hashfn, dst, dev); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, + flags); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, + flags); +#endif + } + return 0; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { + .func = bpf_xdp_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); +#endif + } + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { + .func = bpf_skb_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3781,6 +4409,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: @@ -3798,6 +4428,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: @@ -3852,6 +4484,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; +#ifdef CONFIG_XFRM + case BPF_FUNC_skb_get_xfrm_state: + return &bpf_skb_get_xfrm_state_proto; +#endif + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } @@ -3875,6 +4513,10 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; + case BPF_FUNC_xdp_adjust_tail: + return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_fib_lookup: + return &bpf_xdp_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } @@ -3919,6 +4561,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; + case BPF_FUNC_sock_hash_update: + return &bpf_sock_hash_update_proto; default: return bpf_base_func_proto(func_id); } @@ -3930,6 +4574,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_redirect_hash: + return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: @@ -3961,6 +4607,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; + case BPF_FUNC_sk_redirect_hash: + return &bpf_sk_redirect_hash_proto; default: return bpf_base_func_proto(func_id); } @@ -4221,6 +4869,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, return insn - insn_buf; } +static int bpf_gen_ld_abs(const struct bpf_insn *orig, + struct bpf_insn *insn_buf) +{ + bool indirect = BPF_MODE(orig->code) == BPF_IND; + struct bpf_insn *insn = insn_buf; + + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); + if (orig->imm) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); + } + + switch (BPF_SIZE(orig->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); + break; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -4279,8 +4962,15 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (type == BPF_WRITE) + if (type == BPF_WRITE) { + if (bpf_prog_is_dev_bound(prog->aux)) { + switch (off) { + case offsetof(struct xdp_md, rx_queue_index): + return __is_valid_xdp_access(off, size); + } + } return false; + } switch (off) { case offsetof(struct xdp_md, data): @@ -5490,6 +6180,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { @@ -5501,6 +6192,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d29f09bc5ff9..4fc1e84d77ec 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1253,7 +1253,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, - const struct flow_keys *keys, int hlen) + const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; @@ -1314,9 +1314,9 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data, */ u32 skb_get_poff(const struct sk_buff *skb) { - struct flow_keys keys; + struct flow_keys_basic keys; - if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) + if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); @@ -1403,7 +1403,7 @@ static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { }, }; -static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { +static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), @@ -1417,7 +1417,8 @@ static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = { struct flow_dissector flow_keys_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_dissector); -struct flow_dissector flow_keys_buf_dissector __read_mostly; +struct flow_dissector flow_keys_basic_dissector __read_mostly; +EXPORT_SYMBOL(flow_keys_basic_dissector); static int __init init_default_flow_dissectors(void) { @@ -1427,9 +1428,9 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_dissector_symmetric, flow_keys_dissector_symmetric_keys, ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); - skb_flow_dissector_init(&flow_keys_buf_dissector, - flow_keys_buf_dissector_keys, - ARRAY_SIZE(flow_keys_buf_dissector_keys)); + skb_flow_dissector_init(&flow_keys_basic_dissector, + flow_keys_basic_dissector_keys, + ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ce519861be59..5afae29367c1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -820,7 +820,8 @@ static void neigh_periodic_work(struct work_struct *work) write_lock(&n->lock); state = n->nud_state; - if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || + (n->flags & NTF_EXT_LEARNED)) { write_unlock(&n->lock); goto next_elt; } @@ -1136,6 +1137,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, if (neigh->dead) goto out; + neigh_update_ext_learned(neigh, flags, ¬ify); + if (!(new & NUD_VALID)) { neigh_del_timer(neigh); if (old & NUD_CONNECTED) @@ -1781,6 +1784,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, flags &= ~NEIGH_UPDATE_F_OVERRIDE; } + if (ndm->ndm_flags & NTF_EXT_LEARNED) + flags |= NEIGH_UPDATE_F_EXT_LEARNED; + if (ndm->ndm_flags & NTF_USE) { neigh_event_send(neigh, NULL); err = 0; diff --git a/net/core/page_pool.c b/net/core/page_pool.c new file mode 100644 index 000000000000..68bf07206744 --- /dev/null +++ b/net/core/page_pool.c @@ -0,0 +1,317 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * page_pool.c + * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> + * Copyright (C) 2016 Red Hat, Inc. + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> + +#include <net/page_pool.h> +#include <linux/dma-direction.h> +#include <linux/dma-mapping.h> +#include <linux/page-flags.h> +#include <linux/mm.h> /* for __put_page() */ + +static int page_pool_init(struct page_pool *pool, + const struct page_pool_params *params) +{ + unsigned int ring_qsize = 1024; /* Default */ + + memcpy(&pool->p, params, sizeof(pool->p)); + + /* Validate only known flags were used */ + if (pool->p.flags & ~(PP_FLAG_ALL)) + return -EINVAL; + + if (pool->p.pool_size) + ring_qsize = pool->p.pool_size; + + /* Sanity limit mem that can be pinned down */ + if (ring_qsize > 32768) + return -E2BIG; + + /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. + * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, + * which is the XDP_TX use-case. + */ + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && + (pool->p.dma_dir != DMA_BIDIRECTIONAL)) + return -EINVAL; + + if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) + return -ENOMEM; + + return 0; +} + +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + struct page_pool *pool; + int err = 0; + + pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); + if (!pool) + return ERR_PTR(-ENOMEM); + + err = page_pool_init(pool, params); + if (err < 0) { + pr_warn("%s() gave up with errno %d\n", __func__, err); + kfree(pool); + return ERR_PTR(err); + } + return pool; +} +EXPORT_SYMBOL(page_pool_create); + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ + struct ptr_ring *r = &pool->ring; + struct page *page; + + /* Quicker fallback, avoid locks when ring is empty */ + if (__ptr_ring_empty(r)) + return NULL; + + /* Test for safe-context, caller should provide this guarantee */ + if (likely(in_serving_softirq())) { + if (likely(pool->alloc.count)) { + /* Fast-path */ + page = pool->alloc.cache[--pool->alloc.count]; + return page; + } + /* Slower-path: Alloc array empty, time to refill + * + * Open-coded bulk ptr_ring consumer. + * + * Discussion: the ring consumer lock is not really + * needed due to the softirq/NAPI protection, but + * later need the ability to reclaim pages on the + * ring. Thus, keeping the locks. + */ + spin_lock(&r->consumer_lock); + while ((page = __ptr_ring_consume(r))) { + if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) + break; + pool->alloc.cache[pool->alloc.count++] = page; + } + spin_unlock(&r->consumer_lock); + return page; + } + + /* Slow-path: Get page from locked ring queue */ + page = ptr_ring_consume(&pool->ring); + return page; +} + +/* slow path */ +noinline +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t _gfp) +{ + struct page *page; + gfp_t gfp = _gfp; + dma_addr_t dma; + + /* We could always set __GFP_COMP, and avoid this branch, as + * prep_new_page() can handle order-0 with __GFP_COMP. + */ + if (pool->p.order) + gfp |= __GFP_COMP; + + /* FUTURE development: + * + * Current slow-path essentially falls back to single page + * allocations, which doesn't improve performance. This code + * need bulk allocation support from the page allocator code. + */ + + /* Cache was empty, do real allocation */ + page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); + if (!page) + return NULL; + + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + goto skip_dma_map; + + /* Setup DMA mapping: use page->private for DMA-addr + * This mapping is kept for lifetime of page, until leaving pool. + */ + dma = dma_map_page(pool->p.dev, page, 0, + (PAGE_SIZE << pool->p.order), + pool->p.dma_dir); + if (dma_mapping_error(pool->p.dev, dma)) { + put_page(page); + return NULL; + } + set_page_private(page, dma); /* page->private = dma; */ + +skip_dma_map: + /* When page just alloc'ed is should/must have refcnt 1. */ + return page; +} + +/* For using page_pool replace: alloc_pages() API calls, but provide + * synchronization guarantee for allocation side. + */ +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + struct page *page; + + /* Fast-path: Get a page from cache */ + page = __page_pool_get_cached(pool); + if (page) + return page; + + /* Slow-path: cache empty, do real allocation */ + page = __page_pool_alloc_pages_slow(pool, gfp); + return page; +} +EXPORT_SYMBOL(page_pool_alloc_pages); + +/* Cleanup page_pool state from page */ +static void __page_pool_clean_page(struct page_pool *pool, + struct page *page) +{ + if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + return; + + /* DMA unmap */ + dma_unmap_page(pool->p.dev, page_private(page), + PAGE_SIZE << pool->p.order, pool->p.dma_dir); + set_page_private(page, 0); +} + +/* Return a page to the page allocator, cleaning up our state */ +static void __page_pool_return_page(struct page_pool *pool, struct page *page) +{ + __page_pool_clean_page(pool, page); + put_page(page); + /* An optimization would be to call __free_pages(page, pool->p.order) + * knowing page is not part of page-cache (thus avoiding a + * __page_cache_release() call). + */ +} + +static bool __page_pool_recycle_into_ring(struct page_pool *pool, + struct page *page) +{ + int ret; + /* BH protection not needed if current is serving softirq */ + if (in_serving_softirq()) + ret = ptr_ring_produce(&pool->ring, page); + else + ret = ptr_ring_produce_bh(&pool->ring, page); + + return (ret == 0) ? true : false; +} + +/* Only allow direct recycling in special circumstances, into the + * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. + * + * Caller must provide appropriate safe context. + */ +static bool __page_pool_recycle_direct(struct page *page, + struct page_pool *pool) +{ + if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) + return false; + + /* Caller MUST have verified/know (page_ref_count(page) == 1) */ + pool->alloc.cache[pool->alloc.count++] = page; + return true; +} + +void __page_pool_put_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + /* This allocator is optimized for the XDP mode that uses + * one-frame-per-page, but have fallbacks that act like the + * regular page allocator APIs. + * + * refcnt == 1 means page_pool owns page, and can recycle it. + */ + if (likely(page_ref_count(page) == 1)) { + /* Read barrier done in page_ref_count / READ_ONCE */ + + if (allow_direct && in_serving_softirq()) + if (__page_pool_recycle_direct(page, pool)) + return; + + if (!__page_pool_recycle_into_ring(pool, page)) { + /* Cache full, fallback to free pages */ + __page_pool_return_page(pool, page); + } + return; + } + /* Fallback/non-XDP mode: API user have elevated refcnt. + * + * Many drivers split up the page into fragments, and some + * want to keep doing this to save memory and do refcnt based + * recycling. Support this use case too, to ease drivers + * switching between XDP/non-XDP. + * + * In-case page_pool maintains the DMA mapping, API user must + * call page_pool_put_page once. In this elevated refcnt + * case, the DMA is unmapped/released, as driver is likely + * doing refcnt based recycle tricks, meaning another process + * will be invoking put_page. + */ + __page_pool_clean_page(pool, page); + put_page(page); +} +EXPORT_SYMBOL(__page_pool_put_page); + +static void __page_pool_empty_ring(struct page_pool *pool) +{ + struct page *page; + + /* Empty recycle ring */ + while ((page = ptr_ring_consume(&pool->ring))) { + /* Verify the refcnt invariant of cached pages */ + if (!(page_ref_count(page) == 1)) + pr_crit("%s() page_pool refcnt %d violation\n", + __func__, page_ref_count(page)); + + __page_pool_return_page(pool, page); + } +} + +static void __page_pool_destroy_rcu(struct rcu_head *rcu) +{ + struct page_pool *pool; + + pool = container_of(rcu, struct page_pool, rcu); + + WARN(pool->alloc.count, "API usage violation"); + + __page_pool_empty_ring(pool); + ptr_ring_cleanup(&pool->ring, NULL); + kfree(pool); +} + +/* Cleanup and release resources */ +void page_pool_destroy(struct page_pool *pool) +{ + struct page *page; + + /* Empty alloc cache, assume caller made sure this is + * no-longer in use, and page_pool_alloc_pages() cannot be + * call concurrently. + */ + while (pool->alloc.count) { + page = pool->alloc.cache[--pool->alloc.count]; + __page_pool_return_page(pool, page); + } + + /* No more consumers should exist, but producers could still + * be in-flight. + */ + __page_pool_empty_ring(pool); + + /* An xdp_mem_allocator can still ref page_pool pointer */ + call_rcu(&pool->rcu, __page_pool_destroy_rcu); +} +EXPORT_SYMBOL(page_pool_destroy); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 45936922d7e2..80802546c279 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -785,13 +785,15 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { - .rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse), - .rta_used = dst->__use, - .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, .rta_id = id, }; + if (dst) { + ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + ci.rta_used = dst->__use; + ci.rta_clntref = atomic_read(&dst->__refcnt); + } if (expires) { unsigned long clock; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 345b51837ca8..c642304f178c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1305,7 +1305,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off) skb->inner_mac_header += off; } -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +void skb_copy_header(struct sk_buff *new, const struct sk_buff *old) { __copy_skb_header(new, old); @@ -1313,6 +1313,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } +EXPORT_SYMBOL(skb_copy_header); static inline int skb_alloc_rx_flag(const struct sk_buff *skb) { @@ -1355,7 +1356,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); - copy_skb_header(n, skb); + skb_copy_header(n, skb); return n; } EXPORT_SYMBOL(skb_copy); @@ -1419,7 +1420,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, skb_clone_fraglist(n); } - copy_skb_header(n, skb); + skb_copy_header(n, skb); out: return n; } @@ -1599,7 +1600,7 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, skb->len + head_copy_len)); - copy_skb_header(n, skb); + skb_copy_header(n, skb); skb_headers_offset_update(n, newheadroom - oldheadroom); @@ -1839,6 +1840,20 @@ done: } EXPORT_SYMBOL(___pskb_trim); +/* Note : use pskb_trim_rcsum() instead of calling this directly + */ +int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + int delta = skb->len - len; + + skb->csum = csum_sub(skb->csum, + skb_checksum(skb, len, delta, 0)); + } + return __pskb_trim(skb, len); +} +EXPORT_SYMBOL(pskb_trim_rcsum_slow); + /** * __pskb_pull_tail - advance tail of skb header * @skb: buffer to reallocate @@ -4926,6 +4941,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) thlen = tcp_hdrlen(skb); } else if (unlikely(skb_is_gso_sctp(skb))) { thlen = sizeof(struct sctphdr); + } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { + thlen = sizeof(struct udphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already diff --git a/net/core/sock.c b/net/core/sock.c index 3b6d02854e57..435a0ba85e52 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ - x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" + x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") @@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = { "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , - "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , + "rlock-AF_MAX" }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , @@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , - "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , + "wlock-AF_MAX" }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , @@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , - "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , + "elock-AF_MAX" }; /* @@ -323,8 +327,8 @@ EXPORT_SYMBOL(sysctl_optmem_max); int sysctl_tstamp_allow_data __read_mostly = 1; -struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; -EXPORT_SYMBOL_GPL(memalloc_socks); +DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); +EXPORT_SYMBOL_GPL(memalloc_socks_key); /** * sk_set_memalloc - sets %SOCK_MEMALLOC @@ -338,7 +342,7 @@ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; - static_key_slow_inc(&memalloc_socks); + static_branch_inc(&memalloc_socks_key); } EXPORT_SYMBOL_GPL(sk_set_memalloc); @@ -346,7 +350,7 @@ void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; - static_key_slow_dec(&memalloc_socks); + static_branch_dec(&memalloc_socks_key); /* * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward @@ -905,7 +909,10 @@ set_rcvbuf: case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - sk->sk_rcvlowat = val ? : 1; + if (sock->ops->set_rcvlowat) + ret = sock->ops->set_rcvlowat(sk, val); + else + sk->sk_rcvlowat = val ? : 1; break; case SO_RCVTIMEO: diff --git a/net/core/xdp.c b/net/core/xdp.c index 097a0f74e004..bf6758f74339 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -5,6 +5,10 @@ */ #include <linux/types.h> #include <linux/mm.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/rhashtable.h> +#include <net/page_pool.h> #include <net/xdp.h> @@ -13,6 +17,104 @@ #define REG_STATE_UNREGISTERED 0x2 #define REG_STATE_UNUSED 0x3 +static DEFINE_IDA(mem_id_pool); +static DEFINE_MUTEX(mem_id_lock); +#define MEM_ID_MAX 0xFFFE +#define MEM_ID_MIN 1 +static int mem_id_next = MEM_ID_MIN; + +static bool mem_id_init; /* false */ +static struct rhashtable *mem_id_ht; + +struct xdp_mem_allocator { + struct xdp_mem_info mem; + union { + void *allocator; + struct page_pool *page_pool; + }; + struct rhash_head node; + struct rcu_head rcu; +}; + +static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed) +{ + const u32 *k = data; + const u32 key = *k; + + BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id) + != sizeof(u32)); + + /* Use cyclic increasing ID as direct hash key, see rht_bucket_index */ + return key << RHT_HASH_RESERVED_SPACE; +} + +static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct xdp_mem_allocator *xa = ptr; + u32 mem_id = *(u32 *)arg->key; + + return xa->mem.id != mem_id; +} + +static const struct rhashtable_params mem_id_rht_params = { + .nelem_hint = 64, + .head_offset = offsetof(struct xdp_mem_allocator, node), + .key_offset = offsetof(struct xdp_mem_allocator, mem.id), + .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id), + .max_size = MEM_ID_MAX, + .min_size = 8, + .automatic_shrinking = true, + .hashfn = xdp_mem_id_hashfn, + .obj_cmpfn = xdp_mem_id_cmp, +}; + +static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) +{ + struct xdp_mem_allocator *xa; + + xa = container_of(rcu, struct xdp_mem_allocator, rcu); + + /* Allow this ID to be reused */ + ida_simple_remove(&mem_id_pool, xa->mem.id); + + /* Notice, driver is expected to free the *allocator, + * e.g. page_pool, and MUST also use RCU free. + */ + + /* Poison memory */ + xa->mem.id = 0xFFFF; + xa->mem.type = 0xF0F0; + xa->allocator = (void *)0xDEAD9001; + + kfree(xa); +} + +static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + struct xdp_mem_allocator *xa; + int id = xdp_rxq->mem.id; + int err; + + if (id == 0) + return; + + mutex_lock(&mem_id_lock); + + xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); + if (!xa) { + mutex_unlock(&mem_id_lock); + return; + } + + err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params); + WARN_ON(err); + + call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); + + mutex_unlock(&mem_id_lock); +} + void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) { /* Simplify driver cleanup code paths, allow unreg "unused" */ @@ -21,8 +123,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + __xdp_rxq_info_unreg_mem_model(xdp_rxq); + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; xdp_rxq->dev = NULL; + + /* Reset mem info to defaults */ + xdp_rxq->mem.id = 0; + xdp_rxq->mem.type = 0; } EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); @@ -71,3 +179,173 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) return (xdp_rxq->reg_state == REG_STATE_REGISTERED); } EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); + +static int __mem_id_init_hash_table(void) +{ + struct rhashtable *rht; + int ret; + + if (unlikely(mem_id_init)) + return 0; + + rht = kzalloc(sizeof(*rht), GFP_KERNEL); + if (!rht) + return -ENOMEM; + + ret = rhashtable_init(rht, &mem_id_rht_params); + if (ret < 0) { + kfree(rht); + return ret; + } + mem_id_ht = rht; + smp_mb(); /* mutex lock should provide enough pairing */ + mem_id_init = true; + + return 0; +} + +/* Allocate a cyclic ID that maps to allocator pointer. + * See: https://www.kernel.org/doc/html/latest/core-api/idr.html + * + * Caller must lock mem_id_lock. + */ +static int __mem_id_cyclic_get(gfp_t gfp) +{ + int retries = 1; + int id; + +again: + id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + if (id < 0) { + if (id == -ENOSPC) { + /* Cyclic allocator, reset next id */ + if (retries--) { + mem_id_next = MEM_ID_MIN; + goto again; + } + } + return id; /* errno */ + } + mem_id_next = id + 1; + + return id; +} + +static bool __is_supported_mem_type(enum xdp_mem_type type) +{ + if (type == MEM_TYPE_PAGE_POOL) + return is_page_pool_compiled_in(); + + if (type >= MEM_TYPE_MAX) + return false; + + return true; +} + +int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, + enum xdp_mem_type type, void *allocator) +{ + struct xdp_mem_allocator *xdp_alloc; + gfp_t gfp = GFP_KERNEL; + int id, errno, ret; + void *ptr; + + if (xdp_rxq->reg_state != REG_STATE_REGISTERED) { + WARN(1, "Missing register, driver bug"); + return -EFAULT; + } + + if (!__is_supported_mem_type(type)) + return -EOPNOTSUPP; + + xdp_rxq->mem.type = type; + + if (!allocator) { + if (type == MEM_TYPE_PAGE_POOL) + return -EINVAL; /* Setup time check page_pool req */ + return 0; + } + + /* Delay init of rhashtable to save memory if feature isn't used */ + if (!mem_id_init) { + mutex_lock(&mem_id_lock); + ret = __mem_id_init_hash_table(); + mutex_unlock(&mem_id_lock); + if (ret < 0) { + WARN_ON(1); + return ret; + } + } + + xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp); + if (!xdp_alloc) + return -ENOMEM; + + mutex_lock(&mem_id_lock); + id = __mem_id_cyclic_get(gfp); + if (id < 0) { + errno = id; + goto err; + } + xdp_rxq->mem.id = id; + xdp_alloc->mem = xdp_rxq->mem; + xdp_alloc->allocator = allocator; + + /* Insert allocator into ID lookup table */ + ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); + if (IS_ERR(ptr)) { + errno = PTR_ERR(ptr); + goto err; + } + + mutex_unlock(&mem_id_lock); + + return 0; +err: + mutex_unlock(&mem_id_lock); + kfree(xdp_alloc); + return errno; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); + +static void xdp_return(void *data, struct xdp_mem_info *mem) +{ + struct xdp_mem_allocator *xa; + struct page *page; + + switch (mem->type) { + case MEM_TYPE_PAGE_POOL: + rcu_read_lock(); + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + page = virt_to_head_page(data); + if (xa) + page_pool_put_page(xa->page_pool, page); + else + put_page(page); + rcu_read_unlock(); + break; + case MEM_TYPE_PAGE_SHARED: + page_frag_free(data); + break; + case MEM_TYPE_PAGE_ORDER0: + page = virt_to_page(data); /* Assumes order0 page*/ + put_page(page); + break; + default: + /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + break; + } +} + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + xdp_return(xdpf->data, &xdpf->mem); +} +EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_buff(struct xdp_buff *xdp) +{ + xdp_return(xdp->data, &xdp->rxq->mem); +} +EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index c795c3f509c9..72236695db3d 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -121,13 +121,16 @@ static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, - struct nlattr **tb) + struct nlattr **tb, + struct netlink_ext_ack *extack) { int err = -EINVAL; struct dn_fib_rule *r = (struct dn_fib_rule *)rule; - if (frh->tos) + if (frh->tos) { + NL_SET_ERR_MSG(extack, "Invalid tos value"); goto errout; + } if (rule->table == RT_TABLE_UNSPEC) { if (rule->action == FR_ACT_TO_TBL) { diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index bbf2c82cf7b2..4183e4ba27a5 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,7 +9,7 @@ config NET_DSA depends on HAVE_NET_DSA && MAY_USE_DEVLINK depends on BRIDGE || BRIDGE=n select NET_SWITCHDEV - select PHYLIB + select PHYLINK ---help--- Say Y if you want to enable support for the hardware switches supported by the Distributed Switch Architecture. diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 47725250b4ca..dc5d9af3dc80 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -272,7 +272,28 @@ static int dsa_port_setup(struct dsa_port *dp) case DSA_PORT_TYPE_UNUSED: break; case DSA_PORT_TYPE_CPU: + /* dp->index is used now as port_number. However + * CPU ports should have separate numbering + * independent from front panel port numbers. + */ + devlink_port_attrs_set(&dp->devlink_port, + DEVLINK_PORT_FLAVOUR_CPU, + dp->index, false, 0); + err = dsa_port_link_register_of(dp); + if (err) { + dev_err(ds->dev, "failed to setup link for port %d.%d\n", + ds->index, dp->index); + return err; + } + break; case DSA_PORT_TYPE_DSA: + /* dp->index is used now as port_number. However + * DSA ports should have separate numbering + * independent from front panel port numbers. + */ + devlink_port_attrs_set(&dp->devlink_port, + DEVLINK_PORT_FLAVOUR_DSA, + dp->index, false, 0); err = dsa_port_link_register_of(dp); if (err) { dev_err(ds->dev, "failed to setup link for port %d.%d\n", @@ -281,6 +302,9 @@ static int dsa_port_setup(struct dsa_port *dp) } break; case DSA_PORT_TYPE_USER: + devlink_port_attrs_set(&dp->devlink_port, + DEVLINK_PORT_FLAVOUR_PHYSICAL, + dp->index, false, 0); err = dsa_slave_create(dp); if (err) dev_err(ds->dev, "failed to create slave for port %d.%d\n", diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 053731473c99..3964c6f7a7c0 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -75,15 +75,6 @@ struct dsa_slave_priv { /* DSA port data, such as switch, port index, etc. */ struct dsa_port *dp; - /* - * The phylib phy_device pointer for the PHY connected - * to this port. - */ - phy_interface_t phy_interface; - int old_link; - int old_pause; - int old_duplex; - #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif diff --git a/net/dsa/master.c b/net/dsa/master.c index 90e6df0351eb..c90ee3227dea 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -22,7 +22,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, int port = cpu_dp->index; int count = 0; - if (ops && ops->get_sset_count && ops->get_ethtool_stats) { + if (ops->get_sset_count && ops->get_ethtool_stats) { count = ops->get_sset_count(dev, ETH_SS_STATS); ops->get_ethtool_stats(dev, stats, data); } @@ -31,6 +31,32 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, ds->ops->get_ethtool_stats(ds, port, data + count); } +static void dsa_master_get_ethtool_phy_stats(struct net_device *dev, + struct ethtool_stats *stats, + uint64_t *data) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; + int count = 0; + + if (dev->phydev && !ops->get_ethtool_phy_stats) { + count = phy_ethtool_get_sset_count(dev->phydev); + if (count >= 0) + phy_ethtool_get_stats(dev->phydev, stats, data); + } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) { + count = ops->get_sset_count(dev, ETH_SS_PHY_STATS); + ops->get_ethtool_phy_stats(dev, stats, data); + } + + if (count < 0) + count = 0; + + if (ds->ops->get_ethtool_phy_stats) + ds->ops->get_ethtool_phy_stats(ds, port, data + count); +} + static int dsa_master_get_sset_count(struct net_device *dev, int sset) { struct dsa_port *cpu_dp = dev->dsa_ptr; @@ -38,11 +64,17 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset) struct dsa_switch *ds = cpu_dp->ds; int count = 0; - if (ops && ops->get_sset_count) - count += ops->get_sset_count(dev, sset); + if (sset == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) + count = phy_ethtool_get_sset_count(dev->phydev); + else if (ops->get_sset_count) + count = ops->get_sset_count(dev, sset); + + if (count < 0) + count = 0; - if (sset == ETH_SS_STATS && ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds, cpu_dp->index); + if (ds->ops->get_sset_count) + count += ds->ops->get_sset_count(ds, cpu_dp->index, sset); return count; } @@ -64,19 +96,28 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; - if (ops && ops->get_sset_count && ops->get_strings) { - mcount = ops->get_sset_count(dev, ETH_SS_STATS); + if (stringset == ETH_SS_PHY_STATS && dev->phydev && + !ops->get_ethtool_phy_stats) { + mcount = phy_ethtool_get_sset_count(dev->phydev); + if (mcount < 0) + mcount = 0; + else + phy_ethtool_get_strings(dev->phydev, data); + } else if (ops->get_sset_count && ops->get_strings) { + mcount = ops->get_sset_count(dev, stringset); + if (mcount < 0) + mcount = 0; ops->get_strings(dev, stringset, data); } - if (stringset == ETH_SS_STATS && ds->ops->get_strings) { + if (ds->ops->get_strings) { ndata = data + mcount * len; /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle * the output after to prepend our CPU port prefix we * constructed earlier */ - ds->ops->get_strings(ds, port, ndata); - count = ds->ops->get_sset_count(ds, port); + ds->ops->get_strings(ds, port, stringset, ndata); + count = ds->ops->get_sset_count(ds, port, stringset); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), ndata + i * len, len - sizeof(pfx)); @@ -102,6 +143,7 @@ static int dsa_master_ethtool_setup(struct net_device *dev) ops->get_sset_count = dsa_master_get_sset_count; ops->get_ethtool_stats = dsa_master_get_ethtool_stats; ops->get_strings = dsa_master_get_strings; + ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats; dev->ethtool_ops = ops; diff --git a/net/dsa/port.c b/net/dsa/port.c index 7acc1169d75e..2413beb995be 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -273,25 +273,38 @@ int dsa_port_vlan_del(struct dsa_port *dp, return 0; } -static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) +static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) { - struct device_node *port_dn = dp->dn; struct device_node *phy_dn; - struct dsa_switch *ds = dp->ds; struct phy_device *phydev; - int port = dp->index; - int err = 0; - phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); + phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0); if (!phy_dn) - return 0; + return NULL; phydev = of_phy_find_device(phy_dn); if (!phydev) { - err = -EPROBE_DEFER; - goto err_put_of; + of_node_put(phy_dn); + return ERR_PTR(-EPROBE_DEFER); } + return phydev; +} + +static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) +{ + struct dsa_switch *ds = dp->ds; + struct phy_device *phydev; + int port = dp->index; + int err = 0; + + phydev = dsa_port_get_phy_device(dp); + if (!phydev) + return 0; + + if (IS_ERR(phydev)) + return PTR_ERR(phydev); + if (enable) { err = genphy_config_init(phydev); if (err < 0) @@ -317,8 +330,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable) err_put_dev: put_device(&phydev->mdio.dev); -err_put_of: - of_node_put(phy_dn); return err; } @@ -372,3 +383,60 @@ void dsa_port_link_unregister_of(struct dsa_port *dp) else dsa_port_setup_phy_of(dp, false); } + +int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data) +{ + struct phy_device *phydev; + int ret = -EOPNOTSUPP; + + if (of_phy_is_fixed_link(dp->dn)) + return ret; + + phydev = dsa_port_get_phy_device(dp); + if (IS_ERR_OR_NULL(phydev)) + return ret; + + ret = phy_ethtool_get_strings(phydev, data); + put_device(&phydev->mdio.dev); + + return ret; +} +EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings); + +int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data) +{ + struct phy_device *phydev; + int ret = -EOPNOTSUPP; + + if (of_phy_is_fixed_link(dp->dn)) + return ret; + + phydev = dsa_port_get_phy_device(dp); + if (IS_ERR_OR_NULL(phydev)) + return ret; + + ret = phy_ethtool_get_stats(phydev, NULL, data); + put_device(&phydev->mdio.dev); + + return ret; +} +EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats); + +int dsa_port_get_phy_sset_count(struct dsa_port *dp) +{ + struct phy_device *phydev; + int ret = -EOPNOTSUPP; + + if (of_phy_is_fixed_link(dp->dn)) + return ret; + + phydev = dsa_port_get_phy_device(dp); + if (IS_ERR_OR_NULL(phydev)) + return ret; + + ret = phy_ethtool_get_sset_count(phydev); + put_device(&phydev->mdio.dev); + + return ret; +} +EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 18561af7a8f1..1e3b6a6d8a40 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -13,6 +13,7 @@ #include <linux/netdevice.h> #include <linux/phy.h> #include <linux/phy_fixed.h> +#include <linux/phylink.h> #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> @@ -97,8 +98,7 @@ static int dsa_slave_open(struct net_device *dev) if (err) goto clear_promisc; - if (dev->phydev) - phy_start(dev->phydev); + phylink_start(dp->pl); return 0; @@ -120,8 +120,7 @@ static int dsa_slave_close(struct net_device *dev) struct net_device *master = dsa_slave_to_master(dev); struct dsa_port *dp = dsa_slave_to_port(dev); - if (dev->phydev) - phy_stop(dev->phydev); + phylink_stop(dp->pl); dsa_port_disable(dp, dev->phydev); @@ -272,10 +271,7 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - if (!dev->phydev) - return -ENODEV; - - return phy_mii_ioctl(dev->phydev, ifr, cmd); + return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } static int dsa_slave_port_attr_set(struct net_device *dev, @@ -498,14 +494,11 @@ dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) ds->ops->get_regs(ds, dp->index, regs, _p); } -static u32 dsa_slave_get_link(struct net_device *dev) +static int dsa_slave_nway_reset(struct net_device *dev) { - if (!dev->phydev) - return -ENODEV; - - genphy_update_link(dev->phydev); + struct dsa_port *dp = dsa_slave_to_port(dev); - return dev->phydev->link; + return phylink_ethtool_nway_reset(dp->pl); } static int dsa_slave_get_eeprom_len(struct net_device *dev) @@ -560,7 +553,8 @@ static void dsa_slave_get_strings(struct net_device *dev, strncpy(data + 2 * len, "rx_packets", len); strncpy(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) - ds->ops->get_strings(ds, dp->index, data + 4 * len); + ds->ops->get_strings(ds, dp->index, stringset, + data + 4 * len); } } @@ -605,7 +599,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) count = 4; if (ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds, dp->index); + count += ds->ops->get_sset_count(ds, dp->index, sset); return count; } @@ -618,6 +612,8 @@ static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_switch *ds = dp->ds; + phylink_ethtool_get_wol(dp->pl, w); + if (ds->ops->get_wol) ds->ops->get_wol(ds, dp->index, w); } @@ -628,6 +624,8 @@ static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; + phylink_ethtool_set_wol(dp->pl, w); + if (ds->ops->set_wol) ret = ds->ops->set_wol(ds, dp->index, w); @@ -651,13 +649,7 @@ static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) if (ret) return ret; - if (e->eee_enabled) { - ret = phy_init_eee(dev->phydev, 0); - if (ret) - return ret; - } - - return phy_ethtool_set_eee(dev->phydev, e); + return phylink_ethtool_set_eee(dp->pl, e); } static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) @@ -677,7 +669,23 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) if (ret) return ret; - return phy_ethtool_get_eee(dev->phydev, e); + return phylink_ethtool_get_eee(dp->pl, e); +} + +static int dsa_slave_get_link_ksettings(struct net_device *dev, + struct ethtool_link_ksettings *cmd) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return phylink_ethtool_ksettings_get(dp->pl, cmd); +} + +static int dsa_slave_set_link_ksettings(struct net_device *dev, + const struct ethtool_link_ksettings *cmd) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return phylink_ethtool_ksettings_set(dp->pl, cmd); } #ifdef CONFIG_NET_POLL_CONTROLLER @@ -980,8 +988,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, .get_regs = dsa_slave_get_regs, - .nway_reset = phy_ethtool_nway_reset, - .get_link = dsa_slave_get_link, + .nway_reset = dsa_slave_nway_reset, + .get_link = ethtool_op_get_link, .get_eeprom_len = dsa_slave_get_eeprom_len, .get_eeprom = dsa_slave_get_eeprom, .set_eeprom = dsa_slave_set_eeprom, @@ -992,8 +1000,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, - .get_link_ksettings = phy_ethtool_get_link_ksettings, - .set_link_ksettings = phy_ethtool_set_link_ksettings, + .get_link_ksettings = dsa_slave_get_link_ksettings, + .set_link_ksettings = dsa_slave_set_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, @@ -1052,56 +1060,122 @@ static struct device_type dsa_type = { .name = "dsa", }; -static void dsa_slave_adjust_link(struct net_device *dev) +static void dsa_slave_phylink_validate(struct net_device *dev, + unsigned long *supported, + struct phylink_link_state *state) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = dp->ds; - unsigned int status_changed = 0; - if (p->old_link != dev->phydev->link) { - status_changed = 1; - p->old_link = dev->phydev->link; - } + if (!ds->ops->phylink_validate) + return; - if (p->old_duplex != dev->phydev->duplex) { - status_changed = 1; - p->old_duplex = dev->phydev->duplex; - } + ds->ops->phylink_validate(ds, dp->index, supported, state); +} - if (p->old_pause != dev->phydev->pause) { - status_changed = 1; - p->old_pause = dev->phydev->pause; - } +static int dsa_slave_phylink_mac_link_state(struct net_device *dev, + struct phylink_link_state *state) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + /* Only called for SGMII and 802.3z */ + if (!ds->ops->phylink_mac_link_state) + return -EOPNOTSUPP; + + return ds->ops->phylink_mac_link_state(ds, dp->index, state); +} + +static void dsa_slave_phylink_mac_config(struct net_device *dev, + unsigned int mode, + const struct phylink_link_state *state) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_config) + return; + + ds->ops->phylink_mac_config(ds, dp->index, mode, state); +} - if (ds->ops->adjust_link && status_changed) - ds->ops->adjust_link(ds, dp->index, dev->phydev); +static void dsa_slave_phylink_mac_an_restart(struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; - if (status_changed) - phy_print_status(dev->phydev); + if (!ds->ops->phylink_mac_an_restart) + return; + + ds->ops->phylink_mac_an_restart(ds, dp->index); } -static int dsa_slave_fixed_link_update(struct net_device *dev, - struct fixed_phy_status *status) +static void dsa_slave_phylink_mac_link_down(struct net_device *dev, + unsigned int mode, + phy_interface_t interface) { - struct dsa_switch *ds; - struct dsa_port *dp; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; - if (dev) { - dp = dsa_slave_to_port(dev); - ds = dp->ds; - if (ds->ops->fixed_link_update) - ds->ops->fixed_link_update(ds, dp->index, status); + if (!ds->ops->phylink_mac_link_down) { + if (ds->ops->adjust_link && dev->phydev) + ds->ops->adjust_link(ds, dp->index, dev->phydev); + return; } - return 0; + ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface); +} + +static void dsa_slave_phylink_mac_link_up(struct net_device *dev, + unsigned int mode, + phy_interface_t interface, + struct phy_device *phydev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->phylink_mac_link_up) { + if (ds->ops->adjust_link && dev->phydev) + ds->ops->adjust_link(ds, dp->index, dev->phydev); + return; + } + + ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev); +} + +static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = { + .validate = dsa_slave_phylink_validate, + .mac_link_state = dsa_slave_phylink_mac_link_state, + .mac_config = dsa_slave_phylink_mac_config, + .mac_an_restart = dsa_slave_phylink_mac_an_restart, + .mac_link_down = dsa_slave_phylink_mac_link_down, + .mac_link_up = dsa_slave_phylink_mac_link_up, +}; + +void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) +{ + const struct dsa_port *dp = dsa_to_port(ds, port); + + phylink_mac_change(dp->pl, up); +} +EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); + +static void dsa_slave_phylink_fixed_state(struct net_device *dev, + struct phylink_link_state *state) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + /* No need to check that this operation is valid, the callback would + * not be called if it was not. + */ + ds->ops->phylink_fixed_state(ds, dp->index, state); } /* slave device setup *******************************************************/ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); - struct dsa_slave_priv *p = netdev_priv(slave_dev); struct dsa_switch *ds = dp->ds; slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr); @@ -1110,75 +1184,54 @@ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) return -ENODEV; } - /* Use already configured phy mode */ - if (p->phy_interface == PHY_INTERFACE_MODE_NA) - p->phy_interface = slave_dev->phydev->interface; - - return phy_connect_direct(slave_dev, slave_dev->phydev, - dsa_slave_adjust_link, p->phy_interface); + return phylink_connect_phy(dp->pl, slave_dev->phydev); } static int dsa_slave_phy_setup(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); - struct dsa_slave_priv *p = netdev_priv(slave_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; - struct device_node *phy_dn; - bool phy_is_fixed = false; u32 phy_flags = 0; int mode, ret; mode = of_get_phy_mode(port_dn); if (mode < 0) mode = PHY_INTERFACE_MODE_NA; - p->phy_interface = mode; - phy_dn = of_parse_phandle(port_dn, "phy-handle", 0); - if (!phy_dn && of_phy_is_fixed_link(port_dn)) { - /* In the case of a fixed PHY, the DT node associated - * to the fixed PHY is the Port DT node - */ - ret = of_phy_register_fixed_link(port_dn); - if (ret) { - netdev_err(slave_dev, "failed to register fixed PHY: %d\n", ret); - return ret; - } - phy_is_fixed = true; - phy_dn = of_node_get(port_dn); + dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode, + &dsa_slave_phylink_mac_ops); + if (IS_ERR(dp->pl)) { + netdev_err(slave_dev, + "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); + return PTR_ERR(dp->pl); } + /* Register only if the switch provides such a callback, since this + * callback takes precedence over polling the link GPIO in PHYLINK + * (see phylink_get_fixed_state). + */ + if (ds->ops->phylink_fixed_state) + phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state); + if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); - if (phy_dn) { - slave_dev->phydev = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, - phy_flags, - p->phy_interface); - of_node_put(phy_dn); - } - - if (slave_dev->phydev && phy_is_fixed) - fixed_phy_set_link_update(slave_dev->phydev, - dsa_slave_fixed_link_update); - - /* We could not connect to a designated PHY, so use the switch internal - * MDIO bus instead - */ - if (!slave_dev->phydev) { + ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags); + if (ret == -ENODEV) { + /* We could not connect to a designated PHY or SFP, so use the + * switch internal MDIO bus instead + */ ret = dsa_slave_phy_connect(slave_dev, dp->index); if (ret) { - netdev_err(slave_dev, "failed to connect to port %d: %d\n", + netdev_err(slave_dev, + "failed to connect to port %d: %d\n", dp->index, ret); - if (phy_is_fixed) - of_phy_deregister_fixed_link(port_dn); + phylink_destroy(dp->pl); return ret; } } - phy_attached_info(slave_dev->phydev); - return 0; } @@ -1193,29 +1246,26 @@ static void dsa_slave_set_lockdep_class_one(struct net_device *dev, int dsa_slave_suspend(struct net_device *slave_dev) { - struct dsa_slave_priv *p = netdev_priv(slave_dev); + struct dsa_port *dp = dsa_slave_to_port(slave_dev); netif_device_detach(slave_dev); - if (slave_dev->phydev) { - phy_stop(slave_dev->phydev); - p->old_pause = -1; - p->old_link = -1; - p->old_duplex = -1; - phy_suspend(slave_dev->phydev); - } + rtnl_lock(); + phylink_stop(dp->pl); + rtnl_unlock(); return 0; } int dsa_slave_resume(struct net_device *slave_dev) { + struct dsa_port *dp = dsa_slave_to_port(slave_dev); + netif_device_attach(slave_dev); - if (slave_dev->phydev) { - phy_resume(slave_dev->phydev); - phy_start(slave_dev->phydev); - } + rtnl_lock(); + phylink_start(dp->pl); + rtnl_unlock(); return 0; } @@ -1280,11 +1330,6 @@ int dsa_slave_create(struct dsa_port *port) p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); p->xmit = cpu_dp->tag_ops->xmit; - - p->old_pause = -1; - p->old_link = -1; - p->old_duplex = -1; - port->slave = slave_dev; netif_carrier_off(slave_dev); @@ -1307,9 +1352,10 @@ int dsa_slave_create(struct dsa_port *port) return 0; out_phy: - phy_disconnect(slave_dev->phydev); - if (of_phy_is_fixed_link(port->dn)) - of_phy_deregister_fixed_link(port->dn); + rtnl_lock(); + phylink_disconnect_phy(p->dp->pl); + rtnl_unlock(); + phylink_destroy(p->dp->pl); out_free: free_percpu(p->stats64); free_netdev(slave_dev); @@ -1321,17 +1367,15 @@ void dsa_slave_destroy(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct device_node *port_dn = dp->dn; netif_carrier_off(slave_dev); - if (slave_dev->phydev) { - phy_disconnect(slave_dev->phydev); + rtnl_lock(); + phylink_disconnect_phy(dp->pl); + rtnl_unlock(); - if (of_phy_is_fixed_link(port_dn)) - of_phy_deregister_fixed_link(port_dn); - } dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); unregister_netdev(slave_dev); + phylink_destroy(dp->pl); free_percpu(p->stats64); free_netdev(slave_dev); } @@ -1394,6 +1438,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; + if (!fdb_info->added_by_user) + break; + err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid); if (err) { netdev_dbg(dev, "fdb add failed err=%d\n", err); @@ -1405,6 +1452,9 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) case SWITCHDEV_FDB_DEL_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; + if (!fdb_info->added_by_user) + break; + err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid); if (err) { netdev_dbg(dev, "fdb del failed err=%d\n", err); @@ -1457,8 +1507,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, switch (event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */ case SWITCHDEV_FDB_DEL_TO_DEVICE: - if (dsa_slave_switchdev_fdb_work_init(switchdev_work, - ptr)) + if (dsa_slave_switchdev_fdb_work_init(switchdev_work, ptr)) goto err_fdb_work_init; dev_hold(dev); break; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index eaeba9b99a73..ee28440f57c5 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -128,15 +128,15 @@ u32 eth_get_headlen(void *data, unsigned int len) { const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; - struct flow_keys keys; + struct flow_keys_basic keys; /* this should never happen, but better safe than sorry */ if (unlikely(len < sizeof(*eth))) return len; /* parse any remaining L2/L3 headers, check for L4 */ - if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, - sizeof(*eth), len, flags)) + if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto, + sizeof(*eth), len, flags)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index a07b7dd06def..b379520f9133 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -13,7 +13,8 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ - inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o + inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ + metrics.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index eaed0367e669..b403499fdabe 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = { .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, - .mmap = sock_no_mmap, +#ifdef CONFIG_MMU + .mmap = tcp_mmap, +#endif .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, @@ -1006,6 +1008,7 @@ const struct proto_ops inet_stream_ops = { .compat_getsockopt = compat_sock_common_getsockopt, .compat_ioctl = inet_compat_ioctl, #endif + .set_rcvlowat = tcp_set_rcvlowat, }; EXPORT_SYMBOL(inet_stream_ops); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 737d11bc8838..f8eb78d042a4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -213,14 +213,17 @@ static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, - struct nlattr **tb) + struct nlattr **tb, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int err = -EINVAL; struct fib4_rule *rule4 = (struct fib4_rule *) rule; - if (frh->tos & ~IPTOS_TOS_MASK) + if (frh->tos & ~IPTOS_TOS_MASK) { + NL_SET_ERR_MSG(extack, "Invalid tos"); goto errout; + } /* split local/main if they are not already split */ err = fib_unmerge(net); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c27122f01b87..6608db23f54b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1019,47 +1019,8 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) static int fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) { - bool ecn_ca = false; - struct nlattr *nla; - int remaining; - - if (!cfg->fc_mx) - return 0; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - u32 val; - - if (!type) - continue; - if (type > RTAX_MAX) - return -EINVAL; - - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) - return -EINVAL; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - if (type == RTAX_HOPLIMIT && val > 255) - val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) - return -EINVAL; - fi->fib_metrics->metrics[type - 1] = val; - } - - if (ecn_ca) - fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; - - return 0; + return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len, + fi->fib_metrics->metrics); } struct fib_info *fib_create_info(struct fib_config *cfg, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 881ac6d046f2..33a88e045efd 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -27,11 +27,6 @@ #include <net/sock_reuseport.h> #include <net/addrconf.h> -#ifdef INET_CSK_DEBUG -const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; -EXPORT_SYMBOL(inet_csk_timer_bug_msg); -#endif - #if IS_ENABLED(CONFIG_IPV6) /* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 * only, and any IPv4 addresses if not IPv6 only diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f200b304f76c..2d8efeecf619 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -578,6 +578,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, int tunnel_hlen; int version; __be16 df; + int nhoff; + int thoff; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || @@ -605,6 +607,16 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, truncate = true; } + nhoff = skb_network_header(skb) - skb_mac_header(skb); + if (skb->protocol == htons(ETH_P_IP) && + (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) + truncate = true; + + thoff = skb_transport_header(skb) - skb_mac_header(skb); + if (skb->protocol == htons(ETH_P_IPV6) && + (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) + truncate = true; + if (version == 1) { erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), ntohl(md->u.index), truncate, true); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d54abc097800..af5a830ff6ad 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -878,11 +878,14 @@ static int __ip_append_data(struct sock *sk, struct rtable *rt = (struct rtable *)cork->dst; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; + bool paged; skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; - mtu = cork->fragsize; + mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; + paged = !!cork->gso_size; + if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; @@ -906,8 +909,8 @@ static int __ip_append_data(struct sock *sk, if (transhdrlen && length + fragheaderlen <= mtu && rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && - !(flags & MSG_MORE) && - !exthdrlen) + (!(flags & MSG_MORE) || cork->gso_size) && + (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; cork->length += length; @@ -933,6 +936,7 @@ static int __ip_append_data(struct sock *sk, unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; + unsigned int pagedlen = 0; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; @@ -953,8 +957,12 @@ alloc_new_skb: if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else + else if (!paged) alloclen = fraglen; + else { + alloclen = min_t(int, fraglen, MAX_HEADER); + pagedlen = fraglen - alloclen; + } alloclen += exthdrlen; @@ -998,7 +1006,7 @@ alloc_new_skb: /* * Find where to start putting bytes. */ - data = skb_put(skb, fraglen + exthdrlen); + data = skb_put(skb, fraglen + exthdrlen - pagedlen); skb_set_network_header(skb, exthdrlen); skb->transport_header = (skb->network_header + fragheaderlen); @@ -1014,7 +1022,7 @@ alloc_new_skb: pskb_trim_unique(skb_prev, maxfraglen); } - copy = datalen - transhdrlen - fraggap; + copy = datalen - transhdrlen - fraggap - pagedlen; if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); @@ -1022,7 +1030,7 @@ alloc_new_skb: } offset += copy; - length -= datalen - fraggap; + length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; csummode = CHECKSUM_NONE; @@ -1136,6 +1144,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, *rtp = NULL; cork->fragsize = ip_sk_use_pmtu(sk) ? dst_mtu(&rt->dst) : rt->dst.dev->mtu; + + cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0; cork->dst = &rt->dst; cork->length = 0; cork->ttl = ipc->ttl; @@ -1215,7 +1225,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, return -EOPNOTSUPP; hh_len = LL_RESERVED_SPACE(rt->dst.dev); - mtu = cork->fragsize; + mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; @@ -1471,9 +1481,8 @@ struct sk_buff *ip_make_skb(struct sock *sk, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, - unsigned int flags) + struct inet_cork *cork, unsigned int flags) { - struct inet_cork cork; struct sk_buff_head queue; int err; @@ -1482,22 +1491,22 @@ struct sk_buff *ip_make_skb(struct sock *sk, __skb_queue_head_init(&queue); - cork.flags = 0; - cork.addr = 0; - cork.opt = NULL; - err = ip_setup_cork(sk, &cork, ipc, rtp); + cork->flags = 0; + cork->addr = 0; + cork->opt = NULL; + err = ip_setup_cork(sk, cork, ipc, rtp); if (err) return ERR_PTR(err); - err = __ip_append_data(sk, fl4, &queue, &cork, + err = __ip_append_data(sk, fl4, &queue, cork, ¤t->task_frag, getfrag, from, length, transhdrlen, flags); if (err) { - __ip_flush_pending_frames(sk, &queue, &cork); + __ip_flush_pending_frames(sk, &queue, cork); return ERR_PTR(err); } - return __ip_make_skb(sk, fl4, &queue, &cork); + return __ip_make_skb(sk, fl4, &queue, cork); } /* @@ -1553,7 +1562,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, oif = skb->skb_iif; flowi4_init_output(&fl4, oif, - IP4_REPLY_MARK(net, skb->mark), + IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 2f39479be92f..dde671e97829 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -423,17 +423,17 @@ void __init ip_tunnel_core_init(void) lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } -struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt); EXPORT_SYMBOL(ip_tunnel_metadata_cnt); void ip_tunnel_need_metadata(void) { - static_key_slow_inc(&ip_tunnel_metadata_cnt); + static_branch_inc(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); void ip_tunnel_unneed_metadata(void) { - static_key_slow_dec(&ip_tunnel_metadata_cnt); + static_branch_dec(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 43f620feb1c4..86c9f755de3d 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -28,6 +28,9 @@ * * Multiple Nameservers in /proc/net/pnp * -- Josef Siemes <jsiemes@web.de>, Aug 2002 + * + * NTP servers in /proc/net/ipconfig/ntp_servers + * -- Chris Novakovic <chris@chrisn.me.uk>, April 2018 */ #include <linux/types.h> @@ -93,6 +96,7 @@ #define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */ #define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers - '3' from resolv.h */ +#define CONF_NTP_SERVERS_MAX 3 /* Maximum number of NTP servers */ #define NONE cpu_to_be32(INADDR_NONE) #define ANY cpu_to_be32(INADDR_ANY) @@ -152,6 +156,7 @@ static int ic_proto_used; /* Protocol used, if any */ #define ic_proto_used 0 #endif static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ +static __be32 ic_ntp_servers[CONF_NTP_SERVERS_MAX]; /* NTP server IP addresses */ static u8 ic_domain[64]; /* DNS (not NIS) domain name */ /* @@ -576,6 +581,15 @@ static inline void __init ic_nameservers_predef(void) ic_nameservers[i] = NONE; } +/* Predefine NTP servers */ +static inline void __init ic_ntp_servers_predef(void) +{ + int i; + + for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) + ic_ntp_servers[i] = NONE; +} + /* * DHCP/BOOTP support. */ @@ -671,6 +685,7 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d) 17, /* Boot path */ 26, /* MTU */ 40, /* NIS domain name */ + 42, /* NTP servers */ }; *e++ = 55; /* Parameter request list */ @@ -721,9 +736,11 @@ static void __init ic_bootp_init_ext(u8 *e) *e++ = 3; /* Default gateway request */ *e++ = 4; e += 4; - *e++ = 5; /* Name server request */ - *e++ = 8; - e += 8; +#if CONF_NAMESERVERS_MAX > 0 + *e++ = 6; /* (DNS) name server request */ + *e++ = 4 * CONF_NAMESERVERS_MAX; + e += 4 * CONF_NAMESERVERS_MAX; +#endif *e++ = 12; /* Host name request */ *e++ = 32; e += 32; @@ -748,7 +765,13 @@ static void __init ic_bootp_init_ext(u8 *e) */ static inline void __init ic_bootp_init(void) { + /* Re-initialise all name servers and NTP servers to NONE, in case any + * were set via the "ip=" or "nfsaddrs=" kernel command line parameters: + * any IP addresses specified there will already have been decoded but + * are no longer needed + */ ic_nameservers_predef(); + ic_ntp_servers_predef(); dev_add_pack(&bootp_packet_type); } @@ -912,6 +935,15 @@ static void __init ic_do_bootp_ext(u8 *ext) ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); break; + case 42: /* NTP servers */ + servers = *ext / 4; + if (servers > CONF_NTP_SERVERS_MAX) + servers = CONF_NTP_SERVERS_MAX; + for (i = 0; i < servers; i++) { + if (ic_ntp_servers[i] == NONE) + memcpy(&ic_ntp_servers[i], ext+1+4*i, 4); + } + break; } } @@ -1257,7 +1289,10 @@ static int __init ic_dynamic(void) #endif /* IPCONFIG_DYNAMIC */ #ifdef CONFIG_PROC_FS +/* proc_dir_entry for /proc/net/ipconfig */ +static struct proc_dir_entry *ipconfig_dir; +/* Name servers: */ static int pnp_seq_show(struct seq_file *seq, void *v) { int i; @@ -1294,6 +1329,62 @@ static const struct file_operations pnp_seq_fops = { .llseek = seq_lseek, .release = single_release, }; + +/* Create the /proc/net/ipconfig directory */ +static int __init ipconfig_proc_net_init(void) +{ + ipconfig_dir = proc_net_mkdir(&init_net, "ipconfig", init_net.proc_net); + if (!ipconfig_dir) + return -ENOMEM; + + return 0; +} + +/* Create a new file under /proc/net/ipconfig */ +static int ipconfig_proc_net_create(const char *name, + const struct file_operations *fops) +{ + char *pname; + struct proc_dir_entry *p; + + if (!ipconfig_dir) + return -ENOMEM; + + pname = kasprintf(GFP_KERNEL, "%s%s", "ipconfig/", name); + if (!pname) + return -ENOMEM; + + p = proc_create(pname, 0444, init_net.proc_net, fops); + kfree(pname); + if (!p) + return -ENOMEM; + + return 0; +} + +/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */ +static int ntp_servers_seq_show(struct seq_file *seq, void *v) +{ + int i; + + for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) { + if (ic_ntp_servers[i] != NONE) + seq_printf(seq, "%pI4\n", &ic_ntp_servers[i]); + } + return 0; +} + +static int ntp_servers_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, ntp_servers_seq_show, NULL); +} + +static const struct file_operations ntp_servers_seq_fops = { + .open = ntp_servers_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif /* CONFIG_PROC_FS */ /* @@ -1368,8 +1459,20 @@ static int __init ip_auto_config(void) int err; unsigned int i; + /* Initialise all name servers and NTP servers to NONE (but only if the + * "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded, + * otherwise we'll overwrite the IP addresses specified there) + */ + if (ic_set_manually == 0) { + ic_nameservers_predef(); + ic_ntp_servers_predef(); + } + #ifdef CONFIG_PROC_FS proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops); + + if (ipconfig_proc_net_init() == 0) + ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) @@ -1481,16 +1584,32 @@ static int __init ip_auto_config(void) &ic_servaddr, &root_server_addr, root_server_path); if (ic_dev_mtu) pr_cont(", mtu=%d", ic_dev_mtu); - for (i = 0; i < CONF_NAMESERVERS_MAX; i++) + /* Name servers (if any): */ + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { if (ic_nameservers[i] != NONE) { - pr_cont(" nameserver%u=%pI4", - i, &ic_nameservers[i]); - break; + if (i == 0) + pr_info(" nameserver%u=%pI4", + i, &ic_nameservers[i]); + else + pr_cont(", nameserver%u=%pI4", + i, &ic_nameservers[i]); } - for (i++; i < CONF_NAMESERVERS_MAX; i++) - if (ic_nameservers[i] != NONE) - pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); - pr_cont("\n"); + if (i + 1 == CONF_NAMESERVERS_MAX) + pr_cont("\n"); + } + /* NTP servers (if any): */ + for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) { + if (ic_ntp_servers[i] != NONE) { + if (i == 0) + pr_info(" ntpserver%u=%pI4", + i, &ic_ntp_servers[i]); + else + pr_cont(", ntpserver%u=%pI4", + i, &ic_ntp_servers[i]); + } + if (i + 1 == CONF_NTP_SERVERS_MAX) + pr_cont("\n"); + } #endif /* !SILENT */ /* @@ -1588,7 +1707,9 @@ static int __init ip_auto_config_setup(char *addrs) return 1; } + /* Initialise all name servers and NTP servers to NONE */ ic_nameservers_predef(); + ic_ntp_servers_predef(); /* Parse string for static IP assignment. */ ip = addrs; @@ -1647,6 +1768,13 @@ static int __init ip_auto_config_setup(char *addrs) ic_nameservers[1] = NONE; } break; + case 9: + if (CONF_NTP_SERVERS_MAX >= 1) { + ic_ntp_servers[0] = in_aton(ip); + if (ic_ntp_servers[0] == ANY) + ic_ntp_servers[0] = NONE; + } + break; } } ip = cp; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2fb4de3f7f66..38e092eafc97 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -201,7 +201,8 @@ static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = { }; static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, - struct fib_rule_hdr *frh, struct nlattr **tb) + struct fib_rule_hdr *frh, struct nlattr **tb, + struct netlink_ext_ack *extack) { return 0; } diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c new file mode 100644 index 000000000000..5121c6475e6b --- /dev/null +++ b/net/ipv4/metrics.c @@ -0,0 +1,53 @@ +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/types.h> +#include <net/ip.h> +#include <net/net_namespace.h> +#include <net/tcp.h> + +int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len, + u32 *metrics) +{ + bool ecn_ca = false; + struct nlattr *nla; + int remaining; + + if (!fc_mx) + return 0; + + nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; + metrics[type - 1] = val; + } + + if (ecn_ca) + metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_metrics_convert); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index e85f35b89c49..1ef37e2e2679 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -301,7 +301,7 @@ ipt_do_table(struct sk_buff *skb, counter = xt_get_this_cpu_counter(&e->counters); ADD_COUNTER(*counter, skb->len, 1); - t = ipt_get_target(e); + t = ipt_get_target_c(e); WARN_ON(!t->u.kernel.target); #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index a03e4e7ef5f9..ce1512b02cb2 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -47,7 +47,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par) static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { - struct nf_nat_range range; + struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; mr = par->targinfo; diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 0f7255cc65ee..529d89ec31e8 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -33,8 +33,7 @@ static const struct xt_table nf_nat_ipv4_table = { static unsigned int iptable_nat_do_chain(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct) + const struct nf_hook_state *state) { return ipt_do_table(skb, state, state->net->ipv4.nat_table); } diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 0cd46bffa469..e1e56d7123d2 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -2,265 +2,12 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> -#include <linux/rhashtable.h> -#include <linux/ip.h> -#include <linux/netdevice.h> -#include <net/ip.h> -#include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> -/* For layer 4 checksum field offset. */ -#include <linux/tcp.h> -#include <linux/udp.h> - -static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) -{ - struct tcphdr *tcph; - - if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || - skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - - tcph = (void *)(skb_network_header(skb) + thoff); - inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); - - return 0; -} - -static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) -{ - struct udphdr *udph; - - if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || - skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - - udph = (void *)(skb_network_header(skb) + thoff); - if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { - inet_proto_csum_replace4(&udph->check, skb, addr, - new_addr, true); - if (!udph->check) - udph->check = CSUM_MANGLED_0; - } - - return 0; -} - -static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, - unsigned int thoff, __be32 addr, - __be32 new_addr) -{ - switch (iph->protocol) { - case IPPROTO_TCP: - if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; - break; - case IPPROTO_UDP: - if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; - break; - } - - return 0; -} - -static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) -{ - __be32 addr, new_addr; - - switch (dir) { - case FLOW_OFFLOAD_DIR_ORIGINAL: - addr = iph->saddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; - iph->saddr = new_addr; - break; - case FLOW_OFFLOAD_DIR_REPLY: - addr = iph->daddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; - iph->daddr = new_addr; - break; - default: - return -1; - } - csum_replace4(&iph->check, addr, new_addr); - - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); -} - -static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) -{ - __be32 addr, new_addr; - - switch (dir) { - case FLOW_OFFLOAD_DIR_ORIGINAL: - addr = iph->daddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; - iph->daddr = new_addr; - break; - case FLOW_OFFLOAD_DIR_REPLY: - addr = iph->saddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; - iph->saddr = new_addr; - break; - default: - return -1; - } - csum_replace4(&iph->check, addr, new_addr); - - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); -} - -static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, - enum flow_offload_tuple_dir dir) -{ - struct iphdr *iph = ip_hdr(skb); - unsigned int thoff = iph->ihl * 4; - - if (flow->flags & FLOW_OFFLOAD_SNAT && - (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) - return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && - (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) - return -1; - - return 0; -} - -static bool ip_has_options(unsigned int thoff) -{ - return thoff != sizeof(struct iphdr); -} - -static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple) -{ - struct flow_ports *ports; - unsigned int thoff; - struct iphdr *iph; - - if (!pskb_may_pull(skb, sizeof(*iph))) - return -1; - - iph = ip_hdr(skb); - thoff = iph->ihl * 4; - - if (ip_is_fragment(iph) || - unlikely(ip_has_options(thoff))) - return -1; - - if (iph->protocol != IPPROTO_TCP && - iph->protocol != IPPROTO_UDP) - return -1; - - thoff = iph->ihl * 4; - if (!pskb_may_pull(skb, thoff + sizeof(*ports))) - return -1; - - ports = (struct flow_ports *)(skb_network_header(skb) + thoff); - - tuple->src_v4.s_addr = iph->saddr; - tuple->dst_v4.s_addr = iph->daddr; - tuple->src_port = ports->source; - tuple->dst_port = ports->dest; - tuple->l3proto = AF_INET; - tuple->l4proto = iph->protocol; - tuple->iifidx = dev->ifindex; - - return 0; -} - -/* Based on ip_exceeds_mtu(). */ -static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) -{ - if (skb->len <= mtu) - return false; - - if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) - return false; - - if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) - return false; - - return true; -} - -static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt) -{ - u32 mtu; - - mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (__nf_flow_exceeds_mtu(skb, mtu)) - return true; - - return false; -} - -unsigned int -nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; - enum flow_offload_tuple_dir dir; - struct flow_offload *flow; - struct net_device *outdev; - const struct rtable *rt; - struct iphdr *iph; - __be32 nexthop; - - if (skb->protocol != htons(ETH_P_IP)) - return NF_ACCEPT; - - if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; - - outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); - if (!outdev) - return NF_ACCEPT; - - dir = tuplehash->tuple.dir; - flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - - rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; - if (unlikely(nf_flow_exceeds_mtu(skb, rt))) - return NF_ACCEPT; - - if (skb_try_make_writable(skb, sizeof(*iph))) - return NF_DROP; - - if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && - nf_flow_nat_ip(flow, skb, dir) < 0) - return NF_DROP; - - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; - iph = ip_hdr(skb); - ip_decrease_ttl(iph); - - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); - - return NF_STOLEN; -} -EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); static struct nf_flowtable_type flowtable_ipv4 = { .family = NFPROTO_IPV4, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_ip_hook, .owner = THIS_MODULE, diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index ac8342dcb55e..4e6b53ab6c33 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -395,7 +395,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, static void ip_nat_q931_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct nf_nat_range range; + struct nf_nat_range2 range; if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */ nf_nat_follow_master(new, this); @@ -497,7 +497,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, static void ip_nat_callforwarding_expect(struct nf_conn *new, struct nf_conntrack_expect *this) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(new->status & IPS_NAT_DONE_MASK); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index f7ff6a364d7b..325e02956bf5 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -63,7 +63,7 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, #endif /* CONFIG_XFRM */ static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); @@ -143,7 +143,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); @@ -246,8 +246,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -285,7 +284,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(priv, skb, state, ct); + ret = do_chain(priv, skb, state); if (ret != NF_ACCEPT) return ret; @@ -326,8 +325,7 @@ nf_nat_ipv4_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; @@ -346,8 +344,7 @@ nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { #ifdef CONFIG_XFRM const struct nf_conn *ct; @@ -383,8 +380,7 @@ nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index 0c366aad89cb..f538c5001547 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -24,13 +24,13 @@ unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 8a69363b4884..5d259a12e25f 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -48,7 +48,7 @@ static void pptp_nat_expected(struct nf_conn *ct, struct nf_conntrack_tuple t = {}; const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; - struct nf_nat_range range; + struct nf_nat_range2 range; struct nf_conn_nat *nat; nat = nf_ct_nat_ext_add(ct); diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index edf05002d674..00fda6331ce5 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -41,7 +41,7 @@ MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); static void gre_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 7b98baa13ede..6d7cf1d79baf 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -30,7 +30,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, static void icmp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index b5464a3f253b..285baccfbdea 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -28,8 +28,7 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct) + const struct nf_hook_state *state) { struct nft_pktinfo pkt; diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index f18677277119..f1193e1e928a 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -21,7 +21,7 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index a058de677e94..6c1ff89a60fa 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -296,6 +296,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), + SNMP_MIB_ITEM("TCPDelivered", LINUX_MIB_TCPDELIVERED), + SNMP_MIB_ITEM("TCPDeliveredCE", LINUX_MIB_TCPDELIVEREDCE), + SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4b195bac8ac0..d2eed3ddcb0a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -46,6 +46,7 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +static int comp_sack_nr_max = 255; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1152,6 +1153,22 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &one, }, { + .procname = "tcp_comp_sack_delay_ns", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_delay_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "tcp_comp_sack_nr", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &comp_sack_nr_max, + }, + { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c9d00ef54dec..0a2ea0bbf867 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1702,6 +1702,139 @@ int tcp_peek_len(struct socket *sock) } EXPORT_SYMBOL(tcp_peek_len); +/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ +int tcp_set_rcvlowat(struct sock *sk, int val) +{ + sk->sk_rcvlowat = val ? : 1; + + /* Check if we need to signal EPOLLIN right now */ + tcp_data_ready(sk); + + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) + return 0; + + /* val comes from user space and might be close to INT_MAX */ + val <<= 1; + if (val < 0) + val = INT_MAX; + + val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + if (val > sk->sk_rcvbuf) { + sk->sk_rcvbuf = val; + tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); + } + return 0; +} +EXPORT_SYMBOL(tcp_set_rcvlowat); + +#ifdef CONFIG_MMU +static const struct vm_operations_struct tcp_vm_ops = { +}; + +int tcp_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); + + /* Instruct vm_insert_page() to not down_read(mmap_sem) */ + vma->vm_flags |= VM_MIXEDMAP; + + vma->vm_ops = &tcp_vm_ops; + return 0; +} +EXPORT_SYMBOL(tcp_mmap); + +static int tcp_zerocopy_receive(struct sock *sk, + struct tcp_zerocopy_receive *zc) +{ + unsigned long address = (unsigned long)zc->address; + const skb_frag_t *frags = NULL; + u32 length = 0, seq, offset; + struct vm_area_struct *vma; + struct sk_buff *skb = NULL; + struct tcp_sock *tp; + int ret; + + if (address & (PAGE_SIZE - 1) || address != zc->address) + return -EINVAL; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + + sock_rps_record_flow(sk); + + down_read(¤t->mm->mmap_sem); + + ret = -EINVAL; + vma = find_vma(current->mm, address); + if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) + goto out; + zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); + + tp = tcp_sk(sk); + seq = tp->copied_seq; + zc->length = min_t(u32, zc->length, tcp_inq(sk)); + zc->length &= ~(PAGE_SIZE - 1); + + zap_page_range(vma, address, zc->length); + + zc->recv_skip_hint = 0; + ret = 0; + while (length + PAGE_SIZE <= zc->length) { + if (zc->recv_skip_hint < PAGE_SIZE) { + if (skb) { + skb = skb->next; + offset = seq - TCP_SKB_CB(skb)->seq; + } else { + skb = tcp_recv_skb(sk, seq, &offset); + } + + zc->recv_skip_hint = skb->len - offset; + offset -= skb_headlen(skb); + if ((int)offset < 0 || skb_has_frag_list(skb)) + break; + frags = skb_shinfo(skb)->frags; + while (offset) { + if (frags->size > offset) + goto out; + offset -= frags->size; + frags++; + } + } + if (frags->size != PAGE_SIZE || frags->page_offset) + break; + ret = vm_insert_page(vma, address + length, + skb_frag_page(frags)); + if (ret) + break; + length += PAGE_SIZE; + seq += PAGE_SIZE; + zc->recv_skip_hint -= PAGE_SIZE; + frags++; + } +out: + up_read(¤t->mm->mmap_sem); + if (length) { + tp->copied_seq = seq; + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + tcp_recv_skb(sk, seq, &offset); + tcp_cleanup_rbuf(sk, length); + ret = 0; + if (length == zc->length) + zc->recv_skip_hint = 0; + } else { + if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) + ret = -EIO; + } + zc->length = length; + return ret; +} +#endif + static void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping *tss) { @@ -1757,6 +1890,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, } } +static int tcp_inq_hint(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 copied_seq = READ_ONCE(tp->copied_seq); + u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); + int inq; + + inq = rcv_nxt - copied_seq; + if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { + lock_sock(sk); + inq = tp->rcv_nxt - tp->copied_seq; + release_sock(sk); + } + return inq; +} + /* * This routine copies from a sock struct into the user buffer. * @@ -1773,13 +1922,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, u32 peek_seq; u32 *seq; unsigned long used; - int err; + int err, inq; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; struct scm_timestamping tss; bool has_tss = false; + bool has_cmsg; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); @@ -1794,6 +1944,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (sk->sk_state == TCP_LISTEN) goto out; + has_cmsg = tp->recvmsg_inq; timeo = sock_rcvtimeo(sk, nonblock); /* Urgent data needs to be handled specially. */ @@ -1980,6 +2131,7 @@ skip_copy: if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, &tss); has_tss = true; + has_cmsg = true; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; @@ -1999,13 +2151,20 @@ skip_copy: * on connected socket. I was just happy when found this 8) --ANK */ - if (has_tss) - tcp_recv_timestamp(msg, sk, &tss); - /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); release_sock(sk); + + if (has_cmsg) { + if (has_tss) + tcp_recv_timestamp(msg, sk, &tss); + if (tp->recvmsg_inq) { + inq = tcp_inq_hint(sk); + put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); + } + } + return copied; out: @@ -2422,6 +2581,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->window_clamp = 0; + tp->delivered_ce = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); @@ -2435,6 +2595,7 @@ int tcp_disconnect(struct sock *sk, int flags) dst_release(sk->sk_rx_dst); sk->sk_rx_dst = NULL; tcp_saved_syn_free(tp); + tp->compressed_ack = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); @@ -2873,6 +3034,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, tp->notsent_lowat = val; sk->sk_write_space(sk); break; + case TCP_INQ: + if (val > 1 || val < 0) + err = -EINVAL; + else + tp->recvmsg_inq = val; + break; default: err = -ENOPROTOOPT; break; @@ -3031,6 +3198,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; + info->tcpi_delivered = tp->delivered; + info->tcpi_delivered_ce = tp->delivered_ce; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3044,7 +3213,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 5 * nla_total_size(sizeof(u32)) + + 7 * nla_total_size(sizeof(u32)) + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3075,9 +3244,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); + nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); + nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); + return stats; } @@ -3293,6 +3465,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; break; + case TCP_INQ: + val = tp->recvmsg_inq; + break; case TCP_SAVE_SYN: val = tp->save_syn; break; @@ -3329,6 +3504,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level, } return 0; } +#ifdef CONFIG_MMU + case TCP_ZEROCOPY_RECEIVE: { + struct tcp_zerocopy_receive zc; + int err; + + if (get_user(len, optlen)) + return -EFAULT; + if (len != sizeof(zc)) + return -EINVAL; + if (copy_from_user(&zc, optval, len)) + return -EFAULT; + lock_sock(sk); + err = tcp_zerocopy_receive(sk, &zc); + release_sock(sk); + if (!err && copy_to_user(optval, &zc, len)) + err = -EFAULT; + return err; + } +#endif default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e51c644484dc..aebb29ab2fdf 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -111,6 +111,25 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define REXMIT_LOST 1 /* retransmit packets marked lost */ #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ +#if IS_ENABLED(CONFIG_TLS_DEVICE) +static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled); + +void clean_acked_data_enable(struct inet_connection_sock *icsk, + void (*cad)(struct sock *sk, u32 ack_seq)) +{ + icsk->icsk_clean_acked = cad; + static_branch_inc(&clean_acked_data_enabled); +} +EXPORT_SYMBOL_GPL(clean_acked_data_enable); + +void clean_acked_data_disable(struct inet_connection_sock *icsk) +{ + static_branch_dec(&clean_acked_data_enabled); + icsk->icsk_clean_acked = NULL; +} +EXPORT_SYMBOL_GPL(clean_acked_data_disable); +#endif + static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, unsigned int len) { @@ -582,6 +601,8 @@ void tcp_rcv_space_adjust(struct sock *sk) u32 copied; int time; + trace_tcp_rcv_space_adjust(sk); + tcp_mstamp_refresh(tp); time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) @@ -1896,19 +1917,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp) tp->undo_retrans = tp->retrans_out ? : -1; } -/* Enter Loss state. If we detect SACK reneging, forget all SACK information +static bool tcp_is_rack(const struct sock *sk) +{ + return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; +} + +/* If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ +static void tcp_timeout_mark_lost(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb, *head; + bool is_reneg; /* is receiver reneging on SACKs? */ + + head = tcp_rtx_queue_head(sk); + is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED); + if (is_reneg) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + tp->sacked_out = 0; + /* Mark SACK reneging until we recover from this loss event. */ + tp->is_sack_reneg = 1; + } else if (tcp_is_reno(tp)) { + tcp_reset_reno_sack(tp); + } + + skb = head; + skb_rbtree_walk_from(skb) { + if (is_reneg) + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; + else if (tcp_is_rack(sk) && skb != head && + tcp_rack_skb_timeout(tp, skb, 0) > 0) + continue; /* Don't mark recently sent ones lost yet */ + tcp_mark_skb_lost(sk, skb); + } + tcp_verify_left_out(tp); + tcp_clear_all_retrans_hints(tp); +} + +/* Enter Loss state. */ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; - bool is_reneg; /* is receiver reneging on SACKs? */ - bool mark_lost; + + tcp_timeout_mark_lost(sk); /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1920,40 +1976,10 @@ void tcp_enter_loss(struct sock *sk) tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); } - tp->snd_cwnd = 1; + tp->snd_cwnd = tcp_packets_in_flight(tp) + 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; - tp->retrans_out = 0; - tp->lost_out = 0; - - if (tcp_is_reno(tp)) - tcp_reset_reno_sack(tp); - - skb = tcp_rtx_queue_head(sk); - is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); - if (is_reneg) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); - tp->sacked_out = 0; - /* Mark SACK reneging until we recover from this loss event. */ - tp->is_sack_reneg = 1; - } - tcp_clear_all_retrans_hints(tp); - - skb_rbtree_walk_from(skb) { - mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || - is_reneg); - if (mark_lost) - tcp_sum_lost(tp, skb); - TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (mark_lost) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - } - } - tcp_verify_left_out(tp); - /* Timeout in disordered state after receiving substantial DUPACKs * suggests that the degree of reordering is over-estimated. */ @@ -2120,7 +2146,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) return true; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_dupack_heuristics(tp) > tp->reordering) + if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering) return true; return false; @@ -2197,9 +2223,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_is_reno(tp)) { - tcp_mark_head_lost(sk, 1, 1); - } else { + if (tcp_is_sack(tp)) { int sacked_upto = tp->sacked_out - tp->reordering; if (sacked_upto >= 0) tcp_mark_head_lost(sk, sacked_upto, 0); @@ -2697,12 +2721,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) return false; } -static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) +static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) { struct tcp_sock *tp = tcp_sk(sk); - /* Use RACK to detect loss */ - if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + if (tcp_rtx_queue_empty(sk)) + return; + + if (unlikely(tcp_is_reno(tp))) { + tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); + } else if (tcp_is_rack(sk)) { u32 prior_retrans = tp->retrans_out; tcp_rack_mark_lost(sk); @@ -2798,11 +2826,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tcp_try_keep_open(sk); return; } - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) return; @@ -2819,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - tcp_rack_identify_loss(sk, ack_flag); + tcp_identify_packet_loss(sk, ack_flag); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -2841,7 +2869,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, fast_rexmit = 1; } - if (do_lost) + if (!tcp_is_rack(sk) && do_lost) tcp_update_scoreboard(sk, fast_rexmit); *rexmit = REXMIT_LOST; } @@ -3496,6 +3524,22 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) tcp_xmit_retransmit_queue(sk); } +/* Returns the number of packets newly acked or sacked by the current ACK */ +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +{ + const struct net *net = sock_net(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 delivered; + + delivered = tp->delivered - prior_delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); + if (flag & FLAG_ECE) { + tp->delivered_ce += delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + } + return delivered; +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3542,6 +3586,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; + +#if IS_ENABLED(CONFIG_TLS_DEVICE) + if (static_branch_unlikely(&clean_acked_data_enabled)) + if (icsk->icsk_clean_acked) + icsk->icsk_clean_acked(sk, ack); +#endif } prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; @@ -3619,7 +3669,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ + delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3629,9 +3679,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ - if (flag & FLAG_DSACKING_ACK) + if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, &rexmit); + tcp_newly_delivered(sk, delivered, flag); + } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3655,6 +3707,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag, &rexmit); + tcp_newly_delivered(sk, delivered, flag); tcp_xmit_recovery(sk, rexmit); } @@ -4196,6 +4249,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { + if (tp->compressed_ack) + tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -4573,6 +4628,17 @@ err: } +void tcp_data_ready(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + int avail = tp->rcv_nxt - tp->copied_seq; + + if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) + return; + + sk->sk_data_ready(sk); +} + static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -4630,7 +4696,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + tcp_data_ready(sk); return; } @@ -4651,8 +4717,6 @@ drop: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; - tcp_enter_quickack_mode(sk); - if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", @@ -5019,23 +5083,48 @@ static inline void tcp_data_snd_check(struct sock *sk) static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); + unsigned long rtt, delay; /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. - * (tcp_recvmsg() will send ACK otherwise). Or... + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. */ - __tcp_select_window(sk) >= tp->rcv_wnd) || + (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || + __tcp_select_window(sk) >= tp->rcv_wnd)) || /* We ACK each frame or... */ - tcp_in_quickack_mode(sk) || - /* We have out of order data. */ - (ofo_possible && !RB_EMPTY_ROOT(&tp->out_of_order_queue))) { - /* Then ack it now */ + tcp_in_quickack_mode(sk)) { +send_now: tcp_send_ack(sk); - } else { - /* Else, send delayed ack. */ + return; + } + + if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) { tcp_send_delayed_ack(sk); + return; } + + if (!tcp_is_sack(tp) || + tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) + goto send_now; + tp->compressed_ack++; + + if (hrtimer_is_queued(&tp->compressed_ack_timer)) + return; + + /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ + + rtt = tp->rcv_rtt_est.rtt_us; + if (tp->srtt_us && tp->srtt_us < rtt) + rtt = tp->srtt_us; + + delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, + rtt * (NSEC_PER_USEC >> 3)/20); + sock_hold(sk); + hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) @@ -5428,7 +5517,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, no_ack: if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk); + tcp_data_ready(sk); return; } } @@ -5550,9 +5639,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, return true; } tp->syn_data_acked = tp->syn_data; - if (tp->syn_data_acked) - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVE); + if (tp->syn_data_acked) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + /* SYN-data is counted as two separate packets in tcp_ack() */ + if (tp->delivered > 1) + --tp->delivered; + } tcp_fastopen_add_skb(sk, synack); @@ -5884,6 +5976,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } switch (sk->sk_state) { case TCP_SYN_RECV: + tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f70586b50838..adbdb503db0c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) struct sock *sk1 = NULL; #endif struct net *net; + struct sock *ctl_sk; /* Never send a reset in response to a reset. */ if (th->rst) @@ -723,11 +724,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); + if (sk) + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); local_bh_enable(); @@ -759,6 +765,7 @@ static void tcp_v4_send_ack(const struct sock *sk, } rep; struct net *net = sock_net(sk); struct ip_reply_arg arg; + struct sock *ctl_sk; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -809,11 +816,16 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); + if (sk) + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); local_bh_enable(); } @@ -2560,6 +2572,8 @@ static int __net_init tcp_sk_init(struct net *net) init_net.ipv4.sysctl_tcp_wmem, sizeof(init_net.ipv4.sysctl_tcp_wmem)); } + net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 57b5468b5139..f867658b4b30 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) struct inet_sock *inet = inet_sk(sk); tw->tw_transparent = inet->transparent; + tw->tw_mark = sk->sk_mark; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d07e34f8e309..8e08b409c71e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -162,6 +162,15 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* Account for an ACK we sent. */ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { + struct tcp_sock *tp = tcp_sk(sk); + + if (unlikely(tp->compressed_ack)) { + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack); + tp->compressed_ack = 0; + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + } tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } @@ -229,11 +238,9 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, } } - if (mss > (1 << *rcv_wscale)) { - if (!init_rcv_wnd) /* Use default unless specified otherwise */ - init_rcv_wnd = tcp_default_init_rwnd(mss); - *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); - } + if (!init_rcv_wnd) /* Use default unless specified otherwise */ + init_rcv_wnd = tcp_default_init_rwnd(mss); + *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss); /* Set the clamp no higher than max representable value */ (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); @@ -585,14 +592,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, unsigned int remaining = MAX_TCP_OPTION_SPACE; struct tcp_fastopen_request *fastopen = tp->fastopen_req; + *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - *md5 = tp->af_specific->md5_lookup(sk, sk); - if (*md5) { - opts->options |= OPTION_MD5; - remaining -= TCPOLEN_MD5SIG_ALIGNED; + if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + } } -#else - *md5 = NULL; #endif /* We always get an MSS option. The option bytes which will be seen in @@ -720,14 +728,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->options = 0; + *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - *md5 = tp->af_specific->md5_lookup(sk, sk); - if (unlikely(*md5)) { - opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + if (unlikely(rcu_access_pointer(tp->md5sig_info))) { + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } } -#else - *md5 = NULL; #endif if (likely(tp->rx_opt.tstamp_ok)) { @@ -772,7 +781,7 @@ struct tsq_tasklet { }; static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); -static void tcp_tsq_handler(struct sock *sk) +static void tcp_tsq_write(struct sock *sk) { if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | @@ -789,6 +798,16 @@ static void tcp_tsq_handler(struct sock *sk) 0, GFP_ATOMIC); } } + +static void tcp_tsq_handler(struct sock *sk) +{ + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + tcp_tsq_write(sk); + else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) + sock_hold(sk); + bh_unlock_sock(sk); +} /* * One tasklet per cpu tries to send more skbs. * We run in tasklet context but need to disable irqs when @@ -816,16 +835,7 @@ static void tcp_tasklet_func(unsigned long data) smp_mb__before_atomic(); clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags); - if (!sk->sk_lock.owned && - test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) { - bh_lock_sock(sk); - if (!sock_owned_by_user(sk)) { - clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); - tcp_tsq_handler(sk); - } - bh_unlock_sock(sk); - } - + tcp_tsq_handler(sk); sk_free(sk); } } @@ -853,9 +863,10 @@ void tcp_release_cb(struct sock *sk) nflags = flags & ~TCP_DEFERRED_ALL; } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); - if (flags & TCPF_TSQ_DEFERRED) - tcp_tsq_handler(sk); - + if (flags & TCPF_TSQ_DEFERRED) { + tcp_tsq_write(sk); + __sock_put(sk); + } /* Here begins the tricky part : * We are called from release_sock() with : * 1) BH disabled @@ -929,7 +940,7 @@ void tcp_wfree(struct sk_buff *skb) if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; - nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; + nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); if (nval != oval) continue; @@ -948,37 +959,17 @@ out: sk_free(sk); } -/* Note: Called under hard irq. - * We can not call TCP stack right away. +/* Note: Called under soft irq. + * We can call TCP stack right away, unless socket is owned by user. */ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) { struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer); struct sock *sk = (struct sock *)tp; - unsigned long nval, oval; - for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { - struct tsq_tasklet *tsq; - bool empty; - - if (oval & TSQF_QUEUED) - break; - - nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED; - nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); - if (nval != oval) - continue; + tcp_tsq_handler(sk); + sock_put(sk); - if (!refcount_inc_not_zero(&sk->sk_wmem_alloc)) - break; - /* queue this socket to tasklet queue */ - tsq = this_cpu_ptr(&tsq_tasklet); - empty = list_empty(&tsq->head); - list_add(&tp->tsq_node, &tsq->head); - if (empty) - tasklet_schedule(&tsq->tasklet); - break; - } return HRTIMER_NORESTART; } @@ -1011,7 +1002,8 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) do_div(len_ns, rate); hrtimer_start(&tcp_sk(sk)->pacing_timer, ktime_add_ns(ktime_get(), len_ns), - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS_PINNED_SOFT); + sock_hold(sk); } static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) @@ -1078,7 +1070,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* if no packet is in qdisc/device queue, then allow XPS to select * another queue. We can be called from tcp_tsq_handler() - * which holds one reference to sk_wmem_alloc. + * which holds one reference to sk. * * TODO: Ideally, in-flight pure ACK packets should not matter here. * One way to get this would be to set skb->truesize = 2 on them. @@ -2185,7 +2177,7 @@ static int tcp_mtu_probe(struct sock *sk) static bool tcp_pacing_check(const struct sock *sk) { return tcp_needs_internal_pacing(sk) && - hrtimer_active(&tcp_sk(sk)->pacing_timer); + hrtimer_is_queued(&tcp_sk(sk)->pacing_timer); } /* TCP Small Queues : @@ -2365,8 +2357,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, skb, limit, mss_now, gfp))) break; - if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) - clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 3a81720ac0c4..71593e4400ab 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -2,7 +2,7 @@ #include <linux/tcp.h> #include <net/tcp.h> -static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -21,6 +21,38 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } +static u32 tcp_rack_reo_wnd(const struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->rack.reord) { + /* If reordering has not been observed, be aggressive during + * the recovery or starting the recovery by DUPACK threshold. + */ + if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery) + return 0; + + if (tp->sacked_out >= tp->reordering && + !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH)) + return 0; + } + + /* To be more reordering resilient, allow min_rtt/4 settling delay. + * Use min_rtt instead of the smoothed RTT because reordering is + * often a path property and less related to queuing or delayed ACKs. + * Upon receiving DSACKs, linearly increase the window up to the + * smoothed RTT. + */ + return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps, + tp->srtt_us >> 3); +} + +s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd) +{ + return tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); +} + /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): * * Marks a packet lost, if some packet sent later has been (s)acked. @@ -44,23 +76,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - u32 min_rtt = tcp_min_rtt(tp); struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; - /* To be more reordering resilient, allow min_rtt/4 settling delay - * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed - * RTT because reordering is often a path property and less related - * to queuing or delayed ACKs. - */ - reo_wnd = 1000; - if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) && - min_rtt != ~0U) { - reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd); - reo_wnd = min(reo_wnd, tp->srtt_us >> 3); - } - + reo_wnd = tcp_rack_reo_wnd(sk); list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); @@ -78,10 +98,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) /* A packet is lost if it has not been s/acked beyond * the recent RTT plus the reordering window. */ - remaining = tp->rack.rtt_us + reo_wnd - - tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd); if (remaining <= 0) { - tcp_rack_mark_skb_lost(sk, skb); + tcp_mark_skb_lost(sk, skb); list_del_init(&skb->tcp_tsorted_anchor); } else { /* Record maximum wait time */ @@ -202,3 +221,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) tp->rack.reo_wnd_steps = 1; } } + +/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits + * the next unacked packet upon receiving + * a) three or more DUPACKs to start the fast recovery + * b) an ACK acknowledging new data during the fast recovery. + */ +void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced) +{ + const u8 state = inet_csk(sk)->icsk_ca_state; + struct tcp_sock *tp = tcp_sk(sk); + + if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) || + (state == TCP_CA_Recovery && snd_una_advanced)) { + struct sk_buff *skb = tcp_rtx_queue_head(sk); + u32 mss; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) + return; + + mss = tcp_skb_mss(skb); + if (tcp_skb_pcount(skb) > 1 && skb->len > mss) + tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + mss, mss, GFP_ATOMIC); + + tcp_skb_mark_lost_uncond_verify(tp, skb); + } +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f7d944855f8e..3b3611729928 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -708,11 +708,36 @@ out: sock_put(sk); } +static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) +{ + struct tcp_sock *tp = container_of(timer, struct tcp_sock, compressed_ack_timer); + struct sock *sk = (struct sock *)tp; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + if (tp->compressed_ack) + tcp_send_ack(sk); + } else { + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, + &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + + sock_put(sk); + + return HRTIMER_NORESTART; +} + void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS_PINNED_SOFT); tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; + + hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); + tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b61a770884fa..ff4d4ba67735 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, } EXPORT_SYMBOL(udp_set_csum); -static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) +static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, + struct inet_cork *cork) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); @@ -777,6 +778,26 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) uh->len = htons(len); uh->check = 0; + if (cork->gso_size) { + const int hlen = skb_network_header_len(skb) + + sizeof(struct udphdr); + + if (hlen + cork->gso_size > cork->fragsize) + return -EINVAL; + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) + return -EINVAL; + if (sk->sk_no_check_tx) + return -EINVAL; + if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) + return -EIO; + + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh), + cork->gso_size); + goto csum_partial; + } + if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); @@ -786,6 +807,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ +csum_partial: udp4_hwcsum(skb, fl4->saddr, fl4->daddr); goto send; @@ -828,7 +850,7 @@ int udp_push_pending_frames(struct sock *sk) if (!skb) goto out; - err = udp_send_skb(skb, fl4); + err = udp_send_skb(skb, fl4, &inet->cork.base); out: up->len = 0; @@ -837,6 +859,43 @@ out: } EXPORT_SYMBOL(udp_push_pending_frames); +static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size) +{ + switch (cmsg->cmsg_type) { + case UDP_SEGMENT: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16))) + return -EINVAL; + *gso_size = *(__u16 *)CMSG_DATA(cmsg); + return 0; + default: + return -EINVAL; + } +} + +int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size) +{ + struct cmsghdr *cmsg; + bool need_ip = false; + int err; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_UDP) { + need_ip = true; + continue; + } + + err = __udp_cmsg_send(cmsg, gso_size); + if (err) + return err; + } + + return need_ip; +} +EXPORT_SYMBOL_GPL(udp_cmsg_send); + int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); @@ -922,10 +981,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; + ipc.gso_size = up->gso_size; if (msg->msg_controllen) { - err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); - if (unlikely(err)) { + err = udp_cmsg_send(sk, msg, &ipc.gso_size); + if (err > 0) + err = ip_cmsg_send(sk, msg, &ipc, + sk->sk_family == AF_INET6); + if (unlikely(err < 0)) { kfree(ipc.opt); return err; } @@ -1032,12 +1095,14 @@ back_from_confirm: /* Lockless fast path for the non-corking case. */ if (!corkreq) { + struct inet_cork cork; + skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, - msg->msg_flags); + &cork, msg->msg_flags); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) - err = udp_send_skb(skb, fl4); + err = udp_send_skb(skb, fl4, &cork); goto out; } @@ -1813,10 +1878,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static struct static_key udp_encap_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void) { - static_key_enable(&udp_encap_needed); + static_branch_enable(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_enable); @@ -1840,7 +1905,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; nf_reset(skb); - if (static_key_false(&udp_encap_needed) && up->encap_type) { + if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* @@ -2303,7 +2368,7 @@ void udp_destroy_sock(struct sock *sk) bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); - if (static_key_false(&udp_encap_needed) && up->encap_type) { + if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { void (*encap_destroy)(struct sock *sk); encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) @@ -2368,6 +2433,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, up->no_check6_rx = valbool; break; + case UDP_SEGMENT: + if (val < 0 || val > USHRT_MAX) + return -EINVAL; + up->gso_size = val; + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ @@ -2458,6 +2529,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, val = up->no_check6_rx; break; + case UDP_SEGMENT: + val = up->gso_size; + break; + /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index ea6e6e7df0ee..92dc9e5a7ff3 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -187,6 +187,102 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); +struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, + netdev_features_t features) +{ + struct sock *sk = gso_skb->sk; + unsigned int sum_truesize = 0; + struct sk_buff *segs, *seg; + struct udphdr *uh; + unsigned int mss; + bool copy_dtor; + __sum16 check; + __be16 newlen; + + mss = skb_shinfo(gso_skb)->gso_size; + if (gso_skb->len <= sizeof(*uh) + mss) + return ERR_PTR(-EINVAL); + + skb_pull(gso_skb, sizeof(*uh)); + + /* clear destructor to avoid skb_segment assigning it to tail */ + copy_dtor = gso_skb->destructor == sock_wfree; + if (copy_dtor) + gso_skb->destructor = NULL; + + segs = skb_segment(gso_skb, features); + if (unlikely(IS_ERR_OR_NULL(segs))) { + if (copy_dtor) + gso_skb->destructor = sock_wfree; + return segs; + } + + /* GSO partial and frag_list segmentation only requires splitting + * the frame into an MSS multiple and possibly a remainder, both + * cases return a GSO skb. So update the mss now. + */ + if (skb_is_gso(segs)) + mss *= skb_shinfo(segs)->gso_segs; + + seg = segs; + uh = udp_hdr(seg); + + /* compute checksum adjustment based on old length versus new */ + newlen = htons(sizeof(*uh) + mss); + check = csum16_add(csum16_sub(uh->check, uh->len), newlen); + + for (;;) { + if (copy_dtor) { + seg->destructor = sock_wfree; + seg->sk = sk; + sum_truesize += seg->truesize; + } + + if (!seg->next) + break; + + uh->len = newlen; + uh->check = check; + + if (seg->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(seg, ~check); + else + uh->check = gso_make_checksum(seg, ~check) ? : + CSUM_MANGLED_0; + + seg = seg->next; + uh = udp_hdr(seg); + } + + /* last packet can be partial gso_size, account for that in checksum */ + newlen = htons(skb_tail_pointer(seg) - skb_transport_header(seg) + + seg->data_len); + check = csum16_add(csum16_sub(uh->check, uh->len), newlen); + + uh->len = newlen; + uh->check = check; + + if (seg->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(seg, ~check); + else + uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0; + + /* update refcount for the packet */ + if (copy_dtor) { + int delta = sum_truesize - gso_skb->truesize; + + /* In some pathological cases, delta can be negative. + * We need to either use refcount_add() or refcount_sub_and_test() + */ + if (likely(delta >= 0)) + refcount_add(delta, &sk->sk_wmem_alloc); + else + WARN_ON_ONCE(refcount_sub_and_test(-delta, &sk->sk_wmem_alloc)); + } + return segs; +} +EXPORT_SYMBOL_GPL(__udp_gso_segment); + static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) { @@ -203,12 +299,15 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, goto out; } - if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4))) goto out; if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + return __udp_gso_segment(skb, features); + mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) goto out; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 78cef00c9596..fbfd71a2d9c8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -170,7 +170,7 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event); static int addrconf_ifdown(struct net_device *dev, int how); -static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, +static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, const struct net_device *dev, u32 flags, u32 noflags); @@ -916,7 +916,6 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) pr_warn("Freeing alive inet6 address %p\n", ifp); return; } - ip6_rt_put(ifp->rt); kfree_rcu(ifp, rcu); } @@ -995,7 +994,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC; struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; - struct rt6_info *rt = NULL; + struct fib6_info *f6i = NULL; int err = 0; int addr_type = ipv6_addr_type(addr); @@ -1037,16 +1036,16 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } - rt = addrconf_dst_alloc(idev, addr, false); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; + f6i = addrconf_f6i_alloc(net, idev, addr, false, gfp_flags); + if (IS_ERR(f6i)) { + err = PTR_ERR(f6i); + f6i = NULL; goto out; } if (net->ipv6.devconf_all->disable_policy || idev->cnf.disable_policy) - rt->dst.flags |= DST_NOPOLICY; + f6i->dst_nopolicy = true; neigh_parms_data_state_setall(idev->nd_parms); @@ -1068,7 +1067,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa->cstamp = ifa->tstamp = jiffies; ifa->tokenized = false; - ifa->rt = rt; + ifa->rt = f6i; ifa->idev = idev; in6_dev_hold(idev); @@ -1102,8 +1101,8 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, inet6addr_notifier_call_chain(NETDEV_UP, ifa); out: if (unlikely(err < 0)) { - if (rt) - ip6_rt_put(rt); + fib6_info_release(f6i); + if (ifa) { if (ifa->idev) in6_dev_put(ifa->idev); @@ -1179,19 +1178,19 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires) static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt) { - struct rt6_info *rt; + struct fib6_info *f6i; - rt = addrconf_get_prefix_route(&ifp->addr, + f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, 0, RTF_GATEWAY | RTF_DEFAULT); - if (rt) { + if (f6i) { if (del_rt) - ip6_del_rt(rt); + ip6_del_rt(dev_net(ifp->idev->dev), f6i); else { - if (!(rt->rt6i_flags & RTF_EXPIRES)) - rt6_set_expires(rt, expires); - ip6_rt_put(rt); + if (!(f6i->fib6_flags & RTF_EXPIRES)) + fib6_set_expires(f6i, expires); + fib6_info_release(f6i); } } } @@ -2320,7 +2319,7 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad static void addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, - unsigned long expires, u32 flags) + unsigned long expires, u32 flags, gfp_t gfp_flags) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX, @@ -2331,6 +2330,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, .fc_flags = RTF_UP | flags, .fc_nlinfo.nl_net = dev_net(dev), .fc_protocol = RTPROT_KERNEL, + .fc_type = RTN_UNICAST, }; cfg.fc_dst = *pfx; @@ -2344,17 +2344,17 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, cfg.fc_flags |= RTF_NONEXTHOP; #endif - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, gfp_flags, NULL); } -static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, +static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, int plen, const struct net_device *dev, u32 flags, u32 noflags) { struct fib6_node *fn; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct fib6_table *table; u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX; @@ -2368,14 +2368,13 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->dst.dev->ifindex != dev->ifindex) + if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex) continue; - if ((rt->rt6i_flags & flags) != flags) + if ((rt->fib6_flags & flags) != flags) continue; - if ((rt->rt6i_flags & noflags) != 0) + if ((rt->fib6_flags & noflags) != 0) continue; - if (!dst_hold_safe(&rt->dst)) - rt = NULL; + fib6_info_hold(rt); break; } out: @@ -2394,12 +2393,13 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_ifindex = dev->ifindex, .fc_dst_len = 8, .fc_flags = RTF_UP, + .fc_type = RTN_UNICAST, .fc_nlinfo.nl_net = dev_net(dev), }; ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, GFP_ATOMIC, NULL); } static struct inet6_dev *addrconf_add_dev(struct net_device *dev) @@ -2529,7 +2529,6 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, if (IS_ERR_OR_NULL(ifp)) return -1; - update_lft = 0; create = 1; spin_lock_bh(&ifp->lock); ifp->flags |= IFA_F_MANAGETEMPADDR; @@ -2551,7 +2550,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!update_lft && !create && stored_lft) { + if (!create && stored_lft) { const u32 minimum_lft = min_t(u32, stored_lft, MIN_VALID_LIFETIME); valid_lft = max(valid_lft, minimum_lft); @@ -2642,7 +2641,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) */ if (pinfo->onlink) { - struct rt6_info *rt; + struct fib6_info *rt; unsigned long rt_expires; /* Avoid arithmetic overflow. Really, we could @@ -2667,13 +2666,13 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ - rt6_set_expires(rt, jiffies + rt_expires); + fib6_set_expires(rt, jiffies + rt_expires); } else { - rt6_clean_expires(rt); + fib6_clean_expires(rt); } } else if (valid_lft) { clock_t expires = 0; @@ -2684,9 +2683,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) expires = jiffies_to_clock_t(rt_expires); } addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, expires, flags); + dev, expires, flags, GFP_ATOMIC); } - ip6_rt_put(rt); + fib6_info_release(rt); } /* Try to figure out our local address for this prefix */ @@ -2899,9 +2898,14 @@ static int inet6_addr_add(struct net *net, int ifindex, if (!IS_ERR(ifp)) { if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, - expires, flags); + expires, flags, GFP_KERNEL); } + /* Send a netlink notification if DAD is enabled and + * optimistic flag is not set + */ + if (!(ifp->flags & (IFA_F_OPTIMISTIC | IFA_F_NODAD))) + ipv6_ifa_notify(0, ifp); /* * Note that section 3.1 of RFC 4429 indicates * that the Optimistic flag should not be set for @@ -3047,7 +3051,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) if (addr.s6_addr32[3]) { add_addr(idev, &addr, plen, scope); - addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags); + addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags, + GFP_ATOMIC); return; } @@ -3072,7 +3077,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) add_addr(idev, &addr, plen, flag); addrconf_prefix_route(&addr, plen, idev->dev, 0, - pflags); + pflags, GFP_ATOMIC); } } } @@ -3112,7 +3117,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, ifp = ipv6_add_addr(idev, addr, NULL, 64, IFA_LINK, addr_flags, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME, true, NULL); if (!IS_ERR(ifp)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, 0, 0); + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev, + 0, 0, GFP_ATOMIC); addrconf_dad_start(ifp); in6_ifa_put(ifp); } @@ -3227,7 +3233,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) addrconf_add_linklocal(idev, &addr, IFA_F_STABLE_PRIVACY); else if (prefix_route) - addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + addrconf_prefix_route(&addr, 64, idev->dev, + 0, 0, GFP_KERNEL); break; case IN6_ADDR_GEN_MODE_EUI64: /* addrconf_add_linklocal also adds a prefix_route and we @@ -3237,7 +3244,8 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0) addrconf_add_linklocal(idev, &addr, 0); else if (prefix_route) - addrconf_prefix_route(&addr, 64, idev->dev, 0, 0); + addrconf_prefix_route(&addr, 64, idev->dev, + 0, 0, GFP_KERNEL); break; case IN6_ADDR_GEN_MODE_NONE: default: @@ -3329,32 +3337,34 @@ static void addrconf_gre_config(struct net_device *dev) } #endif -static int fixup_permanent_addr(struct inet6_dev *idev, +static int fixup_permanent_addr(struct net *net, + struct inet6_dev *idev, struct inet6_ifaddr *ifp) { - /* !rt6i_node means the host route was removed from the + /* !fib6_node means the host route was removed from the * FIB, for example, if 'lo' device is taken down. In that * case regenerate the host route. */ - if (!ifp->rt || !ifp->rt->rt6i_node) { - struct rt6_info *rt, *prev; + if (!ifp->rt || !ifp->rt->fib6_node) { + struct fib6_info *f6i, *prev; - rt = addrconf_dst_alloc(idev, &ifp->addr, false); - if (IS_ERR(rt)) - return PTR_ERR(rt); + f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false, + GFP_ATOMIC); + if (IS_ERR(f6i)) + return PTR_ERR(f6i); /* ifp->rt can be accessed outside of rtnl */ spin_lock(&ifp->lock); prev = ifp->rt; - ifp->rt = rt; + ifp->rt = f6i; spin_unlock(&ifp->lock); - ip6_rt_put(prev); + fib6_info_release(prev); } if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { addrconf_prefix_route(&ifp->addr, ifp->prefix_len, - idev->dev, 0, 0); + idev->dev, 0, 0, GFP_ATOMIC); } if (ifp->state == INET6_IFADDR_STATE_PREDAD) @@ -3363,7 +3373,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, return 0; } -static void addrconf_permanent_addr(struct net_device *dev) +static void addrconf_permanent_addr(struct net *net, struct net_device *dev) { struct inet6_ifaddr *ifp, *tmp; struct inet6_dev *idev; @@ -3376,7 +3386,7 @@ static void addrconf_permanent_addr(struct net_device *dev) list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) { if ((ifp->flags & IFA_F_PERMANENT) && - fixup_permanent_addr(idev, ifp) < 0) { + fixup_permanent_addr(net, idev, ifp) < 0) { write_unlock_bh(&idev->lock); in6_ifa_hold(ifp); ipv6_del_addr(ifp); @@ -3445,7 +3455,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (event == NETDEV_UP) { /* restore routes for permanent addresses */ - addrconf_permanent_addr(dev); + addrconf_permanent_addr(net, dev); if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ @@ -3612,8 +3622,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; - int _keep_addr; - bool keep_addr; + bool keep_addr = false; int state, i; ASSERT_RTNL(); @@ -3639,15 +3648,18 @@ static int addrconf_ifdown(struct net_device *dev, int how) } - /* aggregate the system setting and interface setting */ - _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; - if (!_keep_addr) - _keep_addr = idev->cnf.keep_addr_on_down; - /* combine the user config with event to determine if permanent * addresses are to be removed from address hash table */ - keep_addr = !(how || _keep_addr <= 0 || idev->cnf.disable_ipv6); + if (!how && !idev->cnf.disable_ipv6) { + /* aggregate the system setting and interface setting */ + int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down; + + if (!_keep_addr) + _keep_addr = idev->cnf.keep_addr_on_down; + + keep_addr = (_keep_addr > 0); + } /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { @@ -3697,13 +3709,8 @@ restart: write_lock_bh(&idev->lock); } - /* re-combine the user config with event to determine if permanent - * addresses are to be removed from the interface list - */ - keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6); - list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; bool keep; addrconf_del_dad_work(ifa); @@ -3731,7 +3738,7 @@ restart: spin_unlock_bh(&ifa->lock); if (rt) - ip6_del_rt(rt); + ip6_del_rt(net, rt); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -3849,6 +3856,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) struct inet6_dev *idev = ifp->idev; struct net_device *dev = idev->dev; bool bump_id, notify = false; + struct net *net; addrconf_join_solict(dev, &ifp->addr); @@ -3859,8 +3867,9 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) if (ifp->state == INET6_IFADDR_STATE_DEAD) goto out; + net = dev_net(dev); if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + (net->ipv6.devconf_all->accept_dad < 1 && idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { @@ -3896,8 +3905,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) * Frames right away */ if (ifp->flags & IFA_F_OPTIMISTIC) { - ip6_ins_rt(ifp->rt); - if (ipv6_use_optimistic_addr(dev_net(dev), idev)) { + ip6_ins_rt(net, ifp->rt); + if (ipv6_use_optimistic_addr(net, idev)) { /* Because optimistic nodes can use this address, * notify listeners. If DAD fails, RTM_DELADDR is sent. */ @@ -4563,8 +4572,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, ipv6_ifa_notify(0, ifp); if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) { - addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, - expires, flags); + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, + ifp->idev->dev, expires, flags, + GFP_KERNEL); } else if (had_prefixroute) { enum cleanup_prefix_rt_t action; unsigned long rt_expires; @@ -4804,9 +4814,10 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, u32 portid, u32 seq, int event, unsigned int flags) { + struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); + int ifindex = dev ? dev->ifindex : 1; struct nlmsghdr *nlh; u8 scope = RT_SCOPE_UNIVERSE; - int ifindex = ifaca->aca_idev->dev->ifindex; if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; @@ -5029,14 +5040,6 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) struct net *net = dev_net(ifa->idev->dev); int err = -ENOBUFS; - /* Don't send DELADDR notification for TENTATIVE address, - * since NEWADDR notification is sent only after removing - * TENTATIVE flag, if DAD has not failed. - */ - if (ifa->flags & IFA_F_TENTATIVE && !(ifa->flags & IFA_F_DADFAILED) && - event == RTM_DELADDR) - return; - skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); if (!skb) goto errout; @@ -5607,29 +5610,30 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * our DAD process, so we don't need * to do it again */ - if (!rcu_access_pointer(ifp->rt->rt6i_node)) - ip6_ins_rt(ifp->rt); + if (!rcu_access_pointer(ifp->rt->fib6_node)) + ip6_ins_rt(net, ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); if (!ipv6_addr_any(&ifp->peer_addr)) addrconf_prefix_route(&ifp->peer_addr, 128, - ifp->idev->dev, 0, 0); + ifp->idev->dev, 0, 0, + GFP_ATOMIC); break; case RTM_DELADDR: if (ifp->idev->cnf.forwarding) addrconf_leave_anycast(ifp); addrconf_leave_solict(ifp->idev, &ifp->addr); if (!ipv6_addr_any(&ifp->peer_addr)) { - struct rt6_info *rt; + struct fib6_info *rt; rt = addrconf_get_prefix_route(&ifp->peer_addr, 128, ifp->idev->dev, 0, 0); if (rt) - ip6_del_rt(rt); + ip6_del_rt(net, rt); } if (ifp->rt) { - if (dst_hold_safe(&ifp->rt->dst)) - ip6_del_rt(ifp->rt); + ip6_del_rt(net, ifp->rt); + ifp->rt = NULL; } rt_genid_bump_ipv6(net); break; @@ -5976,11 +5980,11 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct rt6_info *rt = ifa->rt; + struct fib6_info *rt = ifa->rt; int cpu; rcu_read_lock(); - addrconf_set_nopolicy(ifa->rt, val); + ifa->rt->dst_nopolicy = val ? true : false; if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 32b564dfd02a..2fe754fd4f5e 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, return -EAFNOSUPPORT; } +static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int flags) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict) +{ + return f6i; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { - .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, + .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, + .fib6_get_table = eafnosupport_fib6_get_table, + .fib6_table_lookup = eafnosupport_fib6_table_lookup, + .fib6_lookup = eafnosupport_fib6_lookup, + .fib6_multipath_select = eafnosupport_fib6_multipath_select, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8da0b513f188..50de8b0d4f70 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -273,33 +273,8 @@ out_rcu_unlock: goto out; } - -/* bind for INET6 API */ -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -{ - struct sock *sk = sock->sk; - int err = 0; - - /* If the socket has its own bind function then use it. */ - if (sk->sk_prot->bind) - return sk->sk_prot->bind(sk, uaddr, addr_len); - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - /* BPF prog is run before any checks are done so that if the prog - * changes context in a wrong way it will be caught. - */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); - if (err) - return err; - - return __inet6_bind(sk, uaddr, addr_len, false, true); -} -EXPORT_SYMBOL(inet6_bind); - -int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) +static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -444,6 +419,30 @@ out_unlock: goto out; } +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + int err = 0; + + /* If the socket has its own bind function then use it. */ + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + if (err) + return err; + + return __inet6_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet6_bind); + int inet6_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -579,7 +578,9 @@ const struct proto_ops inet6_stream_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ .recvmsg = inet_recvmsg, /* ok */ - .mmap = sock_no_mmap, +#ifdef CONFIG_MMU + .mmap = tcp_mmap, +#endif .sendpage = inet_sendpage, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, @@ -590,6 +591,7 @@ const struct proto_ops inet6_stream_ops = { .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif + .set_rcvlowat = tcp_set_rcvlowat, }; const struct proto_ops inet6_dgram_ops = { @@ -887,7 +889,11 @@ static struct pernet_operations inet6_net_ops = { static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, - .ipv6_dst_lookup = ip6_dst_lookup, + .ipv6_dst_lookup = ip6_dst_lookup, + .fib6_get_table = fib6_get_table, + .fib6_table_lookup = fib6_table_lookup, + .fib6_lookup = fib6_lookup, + .fib6_multipath_select = fib6_multipath_select, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index bbcabbba9bd8..0250d199e527 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -212,16 +212,14 @@ static void aca_get(struct ifacaddr6 *aca) static void aca_put(struct ifacaddr6 *ac) { if (refcount_dec_and_test(&ac->aca_refcnt)) { - in6_dev_put(ac->aca_idev); - dst_release(&ac->aca_rt->dst); + fib6_info_release(ac->aca_rt); kfree(ac); } } -static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, +static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, const struct in6_addr *addr) { - struct inet6_dev *idev = rt->rt6i_idev; struct ifacaddr6 *aca; aca = kzalloc(sizeof(*aca), GFP_ATOMIC); @@ -229,9 +227,8 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, return NULL; aca->aca_addr = *addr; - in6_dev_hold(idev); - aca->aca_idev = idev; - aca->aca_rt = rt; + fib6_info_hold(f6i); + aca->aca_rt = f6i; aca->aca_users = 1; /* aca_tstamp should be updated upon changes */ aca->aca_cstamp = aca->aca_tstamp = jiffies; @@ -246,7 +243,8 @@ static struct ifacaddr6 *aca_alloc(struct rt6_info *rt, int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) { struct ifacaddr6 *aca; - struct rt6_info *rt; + struct fib6_info *f6i; + struct net *net; int err; ASSERT_RTNL(); @@ -265,14 +263,15 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) } } - rt = addrconf_dst_alloc(idev, addr, true); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); + net = dev_net(idev->dev); + f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC); + if (IS_ERR(f6i)) { + err = PTR_ERR(f6i); goto out; } - aca = aca_alloc(rt, addr); + aca = aca_alloc(f6i, addr); if (!aca) { - ip6_rt_put(rt); + fib6_info_release(f6i); err = -ENOMEM; goto out; } @@ -286,7 +285,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) aca_get(aca); write_unlock_bh(&idev->lock); - ip6_ins_rt(rt); + ip6_ins_rt(net, f6i); addrconf_join_solict(idev->dev, &aca->aca_addr); @@ -328,8 +327,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) write_unlock_bh(&idev->lock); addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->dst); - ip6_del_rt(aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); return 0; @@ -356,8 +354,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); - dst_hold(&aca->aca_rt->dst); - ip6_del_rt(aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt); aca_put(aca); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index bc68eb661970..5bc2bf3733ab 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -280,6 +280,7 @@ static const struct tlvtype_proc tlvprocdestopt_lst[] = { static int ipv6_destopt_rcv(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; @@ -291,7 +292,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + __IP6_INC_STATS(dev_net(dst->dev), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); @@ -319,8 +320,7 @@ fail_and_free: return 1; } - __IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return -1; } @@ -416,8 +416,7 @@ looped_back: } if (hdr->segments_left >= (hdr->hdrlen >> 1)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); @@ -456,8 +455,7 @@ looped_back: if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); @@ -481,10 +479,10 @@ looped_back: /* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; struct in6_addr daddr; - struct inet6_dev *idev; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; @@ -498,8 +496,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -508,8 +505,7 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -527,7 +523,7 @@ looped_back: * processed by own */ if (!addr) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -553,8 +549,7 @@ looped_back: goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -572,8 +567,7 @@ looped_back: n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); @@ -609,14 +603,12 @@ looped_back: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -627,8 +619,7 @@ looped_back: } if (ipv6_addr_is_multicast(addr)) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -647,8 +638,7 @@ looped_back: if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); @@ -663,7 +653,7 @@ looped_back: return -1; unknown_rh: - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; @@ -755,34 +745,31 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct net *net = ipv6_skb_net(skb); u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); return false; } if (ipv6_hdr(skb)->payload_len) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { - __IP6_INC_STATS(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index b643f5ce6c80..ae365df8abf7 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -161,7 +161,7 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv); * if target < 0. "last header" is transport protocol header, ESP, or * "No next header". * - * Note that *offset is used as input/output parameter. an if it is not zero, + * Note that *offset is used as input/output parameter, and if it is not zero, * then it must be a valid offset to an inner IPv6 header. This can be used * to explore inner IPv6 header, eg. ICMPv6 error messages. * diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index df113c7b5fc8..f590446595d8 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net) return fib_rules_seq_read(net, AF_INET6); } +/* called with rcu lock held; no reference taken on fib6_info */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + struct fib6_info *f6i; + int err; + + if (net->ipv6.fib6_has_custom_rules) { + struct fib_lookup_arg arg = { + .lookup_ptr = fib6_table_lookup, + .lookup_data = &oif, + .flags = FIB_LOOKUP_NOREF, + }; + + l3mdev_update_flow(net, flowi6_to_flowi(fl6)); + + err = fib_rules_lookup(net->ipv6.fib6_rules_ops, + flowi6_to_flowi(fl6), flags, &arg); + if (err) + return ERR_PTR(err); + + f6i = arg.result ? : net->ipv6.fib6_null_entry; + } else { + f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, + oif, fl6, flags); + if (!f6i || f6i == net->ipv6.fib6_null_entry) + f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, + oif, fl6, flags); + } + + return f6i; +} + struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) @@ -96,8 +129,73 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, return &net->ipv6.ip6_null_entry->dst; } -static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) +static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags, + struct flowi6 *flp6, const struct net_device *dev) +{ + struct fib6_rule *r = (struct fib6_rule *)rule; + + /* If we need to find a source address for this traffic, + * we check the result if it meets requirement of the rule. + */ + if ((rule->flags & FIB_RULE_FIND_SADDR) && + r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { + struct in6_addr saddr; + + if (ipv6_dev_get_saddr(net, dev, &flp6->daddr, + rt6_flags2srcprefs(flags), &saddr)) + return -EAGAIN; + + if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) + return -EAGAIN; + + flp6->saddr = saddr; + } + + return 0; +} + +static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct flowi6 *flp6 = &flp->u.ip6; + struct net *net = rule->fr_net; + struct fib6_table *table; + struct fib6_info *f6i; + int err = -EAGAIN, *oif; + u32 tb_id; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + tb_id = fib_rule_get_table(rule, arg); + table = fib6_get_table(net, tb_id); + if (!table) + return -EAGAIN; + + oif = (int *)arg->lookup_data; + f6i = fib6_table_lookup(net, table, *oif, flp6, flags); + if (f6i != net->ipv6.fib6_null_entry) { + err = fib6_rule_saddr(net, rule, flags, flp6, + fib6_info_nh_dev(f6i)); + + if (likely(!err)) + arg->result = f6i; + } + + return err; +} + +static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) { struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; @@ -134,27 +232,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, rt = lookup(net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { - struct fib6_rule *r = (struct fib6_rule *)rule; - - /* - * If we need to find a source address for this traffic, - * we check the result if it meets requirement of the rule. - */ - if ((rule->flags & FIB_RULE_FIND_SADDR) && - r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { - struct in6_addr saddr; - - if (ipv6_dev_get_saddr(net, - ip6_dst_idev(&rt->dst)->dev, - &flp6->daddr, - rt6_flags2srcprefs(flags), - &saddr)) - goto again; - if (!ipv6_prefix_equal(&saddr, &r->src.addr, - r->src.plen)) - goto again; - flp6->saddr = saddr; - } + err = fib6_rule_saddr(net, rule, flags, flp6, + ip6_dst_idev(&rt->dst)->dev); + + if (err == -EAGAIN) + goto again; + err = rt->dst.error; if (err != -EAGAIN) goto out; @@ -172,6 +255,15 @@ out: return err; } +static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + if (arg->lookup_ptr == fib6_table_lookup) + return fib6_rule_action_alt(rule, flp, flags, arg); + + return __fib6_rule_action(rule, flp, flags, arg); +} + static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct rt6_info *rt = (struct rt6_info *) arg->result; @@ -245,15 +337,18 @@ static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = { static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, - struct nlattr **tb) + struct nlattr **tb, + struct netlink_ext_ack *extack) { int err = -EINVAL; struct net *net = sock_net(skb->sk); struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { - if (rule->table == RT6_TABLE_UNSPEC) + if (rule->table == RT6_TABLE_UNSPEC) { + NL_SET_ERR_MSG(extack, "Invalid table"); goto errout; + } if (fib6_new_table(net, rule->table) == NULL) { err = -ENOBUFS; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index deab2db6692e..d1dc6017f5a6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -43,7 +43,7 @@ static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; - int (*func)(struct rt6_info *, void *arg); + int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; }; @@ -54,7 +54,7 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static struct rt6_info *fib6_find_prefix(struct net *net, +static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, @@ -105,13 +105,12 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; -void fib6_update_sernum(struct rt6_info *rt) +void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { - struct net *net = dev_net(rt->dst.dev); struct fib6_node *fn; - fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + fn = rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) fn->fn_sernum = fib6_new_sernum(net); } @@ -146,6 +145,69 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) +{ + struct fib6_info *f6i; + + f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (!f6i) + return NULL; + + f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!f6i->rt6i_pcpu) { + kfree(f6i); + return NULL; + } + + INIT_LIST_HEAD(&f6i->fib6_siblings); + f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; + + atomic_inc(&f6i->fib6_ref); + + return f6i; +} + +void fib6_info_destroy(struct fib6_info *f6i) +{ + struct rt6_exception_bucket *bucket; + struct dst_metrics *m; + + WARN_ON(f6i->fib6_node); + + bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); + if (bucket) { + f6i->rt6i_exception_bucket = NULL; + kfree(bucket); + } + + if (f6i->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + } + + if (f6i->fib6_nh.nh_dev) + dev_put(f6i->fib6_nh.nh_dev); + + m = f6i->fib6_metrics; + if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt)) + kfree(m); + + kfree(f6i); +} +EXPORT_SYMBOL_GPL(fib6_info_destroy); + static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; @@ -176,28 +238,6 @@ static void node_free(struct net *net, struct fib6_node *fn) net->ipv6.rt6_stats->fib_nodes--; } -void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) -{ - int cpu; - - if (!non_pcpu_rt->rt6i_pcpu) - return; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } -} -EXPORT_SYMBOL_GPL(rt6_free_pcpu); - static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); @@ -232,7 +272,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } @@ -314,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, return &rt->dst; } +/* called with rcu lock held; no reference taken on fib6_info */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags); +} + static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); @@ -340,7 +387,7 @@ unsigned int fib6_tables_seq_read(struct net *net) static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, - struct rt6_info *rt) + struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, @@ -351,7 +398,7 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, static int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, - struct rt6_info *rt, + struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { @@ -359,7 +406,7 @@ static int call_fib6_entry_notifiers(struct net *net, .rt = rt, }; - rt->rt6i_table->fib_seq++; + rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } @@ -368,16 +415,16 @@ struct fib6_dump_arg { struct notifier_block *nb; }; -static void fib6_rt_dump(struct rt6_info *rt, struct fib6_dump_arg *arg) +static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { - if (rt == arg->net->ipv6.ip6_null_entry) + if (rt == arg->net->ipv6.fib6_null_entry) return; call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt); } static int fib6_node_dump(struct fib6_walker *w) { - struct rt6_info *rt; + struct fib6_info *rt; for_each_fib6_walker_rt(w) fib6_rt_dump(rt, w->args); @@ -426,7 +473,7 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb) static int fib6_dump_node(struct fib6_walker *w) { int res; - struct rt6_info *rt; + struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args); @@ -441,10 +488,10 @@ static int fib6_dump_node(struct fib6_walker *w) * last sibling of this route (no need to dump the * sibling routes again) */ - if (rt->rt6i_nsiblings) - rt = list_last_entry(&rt->rt6i_siblings, - struct rt6_info, - rt6i_siblings); + if (rt->fib6_nsiblings) + rt = list_last_entry(&rt->fib6_siblings, + struct fib6_info, + fib6_siblings); } w->leaf = NULL; return 0; @@ -579,6 +626,24 @@ out: return res; } +void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) +{ + if (!f6i) + return; + + if (f6i->fib6_metrics == &dst_default_metrics) { + struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); + + if (!p) + return; + + refcount_set(&p->refcnt, 1); + f6i->fib6_metrics = p; + } + + f6i->fib6_metrics->metrics[metric - 1] = val; +} + /* * Routing Table * @@ -608,7 +673,7 @@ static struct fib6_node *fib6_add_1(struct net *net, fn = root; do { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); @@ -637,11 +702,11 @@ static struct fib6_node *fib6_add_1(struct net *net, /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); - rt6_release(leaf); + fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == - net->ipv6.ip6_null_entry) { + net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } @@ -750,7 +815,7 @@ insert_above: RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; atomic_inc(&rcu_dereference_protected(in->leaf, - lockdep_is_held(&table->tb6_lock))->rt6i_ref); + lockdep_is_held(&table->tb6_lock))->fib6_ref); /* update parent pointer */ if (dir) @@ -802,44 +867,37 @@ insert_above: return ln; } -static void fib6_copy_metrics(u32 *mp, const struct mx6_config *mxc) +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) { - int i; - - for (i = 0; i < RTAX_MAX; i++) { - if (test_bit(i, mxc->mx_valid)) - mp[i] = mxc->mx[i]; - } -} - -static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) -{ - if (!mxc->mx) - return 0; - - if (dst->flags & DST_HOST) { - u32 *mp = dst_metrics_write_ptr(dst); + int cpu; - if (unlikely(!mp)) - return -ENOMEM; + /* release the reference to this fib entry from + * all of its cached pcpu routes + */ + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; - fib6_copy_metrics(mp, mxc); - } else { - dst_init_metrics(dst, mxc->mx, false); + ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + struct fib6_info *from; - /* We've stolen mx now. */ - mxc->mx = NULL; + from = rcu_dereference_protected(pcpu_rt->from, + lockdep_is_held(&table->tb6_lock)); + rcu_assign_pointer(pcpu_rt->from, NULL); + fib6_info_release(from); + } } - - return 0; } -static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, +static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { - struct fib6_table *table = rt->rt6i_table; + struct fib6_table *table = rt->fib6_table; - if (atomic_read(&rt->rt6i_ref) != 1) { + if (atomic_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes @@ -847,18 +905,22 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * to still alive ones. */ while (fn) { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *new_leaf; + struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); - atomic_inc(&new_leaf->rt6i_ref); + atomic_inc(&new_leaf->fib6_ref); + rcu_assign_pointer(fn->leaf, new_leaf); - rt6_release(rt); + fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } + + if (rt->rt6i_pcpu) + fib6_drop_pcpu_from(rt, table); } } @@ -866,15 +928,15 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * Insert routing information in a node. */ -static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, +static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack) { - struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); - struct rt6_info *iter = NULL; - struct rt6_info __rcu **ins; - struct rt6_info __rcu **fallback_ins = NULL; + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + struct fib6_info *iter = NULL; + struct fib6_info __rcu **ins; + struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -890,13 +952,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; for (iter = leaf; iter; - iter = rcu_dereference_protected(iter->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock))) { + iter = rcu_dereference_protected(iter->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ - if (iter->rt6i_metric == rt->rt6i_metric) { + if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ @@ -916,15 +978,15 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, } if (rt6_duplicate_nexthop(iter, rt)) { - if (rt->rt6i_nsiblings) - rt->rt6i_nsiblings = 0; - if (!(iter->rt6i_flags & RTF_EXPIRES)) + if (rt->fib6_nsiblings) + rt->fib6_nsiblings = 0; + if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; - if (!(rt->rt6i_flags & RTF_EXPIRES)) - rt6_clean_expires(iter); + if (!(rt->fib6_flags & RTF_EXPIRES)) + fib6_clean_expires(iter); else - rt6_set_expires(iter, rt->dst.expires); - iter->rt6i_pmtu = rt->rt6i_pmtu; + fib6_set_expires(iter, rt->expires); + fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, @@ -940,21 +1002,21 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) - rt->rt6i_nsiblings++; + rt->fib6_nsiblings++; } - if (iter->rt6i_metric > rt->rt6i_metric) + if (iter->fib6_metric > rt->fib6_metric) break; next_iter: - ins = &iter->rt6_next; + ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No ECMP-able route found, replace first non-ECMP one */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } @@ -963,34 +1025,34 @@ next_iter: fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (rt->rt6i_nsiblings) { - unsigned int rt6i_nsiblings; - struct rt6_info *sibling, *temp_sibling; + if (rt->fib6_nsiblings) { + unsigned int fib6_nsiblings; + struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; while (sibling) { - if (sibling->rt6i_metric == rt->rt6i_metric && + if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->rt6i_siblings, - &sibling->rt6i_siblings); + list_add_tail(&rt->fib6_siblings, + &sibling->fib6_siblings); break; } - sibling = rcu_dereference_protected(sibling->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + sibling = rcu_dereference_protected(sibling->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ - rt6i_nsiblings = 0; + fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &rt->rt6i_siblings, rt6i_siblings) { - sibling->rt6i_nsiblings++; - BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); - rt6i_nsiblings++; + &rt->fib6_siblings, fib6_siblings) { + sibling->fib6_nsiblings++; + BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); + fib6_nsiblings++; } - BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } @@ -1003,9 +1065,6 @@ next_iter: add: nlflags |= NLM_F_CREATE; - err = fib6_commit_metrics(&rt->dst, mxc); - if (err) - return err; err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, @@ -1013,9 +1072,9 @@ add: if (err) return err; - rcu_assign_pointer(rt->rt6_next, iter); - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(rt->fib6_next, iter); + atomic_inc(&rt->fib6_ref); + rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); @@ -1036,19 +1095,15 @@ add: return -ENOENT; } - err = fib6_commit_metrics(&rt->dst, mxc); - if (err) - return err; - err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; - atomic_inc(&rt->rt6i_ref); - rcu_assign_pointer(rt->rt6i_node, fn); - rt->rt6_next = iter->rt6_next; + atomic_inc(&rt->fib6_ref); + rcu_assign_pointer(rt->fib6_node, fn); + rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); @@ -1056,35 +1111,35 @@ add: info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } - nsiblings = iter->rt6i_nsiblings; - iter->rt6i_node = NULL; + nsiblings = iter->fib6_nsiblings; + iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - rt6_release(iter); + fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - ins = &rt->rt6_next; + ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { - if (iter->rt6i_metric > rt->rt6i_metric) + if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { - *ins = iter->rt6_next; - iter->rt6i_node = NULL; + *ins = iter->fib6_next; + iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - rt6_release(iter); + fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { - ins = &iter->rt6_next; + ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } @@ -1093,10 +1148,10 @@ add: return 0; } -static void fib6_start_gc(struct net *net, struct rt6_info *rt) +static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && - (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) + (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } @@ -1108,22 +1163,22 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } -static void __fib6_update_sernum_upto_root(struct rt6_info *rt, +static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { - struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in rt6_get_cookie_safe() */ smp_wmb(); while (fn) { fn->fn_sernum = sernum; fn = rcu_dereference_protected(fn->parent, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); } } -void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) +void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } @@ -1135,22 +1190,16 @@ void fib6_update_sernum_upto_root(struct net *net, struct rt6_info *rt) * Need to own table->tb6_lock */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, - struct nl_info *info, struct mx6_config *mxc, - struct netlink_ext_ack *extack) +int fib6_add(struct fib6_node *root, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack) { - struct fib6_table *table = rt->rt6i_table; + struct fib6_table *table = rt->fib6_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; int replace_required = 0; int sernum = fib6_new_sernum(info->nl_net); - if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) - return -EINVAL; - if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; - if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; @@ -1161,8 +1210,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, - &rt->rt6i_dst.addr, rt->rt6i_dst.plen, - offsetof(struct rt6_info, rt6i_dst), allow_create, + &rt->fib6_dst.addr, rt->fib6_dst.plen, + offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); @@ -1173,7 +1222,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, pn = fn; #ifdef CONFIG_IPV6_SUBTREES - if (rt->rt6i_src.plen) { + if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { @@ -1194,16 +1243,16 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!sfn) goto failure; - atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref); rcu_assign_pointer(sfn->leaf, - info->nl_net->ipv6.ip6_null_entry); + info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, - &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src), + &rt->fib6_src.addr, rt->fib6_src.plen, + offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1221,8 +1270,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), - &rt->rt6i_src.addr, rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src), + &rt->fib6_src.addr, rt->fib6_src.plen, + offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { @@ -1235,9 +1284,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, - info->nl_net->ipv6.ip6_null_entry); + info->nl_net->ipv6.fib6_null_entry); } else { - atomic_inc(&rt->rt6i_ref); + atomic_inc(&rt->fib6_ref); rcu_assign_pointer(fn->leaf, rt); } } @@ -1245,7 +1294,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, mxc, extack); + err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); @@ -1259,13 +1308,13 @@ out: * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { - struct rt6_info *pn_leaf = + struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); - atomic_dec(&rt->rt6i_ref); + fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, @@ -1274,10 +1323,10 @@ out: if (!pn_leaf) { WARN_ON(!pn_leaf); pn_leaf = - info->nl_net->ipv6.ip6_null_entry; + info->nl_net->ipv6.fib6_null_entry; } #endif - atomic_inc(&pn_leaf->rt6i_ref); + fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } @@ -1299,10 +1348,6 @@ failure: (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); - /* Always release dst as dst->__refcnt is guaranteed - * to be taken before entering this function - */ - dst_release_immediate(&rt->dst); return err; } @@ -1312,12 +1357,12 @@ failure: */ struct lookup_args { - int offset; /* key offset on rt6_info */ + int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; -static struct fib6_node *fib6_lookup_1(struct fib6_node *root, - struct lookup_args *args) +static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, + struct lookup_args *args) { struct fib6_node *fn; __be32 dir; @@ -1350,7 +1395,7 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) @@ -1362,7 +1407,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; - sfn = fib6_lookup_1(subtree, args + 1); + sfn = fib6_node_lookup_1(subtree, + args + 1); if (!sfn) goto backtrack; fn = sfn; @@ -1384,18 +1430,19 @@ backtrack: /* called with rcu_read_lock() held */ -struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, - const struct in6_addr *saddr) +struct fib6_node *fib6_node_lookup(struct fib6_node *root, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { - .offset = offsetof(struct rt6_info, rt6i_dst), + .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { - .offset = offsetof(struct rt6_info, rt6i_src), + .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif @@ -1404,7 +1451,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad } }; - fn = fib6_lookup_1(root, daddr ? args : args + 1); + fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; @@ -1431,7 +1478,7 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ @@ -1480,7 +1527,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst), + offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES @@ -1491,7 +1538,7 @@ struct fib6_node *fib6_locate(struct fib6_node *root, if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src), + offsetof(struct fib6_info, fib6_src), exact_match); } } @@ -1510,14 +1557,14 @@ struct fib6_node *fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, +static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, @@ -1554,7 +1601,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { - rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } @@ -1569,11 +1616,11 @@ static struct fib6_node *fib6_repair_tree(struct net *net, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); - struct rt6_info *new_fn_leaf; + struct fib6_info *new_fn_leaf; RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; @@ -1599,10 +1646,10 @@ static struct fib6_node *fib6_repair_tree(struct net *net, #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); - new_fn_leaf = net->ipv6.ip6_null_entry; + new_fn_leaf = net->ipv6.fib6_null_entry; } #endif - atomic_inc(&new_fn_leaf->rt6i_ref); + fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } @@ -1658,26 +1705,24 @@ static struct fib6_node *fib6_repair_tree(struct net *net, return pn; RCU_INIT_POINTER(pn->leaf, NULL); - rt6_release(pn_leaf); + fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, - struct rt6_info __rcu **rtp, struct nl_info *info) + struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_walker *w; - struct rt6_info *rt = rcu_dereference_protected(*rtp, + struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; RT6_TRACE("fib6_del_route\n"); - WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); - /* Unlink it */ - *rtp = rt->rt6_next; - rt->rt6i_node = NULL; + *rtp = rt->fib6_next; + rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; @@ -1689,14 +1734,14 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fn->rr_ptr = NULL; /* Remove this entry from other siblings */ - if (rt->rt6i_nsiblings) { - struct rt6_info *sibling, *next_sibling; + if (rt->fib6_nsiblings) { + struct fib6_info *sibling, *next_sibling; list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, rt6i_siblings) - sibling->rt6i_nsiblings--; - rt->rt6i_nsiblings = 0; - list_del_init(&rt->rt6i_siblings); + &rt->fib6_siblings, fib6_siblings) + sibling->fib6_nsiblings--; + rt->fib6_nsiblings = 0; + list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } @@ -1705,7 +1750,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rcu_dereference_protected(rt->rt6_next, + w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; @@ -1730,46 +1775,36 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); - rt6_release(rt); + fib6_info_release(rt); } /* Need to own table->tb6_lock */ -int fib6_del(struct rt6_info *rt, struct nl_info *info) +int fib6_del(struct fib6_info *rt, struct nl_info *info) { - struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); - struct fib6_table *table = rt->rt6i_table; + struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + struct fib6_table *table = rt->fib6_table; struct net *net = info->nl_net; - struct rt6_info __rcu **rtp; - struct rt6_info __rcu **rtp_next; + struct fib6_info __rcu **rtp; + struct fib6_info __rcu **rtp_next; -#if RT6_DEBUG >= 2 - if (rt->dst.obsolete > 0) { - WARN_ON(fn); - return -ENOENT; - } -#endif - if (!fn || rt == net->ipv6.ip6_null_entry) + if (!fn || rt == net->ipv6.fib6_null_entry) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - /* remove cached dst from exception table */ - if (rt->rt6i_flags & RTF_CACHE) - return rt6_remove_exception_rt(rt); - /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { - struct rt6_info *cur = rcu_dereference_protected(*rtp, + struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { fib6_del_route(table, fn, rtp, info); return 0; } - rtp_next = &cur->rt6_next; + rtp_next = &cur->fib6_next; } return -ENOENT; } @@ -1907,7 +1942,7 @@ static int fib6_walk(struct net *net, struct fib6_walker *w) static int fib6_clean_node(struct fib6_walker *w) { int res; - struct rt6_info *rt; + struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, @@ -1932,17 +1967,17 @@ static int fib6_clean_node(struct fib6_walker *w) #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, - rcu_access_pointer(rt->rt6i_node), + rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { - if (WARN_ON(!rt->rt6i_nsiblings)) + if (WARN_ON(!rt->fib6_nsiblings)) continue; - rt = list_last_entry(&rt->rt6i_siblings, - struct rt6_info, rt6i_siblings); + rt = list_last_entry(&rt->fib6_siblings, + struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); @@ -1961,7 +1996,7 @@ static int fib6_clean_node(struct fib6_walker *w) */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, - int (*func)(struct rt6_info *, void *arg), + int (*func)(struct fib6_info *, void *arg), int sernum, void *arg) { struct fib6_cleaner c; @@ -1979,7 +2014,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, } static void __fib6_clean_all(struct net *net, - int (*func)(struct rt6_info *, void *), + int (*func)(struct fib6_info *, void *), int sernum, void *arg) { struct fib6_table *table; @@ -1999,7 +2034,7 @@ static void __fib6_clean_all(struct net *net, rcu_read_unlock(); } -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), +void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); @@ -2016,7 +2051,7 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct rt6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; @@ -2026,8 +2061,8 @@ static int fib6_age(struct rt6_info *rt, void *arg) * Routes are expired even if they are in use. */ - if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { - if (time_after(now, rt->dst.expires)) { + if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { + if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } @@ -2110,7 +2145,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); @@ -2122,7 +2157,7 @@ static int __net_init fib6_net_init(struct net *net) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, - net->ipv6.ip6_null_entry); + net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); @@ -2220,25 +2255,26 @@ struct ipv6_route_iter { static int ipv6_route_seq_show(struct seq_file *seq, void *v) { - struct rt6_info *rt = v; + struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + const struct net_device *dev; - seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES - seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->rt6i_flags & RTF_GATEWAY) - seq_printf(seq, "%pi6", &rt->rt6i_gateway); + if (rt->fib6_flags & RTF_GATEWAY) + seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw); else seq_puts(seq, "00000000000000000000000000000000"); + dev = rt->fib6_nh.nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), - rt->dst.__use, rt->rt6i_flags, - rt->dst.dev ? rt->dst.dev->name : ""); + rt->fib6_metric, atomic_read(&rt->fib6_ref), 0, + rt->fib6_flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } @@ -2252,7 +2288,7 @@ static int ipv6_route_yield(struct fib6_walker *w) do { iter->w.leaf = rcu_dereference_protected( - iter->w.leaf->rt6_next, + iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) @@ -2311,14 +2347,14 @@ static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; - struct rt6_info *n; + struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (!v) goto iter_table; - n = rcu_dereference_bh(((struct rt6_info *)v)->rt6_next); + n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next); if (n) { ++*pos; return n; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 458de353f5d9..c8cf2fdbb13b 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -848,7 +848,7 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) } /** - * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * @@ -937,6 +937,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct flowi6 fl6; int err = -EINVAL; __u32 mtu; + int nhoff; + int thoff; if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) goto tx_err; @@ -949,6 +951,16 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, truncate = true; } + nhoff = skb_network_header(skb) - skb_mac_header(skb); + if (skb->protocol == htons(ETH_P_IP) && + (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) + truncate = true; + + thoff = skb_transport_header(skb) - skb_mac_header(skb); + if (skb->protocol == htons(ETH_P_IPV6) && + (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) + truncate = true; + if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen)) goto tx_err; @@ -1376,6 +1388,7 @@ static void ip6gre_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); } @@ -1443,11 +1456,12 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) return -ENOMEM; ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (ret) { - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; - } + if (ret) + goto cleanup_alloc_pcpu_stats; + + ret = gro_cells_init(&tunnel->gro_cells, dev); + if (ret) + goto cleanup_dst_cache_init; t_hlen = ip6gre_calc_hlen(tunnel); dev->mtu = ETH_DATA_LEN - t_hlen; @@ -1463,6 +1477,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) ip6gre_tnl_init_features(dev); return 0; + +cleanup_dst_cache_init: + dst_cache_destroy(&tunnel->dst_cache); +cleanup_alloc_pcpu_stats: + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; } static int ip6gre_tunnel_init(struct net_device *dev) @@ -1822,11 +1843,12 @@ static int ip6erspan_tap_init(struct net_device *dev) return -ENOMEM; ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (ret) { - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; - } + if (ret) + goto cleanup_alloc_pcpu_stats; + + ret = gro_cells_init(&tunnel->gro_cells, dev); + if (ret) + goto cleanup_dst_cache_init; t_hlen = ip6erspan_calc_hlen(tunnel); dev->mtu = ETH_DATA_LEN - t_hlen; @@ -1839,6 +1861,13 @@ static int ip6erspan_tap_init(struct net_device *dev) ip6erspan_tnl_link_config(tunnel, 1); return 0; + +cleanup_dst_cache_init: + dst_cache_destroy(&tunnel->dst_cache); +cleanup_alloc_pcpu_stats: + free_percpu(dev->tstats); + dev->tstats = NULL; + return ret; } static const struct net_device_ops ip6erspan_netdev_ops = { diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 9ee208a348f5..f08d34491ece 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -336,7 +336,7 @@ int ip6_mc_input(struct sk_buff *skb) bool deliver; __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), - ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, + __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); hdr = ipv6_hdr(skb); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 4a87f9428ca5..5b3f2f89ef41 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -88,9 +88,11 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6)) - udpfrag = proto == IPPROTO_UDP && encap; + udpfrag = proto == IPPROTO_UDP && encap && + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); else - udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; + udpfrag = proto == IPPROTO_UDP && !skb->encapsulation && + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet6_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7b6d1689087b..60b0d1652448 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } -unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) -{ - unsigned int mtu; - struct inet6_dev *idev; - - if (dst_metric_locked(dst, RTAX_MTU)) { - mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - return mtu; - } - - mtu = IPV6_MIN_MTU; - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - if (idev) - mtu = idev->cnf.mtu6; - rcu_read_unlock(); - - return mtu; -} -EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward); - static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) @@ -425,6 +403,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) int ip6_forward(struct sk_buff *skb) { + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); @@ -444,8 +423,7 @@ int ip6_forward(struct sk_buff *skb) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -476,8 +454,7 @@ int ip6_forward(struct sk_buff *skb) /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; @@ -490,15 +467,13 @@ int ip6_forward(struct sk_buff *skb) if (proxied > 0) return ip6_input(skb); else if (proxied < 0) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } dst = skb_dst(skb); @@ -554,8 +529,7 @@ int ip6_forward(struct sk_buff *skb) /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INTOOBIGERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); @@ -579,7 +553,7 @@ int ip6_forward(struct sk_buff *skb) ip6_forward_finish); error: - __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); drop: kfree_skb(skb); return -EINVAL; @@ -966,15 +940,21 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, * that's why we try it again later. */ if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { + struct fib6_info *from; struct rt6_info *rt; bool had_dst = *dst != NULL; if (!had_dst) *dst = ip6_route_output(net, sk, fl6); rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; - err = ip6_route_get_saddr(net, rt, &fl6->daddr, + + rcu_read_lock(); + from = rt ? rcu_dereference(rt->from) : NULL; + err = ip6_route_get_saddr(net, from, &fl6->daddr, sk ? inet6_sk(sk)->srcprefs : 0, &fl6->saddr); + rcu_read_unlock(); + if (err) goto out_err_release; @@ -1238,6 +1218,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (mtu < IPV6_MIN_MTU) return -EINVAL; cork->base.fragsize = mtu; + cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0; + if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; cork->base.length = 0; @@ -1272,6 +1254,7 @@ static int __ip6_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; + bool paged; skb = skb_peek_tail(queue); if (!skb) { @@ -1279,7 +1262,8 @@ static int __ip6_append_data(struct sock *sk, dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; } - mtu = cork->fragsize; + paged = !!cork->gso_size; + mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; hh_len = LL_RESERVED_SPACE(rt->dst.dev); @@ -1327,7 +1311,7 @@ emsgsize: if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && headersize == sizeof(struct ipv6hdr) && length <= mtu - headersize && - !(flags & MSG_MORE) && + (!(flags & MSG_MORE) || cork->gso_size) && rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) csummode = CHECKSUM_PARTIAL; @@ -1370,6 +1354,7 @@ emsgsize: unsigned int fraglen; unsigned int fraggap; unsigned int alloclen; + unsigned int pagedlen = 0; alloc_new_skb: /* There's no room in the current skb */ if (skb) @@ -1392,11 +1377,17 @@ alloc_new_skb: if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; + fraglen = datalen + fragheaderlen; + if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; - else - alloclen = datalen + fragheaderlen; + else if (!paged) + alloclen = fraglen; + else { + alloclen = min_t(int, fraglen, MAX_HEADER); + pagedlen = fraglen - alloclen; + } alloclen += dst_exthdrlen; @@ -1418,7 +1409,7 @@ alloc_new_skb: */ alloclen += sizeof(struct frag_hdr); - copy = datalen - transhdrlen - fraggap; + copy = datalen - transhdrlen - fraggap - pagedlen; if (copy < 0) { err = -EINVAL; goto error; @@ -1457,7 +1448,7 @@ alloc_new_skb: /* * Find where to start putting bytes */ - data = skb_put(skb, fraglen); + data = skb_put(skb, fraglen - pagedlen); skb_set_network_header(skb, exthdrlen); data += fragheaderlen; skb->transport_header = (skb->network_header + @@ -1480,7 +1471,7 @@ alloc_new_skb: } offset += copy; - length -= datalen - fraggap; + length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; dst_exthdrlen = 0; @@ -1754,9 +1745,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk, void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, + struct inet_cork_full *cork, const struct sockcm_cookie *sockc) { - struct inet_cork_full cork; struct inet6_cork v6_cork; struct sk_buff_head queue; int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); @@ -1767,27 +1758,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk, __skb_queue_head_init(&queue); - cork.base.flags = 0; - cork.base.addr = 0; - cork.base.opt = NULL; - cork.base.dst = NULL; + cork->base.flags = 0; + cork->base.addr = 0; + cork->base.opt = NULL; + cork->base.dst = NULL; v6_cork.opt = NULL; - err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); + err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); if (err) { - ip6_cork_release(&cork, &v6_cork); + ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_sk(sk)->dontfrag; - err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, + err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, flags, ipc6, sockc); if (err) { - __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); + __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); } - return __ip6_make_skb(sk, &queue, &cork, &v6_cork); + return __ip6_make_skb(sk, &queue, cork, &v6_cork); } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index ca957dd93a29..b7f28deddaea 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -743,7 +743,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) } /** - * vti6_tnl_ioctl - configure vti6 tunnels from userspace + * vti6_ioctl - configure vti6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: parameters passed from userspace * @cmd: command to be performed diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 298fd8b6ed17..20a419ee8000 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -180,7 +180,8 @@ static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = { }; static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, - struct fib_rule_hdr *frh, struct nlattr **tb) + struct fib_rule_hdr *frh, struct nlattr **tb, + struct netlink_ext_ack *extack) { return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 9de4dfb126ba..9ac5366064e3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1155,7 +1155,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); struct neighbour *neigh = NULL; struct inet6_dev *in6_dev; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; + struct net *net; int lifetime; struct ndisc_options ndopts; int optlen; @@ -1253,9 +1254,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) /* Do not accept RA with source-addr found on local machine unless * accept_ra_from_local is set to true. */ + net = dev_net(in6_dev->dev); if (!in6_dev->cnf.accept_ra_from_local && - ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, - in6_dev->dev, 0)) { + ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { ND_PRINTK(2, info, "RA from local address detected on dev: %s: default router ignored\n", skb->dev->name); @@ -1272,20 +1273,22 @@ static void ndisc_router_discovery(struct sk_buff *skb) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - rt = rt6_get_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev); + rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); if (rt) { - neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, + rt->fib6_nh.nh_dev, NULL, + &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); - ip6_rt_put(rt); + fib6_info_release(rt); return; } } if (rt && lifetime == 0) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } @@ -1294,7 +1297,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (!rt && lifetime) { ND_PRINTK(3, info, "RA: adding default router\n"); - rt = rt6_add_dflt_router(&ipv6_hdr(skb)->saddr, skb->dev, pref); + rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, + skb->dev, pref); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", @@ -1302,28 +1306,29 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = dst_neigh_lookup(&rt->dst, &ipv6_hdr(skb)->saddr); + neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw, + rt->fib6_nh.nh_dev, NULL, + &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, "RA: %s got default router without neighbour\n", __func__); - ip6_rt_put(rt); + fib6_info_release(rt); return; } neigh->flags |= NTF_ROUTER; } else if (rt) { - rt->rt6i_flags = (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); } if (rt) - rt6_set_expires(rt, jiffies + (HZ * lifetime)); + fib6_set_expires(rt, jiffies + (HZ * lifetime)); if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; - if (rt) - dst_metric_set(&rt->dst, RTAX_HOPLIMIT, - ra_msg->icmph.icmp6_hop_limit); + fib6_metric_set(rt, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); } else { ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } @@ -1475,10 +1480,7 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { in6_dev->cnf.mtu6 = mtu; - - if (rt) - dst_metric_set(&rt->dst, RTAX_MTU, mtu); - + fib6_metric_set(rt, RTAX_MTU, mtu); rt6_mtu_change(skb->dev, mtu); } } @@ -1497,7 +1499,7 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: - ip6_rt_put(rt); + fib6_info_release(rt); if (neigh) neigh_release(neigh); } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 97f79dc943d7..3dc9af775ce2 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -529,7 +529,6 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name) .family = NFPROTO_IPV6, }; - t = ip6t_get_target(e); return xt_check_target(&par, t->u.target_size - sizeof(*t), e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c index 92c0047e7e33..491f808e356a 100644 --- a/net/ipv6/netfilter/ip6t_MASQUERADE.c +++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c @@ -29,7 +29,7 @@ masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c index 33719d5560c8..1059894a6f4c 100644 --- a/net/ipv6/netfilter/ip6t_srh.c +++ b/net/ipv6/netfilter/ip6t_srh.c @@ -117,6 +117,130 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par) return true; } +static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par) +{ + int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0; + const struct ip6t_srh1 *srhinfo = par->matchinfo; + struct in6_addr *psid, *nsid, *lsid; + struct in6_addr _psid, _nsid, _lsid; + struct ipv6_sr_hdr *srh; + struct ipv6_sr_hdr _srh; + + if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) + return false; + srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); + if (!srh) + return false; + + hdrlen = ipv6_optlen(srh); + if (skb->len - srhoff < hdrlen) + return false; + + if (srh->type != IPV6_SRCRT_TYPE_4) + return false; + + if (srh->segments_left > srh->first_segment) + return false; + + /* Next Header matching */ + if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, + !(srh->nexthdr == srhinfo->next_hdr))) + return false; + + /* Header Extension Length matching */ + if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, + !(srh->hdrlen == srhinfo->hdr_len))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, + !(srh->hdrlen > srhinfo->hdr_len))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, + !(srh->hdrlen < srhinfo->hdr_len))) + return false; + + /* Segments Left matching */ + if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, + !(srh->segments_left == srhinfo->segs_left))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, + !(srh->segments_left > srhinfo->segs_left))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, + !(srh->segments_left < srhinfo->segs_left))) + return false; + + /** + * Last Entry matching + * Last_Entry field was introduced in revision 6 of the SRH draft. + * It was called First_Segment in the previous revision + */ + if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, + !(srh->first_segment == srhinfo->last_entry))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, + !(srh->first_segment > srhinfo->last_entry))) + return false; + if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, + !(srh->first_segment < srhinfo->last_entry))) + return false; + + /** + * Tag matchig + * Tag field was introduced in revision 6 of the SRH draft + */ + if (srhinfo->mt_flags & IP6T_SRH_TAG) + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, + !(srh->tag == srhinfo->tag))) + return false; + + /* Previous SID matching */ + if (srhinfo->mt_flags & IP6T_SRH_PSID) { + if (srh->segments_left == srh->first_segment) + return false; + psidoff = srhoff + sizeof(struct ipv6_sr_hdr) + + ((srh->segments_left + 1) * sizeof(struct in6_addr)); + psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid); + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID, + ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk, + &srhinfo->psid_addr))) + return false; + } + + /* Next SID matching */ + if (srhinfo->mt_flags & IP6T_SRH_NSID) { + if (srh->segments_left == 0) + return false; + nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) + + ((srh->segments_left - 1) * sizeof(struct in6_addr)); + nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid); + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID, + ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk, + &srhinfo->nsid_addr))) + return false; + } + + /* Last SID matching */ + if (srhinfo->mt_flags & IP6T_SRH_LSID) { + lsidoff = srhoff + sizeof(struct ipv6_sr_hdr); + lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid); + if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID, + ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk, + &srhinfo->lsid_addr))) + return false; + } + return true; +} + static int srh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_srh *srhinfo = par->matchinfo; @@ -136,23 +260,54 @@ static int srh_mt6_check(const struct xt_mtchk_param *par) return 0; } -static struct xt_match srh_mt6_reg __read_mostly = { - .name = "srh", - .family = NFPROTO_IPV6, - .match = srh_mt6, - .matchsize = sizeof(struct ip6t_srh), - .checkentry = srh_mt6_check, - .me = THIS_MODULE, +static int srh1_mt6_check(const struct xt_mtchk_param *par) +{ + const struct ip6t_srh1 *srhinfo = par->matchinfo; + + if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { + pr_info_ratelimited("unknown srh match flags %X\n", + srhinfo->mt_flags); + return -EINVAL; + } + + if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { + pr_info_ratelimited("unknown srh invflags %X\n", + srhinfo->mt_invflags); + return -EINVAL; + } + + return 0; +} + +static struct xt_match srh_mt6_reg[] __read_mostly = { + { + .name = "srh", + .revision = 0, + .family = NFPROTO_IPV6, + .match = srh_mt6, + .matchsize = sizeof(struct ip6t_srh), + .checkentry = srh_mt6_check, + .me = THIS_MODULE, + }, + { + .name = "srh", + .revision = 1, + .family = NFPROTO_IPV6, + .match = srh1_mt6, + .matchsize = sizeof(struct ip6t_srh1), + .checkentry = srh1_mt6_check, + .me = THIS_MODULE, + } }; static int __init srh_mt6_init(void) { - return xt_register_match(&srh_mt6_reg); + return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg)); } static void __exit srh_mt6_exit(void) { - xt_unregister_match(&srh_mt6_reg); + xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg)); } module_init(srh_mt6_init); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 47306e45a80a..2bf554e18af8 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -35,8 +35,7 @@ static const struct xt_table nf_nat_ipv6_table = { static unsigned int ip6table_nat_do_chain(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct) + const struct nf_hook_state *state) { return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat); } diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index 207cb35569b1..c511d206bf9b 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -3,256 +3,12 @@ #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> -#include <linux/ipv6.h> -#include <linux/netdevice.h> -#include <net/ipv6.h> -#include <net/ip6_route.h> -#include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> -/* For layer 4 checksum field offset. */ -#include <linux/tcp.h> -#include <linux/udp.h> - -static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) -{ - struct tcphdr *tcph; - - if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || - skb_try_make_writable(skb, thoff + sizeof(*tcph))) - return -1; - - tcph = (void *)(skb_network_header(skb) + thoff); - inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, - new_addr->s6_addr32, true); - - return 0; -} - -static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) -{ - struct udphdr *udph; - - if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || - skb_try_make_writable(skb, thoff + sizeof(*udph))) - return -1; - - udph = (void *)(skb_network_header(skb) + thoff); - if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { - inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, - new_addr->s6_addr32, true); - if (!udph->check) - udph->check = CSUM_MANGLED_0; - } - - return 0; -} - -static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, struct in6_addr *addr, - struct in6_addr *new_addr) -{ - switch (ip6h->nexthdr) { - case IPPROTO_TCP: - if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; - break; - case IPPROTO_UDP: - if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; - break; - } - - return 0; -} - -static int nf_flow_snat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) -{ - struct in6_addr addr, new_addr; - - switch (dir) { - case FLOW_OFFLOAD_DIR_ORIGINAL: - addr = ip6h->saddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6; - ip6h->saddr = new_addr; - break; - case FLOW_OFFLOAD_DIR_REPLY: - addr = ip6h->daddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; - ip6h->daddr = new_addr; - break; - default: - return -1; - } - - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); -} - -static int nf_flow_dnat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) -{ - struct in6_addr addr, new_addr; - - switch (dir) { - case FLOW_OFFLOAD_DIR_ORIGINAL: - addr = ip6h->daddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6; - ip6h->daddr = new_addr; - break; - case FLOW_OFFLOAD_DIR_REPLY: - addr = ip6h->saddr; - new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; - ip6h->saddr = new_addr; - break; - default: - return -1; - } - - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); -} - -static int nf_flow_nat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, - enum flow_offload_tuple_dir dir) -{ - struct ipv6hdr *ip6h = ipv6_hdr(skb); - unsigned int thoff = sizeof(*ip6h); - - if (flow->flags & FLOW_OFFLOAD_SNAT && - (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) - return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && - (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) - return -1; - - return 0; -} - -static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple) -{ - struct flow_ports *ports; - struct ipv6hdr *ip6h; - unsigned int thoff; - - if (!pskb_may_pull(skb, sizeof(*ip6h))) - return -1; - - ip6h = ipv6_hdr(skb); - - if (ip6h->nexthdr != IPPROTO_TCP && - ip6h->nexthdr != IPPROTO_UDP) - return -1; - - thoff = sizeof(*ip6h); - if (!pskb_may_pull(skb, thoff + sizeof(*ports))) - return -1; - - ports = (struct flow_ports *)(skb_network_header(skb) + thoff); - - tuple->src_v6 = ip6h->saddr; - tuple->dst_v6 = ip6h->daddr; - tuple->src_port = ports->source; - tuple->dst_port = ports->dest; - tuple->l3proto = AF_INET6; - tuple->l4proto = ip6h->nexthdr; - tuple->iifidx = dev->ifindex; - - return 0; -} - -/* Based on ip_exceeds_mtu(). */ -static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) -{ - if (skb->len <= mtu) - return false; - - if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) - return false; - - return true; -} - -static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt) -{ - u32 mtu; - - mtu = ip6_dst_mtu_forward(&rt->dst); - if (__nf_flow_exceeds_mtu(skb, mtu)) - return true; - - return false; -} - -unsigned int -nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct flow_offload_tuple_rhash *tuplehash; - struct nf_flowtable *flow_table = priv; - struct flow_offload_tuple tuple = {}; - enum flow_offload_tuple_dir dir; - struct flow_offload *flow; - struct net_device *outdev; - struct in6_addr *nexthop; - struct ipv6hdr *ip6h; - struct rt6_info *rt; - - if (skb->protocol != htons(ETH_P_IPV6)) - return NF_ACCEPT; - - if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) - return NF_ACCEPT; - - tuplehash = flow_offload_lookup(flow_table, &tuple); - if (tuplehash == NULL) - return NF_ACCEPT; - - outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); - if (!outdev) - return NF_ACCEPT; - - dir = tuplehash->tuple.dir; - flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - - rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; - if (unlikely(nf_flow_exceeds_mtu(skb, rt))) - return NF_ACCEPT; - - if (skb_try_make_writable(skb, sizeof(*ip6h))) - return NF_DROP; - - if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && - nf_flow_nat_ipv6(flow, skb, dir) < 0) - return NF_DROP; - - flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; - ip6h = ipv6_hdr(skb); - ip6h->hop_limit--; - - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); - - return NF_STOLEN; -} -EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); static struct nf_flowtable_type flowtable_ipv6 = { .family = NFPROTO_IPV6, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_ipv6_hook, .owner = THIS_MODULE, diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 6b7f075f811f..f1582b6f9588 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -62,7 +62,7 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, #endif static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; @@ -151,7 +151,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], @@ -257,8 +257,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -303,7 +302,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, if (!nf_nat_initialized(ct, maniptype)) { unsigned int ret; - ret = do_chain(priv, skb, state, ct); + ret = do_chain(priv, skb, state); if (ret != NF_ACCEPT) return ret; @@ -343,8 +342,7 @@ nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { unsigned int ret; struct in6_addr daddr = ipv6_hdr(skb)->daddr; @@ -363,8 +361,7 @@ nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { #ifdef CONFIG_XFRM const struct nf_conn *ct; @@ -400,8 +397,7 @@ nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state, unsigned int (*do_chain)(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct)) + const struct nf_hook_state *state)) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 98f61fcb9108..9dfc2b90c362 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -26,14 +26,14 @@ static atomic_t v6_worker_count; unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out) { enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; struct in6_addr src; struct nf_conn *ct; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c index 57593b00c5b4..d9bf42ba44fa 100644 --- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c @@ -32,7 +32,7 @@ icmpv6_in_range(const struct nf_conntrack_tuple *tuple, static void icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 3557b114446c..100a6bd1046a 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -26,8 +26,7 @@ static unsigned int nft_nat_do_chain(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state, - struct nf_conn *ct) + const struct nf_hook_state *state) { struct nft_pktinfo pkt; diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index 4146536e9c15..dd0122f3cffe 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -22,7 +22,7 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_masq *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); range.flags = priv->flags; diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c index a27e424f690d..74269865acc8 100644 --- a/net/ipv6/netfilter/nft_redir_ipv6.c +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -22,7 +22,7 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_redir *priv = nft_expr_priv(expr); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_proto_min) { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 4979610287e2..b939b94e7e91 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -163,7 +163,8 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) } static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, - struct frag_hdr *fhdr, int nhoff) + struct frag_hdr *fhdr, int nhoff, + u32 *prob_offset) { struct sk_buff *prev, *next; struct net_device *dev; @@ -179,11 +180,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, - ((u8 *)&fhdr->frag_off - - skb_network_header(skb))); + *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); return -1; } @@ -214,10 +211,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, /* RFC2460 says always send parameter problem in * this case. -DaveM */ - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, - offsetof(struct ipv6hdr, payload_len)); + *prob_offset = offsetof(struct ipv6hdr, payload_len); return -1; } if (end > fq->q.len) { @@ -519,15 +513,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb) iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { + u32 prob_offset = 0; int ret; spin_lock(&fq->q.lock); fq->iif = iif; - ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); + ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, + &prob_offset); spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); + if (prob_offset) { + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); + } return ret; } @@ -536,7 +537,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return -1; fail_hdr: - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f4d61736c41a..cc24ed3bc334 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -78,7 +78,6 @@ enum rt6_nud_state { RT6_NUD_SUCCEED = 1 }; -static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); static unsigned int ip6_mtu(const struct dst_entry *dst); @@ -97,25 +96,24 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); -static void rt6_dst_from_metrics_check(struct rt6_info *rt); -static int rt6_score_route(struct rt6_info *rt, int oif, int strict); -static size_t rt6_nlmsg_size(struct rt6_info *rt); -static int rt6_fill_node(struct net *net, - struct sk_buff *skb, struct rt6_info *rt, - struct in6_addr *dst, struct in6_addr *src, +static int rt6_score_route(struct fib6_info *rt, int oif, int strict); +static size_t rt6_nlmsg_size(struct fib6_info *rt); +static int rt6_fill_node(struct net *net, struct sk_buff *skb, + struct fib6_info *rt, struct dst_entry *dst, + struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); -static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, +static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, struct in6_addr *daddr, struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO -static struct rt6_info *rt6_add_route_info(struct net *net, +static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); -static struct rt6_info *rt6_get_route_info(struct net *net, +static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev); @@ -184,29 +182,10 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } } -static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) -{ - return dst_metrics_write_ptr(&rt->from->dst); -} - -static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) -{ - struct rt6_info *rt = (struct rt6_info *)dst; - - if (rt->rt6i_flags & RTF_PCPU) - return rt6_pcpu_cow_metrics(rt); - else if (rt->rt6i_flags & RTF_CACHE) - return NULL; - else - return dst_cow_metrics_generic(dst, old); -} - -static inline const void *choose_neigh_daddr(struct rt6_info *rt, +static inline const void *choose_neigh_daddr(const struct in6_addr *p, struct sk_buff *skb, const void *daddr) { - struct in6_addr *p = &rt->rt6i_gateway; - if (!ipv6_addr_any(p)) return (const void *) p; else if (skb) @@ -214,18 +193,27 @@ static inline const void *choose_neigh_daddr(struct rt6_info *rt, return daddr; } -static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, - struct sk_buff *skb, - const void *daddr) +struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, + struct net_device *dev, + struct sk_buff *skb, + const void *daddr) { - struct rt6_info *rt = (struct rt6_info *) dst; struct neighbour *n; - daddr = choose_neigh_daddr(rt, skb, daddr); - n = __ipv6_neigh_lookup(dst->dev, daddr); + daddr = choose_neigh_daddr(gw, skb, daddr); + n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; - return neigh_create(&nd_tbl, daddr, dst->dev); + return neigh_create(&nd_tbl, daddr, dev); +} + +static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, + struct sk_buff *skb, + const void *daddr) +{ + const struct rt6_info *rt = container_of(dst, struct rt6_info, dst); + + return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) @@ -233,7 +221,7 @@ static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) struct net_device *dev = dst->dev; struct rt6_info *rt = (struct rt6_info *)dst; - daddr = choose_neigh_daddr(rt, NULL, daddr); + daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) @@ -250,7 +238,7 @@ static struct dst_ops ip6_dst_ops_template = { .check = ip6_dst_check, .default_advmss = ip6_default_advmss, .mtu = ip6_mtu, - .cow_metrics = ipv6_cow_metrics, + .cow_metrics = dst_cow_metrics_generic, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, @@ -258,7 +246,7 @@ static struct dst_ops ip6_dst_ops_template = { .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, .local_out = __ip6_local_out, - .neigh_lookup = ip6_neigh_lookup, + .neigh_lookup = ip6_dst_neigh_lookup, .confirm_neigh = ip6_confirm_neigh, }; @@ -288,13 +276,22 @@ static struct dst_ops ip6_dst_blackhole_ops = { .update_pmtu = ip6_rt_blackhole_update_pmtu, .redirect = ip6_rt_blackhole_redirect, .cow_metrics = dst_cow_metrics_generic, - .neigh_lookup = ip6_neigh_lookup, + .neigh_lookup = ip6_dst_neigh_lookup, }; static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; +static const struct fib6_info fib6_null_entry_template = { + .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), + .fib6_protocol = RTPROT_KERNEL, + .fib6_metric = ~(u32)0, + .fib6_ref = ATOMIC_INIT(1), + .fib6_type = RTN_UNREACHABLE, + .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, +}; + static const struct rt6_info ip6_null_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), @@ -305,9 +302,6 @@ static const struct rt6_info ip6_null_entry_template = { .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -322,9 +316,6 @@ static const struct rt6_info ip6_prohibit_entry_template = { .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), }; static const struct rt6_info ip6_blk_hole_entry_template = { @@ -337,9 +328,6 @@ static const struct rt6_info ip6_blk_hole_entry_template = { .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), - .rt6i_protocol = RTPROT_KERNEL, - .rt6i_metric = ~(u32) 0, - .rt6i_ref = ATOMIC_INIT(1), }; #endif @@ -349,14 +337,12 @@ static void rt6_info_init(struct rt6_info *rt) struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - INIT_LIST_HEAD(&rt->rt6i_siblings); INIT_LIST_HEAD(&rt->rt6i_uncached); } /* allocate dst with ip6_dst_ops */ -static struct rt6_info *__ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) +struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, + int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); @@ -368,34 +354,15 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, return rt; } - -struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) -{ - struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); - - if (rt) { - rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); - if (!rt->rt6i_pcpu) { - dst_release_immediate(&rt->dst); - return NULL; - } - } - - return rt; -} EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct rt6_exception_bucket *bucket; - struct rt6_info *from = rt->from; + struct fib6_info *from; struct inet6_dev *idev; dst_destroy_metrics_generic(dst); - free_percpu(rt->rt6i_pcpu); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; @@ -403,14 +370,12 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); - if (bucket) { - rt->rt6i_exception_bucket = NULL; - kfree(bucket); - } - rt->from = NULL; - dst_release(&from->dst); + rcu_read_lock(); + from = rcu_dereference(rt->from); + rcu_assign_pointer(rt->from, NULL); + fib6_info_release(from); + rcu_read_unlock(); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -440,23 +405,27 @@ static bool __rt6_check_expired(const struct rt6_info *rt) static bool rt6_check_expired(const struct rt6_info *rt) { + struct fib6_info *from; + + from = rcu_dereference(rt->from); + if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, rt->dst.expires)) return true; - } else if (rt->from) { + } else if (from) { return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || - rt6_check_expired(rt->from); + fib6_check_expired(from); } return false; } -static struct rt6_info *rt6_multipath_select(const struct net *net, - struct rt6_info *match, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict) +struct fib6_info *fib6_multipath_select(const struct net *net, + struct fib6_info *match, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict) { - struct rt6_info *sibling, *next_sibling; + struct fib6_info *sibling, *next_sibling; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. @@ -464,12 +433,15 @@ static struct rt6_info *rt6_multipath_select(const struct net *net, if (!fl6->mp_hash) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) + if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound)) return match; - list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings, - rt6i_siblings) { - if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound)) + list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, + fib6_siblings) { + int nh_upper_bound; + + nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound); + if (fl6->mp_hash > nh_upper_bound) continue; if (rt6_score_route(sibling, oif, strict) < 0) break; @@ -484,38 +456,27 @@ static struct rt6_info *rt6_multipath_select(const struct net *net, * Route lookup. rcu_read_lock() should be held. */ -static inline struct rt6_info *rt6_device_match(struct net *net, - struct rt6_info *rt, +static inline struct fib6_info *rt6_device_match(struct net *net, + struct fib6_info *rt, const struct in6_addr *saddr, int oif, int flags) { - struct rt6_info *local = NULL; - struct rt6_info *sprt; + struct fib6_info *sprt; - if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD)) + if (!oif && ipv6_addr_any(saddr) && + !(rt->fib6_nh.nh_flags & RTNH_F_DEAD)) return rt; - for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) { - struct net_device *dev = sprt->dst.dev; + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) { + const struct net_device *dev = sprt->fib6_nh.nh_dev; - if (sprt->rt6i_nh_flags & RTNH_F_DEAD) + if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD) continue; if (oif) { if (dev->ifindex == oif) return sprt; - if (dev->flags & IFF_LOOPBACK) { - if (!sprt->rt6i_idev || - sprt->rt6i_idev->dev->ifindex != oif) { - if (flags & RT6_LOOKUP_F_IFACE) - continue; - if (local && - local->rt6i_idev->dev->ifindex == oif) - continue; - } - local = sprt; - } } else { if (ipv6_chk_addr(net, saddr, dev, flags & RT6_LOOKUP_F_IFACE)) @@ -523,15 +484,10 @@ static inline struct rt6_info *rt6_device_match(struct net *net, } } - if (oif) { - if (local) - return local; + if (oif && flags & RT6_LOOKUP_F_IFACE) + return net->ipv6.fib6_null_entry; - if (flags & RT6_LOOKUP_F_IFACE) - return net->ipv6.ip6_null_entry; - } - - return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt; + return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -553,10 +509,13 @@ static void rt6_probe_deferred(struct work_struct *w) kfree(work); } -static void rt6_probe(struct rt6_info *rt) +static void rt6_probe(struct fib6_info *rt) { struct __rt6_probe_work *work; + const struct in6_addr *nh_gw; struct neighbour *neigh; + struct net_device *dev; + /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it @@ -565,20 +524,25 @@ static void rt6_probe(struct rt6_info *rt) * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ - if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) + if (!rt || !(rt->fib6_flags & RTF_GATEWAY)) return; + + nh_gw = &rt->fib6_nh.nh_gw; + dev = rt->fib6_nh.nh_dev; rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { + struct inet6_dev *idev; + if (neigh->nud_state & NUD_VALID) goto out; + idev = __in6_dev_get(dev); work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, - neigh->updated + - rt->rt6i_idev->cnf.rtr_probe_interval)) { + neigh->updated + idev->cnf.rtr_probe_interval)) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); @@ -590,9 +554,9 @@ static void rt6_probe(struct rt6_info *rt) if (work) { INIT_WORK(&work->work, rt6_probe_deferred); - work->target = rt->rt6i_gateway; - dev_hold(rt->dst.dev); - work->dev = rt->dst.dev; + work->target = *nh_gw; + dev_hold(dev); + work->dev = dev; schedule_work(&work->work); } @@ -600,7 +564,7 @@ out: rcu_read_unlock_bh(); } #else -static inline void rt6_probe(struct rt6_info *rt) +static inline void rt6_probe(struct fib6_info *rt) { } #endif @@ -608,28 +572,27 @@ static inline void rt6_probe(struct rt6_info *rt) /* * Default Router Selection (RFC 2461 6.3.6) */ -static inline int rt6_check_dev(struct rt6_info *rt, int oif) +static inline int rt6_check_dev(struct fib6_info *rt, int oif) { - struct net_device *dev = rt->dst.dev; + const struct net_device *dev = rt->fib6_nh.nh_dev; + if (!oif || dev->ifindex == oif) return 2; - if ((dev->flags & IFF_LOOPBACK) && - rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) - return 1; return 0; } -static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) +static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt) { - struct neighbour *neigh; enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; + struct neighbour *neigh; - if (rt->rt6i_flags & RTF_NONEXTHOP || - !(rt->rt6i_flags & RTF_GATEWAY)) + if (rt->fib6_flags & RTF_NONEXTHOP || + !(rt->fib6_flags & RTF_GATEWAY)) return RT6_NUD_SUCCEED; rcu_read_lock_bh(); - neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); + neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev, + &rt->fib6_nh.nh_gw); if (neigh) { read_lock(&neigh->lock); if (neigh->nud_state & NUD_VALID) @@ -650,8 +613,7 @@ static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) return ret; } -static int rt6_score_route(struct rt6_info *rt, int oif, - int strict) +static int rt6_score_route(struct fib6_info *rt, int oif, int strict) { int m; @@ -659,7 +621,7 @@ static int rt6_score_route(struct rt6_info *rt, int oif, if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF - m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; + m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2; #endif if (strict & RT6_LOOKUP_F_REACHABLE) { int n = rt6_check_neigh(rt); @@ -669,23 +631,37 @@ static int rt6_score_route(struct rt6_info *rt, int oif, return m; } -static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, - int *mpri, struct rt6_info *match, +/* called with rc_read_lock held */ +static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i) +{ + const struct net_device *dev = fib6_info_nh_dev(f6i); + bool rc = false; + + if (dev) { + const struct inet6_dev *idev = __in6_dev_get(dev); + + rc = !!idev->cnf.ignore_routes_with_linkdown; + } + + return rc; +} + +static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict, + int *mpri, struct fib6_info *match, bool *do_rr) { int m; bool match_do_rr = false; - struct inet6_dev *idev = rt->rt6i_idev; - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) goto out; - if (idev->cnf.ignore_routes_with_linkdown && - rt->rt6i_nh_flags & RTNH_F_LINKDOWN && + if (fib6_ignore_linkdown(rt) && + rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; - if (rt6_check_expired(rt)) + if (fib6_check_expired(rt)) goto out; m = rt6_score_route(rt, oif, strict); @@ -709,19 +685,19 @@ out: return match; } -static struct rt6_info *find_rr_leaf(struct fib6_node *fn, - struct rt6_info *leaf, - struct rt6_info *rr_head, +static struct fib6_info *find_rr_leaf(struct fib6_node *fn, + struct fib6_info *leaf, + struct fib6_info *rr_head, u32 metric, int oif, int strict, bool *do_rr) { - struct rt6_info *rt, *match, *cont; + struct fib6_info *rt, *match, *cont; int mpri = -1; match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) { - if (rt->rt6i_metric != metric) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) { + if (rt->fib6_metric != metric) { cont = rt; break; } @@ -730,8 +706,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, } for (rt = leaf; rt && rt != rr_head; - rt = rcu_dereference(rt->rt6_next)) { - if (rt->rt6i_metric != metric) { + rt = rcu_dereference(rt->fib6_next)) { + if (rt->fib6_metric != metric) { cont = rt; break; } @@ -742,22 +718,22 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next)) + for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; } -static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, +static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn, int oif, int strict) { - struct rt6_info *leaf = rcu_dereference(fn->leaf); - struct rt6_info *match, *rt0; + struct fib6_info *leaf = rcu_dereference(fn->leaf); + struct fib6_info *match, *rt0; bool do_rr = false; int key_plen; - if (!leaf || leaf == net->ipv6.ip6_null_entry) - return net->ipv6.ip6_null_entry; + if (!leaf || leaf == net->ipv6.fib6_null_entry) + return net->ipv6.fib6_null_entry; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) @@ -768,39 +744,39 @@ static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, * (This might happen if all routes under fn are deleted from * the tree and fib6_repair_tree() is called on the node.) */ - key_plen = rt0->rt6i_dst.plen; + key_plen = rt0->fib6_dst.plen; #ifdef CONFIG_IPV6_SUBTREES - if (rt0->rt6i_src.plen) - key_plen = rt0->rt6i_src.plen; + if (rt0->fib6_src.plen) + key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; - match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, + match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict, &do_rr); if (do_rr) { - struct rt6_info *next = rcu_dereference(rt0->rt6_next); + struct fib6_info *next = rcu_dereference(rt0->fib6_next); /* no entries matched; do round-robin */ - if (!next || next->rt6i_metric != rt0->rt6i_metric) + if (!next || next->fib6_metric != rt0->fib6_metric) next = leaf; if (next != rt0) { - spin_lock_bh(&leaf->rt6i_table->tb6_lock); + spin_lock_bh(&leaf->fib6_table->tb6_lock); /* make sure next is not being deleted from the tree */ - if (next->rt6i_node) + if (next->fib6_node) rcu_assign_pointer(fn->rr_ptr, next); - spin_unlock_bh(&leaf->rt6i_table->tb6_lock); + spin_unlock_bh(&leaf->fib6_table->tb6_lock); } } - return match ? match : net->ipv6.ip6_null_entry; + return match ? match : net->ipv6.fib6_null_entry; } -static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) +static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt) { - return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); + return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -812,7 +788,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, struct in6_addr prefix_buf, *prefix; unsigned int pref; unsigned long lifetime; - struct rt6_info *rt; + struct fib6_info *rt; if (len < sizeof(struct route_info)) { return -EINVAL; @@ -850,13 +826,13 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, } if (rinfo->prefix_len == 0) - rt = rt6_get_dflt_router(gwaddr, dev); + rt = rt6_get_dflt_router(net, gwaddr, dev); else rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev); if (rt && !lifetime) { - ip6_del_rt(rt); + ip6_del_rt(net, rt); rt = NULL; } @@ -864,21 +840,162 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev, pref); else if (rt) - rt->rt6i_flags = RTF_ROUTEINFO | - (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); + rt->fib6_flags = RTF_ROUTEINFO | + (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { if (!addrconf_finite_timeout(lifetime)) - rt6_clean_expires(rt); + fib6_clean_expires(rt); else - rt6_set_expires(rt, jiffies + HZ * lifetime); + fib6_set_expires(rt, jiffies + HZ * lifetime); - ip6_rt_put(rt); + fib6_info_release(rt); } return 0; } #endif +/* + * Misc support functions + */ + +/* called with rcu_lock held */ +static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt) +{ + struct net_device *dev = rt->fib6_nh.nh_dev; + + if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { + /* for copies of local routes, dst->dev needs to be the + * device if it is a master device, the master device if + * device is enslaved, and the loopback as the default + */ + if (netif_is_l3_slave(dev) && + !rt6_need_strict(&rt->fib6_dst.addr)) + dev = l3mdev_master_dev_rcu(dev); + else if (!netif_is_l3_master(dev)) + dev = dev_net(dev)->loopback_dev; + /* last case is netif_is_l3_master(dev) is true in which + * case we want dev returned to be dev + */ + } + + return dev; +} + +static const int fib6_prop[RTN_MAX + 1] = { + [RTN_UNSPEC] = 0, + [RTN_UNICAST] = 0, + [RTN_LOCAL] = 0, + [RTN_BROADCAST] = 0, + [RTN_ANYCAST] = 0, + [RTN_MULTICAST] = 0, + [RTN_BLACKHOLE] = -EINVAL, + [RTN_UNREACHABLE] = -EHOSTUNREACH, + [RTN_PROHIBIT] = -EACCES, + [RTN_THROW] = -EAGAIN, + [RTN_NAT] = -EINVAL, + [RTN_XRESOLVE] = -EINVAL, +}; + +static int ip6_rt_type_to_error(u8 fib6_type) +{ + return fib6_prop[fib6_type]; +} + +static unsigned short fib6_info_dst_flags(struct fib6_info *rt) +{ + unsigned short flags = 0; + + if (rt->dst_nocount) + flags |= DST_NOCOUNT; + if (rt->dst_nopolicy) + flags |= DST_NOPOLICY; + if (rt->dst_host) + flags |= DST_HOST; + + return flags; +} + +static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort) +{ + rt->dst.error = ip6_rt_type_to_error(ort->fib6_type); + + switch (ort->fib6_type) { + case RTN_BLACKHOLE: + rt->dst.output = dst_discard_out; + rt->dst.input = dst_discard; + break; + case RTN_PROHIBIT: + rt->dst.output = ip6_pkt_prohibit_out; + rt->dst.input = ip6_pkt_prohibit; + break; + case RTN_THROW: + case RTN_UNREACHABLE: + default: + rt->dst.output = ip6_pkt_discard_out; + rt->dst.input = ip6_pkt_discard; + break; + } +} + +static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort) +{ + rt->dst.flags |= fib6_info_dst_flags(ort); + + if (ort->fib6_flags & RTF_REJECT) { + ip6_rt_init_dst_reject(rt, ort); + return; + } + + rt->dst.error = 0; + rt->dst.output = ip6_output; + + if (ort->fib6_type == RTN_LOCAL) { + rt->dst.input = ip6_input; + } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { + rt->dst.input = ip6_mc_input; + } else { + rt->dst.input = ip6_forward; + } + + if (ort->fib6_nh.nh_lwtstate) { + rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); + lwtunnel_set_redirect(&rt->dst); + } + + rt->dst.lastuse = jiffies; +} + +static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) +{ + rt->rt6i_flags &= ~RTF_EXPIRES; + fib6_info_hold(from); + rcu_assign_pointer(rt->from, from); + dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); + if (from->fib6_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + refcount_inc(&from->fib6_metrics->refcnt); + } +} + +static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort) +{ + struct net_device *dev = fib6_info_nh_dev(ort); + + ip6_rt_init_dst(rt, ort); + + rt->rt6i_dst = ort->fib6_dst; + rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; + rt->rt6i_gateway = ort->fib6_nh.nh_gw; + rt->rt6i_flags = ort->fib6_flags; + rt6_set_from(rt, ort); +#ifdef CONFIG_IPV6_SUBTREES + rt->rt6i_src = ort->fib6_src; +#endif + rt->rt6i_prefsrc = ort->fib6_prefsrc; + rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate); +} + static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { @@ -889,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, pn = rcu_dereference(fn->parent); sn = FIB6_SUBTREE(pn); if (sn && sn != fn) - fn = fib6_lookup(sn, NULL, saddr); + fn = fib6_node_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) @@ -914,50 +1031,74 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, return false; } +/* called with rcu_lock held */ +static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt) +{ + unsigned short flags = fib6_info_dst_flags(rt); + struct net_device *dev = rt->fib6_nh.nh_dev; + struct rt6_info *nrt; + + nrt = ip6_dst_alloc(dev_net(dev), dev, flags); + if (nrt) + ip6_rt_copy_init(nrt, rt); + + return nrt; +} + static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { - struct rt6_info *rt, *rt_cache; + struct fib6_info *f6i; struct fib6_node *fn; + struct rt6_info *rt; if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) flags &= ~RT6_LOOKUP_F_IFACE; rcu_read_lock(); - fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - rt = rcu_dereference(fn->leaf); - if (!rt) { - rt = net->ipv6.ip6_null_entry; + f6i = rcu_dereference(fn->leaf); + if (!f6i) { + f6i = net->ipv6.fib6_null_entry; } else { - rt = rt6_device_match(net, rt, &fl6->saddr, + f6i = rt6_device_match(net, f6i, &fl6->saddr, fl6->flowi6_oif, flags); - if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, - skb, flags); + if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) + f6i = fib6_multipath_select(net, f6i, fl6, + fl6->flowi6_oif, skb, + flags); } - if (rt == net->ipv6.ip6_null_entry) { + if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } - /* Search through exception table */ - rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); - if (rt_cache) - rt = rt_cache; - if (ip6_hold_safe(net, &rt, true)) - dst_use_noref(&rt->dst, jiffies); + trace_fib6_table_lookup(net, f6i, table, fl6); - rcu_read_unlock(); + /* Search through exception table */ + rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + if (rt) { + if (ip6_hold_safe(net, &rt, true)) + dst_use_noref(&rt->dst, jiffies); + } else if (f6i == net->ipv6.fib6_null_entry) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } else { + rt = ip6_create_rt_rcu(f6i); + if (!rt) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } + } - trace_fib6_table_lookup(net, rt, table, fl6); + rcu_read_unlock(); return rt; - } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, @@ -999,55 +1140,28 @@ EXPORT_SYMBOL(rt6_lookup); * Caller must hold dst before calling it. */ -static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, - struct mx6_config *mxc, +static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { int err; struct fib6_table *table; - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); - err = fib6_add(&table->tb6_root, rt, info, mxc, extack); + err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); return err; } -int ip6_ins_rt(struct rt6_info *rt) -{ - struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; - struct mx6_config mxc = { .mx = NULL, }; - - /* Hold dst to account for the reference from the fib6 tree */ - dst_hold(&rt->dst); - return __ip6_ins_rt(rt, &info, &mxc, NULL); -} - -/* called with rcu_lock held */ -static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) +int ip6_ins_rt(struct net *net, struct fib6_info *rt) { - struct net_device *dev = rt->dst.dev; - - if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) { - /* for copies of local routes, dst->dev needs to be the - * device if it is a master device, the master device if - * device is enslaved, and the loopback as the default - */ - if (netif_is_l3_slave(dev) && - !rt6_need_strict(&rt->rt6i_dst.addr)) - dev = l3mdev_master_dev_rcu(dev); - else if (!netif_is_l3_master(dev)) - dev = dev_net(dev)->loopback_dev; - /* last case is netif_is_l3_master(dev) is true in which - * case we want dev returned to be dev - */ - } + struct nl_info info = { .nl_net = net, }; - return dev; + return __ip6_ins_rt(rt, &info, NULL); } -static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, +static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -1058,26 +1172,20 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, * Clone the route. */ - if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = ort->from; - - rcu_read_lock(); dev = ip6_rt_get_dev_rcu(ort); - rt = __ip6_dst_alloc(dev_net(dev), dev, 0); - rcu_read_unlock(); + rt = ip6_dst_alloc(dev_net(dev), dev, 0); if (!rt) return NULL; ip6_rt_copy_init(rt, ort); rt->rt6i_flags |= RTF_CACHE; - rt->rt6i_metric = 0; rt->dst.flags |= DST_HOST; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; if (!rt6_is_gw_or_nonexthop(ort)) { - if (ort->rt6i_dst.plen != 128 && - ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) + if (ort->fib6_dst.plen != 128 && + ipv6_addr_equal(&ort->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { @@ -1090,45 +1198,44 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, return rt; } -static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) +static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt) { + unsigned short flags = fib6_info_dst_flags(rt); struct net_device *dev; struct rt6_info *pcpu_rt; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(rt); - pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); + pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); rcu_read_unlock(); if (!pcpu_rt) return NULL; ip6_rt_copy_init(pcpu_rt, rt); - pcpu_rt->rt6i_protocol = rt->rt6i_protocol; pcpu_rt->rt6i_flags |= RTF_PCPU; return pcpu_rt; } /* It should be called with rcu_read_lock() acquired */ -static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt) { struct rt6_info *pcpu_rt, **p; p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) - rt6_dst_from_metrics_check(pcpu_rt); + if (pcpu_rt) + ip6_hold_safe(NULL, &pcpu_rt, false); return pcpu_rt; } -static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) +static struct rt6_info *rt6_make_pcpu_route(struct net *net, + struct fib6_info *rt) { struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); if (!pcpu_rt) { - struct net *net = dev_net(rt->dst.dev); - dst_hold(&net->ipv6.ip6_null_entry->dst); return net->ipv6.ip6_null_entry; } @@ -1138,7 +1245,6 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); - rt6_dst_from_metrics_check(pcpu_rt); return pcpu_rt; } @@ -1158,9 +1264,8 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, return; net = dev_net(rt6_ex->rt6i->dst.dev); - rt6_ex->rt6i->rt6i_node = NULL; hlist_del_rcu(&rt6_ex->hlist); - rt6_release(rt6_ex->rt6i); + dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; @@ -1268,20 +1373,36 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, return NULL; } +static unsigned int fib6_mtu(const struct fib6_info *rt) +{ + unsigned int mtu; + + if (rt->fib6_pmtu) { + mtu = rt->fib6_pmtu; + } else { + struct net_device *dev = fib6_info_nh_dev(rt); + struct inet6_dev *idev; + + rcu_read_lock(); + idev = __in6_dev_get(dev); + mtu = idev->cnf.mtu6; + rcu_read_unlock(); + } + + mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); + + return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu); +} + static int rt6_insert_exception(struct rt6_info *nrt, - struct rt6_info *ort) + struct fib6_info *ort) { - struct net *net = dev_net(ort->dst.dev); + struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; int err = 0; - /* ort can't be a cache or pcpu route */ - if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) - ort = ort->from; - WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); - spin_lock_bh(&rt6_exception_lock); if (ort->exception_bucket_flushed) { @@ -1308,19 +1429,19 @@ static int rt6_insert_exception(struct rt6_info *nrt, * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (ort->rt6i_src.plen) + if (ort->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif /* Update rt6i_prefsrc as it could be changed * in rt6_remove_prefsrc() */ - nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + nrt->rt6i_prefsrc = ort->fib6_prefsrc; /* rt6_mtu_change() might lower mtu on ort. * Only insert this exception route if its mtu * is less than ort's mtu value. */ - if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { + if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) { err = -EINVAL; goto out; } @@ -1337,8 +1458,6 @@ static int rt6_insert_exception(struct rt6_info *nrt, } rt6_ex->rt6i = nrt; rt6_ex->stamp = jiffies; - atomic_inc(&nrt->rt6i_ref); - nrt->rt6i_node = ort->rt6i_node; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; @@ -1351,16 +1470,16 @@ out: /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { - spin_lock_bh(&ort->rt6i_table->tb6_lock); - fib6_update_sernum(ort); - spin_unlock_bh(&ort->rt6i_table->tb6_lock); + spin_lock_bh(&ort->fib6_table->tb6_lock); + fib6_update_sernum(net, ort); + spin_unlock_bh(&ort->fib6_table->tb6_lock); fib6_force_start_gc(net); } return err; } -void rt6_flush_exceptions(struct rt6_info *rt) +void rt6_flush_exceptions(struct fib6_info *rt) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1390,7 +1509,7 @@ out: /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ -static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, +static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt, struct in6_addr *daddr, struct in6_addr *saddr) { @@ -1408,7 +1527,7 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (rt->rt6i_src.plen) + if (rt->fib6_src.plen) src_key = saddr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); @@ -1420,14 +1539,15 @@ static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, } /* Remove the passed in cached rt from the hash table that contains it */ -int rt6_remove_exception_rt(struct rt6_info *rt) +static int rt6_remove_exception_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct rt6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; + struct fib6_info *from; int err; + from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; @@ -1445,7 +1565,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->rt6i_src.plen) + if (from->fib6_src.plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1468,7 +1588,7 @@ int rt6_remove_exception_rt(struct rt6_info *rt) static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct rt6_info *from = rt->from; + struct fib6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; @@ -1486,7 +1606,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->rt6i_src.plen) + if (from->fib6_src.plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, @@ -1498,7 +1618,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) rcu_read_unlock(); } -static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) +static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1540,7 +1660,7 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct rt6_info *rt, int mtu) + struct fib6_info *rt, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1557,12 +1677,12 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, struct rt6_info *entry = rt6_ex->rt6i; /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected - * route), the metrics of its rt->dst.from have already + * route), the metrics of its rt->from have already * been updated. */ - if (entry->rt6i_pmtu && + if (dst_metric_raw(&entry->dst, RTAX_MTU) && rt6_mtu_change_route_allowed(idev, entry, mtu)) - entry->rt6i_pmtu = mtu; + dst_metric_set(&entry->dst, RTAX_MTU, mtu); } bucket++; } @@ -1570,7 +1690,7 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct rt6_info *rt, +static void rt6_exceptions_clean_tohost(struct fib6_info *rt, struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; @@ -1649,7 +1769,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct rt6_info *rt, +void rt6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, unsigned long now) { @@ -1680,32 +1800,22 @@ void rt6_age_exceptions(struct rt6_info *rt, rcu_read_unlock_bh(); } -struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, - const struct sk_buff *skb, int flags) +/* must be called with rcu lock held */ +struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int strict) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt, *rt_cache; - int strict = 0; + struct fib6_info *f6i; - strict |= flags & RT6_LOOKUP_F_IFACE; - strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; - if (net->ipv6.devconf_all->forwarding == 0) - strict |= RT6_LOOKUP_F_REACHABLE; - - rcu_read_lock(); - - fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) oif = 0; redo_rt6_select: - rt = rt6_select(net, fn, oif, strict); - if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); - if (rt == net->ipv6.ip6_null_entry) { + f6i = rt6_select(net, fn, oif, strict); + if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; @@ -1717,45 +1827,57 @@ redo_rt6_select: } } - /*Search through exception table */ - rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); - if (rt_cache) - rt = rt_cache; + trace_fib6_table_lookup(net, f6i, table, fl6); - if (rt == net->ipv6.ip6_null_entry) { + return f6i; +} + +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) +{ + struct fib6_info *f6i; + struct rt6_info *rt; + int strict = 0; + + strict |= flags & RT6_LOOKUP_F_IFACE; + strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; + if (net->ipv6.devconf_all->forwarding == 0) + strict |= RT6_LOOKUP_F_REACHABLE; + + rcu_read_lock(); + + f6i = fib6_table_lookup(net, table, oif, fl6, strict); + if (f6i->fib6_nsiblings) + f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); + + if (f6i == net->ipv6.fib6_null_entry) { + rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); - trace_fib6_table_lookup(net, rt, table, fl6); return rt; - } else if (rt->rt6i_flags & RTF_CACHE) { - if (ip6_hold_safe(net, &rt, true)) { + } + + /*Search through exception table */ + rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); + if (rt) { + if (ip6_hold_safe(net, &rt, true)) dst_use_noref(&rt->dst, jiffies); - rt6_dst_from_metrics_check(rt); - } + rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && - !(rt->rt6i_flags & RTF_GATEWAY))) { + !(f6i->fib6_flags & RTF_GATEWAY))) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ - struct rt6_info *uncached_rt; - if (ip6_hold_safe(net, &rt, true)) { - dst_use_noref(&rt->dst, jiffies); - } else { - rcu_read_unlock(); - uncached_rt = rt; - goto uncached_rt_out; - } - rcu_read_unlock(); + uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL); - uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); - dst_release(&rt->dst); + rcu_read_unlock(); if (uncached_rt) { /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() @@ -1768,36 +1890,21 @@ redo_rt6_select: dst_hold(&uncached_rt->dst); } -uncached_rt_out: - trace_fib6_table_lookup(net, uncached_rt, table, fl6); return uncached_rt; - } else { /* Get a percpu copy */ struct rt6_info *pcpu_rt; - dst_use_noref(&rt->dst, jiffies); local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(rt); - - if (!pcpu_rt) { - /* atomic_inc_not_zero() is needed when using rcu */ - if (atomic_inc_not_zero(&rt->rt6i_ref)) { - /* No dst_hold() on rt is needed because grabbing - * rt->rt6i_ref makes sure rt can't be released. - */ - pcpu_rt = rt6_make_pcpu_route(rt); - rt6_release(rt); - } else { - /* rt is already removed from tree */ - pcpu_rt = net->ipv6.ip6_null_entry; - dst_hold(&pcpu_rt->dst); - } - } + pcpu_rt = rt6_get_pcpu_route(f6i); + + if (!pcpu_rt) + pcpu_rt = rt6_make_pcpu_route(net, f6i); + local_bh_enable(); rcu_read_unlock(); - trace_fib6_table_lookup(net, pcpu_rt, table, fl6); + return pcpu_rt; } } @@ -2020,7 +2127,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; - rt->rt6i_metric = 0; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES @@ -2036,18 +2142,27 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ -static void rt6_dst_from_metrics_check(struct rt6_info *rt) +static bool fib6_check(struct fib6_info *f6i, u32 cookie) { - if (rt->from && - dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst)) - dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true); + u32 rt_cookie = 0; + + if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) + return false; + + if (fib6_check_expired(f6i)) + return false; + + return true; } -static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +static struct dst_entry *rt6_check(struct rt6_info *rt, + struct fib6_info *from, + u32 cookie) { u32 rt_cookie = 0; - if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) + if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || + rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) @@ -2056,11 +2171,13 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) return &rt->dst; } -static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) +static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, + struct fib6_info *from, + u32 cookie) { if (!__rt6_check_expired(rt) && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && - rt6_check(rt->from, cookie)) + fib6_check(from, cookie)) return &rt->dst; else return NULL; @@ -2068,22 +2185,30 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { + struct dst_entry *dst_ret; + struct fib6_info *from; struct rt6_info *rt; - rt = (struct rt6_info *) dst; + rt = container_of(dst, struct rt6_info, dst); + + rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - rt6_dst_from_metrics_check(rt); + from = rcu_dereference(rt->from); - if (rt->rt6i_flags & RTF_PCPU || - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from)) - return rt6_dst_from_check(rt, cookie); + if (from && (rt->rt6i_flags & RTF_PCPU || + unlikely(!list_empty(&rt->rt6i_uncached)))) + dst_ret = rt6_dst_from_check(rt, from, cookie); else - return rt6_check(rt, cookie); + dst_ret = rt6_check(rt, from, cookie); + + rcu_read_unlock(); + + return dst_ret; } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) @@ -2092,10 +2217,12 @@ static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) if (rt) { if (rt->rt6i_flags & RTF_CACHE) { + rcu_read_lock(); if (rt6_check_expired(rt)) { - ip6_del_rt(rt); + rt6_remove_exception_rt(rt); dst = NULL; } + rcu_read_unlock(); } else { dst_release(dst); dst = NULL; @@ -2112,35 +2239,60 @@ static void ip6_link_failure(struct sk_buff *skb) rt = (struct rt6_info *) skb_dst(skb); if (rt) { + rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) - ip6_del_rt(rt); + rt6_remove_exception_rt(rt); } else { + struct fib6_info *from; struct fib6_node *fn; - rcu_read_lock(); - fn = rcu_dereference(rt->rt6i_node); - if (fn && (rt->rt6i_flags & RTF_DEFAULT)) - fn->fn_sernum = -1; - rcu_read_unlock(); + from = rcu_dereference(rt->from); + if (from) { + fn = rcu_dereference(from->fib6_node); + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) + fn->fn_sernum = -1; + } } + rcu_read_unlock(); + } +} + +static void rt6_update_expires(struct rt6_info *rt0, int timeout) +{ + if (!(rt0->rt6i_flags & RTF_EXPIRES)) { + struct fib6_info *from; + + rcu_read_lock(); + from = rcu_dereference(rt0->from); + if (from) + rt0->dst.expires = from->expires; + rcu_read_unlock(); } + + dst_set_expires(&rt0->dst, timeout); + rt0->rt6i_flags |= RTF_EXPIRES; } static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); + dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; - rt->rt6i_pmtu = mtu; rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); } static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { + bool from_set; + + rcu_read_lock(); + from_set = !!rcu_dereference(rt->from); + rcu_read_unlock(); + return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || - rcu_access_pointer(rt->rt6i_node)); + (rt->rt6i_flags & RTF_PCPU || from_set); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, @@ -2176,14 +2328,18 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { + struct fib6_info *from; struct rt6_info *nrt6; - nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); + rcu_read_lock(); + from = rcu_dereference(rt6->from); + nrt6 = ip6_rt_cache_alloc(from, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - if (rt6_insert_exception(nrt6, rt6)) + if (rt6_insert_exception(nrt6, from)) dst_release_immediate(&nrt6->dst); } + rcu_read_unlock(); } } @@ -2264,7 +2420,8 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt, *rt_cache; + struct rt6_info *ret = NULL, *rt_cache; + struct fib6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -2278,32 +2435,32 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, */ rcu_read_lock(); - fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); + fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) continue; - if (rt6_check_expired(rt)) + if (fib6_check_expired(rt)) continue; - if (rt->dst.error) + if (rt->fib6_flags & RTF_REJECT) break; - if (!(rt->rt6i_flags & RTF_GATEWAY)) + if (!(rt->fib6_flags & RTF_GATEWAY)) continue; - if (fl6->flowi6_oif != rt->dst.dev->ifindex) + if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex) continue; /* rt_cache's gateway might be different from its 'parent' * in the case of an ip redirect. * So we keep searching in the exception table if the gateway * is different. */ - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { + if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) { rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); if (rt_cache && ipv6_addr_equal(&rdfl->gateway, &rt_cache->rt6i_gateway)) { - rt = rt_cache; + ret = rt_cache; break; } continue; @@ -2312,25 +2469,28 @@ restart: } if (!rt) - rt = net->ipv6.ip6_null_entry; - else if (rt->dst.error) { - rt = net->ipv6.ip6_null_entry; + rt = net->ipv6.fib6_null_entry; + else if (rt->fib6_flags & RTF_REJECT) { + ret = net->ipv6.ip6_null_entry; goto out; } - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } out: - ip6_hold_safe(net, &rt, true); + if (ret) + dst_hold(&ret->dst); + else + ret = ip6_create_rt_rcu(rt); rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table, fl6); - return rt; + return ret; }; static struct dst_entry *ip6_route_redirect(struct net *net, @@ -2422,12 +2582,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) static unsigned int ip6_mtu(const struct dst_entry *dst) { - const struct rt6_info *rt = (const struct rt6_info *)dst; - unsigned int mtu = rt->rt6i_pmtu; struct inet6_dev *idev; - - if (mtu) - goto out; + unsigned int mtu; mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) @@ -2511,60 +2667,22 @@ out: return entries > rt_max_size; } -static int ip6_convert_metrics(struct mx6_config *mxc, - const struct fib6_config *cfg) +static int ip6_convert_metrics(struct net *net, struct fib6_info *rt, + struct fib6_config *cfg) { - struct net *net = cfg->fc_nlinfo.nl_net; - bool ecn_ca = false; - struct nlattr *nla; - int remaining; - u32 *mp; + struct dst_metrics *p; if (!cfg->fc_mx) return 0; - mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); - if (unlikely(!mp)) + p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL); + if (unlikely(!p)) return -ENOMEM; - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - u32 val; - - if (!type) - continue; - if (unlikely(type > RTAX_MAX)) - goto err; + refcount_set(&p->refcnt, 1); + rt->fib6_metrics = p; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); - if (val == TCP_CA_UNSPEC) - goto err; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_HOPLIMIT && val > 255) - val = 255; - if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) - goto err; - - mp[type - 1] = val; - __set_bit(type - 1, mxc->mx_valid); - } - - if (ecn_ca) { - __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); - mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; - } - - mxc->mx = mp; - return 0; - err: - kfree(mp); - return -EINVAL; + return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics); } static struct rt6_info *ip6_nh_lookup_table(struct net *net, @@ -2750,11 +2868,12 @@ out: return err; } -static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, +static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, + gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; struct fib6_table *table; @@ -2773,6 +2892,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; @@ -2831,35 +2955,30 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (!table) goto out; - rt = ip6_dst_alloc(net, NULL, - (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); + err = -ENOMEM; + rt = fib6_info_alloc(gfp_flags); + if (!rt) + goto out; - if (!rt) { - err = -ENOMEM; + if (cfg->fc_flags & RTF_ADDRCONF) + rt->dst_nocount = true; + + err = ip6_convert_metrics(net, rt, cfg); + if (err < 0) goto out; - } if (cfg->fc_flags & RTF_EXPIRES) - rt6_set_expires(rt, jiffies + + fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); else - rt6_clean_expires(rt); + fib6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; - rt->rt6i_protocol = cfg->fc_protocol; + rt->fib6_protocol = cfg->fc_protocol; addr_type = ipv6_addr_type(&cfg->fc_dst); - if (addr_type & IPV6_ADDR_MULTICAST) - rt->dst.input = ip6_mc_input; - else if (cfg->fc_flags & RTF_LOCAL) - rt->dst.input = ip6_input; - else - rt->dst.input = ip6_forward; - - rt->dst.output = ip6_output; - if (cfg->fc_encap) { struct lwtunnel_state *lwtstate; @@ -2868,22 +2987,23 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, &lwtstate, extack); if (err) goto out; - rt->dst.lwtstate = lwtstate_get(lwtstate); - lwtunnel_set_redirect(&rt->dst); + rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate); } - ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); - rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) - rt->dst.flags |= DST_HOST; + ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); + rt->fib6_dst.plen = cfg->fc_dst_len; + if (rt->fib6_dst.plen == 128) + rt->dst_host = true; #ifdef CONFIG_IPV6_SUBTREES - ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); - rt->rt6i_src.plen = cfg->fc_src_len; + ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); + rt->fib6_src.plen = cfg->fc_src_len; #endif - rt->rt6i_metric = cfg->fc_metric; - rt->rt6i_nh_weight = 1; + rt->fib6_metric = cfg->fc_metric; + rt->fib6_nh.nh_weight = 1; + + rt->fib6_type = cfg->fc_type; /* We cannot add true routes via loopback here, they would result in kernel looping; promote them to reject routes @@ -2906,28 +3026,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } } - rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; - switch (cfg->fc_type) { - case RTN_BLACKHOLE: - rt->dst.error = -EINVAL; - rt->dst.output = dst_discard_out; - rt->dst.input = dst_discard; - break; - case RTN_PROHIBIT: - rt->dst.error = -EACCES; - rt->dst.output = ip6_pkt_prohibit_out; - rt->dst.input = ip6_pkt_prohibit; - break; - case RTN_THROW: - case RTN_UNREACHABLE: - default: - rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN - : (cfg->fc_type == RTN_UNREACHABLE) - ? -EHOSTUNREACH : -ENETUNREACH; - rt->dst.output = ip6_pkt_discard_out; - rt->dst.input = ip6_pkt_discard; - break; - } + rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP; goto install_route; } @@ -2936,7 +3035,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; - rt->rt6i_gateway = cfg->fc_gateway; + rt->fib6_nh.nh_gw = cfg->fc_gateway; } err = -ENODEV; @@ -2961,96 +3060,82 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, err = -EINVAL; goto out; } - rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; - rt->rt6i_prefsrc.plen = 128; + rt->fib6_prefsrc.addr = cfg->fc_prefsrc; + rt->fib6_prefsrc.plen = 128; } else - rt->rt6i_prefsrc.plen = 0; + rt->fib6_prefsrc.plen = 0; - rt->rt6i_flags = cfg->fc_flags; + rt->fib6_flags = cfg->fc_flags; install_route: - if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) && + if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) - rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; - rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); - rt->dst.dev = dev; - rt->rt6i_idev = idev; - rt->rt6i_table = table; + rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK); + rt->fib6_nh.nh_dev = dev; + rt->fib6_table = table; cfg->fc_nlinfo.nl_net = dev_net(dev); + if (idev) + in6_dev_put(idev); + return rt; out: if (dev) dev_put(dev); if (idev) in6_dev_put(idev); - if (rt) - dst_release_immediate(&rt->dst); + fib6_info_release(rt); return ERR_PTR(err); } -int ip6_route_add(struct fib6_config *cfg, +int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { - struct mx6_config mxc = { .mx = NULL, }; - struct rt6_info *rt; + struct fib6_info *rt; int err; - rt = ip6_route_info_create(cfg, extack); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto out; - } - - err = ip6_convert_metrics(&mxc, cfg); - if (err) - goto out; - - err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); + rt = ip6_route_info_create(cfg, gfp_flags, extack); + if (IS_ERR(rt)) + return PTR_ERR(rt); - kfree(mxc.mx); - - return err; -out: - if (rt) - dst_release_immediate(&rt->dst); + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); + fib6_info_release(rt); return err; } -static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) +static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) { - int err; + struct net *net = info->nl_net; struct fib6_table *table; - struct net *net = dev_net(rt->dst.dev); + int err; - if (rt == net->ipv6.ip6_null_entry) { + if (rt == net->ipv6.fib6_null_entry) { err = -ENOENT; goto out; } - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); spin_unlock_bh(&table->tb6_lock); out: - ip6_rt_put(rt); + fib6_info_release(rt); return err; } -int ip6_del_rt(struct rt6_info *rt) +int ip6_del_rt(struct net *net, struct fib6_info *rt) { - struct nl_info info = { - .nl_net = dev_net(rt->dst.dev), - }; + struct nl_info info = { .nl_net = net }; + return __ip6_del_rt(rt, &info); } -static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; struct net *net = info->nl_net; @@ -3058,20 +3143,20 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) struct fib6_table *table; int err = -ENOENT; - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) goto out_put; - table = rt->rt6i_table; + table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); - if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { - struct rt6_info *sibling, *next_sibling; + if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { + struct fib6_info *sibling, *next_sibling; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; - if (rt6_fill_node(net, skb, rt, + if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); @@ -3081,8 +3166,8 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) } list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, - rt6i_siblings) { + &rt->fib6_siblings, + fib6_siblings) { err = fib6_del(sibling, info); if (err) goto out_unlock; @@ -3093,7 +3178,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) out_unlock: spin_unlock_bh(&table->tb6_lock); out_put: - ip6_rt_put(rt); + fib6_info_release(rt); if (skb) { rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, @@ -3102,11 +3187,28 @@ out_put: return err; } +static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +{ + int rc = -ESRCH; + + if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) + goto out; + + if (cfg->fc_flags & RTF_GATEWAY && + !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) + goto out; + if (dst_hold_safe(&rt->dst)) + rc = rt6_remove_exception_rt(rt); +out: + return rc; +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt, *rt_cache; + struct rt6_info *rt_cache; struct fib6_table *table; + struct fib6_info *rt; struct fib6_node *fn; int err = -ESRCH; @@ -3126,25 +3228,31 @@ static int ip6_route_del(struct fib6_config *cfg, if (fn) { for_each_fib6_node_rt_rcu(fn) { if (cfg->fc_flags & RTF_CACHE) { + int rc; + rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, &cfg->fc_src); - if (!rt_cache) - continue; - rt = rt_cache; + if (rt_cache) { + rc = ip6_del_cached_rt(rt_cache, cfg); + if (rc != -ESRCH) { + rcu_read_unlock(); + return rc; + } + } + continue; } if (cfg->fc_ifindex && - (!rt->dst.dev || - rt->dst.dev->ifindex != cfg->fc_ifindex)) + (!rt->fib6_nh.nh_dev || + rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && - !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) + !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw)) continue; - if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) + if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) + if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; - if (!dst_hold_safe(&rt->dst)) - break; + fib6_info_hold(rt); rcu_read_unlock(); /* if gateway was specified only delete the one hop */ @@ -3166,6 +3274,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; + struct fib6_info *from; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; @@ -3247,7 +3356,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)), NDISC_REDIRECT, &ndopts); - nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); + rcu_read_lock(); + from = rcu_dereference(rt->from); + fib6_info_hold(from); + rcu_read_unlock(); + + nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL); if (!nrt) goto out; @@ -3255,14 +3369,13 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; - nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; /* No need to remove rt from the exception table if rt is * a cached route because rt6_insert_exception() will * takes care of it */ - if (rt6_insert_exception(nrt, rt)) { + if (rt6_insert_exception(nrt, from)) { dst_release_immediate(&nrt->dst); goto out; } @@ -3274,47 +3387,12 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: + fib6_info_release(from); neigh_release(neigh); } -/* - * Misc support functions - */ - -static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) -{ - BUG_ON(from->from); - - rt->rt6i_flags &= ~RTF_EXPIRES; - dst_hold(&from->dst); - rt->from = from; - dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); -} - -static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) -{ - rt->dst.input = ort->dst.input; - rt->dst.output = ort->dst.output; - rt->rt6i_dst = ort->rt6i_dst; - rt->dst.error = ort->dst.error; - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); - rt->dst.lastuse = jiffies; - rt->rt6i_gateway = ort->rt6i_gateway; - rt->rt6i_flags = ort->rt6i_flags; - rt6_set_from(rt, ort); - rt->rt6i_metric = ort->rt6i_metric; -#ifdef CONFIG_IPV6_SUBTREES - rt->rt6i_src = ort->rt6i_src; -#endif - rt->rt6i_prefsrc = ort->rt6i_prefsrc; - rt->rt6i_table = ort->rt6i_table; - rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); -} - #ifdef CONFIG_IPV6_ROUTE_INFO -static struct rt6_info *rt6_get_route_info(struct net *net, +static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev) @@ -3322,7 +3400,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; int ifindex = dev->ifindex; struct fib6_node *fn; - struct rt6_info *rt = NULL; + struct fib6_info *rt = NULL; struct fib6_table *table; table = fib6_get_table(net, tb_id); @@ -3335,13 +3413,13 @@ static struct rt6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->dst.dev->ifindex != ifindex) + if (rt->fib6_nh.nh_dev->ifindex != ifindex) continue; - if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) + if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; - if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr)) continue; - ip6_hold_safe(NULL, &rt, false); + fib6_info_hold(rt); break; } out: @@ -3349,7 +3427,7 @@ out: return rt; } -static struct rt6_info *rt6_add_route_info(struct net *net, +static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, @@ -3362,6 +3440,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), .fc_protocol = RTPROT_RA, + .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@ -3375,36 +3454,39 @@ static struct rt6_info *rt6_add_route_info(struct net *net, if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; - ip6_route_add(&cfg, NULL); + ip6_route_add(&cfg, GFP_ATOMIC, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } #endif -struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) +struct fib6_info *rt6_get_dflt_router(struct net *net, + const struct in6_addr *addr, + struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; - struct rt6_info *rt; + struct fib6_info *rt; struct fib6_table *table; - table = fib6_get_table(dev_net(dev), tb_id); + table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - if (dev == rt->dst.dev && - ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && - ipv6_addr_equal(&rt->rt6i_gateway, addr)) + if (dev == rt->fib6_nh.nh_dev && + ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && + ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr)) break; } if (rt) - ip6_hold_safe(NULL, &rt, false); + fib6_info_hold(rt); rcu_read_unlock(); return rt; } -struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, +struct fib6_info *rt6_add_dflt_router(struct net *net, + const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { @@ -3415,14 +3497,15 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), .fc_protocol = RTPROT_RA, + .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, - .fc_nlinfo.nl_net = dev_net(dev), + .fc_nlinfo.nl_net = net, }; cfg.fc_gateway = *gwaddr; - if (!ip6_route_add(&cfg, NULL)) { + if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); @@ -3430,24 +3513,25 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; } - return rt6_get_dflt_router(gwaddr, dev); + return rt6_get_dflt_router(net, gwaddr, dev); } -static void __rt6_purge_dflt_routers(struct fib6_table *table) +static void __rt6_purge_dflt_routers(struct net *net, + struct fib6_table *table) { - struct rt6_info *rt; + struct fib6_info *rt; restart: rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && - (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { - if (dst_hold_safe(&rt->dst)) { - rcu_read_unlock(); - ip6_del_rt(rt); - } else { - rcu_read_unlock(); - } + struct net_device *dev = fib6_info_nh_dev(rt); + struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; + + if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && + (!idev || idev->cnf.accept_ra != 2)) { + fib6_info_hold(rt); + rcu_read_unlock(); + ip6_del_rt(net, rt); goto restart; } } @@ -3468,7 +3552,7 @@ void rt6_purge_dflt_routers(struct net *net) head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) - __rt6_purge_dflt_routers(table); + __rt6_purge_dflt_routers(net, table); } } @@ -3489,6 +3573,7 @@ static void rtmsg_to_fib6_config(struct net *net, cfg->fc_dst_len = rtmsg->rtmsg_dst_len; cfg->fc_src_len = rtmsg->rtmsg_src_len; cfg->fc_flags = rtmsg->rtmsg_flags; + cfg->fc_type = rtmsg->rtmsg_type; cfg->fc_nlinfo.nl_net = net; @@ -3518,7 +3603,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); switch (cmd) { case SIOCADDRT: - err = ip6_route_add(&cfg, NULL); + err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); @@ -3546,7 +3631,8 @@ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { - IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + IP6_INC_STATS(dev_net(dst->dev), + __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INADDRERRORS); break; } @@ -3587,40 +3673,40 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff * Allocate a dst for local (unicast / anycast) address. */ -struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, - const struct in6_addr *addr, - bool anycast) +struct fib6_info *addrconf_f6i_alloc(struct net *net, + struct inet6_dev *idev, + const struct in6_addr *addr, + bool anycast, gfp_t gfp_flags) { u32 tb_id; - struct net *net = dev_net(idev->dev); struct net_device *dev = idev->dev; - struct rt6_info *rt; + struct fib6_info *f6i; - rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); - if (!rt) + f6i = fib6_info_alloc(gfp_flags); + if (!f6i) return ERR_PTR(-ENOMEM); - in6_dev_hold(idev); - - rt->dst.flags |= DST_HOST; - rt->dst.input = ip6_input; - rt->dst.output = ip6_output; - rt->rt6i_idev = idev; - - rt->rt6i_protocol = RTPROT_KERNEL; - rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; - if (anycast) - rt->rt6i_flags |= RTF_ANYCAST; - else - rt->rt6i_flags |= RTF_LOCAL; + f6i->dst_nocount = true; + f6i->dst_host = true; + f6i->fib6_protocol = RTPROT_KERNEL; + f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP; + if (anycast) { + f6i->fib6_type = RTN_ANYCAST; + f6i->fib6_flags |= RTF_ANYCAST; + } else { + f6i->fib6_type = RTN_LOCAL; + f6i->fib6_flags |= RTF_LOCAL; + } - rt->rt6i_gateway = *addr; - rt->rt6i_dst.addr = *addr; - rt->rt6i_dst.plen = 128; + f6i->fib6_nh.nh_gw = *addr; + dev_hold(dev); + f6i->fib6_nh.nh_dev = dev; + f6i->fib6_dst.addr = *addr; + f6i->fib6_dst.plen = 128; tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - rt->rt6i_table = fib6_get_table(net, tb_id); + f6i->fib6_table = fib6_get_table(net, tb_id); - return rt; + return f6i; } /* remove deleted ip from prefsrc entries */ @@ -3630,18 +3716,18 @@ struct arg_dev_net_ip { struct in6_addr *addr; }; -static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) +static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->dst.dev == dev || !dev) && - rt != net->ipv6.ip6_null_entry && - ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + if (((void *)rt->fib6_nh.nh_dev == dev || !dev) && + rt != net->ipv6.fib6_null_entry && + ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ - rt->rt6i_prefsrc.plen = 0; + rt->fib6_prefsrc.plen = 0; /* need to update cache as well */ rt6_exceptions_remove_prefsrc(rt); spin_unlock_bh(&rt6_exception_lock); @@ -3663,12 +3749,12 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) /* Remove routers and update dst entries when gateway turn into host. */ -static int fib6_clean_tohost(struct rt6_info *rt, void *arg) +static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && + ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) { return -1; } @@ -3694,85 +3780,85 @@ struct arg_netdev_event { }; }; -static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt) +static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; struct fib6_node *fn; - fn = rcu_dereference_protected(rt->rt6i_node, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); iter = rcu_dereference_protected(fn->leaf, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { - if (iter->rt6i_metric == rt->rt6i_metric && + if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; - iter = rcu_dereference_protected(iter->rt6_next, - lockdep_is_held(&rt->rt6i_table->tb6_lock)); + iter = rcu_dereference_protected(iter->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } return NULL; } -static bool rt6_is_dead(const struct rt6_info *rt) +static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD || - (rt->rt6i_nh_flags & RTNH_F_LINKDOWN && - rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD || + (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN && + fib6_ignore_linkdown(rt))) return true; return false; } -static int rt6_multipath_total_weight(const struct rt6_info *rt) +static int rt6_multipath_total_weight(const struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; int total = 0; if (!rt6_is_dead(rt)) - total += rt->rt6i_nh_weight; + total += rt->fib6_nh.nh_weight; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) { + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->rt6i_nh_weight; + total += iter->fib6_nh.nh_weight; } return total; } -static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total) +static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) { int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->rt6i_nh_weight; + *weight += rt->fib6_nh.nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->rt6i_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound); } -static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total) +static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) { - struct rt6_info *iter; + struct fib6_info *iter; int weight = 0; rt6_upper_bound_set(rt, &weight, total); - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) rt6_upper_bound_set(iter, &weight, total); } -void rt6_multipath_rebalance(struct rt6_info *rt) +void rt6_multipath_rebalance(struct fib6_info *rt) { - struct rt6_info *first; + struct fib6_info *first; int total; /* In case the entire multipath route was marked for flushing, * then there is no need to rebalance upon the removal of every * sibling route. */ - if (!rt->rt6i_nsiblings || rt->should_flush) + if (!rt->fib6_nsiblings || rt->should_flush) return; /* During lookup routes are evaluated in order, so we need to @@ -3787,14 +3873,14 @@ void rt6_multipath_rebalance(struct rt6_info *rt) rt6_multipath_upper_bound_set(first, total); } -static int fib6_ifup(struct rt6_info *rt, void *p_arg) +static int fib6_ifup(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; - const struct net *net = dev_net(arg->dev); + struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) { - rt->rt6i_nh_flags &= ~arg->nh_flags; - fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt); + if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) { + rt->fib6_nh.nh_flags &= ~arg->nh_flags; + fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -3816,95 +3902,96 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } -static bool rt6_multipath_uses_dev(const struct rt6_info *rt, +static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { - struct rt6_info *iter; + struct fib6_info *iter; - if (rt->dst.dev == dev) + if (rt->fib6_nh.nh_dev == dev) return true; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == dev) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) + if (iter->fib6_nh.nh_dev == dev) return true; return false; } -static void rt6_multipath_flush(struct rt6_info *rt) +static void rt6_multipath_flush(struct fib6_info *rt) { - struct rt6_info *iter; + struct fib6_info *iter; rt->should_flush = 1; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) iter->should_flush = 1; } -static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt, +static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, const struct net_device *down_dev) { - struct rt6_info *iter; + struct fib6_info *iter; unsigned int dead = 0; - if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_dev == down_dev || + rt->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == down_dev || - iter->rt6i_nh_flags & RTNH_F_DEAD) + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) + if (iter->fib6_nh.nh_dev == down_dev || + iter->fib6_nh.nh_flags & RTNH_F_DEAD) dead++; return dead; } -static void rt6_multipath_nh_flags_set(struct rt6_info *rt, +static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, unsigned int nh_flags) { - struct rt6_info *iter; + struct fib6_info *iter; - if (rt->dst.dev == dev) - rt->rt6i_nh_flags |= nh_flags; - list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) - if (iter->dst.dev == dev) - iter->rt6i_nh_flags |= nh_flags; + if (rt->fib6_nh.nh_dev == dev) + rt->fib6_nh.nh_flags |= nh_flags; + list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) + if (iter->fib6_nh.nh_dev == dev) + iter->fib6_nh.nh_flags |= nh_flags; } /* called with write lock held for table with rt */ -static int fib6_ifdown(struct rt6_info *rt, void *p_arg) +static int fib6_ifdown(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; - const struct net *net = dev_net(dev); + struct net *net = dev_net(dev); - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) return 0; switch (arg->event) { case NETDEV_UNREGISTER: - return rt->dst.dev == dev ? -1 : 0; + return rt->fib6_nh.nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; - if (!rt->rt6i_nsiblings) - return rt->dst.dev == dev ? -1 : 0; + if (!rt->fib6_nsiblings) + return rt->fib6_nh.nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; count = rt6_multipath_dead_count(rt, dev); - if (rt->rt6i_nsiblings + 1 == count) { + if (rt->fib6_nsiblings + 1 == count) { rt6_multipath_flush(rt); return -1; } rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); - fib6_update_sernum(rt); + fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); } return -2; case NETDEV_CHANGE: - if (rt->dst.dev != dev || - rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) + if (rt->fib6_nh.nh_dev != dev || + rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->rt6i_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -3936,7 +4023,7 @@ struct rt6_mtu_change_arg { unsigned int mtu; }; -static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) +static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -3956,12 +4043,15 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) Since RFC 1981 doesn't include administrative MTU increase update PMTU increase is a MUST. (i.e. jumbo frame) */ - if (rt->dst.dev == arg->dev && - !dst_metric_locked(&rt->dst, RTAX_MTU)) { + if (rt->fib6_nh.nh_dev == arg->dev && + !fib6_metric_locked(rt, RTAX_MTU)) { + u32 mtu = rt->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(rt, RTAX_MTU, arg->mtu); + spin_lock_bh(&rt6_exception_lock); - if (dst_metric_raw(&rt->dst, RTAX_MTU) && - rt6_mtu_change_route_allowed(idev, rt, arg->mtu)) - dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); rt6_exceptions_update_pmtu(idev, rt, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } @@ -4122,9 +4212,8 @@ errout: } struct rt6_nh { - struct rt6_info *rt6_info; + struct fib6_info *fib6_info; struct fib6_config r_cfg; - struct mx6_config mxc; struct list_head next; }; @@ -4139,23 +4228,25 @@ static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) } } -static int ip6_route_info_append(struct list_head *rt6_nh_list, - struct rt6_info *rt, struct fib6_config *r_cfg) +static int ip6_route_info_append(struct net *net, + struct list_head *rt6_nh_list, + struct fib6_info *rt, + struct fib6_config *r_cfg) { struct rt6_nh *nh; int err = -EEXIST; list_for_each_entry(nh, rt6_nh_list, next) { - /* check if rt6_info already exists */ - if (rt6_duplicate_nexthop(nh->rt6_info, rt)) + /* check if fib6_info already exists */ + if (rt6_duplicate_nexthop(nh->fib6_info, rt)) return err; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; - nh->rt6_info = rt; - err = ip6_convert_metrics(&nh->mxc, r_cfg); + nh->fib6_info = rt; + err = ip6_convert_metrics(net, rt, r_cfg); if (err) { kfree(nh); return err; @@ -4166,8 +4257,8 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, return 0; } -static void ip6_route_mpath_notify(struct rt6_info *rt, - struct rt6_info *rt_last, +static void ip6_route_mpath_notify(struct fib6_info *rt, + struct fib6_info *rt_last, struct nl_info *info, __u16 nlflags) { @@ -4177,10 +4268,10 @@ static void ip6_route_mpath_notify(struct rt6_info *rt, * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ - if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { - rt = list_first_entry(&rt_last->rt6i_siblings, - struct rt6_info, - rt6i_siblings); + if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { + rt = list_first_entry(&rt_last->fib6_siblings, + struct fib6_info, + fib6_siblings); } if (rt) @@ -4190,11 +4281,11 @@ static void ip6_route_mpath_notify(struct rt6_info *rt, static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_notif = NULL, *rt_last = NULL; + struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; struct fib6_config r_cfg; struct rtnexthop *rtnh; - struct rt6_info *rt; + struct fib6_info *rt; struct rt6_nh *err_nh; struct rt6_nh *nh, *nh_safe; __u16 nlflags; @@ -4214,7 +4305,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry and build a list (rt6_nh_list) of - * rt6_info structs per nexthop + * fib6_info structs per nexthop */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); @@ -4237,18 +4328,19 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, } r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); - rt = ip6_route_info_create(&r_cfg, extack); + rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto cleanup; } - rt->rt6i_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; - err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); + err = ip6_route_info_append(info->nl_net, &rt6_nh_list, + rt, &r_cfg); if (err) { - dst_release_immediate(&rt->dst); + fib6_info_release(rt); goto cleanup; } @@ -4263,14 +4355,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { - rt_last = nh->rt6_info; - err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); + rt_last = nh->fib6_info; + err = __ip6_ins_rt(nh->fib6_info, info, extack); + fib6_info_release(nh->fib6_info); + /* save reference to first route for notification */ if (!rt_notif && !err) - rt_notif = nh->rt6_info; + rt_notif = nh->fib6_info; - /* nh->rt6_info is used or freed at this point, reset to NULL*/ - nh->rt6_info = NULL; + /* nh->fib6_info is used or freed at this point, reset to NULL*/ + nh->fib6_info = NULL; if (err) { if (replace && nhn) ip6_print_replace_route_err(&rt6_nh_list); @@ -4311,9 +4405,8 @@ add_errout: cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { - if (nh->rt6_info) - dst_release_immediate(&nh->rt6_info->dst); - kfree(nh->mxc.mx); + if (nh->fib6_info) + fib6_info_release(nh->fib6_info); list_del(&nh->next); kfree(nh); } @@ -4390,20 +4483,20 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else - return ip6_route_add(&cfg, extack); + return ip6_route_add(&cfg, GFP_KERNEL, extack); } -static size_t rt6_nlmsg_size(struct rt6_info *rt) +static size_t rt6_nlmsg_size(struct fib6_info *rt) { int nexthop_len = 0; - if (rt->rt6i_nsiblings) { + if (rt->fib6_nsiblings) { nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->dst.lwtstate); + + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate); - nexthop_len *= rt->rt6i_nsiblings; + nexthop_len *= rt->fib6_nsiblings; } return NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -4419,38 +4512,41 @@ static size_t rt6_nlmsg_size(struct rt6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->dst.lwtstate) + + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate) + nexthop_len; } -static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, +static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt, unsigned int *flags, bool skip_oif) { - if (rt->rt6i_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh.nh_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; - if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) { + if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; - if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + + rcu_read_lock(); + if (fib6_ignore_linkdown(rt)) *flags |= RTNH_F_DEAD; + rcu_read_unlock(); } - if (rt->rt6i_flags & RTF_GATEWAY) { - if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) + if (rt->fib6_flags & RTF_GATEWAY) { + if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0) goto nla_put_failure; } - *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK); - if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) + *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK); + if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; /* not needed for multipath encoding b/c it has a rtnexthop struct */ - if (!skip_oif && rt->dst.dev && - nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) + if (!skip_oif && rt->fib6_nh.nh_dev && + nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex)) goto nla_put_failure; - if (rt->dst.lwtstate && - lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) + if (rt->fib6_nh.nh_lwtstate && + lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0) goto nla_put_failure; return 0; @@ -4460,8 +4556,9 @@ nla_put_failure: } /* add multipath next hop */ -static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) +static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt) { + const struct net_device *dev = rt->fib6_nh.nh_dev; struct rtnexthop *rtnh; unsigned int flags = 0; @@ -4469,8 +4566,8 @@ static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) if (!rtnh) goto nla_put_failure; - rtnh->rtnh_hops = rt->rt6i_nh_weight - 1; - rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; + rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1; + rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; if (rt6_nexthop_info(skb, rt, &flags, true) < 0) goto nla_put_failure; @@ -4486,16 +4583,16 @@ nla_put_failure: return -EMSGSIZE; } -static int rt6_fill_node(struct net *net, - struct sk_buff *skb, struct rt6_info *rt, - struct in6_addr *dst, struct in6_addr *src, +static int rt6_fill_node(struct net *net, struct sk_buff *skb, + struct fib6_info *rt, struct dst_entry *dst, + struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) { - u32 metrics[RTAX_MAX]; struct rtmsg *rtm; struct nlmsghdr *nlh; - long expires; + long expires = 0; + u32 *pmetrics; u32 table; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); @@ -4504,53 +4601,31 @@ static int rt6_fill_node(struct net *net, rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; - rtm->rtm_dst_len = rt->rt6i_dst.plen; - rtm->rtm_src_len = rt->rt6i_src.plen; + rtm->rtm_dst_len = rt->fib6_dst.plen; + rtm->rtm_src_len = rt->fib6_src.plen; rtm->rtm_tos = 0; - if (rt->rt6i_table) - table = rt->rt6i_table->tb6_id; + if (rt->fib6_table) + table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; - if (rt->rt6i_flags & RTF_REJECT) { - switch (rt->dst.error) { - case -EINVAL: - rtm->rtm_type = RTN_BLACKHOLE; - break; - case -EACCES: - rtm->rtm_type = RTN_PROHIBIT; - break; - case -EAGAIN: - rtm->rtm_type = RTN_THROW; - break; - default: - rtm->rtm_type = RTN_UNREACHABLE; - break; - } - } - else if (rt->rt6i_flags & RTF_LOCAL) - rtm->rtm_type = RTN_LOCAL; - else if (rt->rt6i_flags & RTF_ANYCAST) - rtm->rtm_type = RTN_ANYCAST; - else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) - rtm->rtm_type = RTN_LOCAL; - else - rtm->rtm_type = RTN_UNICAST; + + rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - rtm->rtm_protocol = rt->rt6i_protocol; + rtm->rtm_protocol = rt->fib6_protocol; - if (rt->rt6i_flags & RTF_CACHE) + if (rt->fib6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; - if (dst) { - if (nla_put_in6_addr(skb, RTA_DST, dst)) + if (dest) { + if (nla_put_in6_addr(skb, RTA_DST, dest)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) - if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) + if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { @@ -4558,12 +4633,12 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && - nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) + nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE - if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { + if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) @@ -4574,34 +4649,32 @@ static int rt6_fill_node(struct net *net, #endif if (nla_put_u32(skb, RTA_IIF, iif)) goto nla_put_failure; - } else if (dst) { + } else if (dest) { struct in6_addr saddr_buf; - if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && + if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } - if (rt->rt6i_prefsrc.plen) { + if (rt->fib6_prefsrc.plen) { struct in6_addr saddr_buf; - saddr_buf = rt->rt6i_prefsrc.addr; + saddr_buf = rt->fib6_prefsrc.addr; if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } - memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); - if (rt->rt6i_pmtu) - metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; - if (rtnetlink_put_metrics(skb, metrics) < 0) + pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; + if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; - if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) + if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) goto nla_put_failure; /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ - if (rt->rt6i_nsiblings) { - struct rt6_info *sibling, *next_sibling; + if (rt->fib6_nsiblings) { + struct fib6_info *sibling, *next_sibling; struct nlattr *mp; mp = nla_nest_start(skb, RTA_MULTIPATH); @@ -4612,7 +4685,7 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, rt6i_siblings) { + &rt->fib6_siblings, fib6_siblings) { if (rt6_add_nexthop(skb, sibling) < 0) goto nla_put_failure; } @@ -4623,12 +4696,15 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; } - expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; + if (rt->fib6_flags & RTF_EXPIRES) { + expires = dst ? dst->expires : rt->expires; + expires -= jiffies; + } - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) + if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; - if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) + if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags))) goto nla_put_failure; @@ -4640,12 +4716,12 @@ nla_put_failure: return -EMSGSIZE; } -int rt6_dump_route(struct rt6_info *rt, void *p_arg) +int rt6_dump_route(struct fib6_info *rt, void *p_arg) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct net *net = arg->net; - if (rt == net->ipv6.ip6_null_entry) + if (rt == net->ipv6.fib6_null_entry) return 0; if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { @@ -4653,16 +4729,15 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) /* user wants prefix routes only */ if (rtm->rtm_flags & RTM_F_PREFIX && - !(rt->rt6i_flags & RTF_PREFIX_RT)) { + !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ return 1; } } - return rt6_fill_node(net, - arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, - NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, - NLM_F_MULTI); + return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, + arg->cb->nlh->nlmsg_seq, NLM_F_MULTI); } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, @@ -4671,6 +4746,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; int err, iif = 0, oif = 0; + struct fib6_info *from; struct dst_entry *dst; struct rt6_info *rt; struct sk_buff *skb; @@ -4759,14 +4835,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } - if (fibmatch && rt->from) { - struct rt6_info *ort = rt->from; - - dst_hold(&ort->dst); - ip6_rt_put(rt); - rt = ort; - } - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); @@ -4775,14 +4843,21 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, } skb_dst_set(skb, &rt->dst); + + rcu_read_lock(); + from = rcu_dereference(rt->from); + if (fibmatch) - err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, + err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else - err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, - RTM_NEWROUTE, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0); + err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, + &fl6.saddr, iif, RTM_NEWROUTE, + NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + 0); + rcu_read_unlock(); + if (err < 0) { kfree_skb(skb); goto errout; @@ -4793,7 +4868,7 @@ errout: return err; } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, +void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { struct sk_buff *skb; @@ -4808,8 +4883,8 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, if (!skb) goto errout; - err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, - event, info->portid, seq, nlm_flags); + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + event, info->portid, seq, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -4834,6 +4909,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { + net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5030,11 +5106,17 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; + net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry), + GFP_KERNEL); + if (!net->ipv6.fib6_null_entry) + goto out_ip6_dst_entries; + net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) - goto out_ip6_dst_entries; + goto out_fib6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); @@ -5081,6 +5163,8 @@ out_ip6_prohibit_entry: out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif +out_fib6_null_entry: + kfree(net->ipv6.fib6_null_entry); out_ip6_dst_entries: dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: @@ -5089,6 +5173,7 @@ out_ip6_dst_ops: static void __net_exit ip6_route_net_exit(struct net *net) { + kfree(net->ipv6.fib6_null_entry); kfree(net->ipv6.ip6_null_entry); #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); @@ -5159,6 +5244,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ + init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 5fe139484919..eab39bd91548 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -91,6 +91,24 @@ static void set_tun_src(struct net *net, struct net_device *dev, rcu_read_unlock(); } +/* Compute flowlabel for outer IPv6 header */ +static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, + struct ipv6hdr *inner_hdr) +{ + int do_flowlabel = net->ipv6.sysctl.seg6_flowlabel; + __be32 flowlabel = 0; + u32 hash; + + if (do_flowlabel > 0) { + hash = skb_get_hash(skb); + rol32(hash, 16); + flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; + } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) { + flowlabel = ip6_flowlabel(inner_hdr); + } + return flowlabel; +} + /* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) { @@ -99,6 +117,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) struct ipv6hdr *hdr, *inner_hdr; struct ipv6_sr_hdr *isrh; int hdrlen, tot_len, err; + __be32 flowlabel; hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); @@ -108,6 +127,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) return err; inner_hdr = ipv6_hdr(skb); + flowlabel = seg6_make_flowlabel(net, skb, inner_hdr); skb_push(skb, tot_len); skb_reset_network_header(skb); @@ -121,10 +141,10 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) if (skb->protocol == htons(ETH_P_IPV6)) { ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)), - ip6_flowlabel(inner_hdr)); + flowlabel); hdr->hop_limit = inner_hdr->hop_limit; } else { - ip6_flow_hdr(hdr, 0, 0); + ip6_flow_hdr(hdr, 0, flowlabel); hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb)); } diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 6fbdef630152..e15cd37024fd 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -152,6 +152,13 @@ static struct ctl_table ipv6_table_template[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "seg6_flowlabel", + .data = &init_net.ipv6.sysctl.seg6_flowlabel, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -217,6 +224,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy, + ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6d664d83cd16..7d47c2b550a9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 unsigned int tot_len = sizeof(struct tcphdr); struct dst_entry *dst; __be32 *topt; + __u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; @@ -871,7 +872,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_oif = oif; } - fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); + if (sk) + mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ea0730028e5d..2839c1bd1e58 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -546,10 +546,10 @@ static __inline__ void udpv6_err(struct sk_buff *skb, __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); } -static struct static_key udpv6_encap_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void) { - static_key_enable(&udpv6_encap_needed); + static_branch_enable(&udpv6_encap_needed_key); } EXPORT_SYMBOL(udpv6_encap_enable); @@ -561,7 +561,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto drop; - if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* @@ -1023,7 +1023,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, * Sending */ -static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6) +static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, + struct inet_cork *cork) { struct sock *sk = skb->sk; struct udphdr *uh; @@ -1042,12 +1043,31 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6) uh->len = htons(len); uh->check = 0; + if (cork->gso_size) { + const int hlen = skb_network_header_len(skb) + + sizeof(struct udphdr); + + if (hlen + cork->gso_size > cork->fragsize) + return -EINVAL; + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) + return -EINVAL; + if (udp_sk(sk)->no_check6_tx) + return -EINVAL; + if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) + return -EIO; + + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + goto csum_partial; + } + if (is_udplite) csum = udplite_csum(skb); else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ +csum_partial: udp6_hwcsum_outgoing(sk, skb, &fl6->saddr, &fl6->daddr, len); goto send; } else @@ -1093,7 +1113,7 @@ static int udp_v6_push_pending_frames(struct sock *sk) if (!skb) goto out; - err = udp_v6_send_skb(skb, &fl6); + err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base); out: up->len = 0; @@ -1127,6 +1147,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc6.hlimit = -1; ipc6.tclass = -1; ipc6.dontfrag = -1; + ipc6.gso_size = up->gso_size; sockc.tsflags = sk->sk_tsflags; /* destination address check */ @@ -1259,7 +1280,10 @@ do_udp_sendmsg: opt->tot_len = sizeof(*opt); ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); + err = udp_cmsg_send(sk, msg, &ipc6.gso_size); + if (err > 0) + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, + &ipc6, &sockc); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1324,15 +1348,16 @@ back_from_confirm: /* Lockless fast path for the non-corking case */ if (!corkreq) { + struct inet_cork_full cork; struct sk_buff *skb; skb = ip6_make_skb(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, &sockc); + msg->msg_flags, &cork, &sockc); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) - err = udp_v6_send_skb(skb, &fl6); + err = udp_v6_send_skb(skb, &fl6, &cork.base); goto out; } @@ -1402,7 +1427,7 @@ void udpv6_destroy_sock(struct sock *sk) udp_v6_flush_pending_frames(sk); release_sock(sk); - if (static_key_false(&udpv6_encap_needed) && up->encap_type) { + if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) { void (*encap_destroy)(struct sock *sk); encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 2a04dc9c781b..03a2ff3fe1e6 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -42,12 +42,15 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, const struct ipv6hdr *ipv6h; struct udphdr *uh; - if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP)) + if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4))) goto out; if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + return __udp_gso_segment(skb, features); + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot * do checksum of UDP packets sent as multiple IP fragments. */ diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 416fe67271a9..2cff209d0fc1 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -107,8 +107,6 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, * it was magically lost, so this code needs audit */ xdst->u.rt6.rt6i_flags = rt->rt6i_flags & (RTF_ANYCAST | RTF_LOCAL); - xdst->u.rt6.rt6i_metric = rt->rt6i_metric; - xdst->u.rt6.rt6i_node = rt->rt6i_node; xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 16f434791763..5bdca3d5d6b7 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -60,11 +60,9 @@ xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, static int __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) { - int i; + int count[XFRM_MAX_DEPTH] = { }; int class[XFRM_MAX_DEPTH]; - int count[maxclass]; - - memset(count, 0, sizeof(count)); + int i; for (i = 0; i < n; i++) { int c; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 7f1e842ef05a..e87686f7d63c 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -57,6 +57,10 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd) static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd) { + /* Drop reference taken during previous invocation */ + if (pd->session) + l2tp_session_dec_refcount(pd->session); + pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); pd->session_idx++; @@ -105,11 +109,16 @@ static void l2tp_dfs_seq_stop(struct seq_file *p, void *v) if (!pd || pd == SEQ_START_TOKEN) return; - /* Drop reference taken by last invocation of l2tp_dfs_next_tunnel() */ + /* Drop reference taken by last invocation of l2tp_dfs_next_session() + * or l2tp_dfs_next_tunnel(). + */ + if (pd->session) { + l2tp_session_dec_refcount(pd->session); + pd->session = NULL; + } if (pd->tunnel) { l2tp_tunnel_dec_refcount(pd->tunnel); pd->tunnel = NULL; - pd->session = NULL; } } @@ -250,13 +259,10 @@ static int l2tp_dfs_seq_show(struct seq_file *m, void *v) goto out; } - /* Show the tunnel or session context */ - if (!pd->session) { + if (!pd->session) l2tp_dfs_seq_tunnel_show(m, pd->tunnel); - } else { + else l2tp_dfs_seq_session_show(m, pd->session); - l2tp_session_dec_refcount(pd->session); - } out: return 0; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 1fd9e145076a..f951c768dcf2 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1576,6 +1576,10 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd) static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) { + /* Drop reference taken during previous invocation */ + if (pd->session) + l2tp_session_dec_refcount(pd->session); + pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx); pd->session_idx++; @@ -1624,11 +1628,16 @@ static void pppol2tp_seq_stop(struct seq_file *p, void *v) if (!pd || pd == SEQ_START_TOKEN) return; - /* Drop reference taken by last invocation of pppol2tp_next_tunnel() */ + /* Drop reference taken by last invocation of pppol2tp_next_session() + * or pppol2tp_next_tunnel(). + */ + if (pd->session) { + l2tp_session_dec_refcount(pd->session); + pd->session = NULL; + } if (pd->tunnel) { l2tp_tunnel_dec_refcount(pd->tunnel); pd->tunnel = NULL; - pd->session = NULL; } } @@ -1723,14 +1732,10 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v) goto out; } - /* Show the tunnel or session context. - */ - if (!pd->session) { + if (!pd->session) pppol2tp_seq_tunnel_show(m, pd->tunnel); - } else { + else pppol2tp_seq_session_show(m, pd->session); - l2tp_session_dec_refcount(pd->session); - } out: return 0; diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 8da84312cd3b..8055e3965cef 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -68,15 +68,6 @@ enum { NCSI_MODE_MAX }; -enum { - NCSI_FILTER_BASE = 0, - NCSI_FILTER_VLAN = 0, - NCSI_FILTER_UC, - NCSI_FILTER_MC, - NCSI_FILTER_MIXED, - NCSI_FILTER_MAX -}; - struct ncsi_channel_version { u32 version; /* Supported BCD encoded NCSI version */ u32 alpha2; /* Supported BCD encoded NCSI version */ @@ -98,11 +89,18 @@ struct ncsi_channel_mode { u32 data[8]; /* Data entries */ }; -struct ncsi_channel_filter { - u32 index; /* Index of channel filters */ - u32 total; /* Total entries in the filter table */ - u64 bitmap; /* Bitmap of valid entries */ - u32 data[]; /* Data for the valid entries */ +struct ncsi_channel_mac_filter { + u8 n_uc; + u8 n_mc; + u8 n_mixed; + u64 bitmap; + unsigned char *addrs; +}; + +struct ncsi_channel_vlan_filter { + u8 n_vids; + u64 bitmap; + u16 *vids; }; struct ncsi_channel_stats { @@ -186,7 +184,9 @@ struct ncsi_channel { struct ncsi_channel_version version; struct ncsi_channel_cap caps[NCSI_CAP_MAX]; struct ncsi_channel_mode modes[NCSI_MODE_MAX]; - struct ncsi_channel_filter *filters[NCSI_FILTER_MAX]; + /* Filtering Settings */ + struct ncsi_channel_mac_filter mac_filter; + struct ncsi_channel_vlan_filter vlan_filter; struct ncsi_channel_stats stats; struct { struct timer_list timer; @@ -320,10 +320,6 @@ extern spinlock_t ncsi_dev_lock; list_for_each_entry_rcu(nc, &np->channels, node) /* Resources */ -u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index); -int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); -int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); -int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); void ncsi_start_channel_monitor(struct ncsi_channel *nc); void ncsi_stop_channel_monitor(struct ncsi_channel *nc); struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index c3695ba0cf94..5561e221b71f 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -27,125 +27,6 @@ LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); -static inline int ncsi_filter_size(int table) -{ - int sizes[] = { 2, 6, 6, 6 }; - - BUILD_BUG_ON(ARRAY_SIZE(sizes) != NCSI_FILTER_MAX); - if (table < NCSI_FILTER_BASE || table >= NCSI_FILTER_MAX) - return -EINVAL; - - return sizes[table]; -} - -u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) -{ - struct ncsi_channel_filter *ncf; - int size; - - ncf = nc->filters[table]; - if (!ncf) - return NULL; - - size = ncsi_filter_size(table); - if (size < 0) - return NULL; - - return ncf->data + size * index; -} - -/* Find the first active filter in a filter table that matches the given - * data parameter. If data is NULL, this returns the first active filter. - */ -int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data) -{ - struct ncsi_channel_filter *ncf; - void *bitmap; - int index, size; - unsigned long flags; - - ncf = nc->filters[table]; - if (!ncf) - return -ENXIO; - - size = ncsi_filter_size(table); - if (size < 0) - return size; - - spin_lock_irqsave(&nc->lock, flags); - bitmap = (void *)&ncf->bitmap; - index = -1; - while ((index = find_next_bit(bitmap, ncf->total, index + 1)) - < ncf->total) { - if (!data || !memcmp(ncf->data + size * index, data, size)) { - spin_unlock_irqrestore(&nc->lock, flags); - return index; - } - } - spin_unlock_irqrestore(&nc->lock, flags); - - return -ENOENT; -} - -int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data) -{ - struct ncsi_channel_filter *ncf; - int index, size; - void *bitmap; - unsigned long flags; - - size = ncsi_filter_size(table); - if (size < 0) - return size; - - index = ncsi_find_filter(nc, table, data); - if (index >= 0) - return index; - - ncf = nc->filters[table]; - if (!ncf) - return -ENODEV; - - spin_lock_irqsave(&nc->lock, flags); - bitmap = (void *)&ncf->bitmap; - do { - index = find_next_zero_bit(bitmap, ncf->total, 0); - if (index >= ncf->total) { - spin_unlock_irqrestore(&nc->lock, flags); - return -ENOSPC; - } - } while (test_and_set_bit(index, bitmap)); - - memcpy(ncf->data + size * index, data, size); - spin_unlock_irqrestore(&nc->lock, flags); - - return index; -} - -int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index) -{ - struct ncsi_channel_filter *ncf; - int size; - void *bitmap; - unsigned long flags; - - size = ncsi_filter_size(table); - if (size < 0) - return size; - - ncf = nc->filters[table]; - if (!ncf || index >= ncf->total) - return -ENODEV; - - spin_lock_irqsave(&nc->lock, flags); - bitmap = (void *)&ncf->bitmap; - if (test_and_clear_bit(index, bitmap)) - memset(ncf->data + size * index, 0, size); - spin_unlock_irqrestore(&nc->lock, flags); - - return 0; -} - static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) { struct ncsi_dev *nd = &ndp->ndev; @@ -339,20 +220,13 @@ struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) static void ncsi_remove_channel(struct ncsi_channel *nc) { struct ncsi_package *np = nc->package; - struct ncsi_channel_filter *ncf; unsigned long flags; - int i; - /* Release filters */ spin_lock_irqsave(&nc->lock, flags); - for (i = 0; i < NCSI_FILTER_MAX; i++) { - ncf = nc->filters[i]; - if (!ncf) - continue; - nc->filters[i] = NULL; - kfree(ncf); - } + /* Release filters */ + kfree(nc->mac_filter.addrs); + kfree(nc->vlan_filter.vids); nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); @@ -670,32 +544,26 @@ error: static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, struct ncsi_cmd_arg *nca) { + struct ncsi_channel_vlan_filter *ncf; + unsigned long flags; + void *bitmap; int index; - u32 *data; u16 vid; - index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, NULL); - if (index < 0) { - /* Filter table empty */ - return -1; - } + ncf = &nc->vlan_filter; + bitmap = &ncf->bitmap; - data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, index); - if (!data) { - netdev_err(ndp->ndev.dev, - "NCSI: failed to retrieve filter %d\n", index); - /* Set the VLAN id to 0 - this will still disable the entry in - * the filter table, but we won't know what it was. - */ - vid = 0; - } else { - vid = *(u16 *)data; + spin_lock_irqsave(&nc->lock, flags); + index = find_next_bit(bitmap, ncf->n_vids, 0); + if (index >= ncf->n_vids) { + spin_unlock_irqrestore(&nc->lock, flags); + return -1; } + vid = ncf->vids[index]; - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "NCSI: removed vlan tag %u at index %d\n", - vid, index + 1); - ncsi_remove_filter(nc, NCSI_FILTER_VLAN, index); + clear_bit(index, bitmap); + ncf->vids[index] = 0; + spin_unlock_irqrestore(&nc->lock, flags); nca->type = NCSI_PKT_CMD_SVF; nca->words[1] = vid; @@ -711,45 +579,55 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, struct ncsi_cmd_arg *nca) { + struct ncsi_channel_vlan_filter *ncf; struct vlan_vid *vlan = NULL; - int index = 0; + unsigned long flags; + int i, index; + void *bitmap; + u16 vid; + if (list_empty(&ndp->vlan_vids)) + return -1; + + ncf = &nc->vlan_filter; + bitmap = &ncf->bitmap; + + spin_lock_irqsave(&nc->lock, flags); + + rcu_read_lock(); list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) { - index = ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan->vid); - if (index < 0) { - /* New tag to add */ - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "NCSI: new vlan id to set: %u\n", - vlan->vid); + vid = vlan->vid; + for (i = 0; i < ncf->n_vids; i++) + if (ncf->vids[i] == vid) { + vid = 0; + break; + } + if (vid) break; - } - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "vid %u already at filter pos %d\n", - vlan->vid, index); } + rcu_read_unlock(); - if (!vlan || index >= 0) { - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "no vlan ids left to set\n"); + if (!vid) { + /* No VLAN ID is not set */ + spin_unlock_irqrestore(&nc->lock, flags); return -1; } - index = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan->vid); - if (index < 0) { + index = find_next_zero_bit(bitmap, ncf->n_vids, 0); + if (index < 0 || index >= ncf->n_vids) { netdev_err(ndp->ndev.dev, - "Failed to add new VLAN tag, error %d\n", index); - if (index == -ENOSPC) - netdev_err(ndp->ndev.dev, - "Channel %u already has all VLAN filters set\n", - nc->id); + "Channel %u already has all VLAN filters set\n", + nc->id); + spin_unlock_irqrestore(&nc->lock, flags); return -1; } - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "NCSI: set vid %u in packet, index %u\n", - vlan->vid, index + 1); + ncf->vids[index] = vid; + set_bit(index, bitmap); + spin_unlock_irqrestore(&nc->lock, flags); + nca->type = NCSI_PKT_CMD_SVF; - nca->words[1] = vlan->vid; + nca->words[1] = vid; /* HW filter index starts at 1 */ nca->bytes[6] = index + 1; nca->bytes[7] = 0x01; diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index 8d7e849d4825..b09ef77bf4cd 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -58,10 +58,9 @@ static int ncsi_write_channel_info(struct sk_buff *skb, struct ncsi_dev_priv *ndp, struct ncsi_channel *nc) { - struct nlattr *vid_nest; - struct ncsi_channel_filter *ncf; + struct ncsi_channel_vlan_filter *ncf; struct ncsi_channel_mode *m; - u32 *data; + struct nlattr *vid_nest; int i; nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id); @@ -79,18 +78,13 @@ static int ncsi_write_channel_info(struct sk_buff *skb, vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); if (!vid_nest) return -ENOMEM; - ncf = nc->filters[NCSI_FILTER_VLAN]; + ncf = &nc->vlan_filter; i = -1; - if (ncf) { - while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total, - i + 1)) < ncf->total) { - data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i); - /* Uninitialised channels will have 'zero' vlan ids */ - if (!data || !*data) - continue; + while ((i = find_next_bit((void *)&ncf->bitmap, ncf->n_vids, + i + 1)) < ncf->n_vids) { + if (ncf->vids[i]) nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID, - *(u16 *)data); - } + ncf->vids[i]); } nla_nest_end(skb, vid_nest); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index efd933ff5570..a6b7c7d5c829 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -334,9 +334,9 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr) struct ncsi_rsp_pkt *rsp; struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_channel *nc; - struct ncsi_channel_filter *ncf; - unsigned short vlan; - int ret; + struct ncsi_channel_vlan_filter *ncf; + unsigned long flags; + void *bitmap; /* Find the package and channel */ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); @@ -346,22 +346,23 @@ static int ncsi_rsp_handler_svf(struct ncsi_request *nr) return -ENODEV; cmd = (struct ncsi_cmd_svf_pkt *)skb_network_header(nr->cmd); - ncf = nc->filters[NCSI_FILTER_VLAN]; - if (!ncf) - return -ENOENT; - if (cmd->index >= ncf->total) + ncf = &nc->vlan_filter; + if (cmd->index == 0 || cmd->index > ncf->n_vids) return -ERANGE; - /* Add or remove the VLAN filter */ + /* Add or remove the VLAN filter. Remember HW indexes from 1 */ + spin_lock_irqsave(&nc->lock, flags); + bitmap = &ncf->bitmap; if (!(cmd->enable & 0x1)) { - /* HW indexes from 1 */ - ret = ncsi_remove_filter(nc, NCSI_FILTER_VLAN, cmd->index - 1); + if (test_and_clear_bit(cmd->index - 1, bitmap)) + ncf->vids[cmd->index - 1] = 0; } else { - vlan = ntohs(cmd->vlan); - ret = ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan); + set_bit(cmd->index - 1, bitmap); + ncf->vids[cmd->index - 1] = ntohs(cmd->vlan); } + spin_unlock_irqrestore(&nc->lock, flags); - return ret; + return 0; } static int ncsi_rsp_handler_ev(struct ncsi_request *nr) @@ -422,8 +423,12 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr) struct ncsi_rsp_pkt *rsp; struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_channel *nc; - struct ncsi_channel_filter *ncf; + struct ncsi_channel_mac_filter *ncf; + unsigned long flags; void *bitmap; + bool enabled; + int index; + /* Find the package and channel */ rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); @@ -436,31 +441,24 @@ static int ncsi_rsp_handler_sma(struct ncsi_request *nr) * isn't supported yet. */ cmd = (struct ncsi_cmd_sma_pkt *)skb_network_header(nr->cmd); - switch (cmd->at_e >> 5) { - case 0x0: /* UC address */ - ncf = nc->filters[NCSI_FILTER_UC]; - break; - case 0x1: /* MC address */ - ncf = nc->filters[NCSI_FILTER_MC]; - break; - default: - return -EINVAL; - } + enabled = cmd->at_e & 0x1; + ncf = &nc->mac_filter; + bitmap = &ncf->bitmap; - /* Sanity check on the filter */ - if (!ncf) - return -ENOENT; - else if (cmd->index >= ncf->total) + if (cmd->index == 0 || + cmd->index > ncf->n_uc + ncf->n_mc + ncf->n_mixed) return -ERANGE; - bitmap = &ncf->bitmap; - if (cmd->at_e & 0x1) { - set_bit(cmd->index, bitmap); - memcpy(ncf->data + 6 * cmd->index, cmd->mac, 6); + index = (cmd->index - 1) * ETH_ALEN; + spin_lock_irqsave(&nc->lock, flags); + if (enabled) { + set_bit(cmd->index - 1, bitmap); + memcpy(&ncf->addrs[index], cmd->mac, ETH_ALEN); } else { - clear_bit(cmd->index, bitmap); - memset(ncf->data + 6 * cmd->index, 0, 6); + clear_bit(cmd->index - 1, bitmap); + memset(&ncf->addrs[index], 0, ETH_ALEN); } + spin_unlock_irqrestore(&nc->lock, flags); return 0; } @@ -631,9 +629,7 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr) struct ncsi_rsp_gc_pkt *rsp; struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_channel *nc; - struct ncsi_channel_filter *ncf; - size_t size, entry_size; - int cnt, i; + size_t size; /* Find the channel */ rsp = (struct ncsi_rsp_gc_pkt *)skb_network_header(nr->rsp); @@ -655,64 +651,40 @@ static int ncsi_rsp_handler_gc(struct ncsi_request *nr) nc->caps[NCSI_CAP_VLAN].cap = rsp->vlan_mode & NCSI_CAP_VLAN_MASK; - /* Build filters */ - for (i = 0; i < NCSI_FILTER_MAX; i++) { - switch (i) { - case NCSI_FILTER_VLAN: - cnt = rsp->vlan_cnt; - entry_size = 2; - break; - case NCSI_FILTER_MIXED: - cnt = rsp->mixed_cnt; - entry_size = 6; - break; - case NCSI_FILTER_MC: - cnt = rsp->mc_cnt; - entry_size = 6; - break; - case NCSI_FILTER_UC: - cnt = rsp->uc_cnt; - entry_size = 6; - break; - default: - continue; - } - - if (!cnt || nc->filters[i]) - continue; - - size = sizeof(*ncf) + cnt * entry_size; - ncf = kzalloc(size, GFP_ATOMIC); - if (!ncf) { - pr_warn("%s: Cannot alloc filter table (%d)\n", - __func__, i); - return -ENOMEM; - } - - ncf->index = i; - ncf->total = cnt; - if (i == NCSI_FILTER_VLAN) { - /* Set VLAN filters active so they are cleared in - * first configuration state - */ - ncf->bitmap = U64_MAX; - } else { - ncf->bitmap = 0x0ul; - } - nc->filters[i] = ncf; - } + size = (rsp->uc_cnt + rsp->mc_cnt + rsp->mixed_cnt) * ETH_ALEN; + nc->mac_filter.addrs = kzalloc(size, GFP_KERNEL); + if (!nc->mac_filter.addrs) + return -ENOMEM; + nc->mac_filter.n_uc = rsp->uc_cnt; + nc->mac_filter.n_mc = rsp->mc_cnt; + nc->mac_filter.n_mixed = rsp->mixed_cnt; + + nc->vlan_filter.vids = kcalloc(rsp->vlan_cnt, + sizeof(*nc->vlan_filter.vids), + GFP_KERNEL); + if (!nc->vlan_filter.vids) + return -ENOMEM; + /* Set VLAN filters active so they are cleared in the first + * configuration state + */ + nc->vlan_filter.bitmap = U64_MAX; + nc->vlan_filter.n_vids = rsp->vlan_cnt; return 0; } static int ncsi_rsp_handler_gp(struct ncsi_request *nr) { - struct ncsi_rsp_gp_pkt *rsp; + struct ncsi_channel_vlan_filter *ncvf; + struct ncsi_channel_mac_filter *ncmf; struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_rsp_gp_pkt *rsp; struct ncsi_channel *nc; - unsigned short enable, vlan; + unsigned short enable; unsigned char *pdata; - int table, i; + unsigned long flags; + void *bitmap; + int i; /* Find the channel */ rsp = (struct ncsi_rsp_gp_pkt *)skb_network_header(nr->rsp); @@ -746,36 +718,33 @@ static int ncsi_rsp_handler_gp(struct ncsi_request *nr) /* MAC addresses filter table */ pdata = (unsigned char *)rsp + 48; enable = rsp->mac_enable; + ncmf = &nc->mac_filter; + spin_lock_irqsave(&nc->lock, flags); + bitmap = &ncmf->bitmap; for (i = 0; i < rsp->mac_cnt; i++, pdata += 6) { - if (i >= (nc->filters[NCSI_FILTER_UC]->total + - nc->filters[NCSI_FILTER_MC]->total)) - table = NCSI_FILTER_MIXED; - else if (i >= nc->filters[NCSI_FILTER_UC]->total) - table = NCSI_FILTER_MC; - else - table = NCSI_FILTER_UC; - if (!(enable & (0x1 << i))) - continue; - - if (ncsi_find_filter(nc, table, pdata) >= 0) - continue; + clear_bit(i, bitmap); + else + set_bit(i, bitmap); - ncsi_add_filter(nc, table, pdata); + memcpy(&ncmf->addrs[i * ETH_ALEN], pdata, ETH_ALEN); } + spin_unlock_irqrestore(&nc->lock, flags); /* VLAN filter table */ enable = ntohs(rsp->vlan_enable); + ncvf = &nc->vlan_filter; + bitmap = &ncvf->bitmap; + spin_lock_irqsave(&nc->lock, flags); for (i = 0; i < rsp->vlan_cnt; i++, pdata += 2) { if (!(enable & (0x1 << i))) - continue; - - vlan = ntohs(*(__be16 *)pdata); - if (ncsi_find_filter(nc, NCSI_FILTER_VLAN, &vlan) >= 0) - continue; + clear_bit(i, bitmap); + else + set_bit(i, bitmap); - ncsi_add_filter(nc, NCSI_FILTER_VLAN, &vlan); + ncvf->vids[i] = ntohs(*(__be16 *)pdata); } + spin_unlock_irqrestore(&nc->lock, flags); return 0; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 44d8a55e9721..e57c9d479503 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -444,6 +444,9 @@ config NETFILTER_SYNPROXY endif # NF_CONNTRACK +config NF_OSF + tristate 'Passive OS fingerprint infrastructure' + config NF_TABLES select NETFILTER_NETLINK tristate "Netfilter nf_tables support" @@ -474,24 +477,6 @@ config NF_TABLES_NETDEV help This option enables support for the "netdev" table. -config NFT_EXTHDR - tristate "Netfilter nf_tables exthdr module" - help - This option adds the "exthdr" expression that you can use to match - IPv6 extension headers and tcp options. - -config NFT_META - tristate "Netfilter nf_tables meta module" - help - This option adds the "meta" expression that you can use to match and - to set packet metainformation such as the packet mark. - -config NFT_RT - tristate "Netfilter nf_tables routing module" - help - This option adds the "rt" expression that you can use to match - packet routing information such as the packet nexthop. - config NFT_NUMGEN tristate "Netfilter nf_tables number generator module" help @@ -667,8 +652,7 @@ endif # NF_TABLES config NF_FLOW_TABLE_INET tristate "Netfilter flow table mixed IPv4/IPv6 module" - depends on NF_FLOW_TABLE_IPV4 - depends on NF_FLOW_TABLE_IPV6 + depends on NF_FLOW_TABLE help This option adds the flow table mixed IPv4/IPv6 support. @@ -1378,6 +1362,7 @@ config NETFILTER_XT_MATCH_NFACCT config NETFILTER_XT_MATCH_OSF tristate '"osf" Passive OS fingerprint match' depends on NETFILTER_ADVANCED && NETFILTER_NETLINK + select NF_OSF help This option selects the Passive OS Fingerprinting match module that allows to passively match the remote operating system by diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index fd32bd2c9521..1aa710b5d384 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -76,13 +76,10 @@ obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ - nft_dynset.o + nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o -obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o -obj-$(CONFIG_NFT_META) += nft_meta.o -obj-$(CONFIG_NFT_RT) += nft_rt.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o @@ -104,6 +101,7 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_FIB) += nft_fib.o obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o +obj-$(CONFIG_NF_OSF) += nf_osf.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o @@ -111,6 +109,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o +nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o + obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o # generic X tables diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index b32fb0dbe237..05dc1b77e466 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -225,6 +225,25 @@ config IP_VS_SH If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_MH + tristate "maglev hashing scheduling" + ---help--- + The maglev consistent hashing scheduling algorithm provides the + Google's Maglev hashing algorithm as a IPVS scheduler. It assigns + network connections to the servers through looking up a statically + assigned special hash table called the lookup table. Maglev hashing + is to assign a preference list of all the lookup table positions + to each destination. + + Through this operation, The maglev hashing gives an almost equal + share of the lookup table to each of the destinations and provides + minimal disruption by using the lookup table. When the set of + destinations changes, a connection will likely be sent to the same + destination as it was before. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_SED tristate "shortest expected delay scheduling" ---help--- @@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS needs to be large enough to effectively fit all the destinations multiplied by their respective weights. +comment 'IPVS MH scheduler' + +config IP_VS_MH_TAB_INDEX + int "IPVS maglev hashing table index of size (the prime numbers)" + range 8 17 + default 12 + ---help--- + The maglev hashing scheduler maps source IPs to destinations + stored in a hash table. This table is assigned by a preference + list of the positions to each destination until all slots in + the table are filled. The index determines the prime for size of + the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749, + 65521 or 131071. When using weights to allow destinations to + receive more connections, the table is assigned an amount + proportional to the weights specified. The table needs to be large + enough to effectively fit all the destinations multiplied by their + respective weights. + comment 'IPVS application helper' config IP_VS_FTP diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index c552993fa4b9..bfce2677fda2 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f36098887ad0..d4f68d0f7df7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if (add && udest->af != svc->af) ipvs->mixed_address_family_dests++; + /* keep the last_weight with latest non-0 weight */ + if (add || udest->weight != 0) + atomic_set(&dest->last_weight, udest->weight); + /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 75f798f8e83b..07459e71d907 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -43,6 +43,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <linux/hash.h> #include <net/ip_vs.h> @@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS); } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 3057e453bf31..b9f375e6dc93 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -48,6 +48,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/jiffies.h> +#include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> @@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS); } @@ -371,6 +372,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) tbl->counter = 1; tbl->dead = false; tbl->svc = svc; + atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 92adc04557ed..542c4949937a 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -47,6 +47,7 @@ #include <linux/jiffies.h> #include <linux/list.h> #include <linux/slab.h> +#include <linux/hash.h> /* for sysctl */ #include <linux/fs.h> @@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; + return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS); } @@ -534,6 +535,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) tbl->counter = 1; tbl->dead = false; tbl->svc = svc; + atomic_set(&tbl->entries, 0); /* * Hook periodic timer for garbage collection diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c new file mode 100644 index 000000000000..0f795b186eb3 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -0,0 +1,540 @@ +// SPDX-License-Identifier: GPL-2.0 +/* IPVS: Maglev Hashing scheduling module + * + * Authors: Inju Song <inju.song@navercorp.com> + * + */ + +/* The mh algorithm is to assign a preference list of all the lookup + * table positions to each destination and populate the table with + * the most-preferred position of destinations. Then it is to select + * destination with the hash key of source IP address through looking + * up a the lookup table. + * + * The algorithm is detailed in: + * [3.4 Consistent Hasing] +https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/ip.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> + +#include <net/ip_vs.h> + +#include <linux/siphash.h> +#include <linux/bitops.h> +#include <linux/gcd.h> + +#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */ +#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */ + +struct ip_vs_mh_lookup { + struct ip_vs_dest __rcu *dest; /* real server (cache) */ +}; + +struct ip_vs_mh_dest_setup { + unsigned int offset; /* starting offset */ + unsigned int skip; /* skip */ + unsigned int perm; /* next_offset */ + int turns; /* weight / gcd() and rshift */ +}; + +/* Available prime numbers for MH table */ +static int primes[] = {251, 509, 1021, 2039, 4093, + 8191, 16381, 32749, 65521, 131071}; + +/* For IPVS MH entry hash table */ +#ifndef CONFIG_IP_VS_MH_TAB_INDEX +#define CONFIG_IP_VS_MH_TAB_INDEX 12 +#endif +#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2) +#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8) +#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX] + +struct ip_vs_mh_state { + struct rcu_head rcu_head; + struct ip_vs_mh_lookup *lookup; + struct ip_vs_mh_dest_setup *dest_setup; + hsiphash_key_t hash1, hash2; + int gcd; + int rshift; +}; + +static inline void generate_hash_secret(hsiphash_key_t *hash1, + hsiphash_key_t *hash2) +{ + hash1->key[0] = 2654435761UL; + hash1->key[1] = 2654435761UL; + + hash2->key[0] = 2654446892UL; + hash2->key[1] = 2654446892UL; +} + +/* Helper function to determine if server is unavailable */ +static inline bool is_unavailable(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->weight) <= 0 || + dest->flags & IP_VS_DEST_F_OVERLOAD; +} + +/* Returns hash value for IPVS MH entry */ +static inline unsigned int +ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr, + __be16 port, hsiphash_key_t *key, unsigned int offset) +{ + unsigned int v; + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0] ^ addr->ip6[1] ^ + addr->ip6[2] ^ addr->ip6[3]; +#endif + v = (offset + ntohs(port) + ntohl(addr_fold)); + return hsiphash(&v, sizeof(v), key); +} + +/* Reset all the hash buckets of the specified table. */ +static void ip_vs_mh_reset(struct ip_vs_mh_state *s) +{ + int i; + struct ip_vs_mh_lookup *l; + struct ip_vs_dest *dest; + + l = &s->lookup[0]; + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) { + dest = rcu_dereference_protected(l->dest, 1); + if (dest) { + ip_vs_dest_put(dest); + RCU_INIT_POINTER(l->dest, NULL); + } + l++; + } +} + +static int ip_vs_mh_permutate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + struct list_head *p; + struct ip_vs_mh_dest_setup *ds; + struct ip_vs_dest *dest; + int lw; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, skip + * permutation for the dests. + */ + if (s->gcd < 1) + return 0; + + /* Set dest_setup for the dests permutation */ + p = &svc->destinations; + ds = &s->dest_setup[0]; + while ((p = p->next) != &svc->destinations) { + dest = list_entry(p, struct ip_vs_dest, n_list); + + ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash1, 0) % + IP_VS_MH_TAB_SIZE; + ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr, + dest->port, &s->hash2, 0) % + (IP_VS_MH_TAB_SIZE - 1) + 1; + ds->perm = ds->offset; + + lw = atomic_read(&dest->last_weight); + ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0); + ds++; + } + + return 0; +} + +static int ip_vs_mh_populate(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + int n, c, dt_count; + unsigned long *table; + struct list_head *p; + struct ip_vs_mh_dest_setup *ds; + struct ip_vs_dest *dest, *new_dest; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, skip + * the population for the dests and reset lookup table. + */ + if (s->gcd < 1) { + ip_vs_mh_reset(s); + return 0; + } + + table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE), + sizeof(unsigned long), GFP_KERNEL); + if (!table) + return -ENOMEM; + + p = &svc->destinations; + n = 0; + dt_count = 0; + while (n < IP_VS_MH_TAB_SIZE) { + if (p == &svc->destinations) + p = p->next; + + ds = &s->dest_setup[0]; + while (p != &svc->destinations) { + /* Ignore added server with zero weight */ + if (ds->turns < 1) { + p = p->next; + ds++; + continue; + } + + c = ds->perm; + while (test_bit(c, table)) { + /* Add skip, mod IP_VS_MH_TAB_SIZE */ + ds->perm += ds->skip; + if (ds->perm >= IP_VS_MH_TAB_SIZE) + ds->perm -= IP_VS_MH_TAB_SIZE; + c = ds->perm; + } + + __set_bit(c, table); + + dest = rcu_dereference_protected(s->lookup[c].dest, 1); + new_dest = list_entry(p, struct ip_vs_dest, n_list); + if (dest != new_dest) { + if (dest) + ip_vs_dest_put(dest); + ip_vs_dest_hold(new_dest); + RCU_INIT_POINTER(s->lookup[c].dest, new_dest); + } + + if (++n == IP_VS_MH_TAB_SIZE) + goto out; + + if (++dt_count >= ds->turns) { + dt_count = 0; + p = p->next; + ds++; + } + } + } + +out: + kfree(table); + return 0; +} + +/* Get ip_vs_dest associated with supplied parameters. */ +static inline struct ip_vs_dest * +ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0) + % IP_VS_MH_TAB_SIZE; + struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest); + + return (!dest || is_unavailable(dest)) ? NULL : dest; +} + +/* As ip_vs_mh_get, but with fallback if selected server is unavailable */ +static inline struct ip_vs_dest * +ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s, + const union nf_inet_addr *addr, __be16 port) +{ + unsigned int offset, roffset; + unsigned int hash, ihash; + struct ip_vs_dest *dest; + + /* First try the dest it's supposed to go to */ + ihash = ip_vs_mh_hashkey(svc->af, addr, port, + &s->hash1, 0) % IP_VS_MH_TAB_SIZE; + dest = rcu_dereference(s->lookup[ihash].dest); + if (!dest) + return NULL; + if (!is_unavailable(dest)) + return dest; + + IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); + + /* If the original dest is unavailable, loop around the table + * starting from ihash to find a new dest + */ + for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) { + roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE; + hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, + roffset) % IP_VS_MH_TAB_SIZE; + dest = rcu_dereference(s->lookup[hash].dest); + if (!dest) + break; + if (!is_unavailable(dest)) + return dest; + IP_VS_DBG_BUF(6, + "MH: selected unavailable server %s:%u (offset %u), reselecting", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), roffset); + } + + return NULL; +} + +/* Assign all the hash buckets of the specified table with the service. */ +static int ip_vs_mh_reassign(struct ip_vs_mh_state *s, + struct ip_vs_service *svc) +{ + int ret; + + if (svc->num_dests > IP_VS_MH_TAB_SIZE) + return -EINVAL; + + if (svc->num_dests >= 1) { + s->dest_setup = kcalloc(svc->num_dests, + sizeof(struct ip_vs_mh_dest_setup), + GFP_KERNEL); + if (!s->dest_setup) + return -ENOMEM; + } + + ip_vs_mh_permutate(s, svc); + + ret = ip_vs_mh_populate(s, svc); + if (ret < 0) + goto out; + + IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n", + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port)); + +out: + if (svc->num_dests >= 1) { + kfree(s->dest_setup); + s->dest_setup = NULL; + } + return ret; +} + +static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->last_weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g; +} + +/* To avoid assigning huge weight for the MH table, + * calculate shift value with gcd. + */ +static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd) +{ + struct ip_vs_dest *dest; + int new_weight, weight = 0; + int mw, shift; + + /* If gcd is smaller then 1, number of dests or + * all last_weight of dests are zero. So, return + * shift value as zero. + */ + if (gcd < 1) + return 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + new_weight = atomic_read(&dest->last_weight); + if (new_weight > weight) + weight = new_weight; + } + + /* Because gcd is greater than zero, + * the maximum weight and gcd are always greater than zero + */ + mw = weight / gcd; + + /* shift = occupied bits of weight/gcd - MH highest bits */ + shift = fls(mw) - IP_VS_MH_TAB_BITS; + return (shift >= 0) ? shift : 0; +} + +static void ip_vs_mh_state_free(struct rcu_head *head) +{ + struct ip_vs_mh_state *s; + + s = container_of(head, struct ip_vs_mh_state, rcu_head); + kfree(s->lookup); + kfree(s); +} + +static int ip_vs_mh_init_svc(struct ip_vs_service *svc) +{ + int ret; + struct ip_vs_mh_state *s; + + /* Allocate the MH table for this service */ + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup), + GFP_KERNEL); + if (!s->lookup) { + kfree(s); + return -ENOMEM; + } + + generate_hash_secret(&s->hash1, &s->hash2); + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + + IP_VS_DBG(6, + "MH lookup table (memory=%zdbytes) allocated for current service\n", + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); + + /* Assign the lookup table with current dests */ + ret = ip_vs_mh_reassign(s, svc); + if (ret < 0) { + ip_vs_mh_reset(s); + ip_vs_mh_state_free(&s->rcu_head); + return ret; + } + + /* No more failures, attach state */ + svc->sched_data = s; + return 0; +} + +static void ip_vs_mh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_mh_state *s = svc->sched_data; + + /* Got to clean up lookup entry here */ + ip_vs_mh_reset(s); + + call_rcu(&s->rcu_head, ip_vs_mh_state_free); + IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n", + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE); +} + +static int ip_vs_mh_dest_changed(struct ip_vs_service *svc, + struct ip_vs_dest *dest) +{ + struct ip_vs_mh_state *s = svc->sched_data; + + s->gcd = ip_vs_mh_gcd_weight(svc); + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd); + + /* Assign the lookup table with the updated service */ + return ip_vs_mh_reassign(s, svc); +} + +/* Helper function to get port number */ +static inline __be16 +ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph) +{ + __be16 _ports[2], *ports; + + /* At this point we know that we have a valid packet of some kind. + * Because ICMP packets are only guaranteed to have the first 8 + * bytes, let's just grab the ports. Fortunately they're in the + * same position for all three of the protocols we care about. + */ + switch (iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + ports = skb_header_pointer(skb, iph->len, sizeof(_ports), + &_ports); + if (unlikely(!ports)) + return 0; + + if (likely(!ip_vs_iph_inverse(iph))) + return ports[0]; + else + return ports[1]; + default: + return 0; + } +} + +/* Maglev Hashing scheduling */ +static struct ip_vs_dest * +ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_mh_state *s; + __be16 port = 0; + const union nf_inet_addr *hash_addr; + + hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr; + + IP_VS_DBG(6, "%s : Scheduling...\n", __func__); + + if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT) + port = ip_vs_mh_get_port(skb, iph); + + s = (struct ip_vs_mh_state *)svc->sched_data; + + if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK) + dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port); + else + dest = ip_vs_mh_get(svc, s, hash_addr, port); + + if (!dest) { + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; + } + + IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n", + IP_VS_DBG_ADDR(svc->af, hash_addr), + ntohs(port), + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); + + return dest; +} + +/* IPVS MH Scheduler structure */ +static struct ip_vs_scheduler ip_vs_mh_scheduler = { + .name = "mh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list), + .init_service = ip_vs_mh_init_svc, + .done_service = ip_vs_mh_done_svc, + .add_dest = ip_vs_mh_dest_changed, + .del_dest = ip_vs_mh_dest_changed, + .upd_dest = ip_vs_mh_dest_changed, + .schedule = ip_vs_mh_schedule, +}; + +static int __init ip_vs_mh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_mh_scheduler); +} + +static void __exit ip_vs_mh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_mh_scheduler); + rcu_barrier(); +} + +module_init(ip_vs_mh_init); +module_exit(ip_vs_mh_cleanup); +MODULE_DESCRIPTION("Maglev hashing ipvs scheduler"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>"); diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index bcd9b7bde4ee..569631d2b2a1 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -436,7 +436,7 @@ static bool tcp_state_active(int state) return tcp_state_active_table[state]; } -static struct tcp_states_t tcp_states [] = { +static struct tcp_states_t tcp_states[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, @@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = { /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; -static struct tcp_states_t tcp_states_dos [] = { +static struct tcp_states_t tcp_states_dos[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 16aaac6eedc9..1e01c782583a 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr, addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif - return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) & + return (offset + hash_32(ntohs(port) + ntohl(addr_fold), + IP_VS_SH_TAB_BITS)) & IP_VS_SH_TAB_MASK; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 4527921b1c3a..ba0a0fd045c8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs, /* check and decrement ttl */ if (ipv6_hdr(skb)->hop_limit <= 1) { + struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); + /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - __IP6_INC_STATS(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return false; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 41ff04ee2554..605441727008 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -186,6 +186,7 @@ unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_t nf_conntrack_generation __read_mostly; static unsigned int nf_conntrack_hash_rnd __read_mostly; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index f0e9a7511e1a..a11c304fb771 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -566,8 +566,7 @@ static const struct nf_conntrack_expect_policy ftp_exp_policy = { .timeout = 5 * 60, }; -/* don't make this __exit, since it's called from __init ! */ -static void nf_conntrack_ftp_fini(void) +static void __exit nf_conntrack_ftp_fini(void) { nf_conntrack_helpers_unregister(ftp, ports_c * 2); kfree(ftp_buffer); diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 5523acce9d69..4099f4d79bae 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -232,8 +232,6 @@ static int help(struct sk_buff *skb, unsigned int protoff, static struct nf_conntrack_helper irc[MAX_PORTS] __read_mostly; static struct nf_conntrack_expect_policy irc_exp_policy; -static void nf_conntrack_irc_fini(void); - static int __init nf_conntrack_irc_init(void) { int i, ret; @@ -276,9 +274,7 @@ static int __init nf_conntrack_irc_init(void) return 0; } -/* This function is intentionally _NOT_ defined as __exit, because - * it is needed by the init function */ -static void nf_conntrack_irc_fini(void) +static void __exit nf_conntrack_irc_fini(void) { nf_conntrack_helpers_unregister(irc, ports_c); kfree(irc_buffer); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 4c1d0c5bc268..d807b8770be3 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2205,6 +2205,9 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; + if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max))) + goto nla_put_failure; + nlmsg_end(skb, nlh); return skb->len; diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index ae457f39d5ce..5072ff96ab33 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -173,8 +173,7 @@ static const struct nf_conntrack_expect_policy sane_exp_policy = { .timeout = 5 * 60, }; -/* don't make this __exit, since it's called from __init ! */ -static void nf_conntrack_sane_fini(void) +static void __exit nf_conntrack_sane_fini(void) { nf_conntrack_helpers_unregister(sane, ports_c * 2); kfree(sane_buffer); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 908e51e2dc2b..c8d2b6688a2a 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1617,7 +1617,7 @@ static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1 }, }; -static void nf_conntrack_sip_fini(void) +static void __exit nf_conntrack_sip_fini(void) { nf_conntrack_helpers_unregister(sip, ports_c * 4); } diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 0ec6779fd5d9..548b673b3625 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -104,7 +104,7 @@ static const struct nf_conntrack_expect_policy tftp_exp_policy = { .timeout = 5 * 60, }; -static void nf_conntrack_tftp_fini(void) +static void __exit nf_conntrack_tftp_fini(void) { nf_conntrack_helpers_unregister(tftp, ports_c * 2); } diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table_core.c index ec410cae9307..eb0d1658ac05 100644 --- a/net/netfilter/nf_flow_table.c +++ b/net/netfilter/nf_flow_table_core.c @@ -4,6 +4,8 @@ #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> +#include <net/ip.h> +#include <net/ip6_route.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack.h> @@ -16,6 +18,43 @@ struct flow_offload_entry { struct rcu_head rcu_head; }; +static DEFINE_MUTEX(flowtable_lock); +static LIST_HEAD(flowtables); + +static void +flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct, + struct nf_flow_route *route, + enum flow_offload_tuple_dir dir) +{ + struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; + struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple; + struct dst_entry *dst = route->tuple[dir].dst; + + ft->dir = dir; + + switch (ctt->src.l3num) { + case NFPROTO_IPV4: + ft->src_v4 = ctt->src.u3.in; + ft->dst_v4 = ctt->dst.u3.in; + ft->mtu = ip_dst_mtu_maybe_forward(dst, true); + break; + case NFPROTO_IPV6: + ft->src_v6 = ctt->src.u3.in6; + ft->dst_v6 = ctt->dst.u3.in6; + ft->mtu = ip6_dst_mtu_forward(dst); + break; + } + + ft->l3proto = ctt->src.l3num; + ft->l4proto = ctt->dst.protonum; + ft->src_port = ctt->src.u.tcp.port; + ft->dst_port = ctt->dst.u.tcp.port; + + ft->iifidx = route->tuple[dir].ifindex; + ft->oifidx = route->tuple[!dir].ifindex; + ft->dst_cache = dst; +} + struct flow_offload * flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) { @@ -40,69 +79,12 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route) entry->ct = ct; - switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) { - case NFPROTO_IPV4: - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in; - break; - case NFPROTO_IPV6: - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6; - break; - } - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache = - route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache = - route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst; - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port = - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port = - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port; - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir = - FLOW_OFFLOAD_DIR_ORIGINAL; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir = - FLOW_OFFLOAD_DIR_REPLY; - - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx = - route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex; - flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx = - route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx = - route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex; - flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx = - route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex; + flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL); + flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) flow->flags |= FLOW_OFFLOAD_SNAT; - else if (ct->status & IPS_DST_NAT) + if (ct->status & IPS_DST_NAT) flow->flags |= FLOW_OFFLOAD_DNAT; return flow; @@ -118,6 +100,43 @@ err_ct_refcnt: } EXPORT_SYMBOL_GPL(flow_offload_alloc); +static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) +{ + tcp->state = TCP_CONNTRACK_ESTABLISHED; + tcp->seen[0].td_maxwin = 0; + tcp->seen[1].td_maxwin = 0; +} + +static void flow_offload_fixup_ct_state(struct nf_conn *ct) +{ + const struct nf_conntrack_l4proto *l4proto; + struct net *net = nf_ct_net(ct); + unsigned int *timeouts; + unsigned int timeout; + int l4num; + + l4num = nf_ct_protonum(ct); + if (l4num == IPPROTO_TCP) + flow_offload_fixup_tcp(&ct->proto.tcp); + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), l4num); + if (!l4proto) + return; + + timeouts = l4proto->get_timeouts(net); + if (!timeouts) + return; + + if (l4num == IPPROTO_TCP) + timeout = timeouts[TCP_CONNTRACK_ESTABLISHED]; + else if (l4num == IPPROTO_UDP) + timeout = timeouts[UDP_CT_REPLIED]; + else + return; + + ct->timeout = nfct_time_stamp + timeout; +} + void flow_offload_free(struct flow_offload *flow) { struct flow_offload_entry *e; @@ -125,17 +144,46 @@ void flow_offload_free(struct flow_offload *flow) dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); e = container_of(flow, struct flow_offload_entry, flow); - nf_ct_delete(e->ct, 0, 0); + if (flow->flags & FLOW_OFFLOAD_DYING) + nf_ct_delete(e->ct, 0, 0); nf_ct_put(e->ct); kfree_rcu(e, rcu_head); } EXPORT_SYMBOL_GPL(flow_offload_free); -void flow_offload_dead(struct flow_offload *flow) +static u32 flow_offload_hash(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple *tuple = data; + + return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) +{ + const struct flow_offload_tuple_rhash *tuplehash = data; + + return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); +} + +static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) { - flow->flags |= FLOW_OFFLOAD_DYING; + const struct flow_offload_tuple *tuple = arg->key; + const struct flow_offload_tuple_rhash *x = ptr; + + if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) + return 1; + + return 0; } -EXPORT_SYMBOL_GPL(flow_offload_dead); + +static const struct rhashtable_params nf_flow_offload_rhash_params = { + .head_offset = offsetof(struct flow_offload_tuple_rhash, node), + .hashfn = flow_offload_hash, + .obj_hashfn = flow_offload_hash_obj, + .obj_cmpfn = flow_offload_hash_cmp, + .automatic_shrinking = true, +}; int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -143,10 +191,10 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); rhashtable_insert_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); return 0; } EXPORT_SYMBOL_GPL(flow_offload_add); @@ -154,22 +202,51 @@ EXPORT_SYMBOL_GPL(flow_offload_add); static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { + struct flow_offload_entry *e; + rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); rhashtable_remove_fast(&flow_table->rhashtable, &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, - *flow_table->type->params); + nf_flow_offload_rhash_params); + + e = container_of(flow, struct flow_offload_entry, flow); + clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); flow_offload_free(flow); } +void flow_offload_teardown(struct flow_offload *flow) +{ + struct flow_offload_entry *e; + + flow->flags |= FLOW_OFFLOAD_TEARDOWN; + + e = container_of(flow, struct flow_offload_entry, flow); + flow_offload_fixup_ct_state(e->ct); +} +EXPORT_SYMBOL_GPL(flow_offload_teardown); + struct flow_offload_tuple_rhash * flow_offload_lookup(struct nf_flowtable *flow_table, struct flow_offload_tuple *tuple) { - return rhashtable_lookup_fast(&flow_table->rhashtable, tuple, - *flow_table->type->params); + struct flow_offload_tuple_rhash *tuplehash; + struct flow_offload *flow; + int dir; + + tuplehash = rhashtable_lookup_fast(&flow_table->rhashtable, tuple, + nf_flow_offload_rhash_params); + if (!tuplehash) + return NULL; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) + return NULL; + + return tuplehash; } EXPORT_SYMBOL_GPL(flow_offload_lookup); @@ -216,11 +293,6 @@ static inline bool nf_flow_has_expired(const struct flow_offload *flow) return (__s32)(flow->timeout - (u32)jiffies) <= 0; } -static inline bool nf_flow_is_dying(const struct flow_offload *flow) -{ - return flow->flags & FLOW_OFFLOAD_DYING; -} - static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) { struct flow_offload_tuple_rhash *tuplehash; @@ -248,7 +320,8 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table) flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); if (nf_flow_has_expired(flow) || - nf_flow_is_dying(flow)) + (flow->flags & (FLOW_OFFLOAD_DYING | + FLOW_OFFLOAD_TEARDOWN))) flow_offload_del(flow_table, flow); } out: @@ -258,7 +331,7 @@ out: return 1; } -void nf_flow_offload_work_gc(struct work_struct *work) +static void nf_flow_offload_work_gc(struct work_struct *work) { struct nf_flowtable *flow_table; @@ -266,42 +339,6 @@ void nf_flow_offload_work_gc(struct work_struct *work) nf_flow_offload_gc_step(flow_table); queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } -EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc); - -static u32 flow_offload_hash(const void *data, u32 len, u32 seed) -{ - const struct flow_offload_tuple *tuple = data; - - return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed); -} - -static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) -{ - const struct flow_offload_tuple_rhash *tuplehash = data; - - return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed); -} - -static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, - const void *ptr) -{ - const struct flow_offload_tuple *tuple = arg->key; - const struct flow_offload_tuple_rhash *x = ptr; - - if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir))) - return 1; - - return 0; -} - -const struct rhashtable_params nf_flow_offload_rhash_params = { - .head_offset = offsetof(struct flow_offload_tuple_rhash, node), - .hashfn = flow_offload_hash, - .obj_hashfn = flow_offload_hash_obj, - .obj_cmpfn = flow_offload_hash_cmp, - .automatic_shrinking = true, -}; -EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params); static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, __be16 port, __be16 new_port) @@ -419,33 +456,69 @@ int nf_flow_dnat_port(const struct flow_offload *flow, } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); +int nf_flow_table_init(struct nf_flowtable *flowtable) +{ + int err; + + INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); + + err = rhashtable_init(&flowtable->rhashtable, + &nf_flow_offload_rhash_params); + if (err < 0) + return err; + + queue_delayed_work(system_power_efficient_wq, + &flowtable->gc_work, HZ); + + mutex_lock(&flowtable_lock); + list_add(&flowtable->list, &flowtables); + mutex_unlock(&flowtable_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_flow_table_init); + static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) { struct net_device *dev = data; - if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex) + if (!dev) { + flow_offload_teardown(flow); return; + } - flow_offload_dead(flow); + if (flow->tuplehash[0].tuple.iifidx == dev->ifindex || + flow->tuplehash[1].tuple.iifidx == dev->ifindex) + flow_offload_dead(flow); } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, - void *data) + struct net_device *dev) { - nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data); + nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); } void nf_flow_table_cleanup(struct net *net, struct net_device *dev) { - nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev); + struct nf_flowtable *flowtable; + + mutex_lock(&flowtable_lock); + list_for_each_entry(flowtable, &flowtables, list) + nf_flow_table_iterate_cleanup(flowtable, dev); + mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); void nf_flow_table_free(struct nf_flowtable *flow_table) { + mutex_lock(&flowtable_lock); + list_del(&flow_table->list); + mutex_unlock(&flowtable_lock); + cancel_delayed_work_sync(&flow_table->gc_work); nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); WARN_ON(!nf_flow_offload_gc_step(flow_table)); + rhashtable_destroy(&flow_table->rhashtable); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index 375a1881d93d..99771aa7e7ea 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, static struct nf_flowtable_type flowtable_inet = { .family = NFPROTO_INET, - .params = &nf_flow_offload_rhash_params, - .gc = nf_flow_offload_work_gc, + .init = nf_flow_table_init, .free = nf_flow_table_free, .hook = nf_flow_offload_inet_hook, .owner = THIS_MODULE, diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c new file mode 100644 index 000000000000..82451b7e0acb --- /dev/null +++ b/net/netfilter/nf_flow_table_ip.c @@ -0,0 +1,487 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/rhashtable.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/netdevice.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <net/neighbour.h> +#include <net/netfilter/nf_flow_table.h> +/* For layer 4 checksum field offset. */ +#include <linux/tcp.h> +#include <linux/udp.h> + +static int nf_flow_state_check(struct flow_offload *flow, int proto, + struct sk_buff *skb, unsigned int thoff) +{ + struct tcphdr *tcph; + + if (proto != IPPROTO_TCP) + return 0; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + if (unlikely(tcph->fin || tcph->rst)) { + flow_offload_teardown(flow); + return -1; + } + + return 0; +} + +static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); + + return 0; +} + +static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&udph->check, skb, addr, + new_addr, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, __be32 addr, + __be32 new_addr) +{ + switch (iph->protocol) { + case IPPROTO_TCP: + if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + default: + return -1; + } + csum_replace4(&iph->check, addr, new_addr); + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, + struct iphdr *iph, unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + __be32 addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = iph->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; + iph->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = iph->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; + iph->saddr = new_addr; + break; + default: + return -1; + } + csum_replace4(&iph->check, addr, new_addr); + + return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); +} + +static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, + unsigned int thoff, enum flow_offload_tuple_dir dir) +{ + struct iphdr *iph = ip_hdr(skb); + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || + nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) + return -1; + + return 0; +} + +static bool ip_has_options(unsigned int thoff) +{ + return thoff != sizeof(struct iphdr); +} + +static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + unsigned int thoff; + struct iphdr *iph; + + if (!pskb_may_pull(skb, sizeof(*iph))) + return -1; + + iph = ip_hdr(skb); + thoff = iph->ihl * 4; + + if (ip_is_fragment(iph) || + unlikely(ip_has_options(thoff))) + return -1; + + if (iph->protocol != IPPROTO_TCP && + iph->protocol != IPPROTO_UDP) + return -1; + + thoff = iph->ihl * 4; + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v4.s_addr = iph->saddr; + tuple->dst_v4.s_addr = iph->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET; + tuple->l4proto = iph->protocol; + tuple->iifidx = dev->ifindex; + + return 0; +} + +/* Based on ip_exceeds_mtu(). */ +static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu) + return false; + + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) + return false; + + return true; +} + +unsigned int +nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + const struct rtable *rt; + unsigned int thoff; + struct iphdr *iph; + __be32 nexthop; + + if (skb->protocol != htons(ETH_P_IP)) + return NF_ACCEPT; + + if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + + if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) && + (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*iph))) + return NF_DROP; + + thoff = ip_hdr(skb)->ihl * 4; + if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) + return NF_ACCEPT; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ip(flow, skb, thoff, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + iph = ip_hdr(skb); + ip_decrease_ttl(iph); + + skb->dev = outdev; + nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); + +static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct tcphdr *tcph; + + if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) || + skb_try_make_writable(skb, thoff + sizeof(*tcph))) + return -1; + + tcph = (void *)(skb_network_header(skb) + thoff); + inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + + return 0; +} + +static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) +{ + struct udphdr *udph; + + if (!pskb_may_pull(skb, thoff + sizeof(*udph)) || + skb_try_make_writable(skb, thoff + sizeof(*udph))) + return -1; + + udph = (void *)(skb_network_header(skb) + thoff); + if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, + new_addr->s6_addr32, true); + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + return 0; +} + +static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, struct in6_addr *addr, + struct in6_addr *new_addr) +{ + switch (ip6h->nexthdr) { + case IPPROTO_TCP: + if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + case IPPROTO_UDP: + if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) + return NF_DROP; + break; + } + + return 0; +} + +static int nf_flow_snat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; + ip6h->daddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_dnat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) +{ + struct in6_addr addr, new_addr; + + switch (dir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + addr = ip6h->daddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6; + ip6h->daddr = new_addr; + break; + case FLOW_OFFLOAD_DIR_REPLY: + addr = ip6h->saddr; + new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; + ip6h->saddr = new_addr; + break; + default: + return -1; + } + + return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); +} + +static int nf_flow_nat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, + enum flow_offload_tuple_dir dir) +{ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + unsigned int thoff = sizeof(*ip6h); + + if (flow->flags & FLOW_OFFLOAD_SNAT && + (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + if (flow->flags & FLOW_OFFLOAD_DNAT && + (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || + nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) + return -1; + + return 0; +} + +static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, + struct flow_offload_tuple *tuple) +{ + struct flow_ports *ports; + struct ipv6hdr *ip6h; + unsigned int thoff; + + if (!pskb_may_pull(skb, sizeof(*ip6h))) + return -1; + + ip6h = ipv6_hdr(skb); + + if (ip6h->nexthdr != IPPROTO_TCP && + ip6h->nexthdr != IPPROTO_UDP) + return -1; + + thoff = sizeof(*ip6h); + if (!pskb_may_pull(skb, thoff + sizeof(*ports))) + return -1; + + ports = (struct flow_ports *)(skb_network_header(skb) + thoff); + + tuple->src_v6 = ip6h->saddr; + tuple->dst_v6 = ip6h->daddr; + tuple->src_port = ports->source; + tuple->dst_port = ports->dest; + tuple->l3proto = AF_INET6; + tuple->l4proto = ip6h->nexthdr; + tuple->iifidx = dev->ifindex; + + return 0; +} + +unsigned int +nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct flow_offload_tuple_rhash *tuplehash; + struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple tuple = {}; + enum flow_offload_tuple_dir dir; + struct flow_offload *flow; + struct net_device *outdev; + struct in6_addr *nexthop; + struct ipv6hdr *ip6h; + struct rt6_info *rt; + + if (skb->protocol != htons(ETH_P_IPV6)) + return NF_ACCEPT; + + if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0) + return NF_ACCEPT; + + tuplehash = flow_offload_lookup(flow_table, &tuple); + if (tuplehash == NULL) + return NF_ACCEPT; + + outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx); + if (!outdev) + return NF_ACCEPT; + + dir = tuplehash->tuple.dir; + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; + + if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) + return NF_ACCEPT; + + if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb, + sizeof(*ip6h))) + return NF_ACCEPT; + + if (skb_try_make_writable(skb, sizeof(*ip6h))) + return NF_DROP; + + if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) && + nf_flow_nat_ipv6(flow, skb, dir) < 0) + return NF_DROP; + + flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT; + ip6h = ipv6_hdr(skb); + ip6h->hop_limit--; + + skb->dev = outdev; + nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + + return NF_STOLEN; +} +EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 617693ff9f4c..37b3c9913b08 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -157,7 +157,7 @@ EXPORT_SYMBOL(nf_nat_used_tuple); static int in_range(const struct nf_nat_l3proto *l3proto, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. @@ -194,7 +194,7 @@ find_appropriate_src(struct net *net, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, - const struct nf_nat_range *range) + const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, tuple); const struct nf_conn *ct; @@ -224,7 +224,7 @@ find_appropriate_src(struct net *net, static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { @@ -298,7 +298,7 @@ find_best_ips_proto(const struct nf_conntrack_zone *zone, static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { @@ -349,9 +349,10 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { - if (l4proto->in_range(tuple, maniptype, - &range->min_proto, - &range->max_proto) && + if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && + l4proto->in_range(tuple, maniptype, + &range->min_proto, + &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) goto out; @@ -360,7 +361,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, } } - /* Last change: get protocol to try to obtain unique tuple. */ + /* Last chance: get protocol to try to obtain unique tuple. */ l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); out: rcu_read_unlock(); @@ -381,7 +382,7 @@ EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); @@ -459,7 +460,7 @@ __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); - struct nf_nat_range range = { + struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, @@ -702,7 +703,7 @@ static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, - struct nf_nat_range *range) + struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; const struct nf_nat_l4proto *l4proto; @@ -730,7 +731,7 @@ static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { static int nfnetlink_parse_nat(const struct nlattr *nat, - const struct nf_conn *ct, struct nf_nat_range *range, + const struct nf_conn *ct, struct nf_nat_range2 *range, const struct nf_nat_l3proto *l3proto) { struct nlattr *tb[CTA_NAT_MAX+1]; @@ -758,7 +759,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { - struct nf_nat_range range; + struct nf_nat_range2 range; const struct nf_nat_l3proto *l3proto; int err; diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 607a373379b4..99606baedda4 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -191,7 +191,7 @@ EXPORT_SYMBOL(nf_nat_mangle_udp_packet); void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *exp) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(ct->status & IPS_NAT_DONE_MASK); diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c index 7d7466dbf663..5d849d835561 100644 --- a/net/netfilter/nf_nat_proto_common.c +++ b/net/netfilter/nf_nat_proto_common.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_in_range); void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, u16 *rover) @@ -83,6 +83,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, : tuple->src.u.all); } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) { off = prandom_u32(); + } else if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) { + off = (ntohs(*portptr) - ntohs(range->base_proto.all)); } else { off = *rover; } @@ -91,7 +93,8 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto, *portptr = htons(min + off % range_size); if (++i != range_size && nf_nat_used_tuple(tuple, ct)) continue; - if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) + if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL| + NF_NAT_RANGE_PROTO_OFFSET))) *rover = off; return; } @@ -100,7 +103,7 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple); #if IS_ENABLED(CONFIG_NF_CT_NETLINK) int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], - struct nf_nat_range *range) + struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c index 269fcd5dc34c..67ea0d83aa5a 100644 --- a/net/netfilter/nf_nat_proto_dccp.c +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -23,7 +23,7 @@ static u_int16_t dccp_port_rover; static void dccp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c index c57ee3240b1d..1c5d9b65fbba 100644 --- a/net/netfilter/nf_nat_proto_sctp.c +++ b/net/netfilter/nf_nat_proto_sctp.c @@ -17,7 +17,7 @@ static u_int16_t nf_sctp_port_rover; static void sctp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c index 4f8820fc5148..f15fcd475f98 100644 --- a/net/netfilter/nf_nat_proto_tcp.c +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -23,7 +23,7 @@ static u16 tcp_port_rover; static void tcp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c index edd4a77dc09a..5790f70a83b2 100644 --- a/net/netfilter/nf_nat_proto_udp.c +++ b/net/netfilter/nf_nat_proto_udp.c @@ -22,7 +22,7 @@ static u16 udp_port_rover; static void udp_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { @@ -100,7 +100,7 @@ static bool udplite_manip_pkt(struct sk_buff *skb, static void udplite_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_proto_unknown.c b/net/netfilter/nf_nat_proto_unknown.c index 6e494d584412..c5db3e251232 100644 --- a/net/netfilter/nf_nat_proto_unknown.c +++ b/net/netfilter/nf_nat_proto_unknown.c @@ -27,7 +27,7 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, static void unknown_unique_tuple(const struct nf_nat_l3proto *l3proto, struct nf_conntrack_tuple *tuple, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 25b06b959118..7c4bb0a773ca 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -36,7 +36,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be32 newdst; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; WARN_ON(hooknum != NF_INET_PRE_ROUTING && hooknum != NF_INET_LOCAL_OUT); @@ -82,10 +82,10 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; unsigned int -nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, +nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, unsigned int hooknum) { - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; struct in6_addr newdst; enum ip_conntrack_info ctinfo; struct nf_conn *ct; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index 791fac4fd745..1f3086074981 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -316,7 +316,7 @@ static void nf_nat_sip_seq_adjust(struct sk_buff *skb, unsigned int protoff, static void nf_nat_sip_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) { - struct nf_nat_range range; + struct nf_nat_range2 range; /* This must be a fresh one. */ BUG_ON(ct->status & IPS_NAT_DONE_MASK); diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c new file mode 100644 index 000000000000..5ba5c7bef2f9 --- /dev/null +++ b/net/netfilter/nf_osf.c @@ -0,0 +1,218 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/kernel.h> + +#include <linux/capability.h> +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <linux/ip.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/tcp.h> + +#include <net/ip.h> +#include <net/tcp.h> + +#include <linux/netfilter/nfnetlink.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_log.h> +#include <linux/netfilter/nf_osf.h> + +static inline int nf_osf_ttl(const struct sk_buff *skb, + const struct nf_osf_info *info, + unsigned char f_ttl) +{ + const struct iphdr *ip = ip_hdr(skb); + + if (info->flags & NF_OSF_TTL) { + if (info->ttl == NF_OSF_TTL_TRUE) + return ip->ttl == f_ttl; + if (info->ttl == NF_OSF_TTL_NOCHECK) + return 1; + else if (ip->ttl <= f_ttl) + return 1; + else { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + int ret = 0; + + for_ifa(in_dev) { + if (inet_ifa_match(ip->saddr, ifa)) { + ret = (ip->ttl == f_ttl); + break; + } + } + endfor_ifa(in_dev); + + return ret; + } + } + + return ip->ttl == f_ttl; +} + +bool +nf_osf_match(const struct sk_buff *skb, u_int8_t family, + int hooknum, struct net_device *in, struct net_device *out, + const struct nf_osf_info *info, struct net *net, + const struct list_head *nf_osf_fingers) +{ + const unsigned char *optp = NULL, *_optp = NULL; + unsigned int optsize = 0, check_WSS = 0; + int fmatch = FMATCH_WRONG, fcount = 0; + const struct iphdr *ip = ip_hdr(skb); + const struct nf_osf_user_finger *f; + unsigned char opts[MAX_IPOPTLEN]; + const struct nf_osf_finger *kf; + u16 window, totlen, mss = 0; + const struct tcphdr *tcp; + struct tcphdr _tcph; + bool df; + + tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); + if (!tcp) + return false; + + if (!tcp->syn) + return false; + + totlen = ntohs(ip->tot_len); + df = ntohs(ip->frag_off) & IP_DF; + window = ntohs(tcp->window); + + if (tcp->doff * 4 > sizeof(struct tcphdr)) { + optsize = tcp->doff * 4 - sizeof(struct tcphdr); + + _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + + sizeof(struct tcphdr), optsize, opts); + } + + list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) { + int foptsize, optnum; + + f = &kf->finger; + + if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) + continue; + + optp = _optp; + fmatch = FMATCH_WRONG; + + if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl)) + continue; + + /* + * Should not happen if userspace parser was written correctly. + */ + if (f->wss.wc >= OSF_WSS_MAX) + continue; + + /* Check options */ + + foptsize = 0; + for (optnum = 0; optnum < f->opt_num; ++optnum) + foptsize += f->opt[optnum].length; + + if (foptsize > MAX_IPOPTLEN || + optsize > MAX_IPOPTLEN || + optsize != foptsize) + continue; + + check_WSS = f->wss.wc; + + for (optnum = 0; optnum < f->opt_num; ++optnum) { + if (f->opt[optnum].kind == (*optp)) { + __u32 len = f->opt[optnum].length; + const __u8 *optend = optp + len; + + fmatch = FMATCH_OK; + + switch (*optp) { + case OSFOPT_MSS: + mss = optp[3]; + mss <<= 8; + mss |= optp[2]; + + mss = ntohs((__force __be16)mss); + break; + case OSFOPT_TS: + break; + } + + optp = optend; + } else + fmatch = FMATCH_OPT_WRONG; + + if (fmatch != FMATCH_OK) + break; + } + + if (fmatch != FMATCH_OPT_WRONG) { + fmatch = FMATCH_WRONG; + + switch (check_WSS) { + case OSF_WSS_PLAIN: + if (f->wss.val == 0 || window == f->wss.val) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MSS: + /* + * Some smart modems decrease mangle MSS to + * SMART_MSS_2, so we check standard, decreased + * and the one provided in the fingerprint MSS + * values. + */ +#define SMART_MSS_1 1460 +#define SMART_MSS_2 1448 + if (window == f->wss.val * mss || + window == f->wss.val * SMART_MSS_1 || + window == f->wss.val * SMART_MSS_2) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MTU: + if (window == f->wss.val * (mss + 40) || + window == f->wss.val * (SMART_MSS_1 + 40) || + window == f->wss.val * (SMART_MSS_2 + 40)) + fmatch = FMATCH_OK; + break; + case OSF_WSS_MODULO: + if ((window % f->wss.val) == 0) + fmatch = FMATCH_OK; + break; + } + } + + if (fmatch != FMATCH_OK) + continue; + + fcount++; + + if (info->flags & NF_OSF_LOG) + nf_log_packet(net, family, hooknum, skb, + in, out, NULL, + "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", + f->genre, f->version, f->subtype, + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest), + f->ttl - ip->ttl); + + if ((info->flags & NF_OSF_LOG) && + info->loglevel == NF_OSF_LOGLEVEL_FIRST) + break; + } + + if (!fcount && (info->flags & NF_OSF_LOG)) + nf_log_packet(net, family, hooknum, skb, in, out, NULL, + "Remote OS is not known: %pI4:%u -> %pI4:%u\n", + &ip->saddr, ntohs(tcp->source), + &ip->daddr, ntohs(tcp->dest)); + + if (fcount) + fmatch = FMATCH_OK; + + return fmatch == FMATCH_OK; +} +EXPORT_SYMBOL_GPL(nf_osf_match); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 91e80aa852d6..a5f3743fda65 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -415,13 +415,17 @@ static struct nft_table *nft_table_lookup(const struct net *net, { struct nft_table *table; + if (nla == NULL) + return ERR_PTR(-EINVAL); + list_for_each_entry(table, &net->nft.tables, list) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) return table; } - return NULL; + + return ERR_PTR(-ENOENT); } static struct nft_table *nft_table_lookup_byhandle(const struct net *net, @@ -435,37 +439,6 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, nft_active_genmask(table, genmask)) return table; } - return NULL; -} - -static struct nft_table *nf_tables_table_lookup(const struct net *net, - const struct nlattr *nla, - u8 family, u8 genmask) -{ - struct nft_table *table; - - if (nla == NULL) - return ERR_PTR(-EINVAL); - - table = nft_table_lookup(net, nla, family, genmask); - if (table != NULL) - return table; - - return ERR_PTR(-ENOENT); -} - -static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net, - const struct nlattr *nla, - u8 genmask) -{ - struct nft_table *table; - - if (nla == NULL) - return ERR_PTR(-EINVAL); - - table = nft_table_lookup_byhandle(net, nla, genmask); - if (table != NULL) - return table; return ERR_PTR(-ENOENT); } @@ -637,10 +610,11 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]); return PTR_ERR(table); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -756,21 +730,23 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - const struct nlattr *name; - struct nft_table *table; int family = nfmsg->nfgen_family; + const struct nlattr *attr; + struct nft_table *table; u32 flags = 0; struct nft_ctx ctx; int err; - name = nla[NFTA_TABLE_NAME]; - table = nf_tables_table_lookup(net, name, family, genmask); + attr = nla[NFTA_TABLE_NAME]; + table = nft_table_lookup(net, attr, family, genmask); if (IS_ERR(table)) { if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; + } if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; @@ -789,7 +765,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table == NULL) goto err_kzalloc; - table->name = nla_strdup(name, GFP_KERNEL); + table->name = nla_strdup(attr, GFP_KERNEL); if (table->name == NULL) goto err_strdup; @@ -912,8 +888,9 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); - struct nft_table *table; int family = nfmsg->nfgen_family; + const struct nlattr *attr; + struct nft_table *table; struct nft_ctx ctx; nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); @@ -921,16 +898,18 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); - if (nla[NFTA_TABLE_HANDLE]) - table = nf_tables_table_lookup_byhandle(net, - nla[NFTA_TABLE_HANDLE], - genmask); - else - table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], - family, genmask); + if (nla[NFTA_TABLE_HANDLE]) { + attr = nla[NFTA_TABLE_HANDLE]; + table = nft_table_lookup_byhandle(net, attr, genmask); + } else { + attr = nla[NFTA_TABLE_NAME]; + table = nft_table_lookup(net, attr, family, genmask); + } - if (IS_ERR(table)) + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(table); + } if (nlh->nlmsg_flags & NLM_F_NONREC && table->use > 0) @@ -978,8 +957,7 @@ EXPORT_SYMBOL_GPL(nft_unregister_chain_type); */ static struct nft_chain * -nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, - u8 genmask) +nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) { struct nft_chain *chain; @@ -992,9 +970,8 @@ nf_tables_chain_lookup_byhandle(const struct nft_table *table, u64 handle, return ERR_PTR(-ENOENT); } -static struct nft_chain *nf_tables_chain_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask) +static struct nft_chain *nft_chain_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_chain *chain; @@ -1223,14 +1200,17 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); + } - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return PTR_ERR(chain); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -1542,8 +1522,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nla[NFTA_CHAIN_NAME]) { struct nft_chain *chain2; - chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], - genmask); + chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); if (!IS_ERR(chain2)) return -EEXIST; } @@ -1593,9 +1572,9 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - const struct nlattr * uninitialized_var(name); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; u8 policy = NF_ACCEPT; @@ -1605,36 +1584,46 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); + } chain = NULL; - name = nla[NFTA_CHAIN_NAME]; + attr = nla[NFTA_CHAIN_NAME]; if (nla[NFTA_CHAIN_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup_byhandle(table, handle, genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]); return PTR_ERR(chain); + } + attr = nla[NFTA_CHAIN_HANDLE]; } else { - chain = nf_tables_chain_lookup(table, name, genmask); + chain = nft_chain_lookup(table, attr, genmask); if (IS_ERR(chain)) { - if (PTR_ERR(chain) != -ENOENT) + if (PTR_ERR(chain) != -ENOENT) { + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); + } chain = NULL; } } if (nla[NFTA_CHAIN_POLICY]) { if (chain != NULL && - !nft_is_base_chain(chain)) + !nft_is_base_chain(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]); return -EOPNOTSUPP; + } if (chain == NULL && - nla[NFTA_CHAIN_HOOK] == NULL) + nla[NFTA_CHAIN_HOOK] == NULL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]); return -EOPNOTSUPP; + } policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY])); switch (policy) { @@ -1649,8 +1638,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain != NULL) { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; + } if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; @@ -1667,28 +1658,34 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); + int family = nfmsg->nfgen_family; + const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule; - int family = nfmsg->nfgen_family; struct nft_ctx ctx; u64 handle; u32 use; int err; - table = nf_tables_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]); return PTR_ERR(table); + } if (nla[NFTA_CHAIN_HANDLE]) { - handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE])); - chain = nf_tables_chain_lookup_byhandle(table, handle, genmask); + attr = nla[NFTA_CHAIN_HANDLE]; + handle = be64_to_cpu(nla_get_be64(attr)); + chain = nft_chain_lookup_byhandle(table, handle, genmask); } else { - chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); + attr = nla[NFTA_CHAIN_NAME]; + chain = nft_chain_lookup(table, attr, genmask); } - if (IS_ERR(chain)) + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(chain); + } if (nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) @@ -1710,8 +1707,10 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, /* There are rules and elements that are still holding references to us, * we cannot do a recursive removal in this case. */ - if (use > 0) + if (use > 0) { + NL_SET_BAD_ATTR(extack, attr); return -EBUSY; + } return nft_delchain(&ctx); } @@ -1968,8 +1967,8 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) * Rules */ -static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, - u64 handle) +static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, + u64 handle) { struct nft_rule *rule; @@ -1982,13 +1981,13 @@ static struct nft_rule *__nf_tables_rule_lookup(const struct nft_chain *chain, return ERR_PTR(-ENOENT); } -static struct nft_rule *nf_tables_rule_lookup(const struct nft_chain *chain, - const struct nlattr *nla) +static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain, + const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); - return __nf_tables_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); + return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { @@ -2220,18 +2219,23 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return netlink_dump_start(nlsk, skb, nlh, &c); } - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); + } - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); + } - rule = nf_tables_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); - if (IS_ERR(rule)) + rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -2301,23 +2305,30 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); + } - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); + } if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); - rule = __nf_tables_rule_lookup(chain, handle); - if (IS_ERR(rule)) + rule = __nft_rule_lookup(chain, handle); + if (IS_ERR(rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); + } - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return -EEXIST; + } if (nlh->nlmsg_flags & NLM_F_REPLACE) old_rule = rule; else @@ -2336,9 +2347,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return -EOPNOTSUPP; pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nf_tables_rule_lookup(chain, pos_handle); - if (IS_ERR(old_rule)) + old_rule = __nft_rule_lookup(chain, pos_handle); + if (IS_ERR(old_rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); + } } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); @@ -2476,32 +2489,37 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; - table = nf_tables_table_lookup(net, nla[NFTA_RULE_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]); return PTR_ERR(table); + } if (nla[NFTA_RULE_CHAIN]) { - chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN], - genmask); - if (IS_ERR(chain)) + chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask); + if (IS_ERR(chain)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]); return PTR_ERR(chain); + } } nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { - rule = nf_tables_rule_lookup(chain, - nla[NFTA_RULE_HANDLE]); - if (IS_ERR(rule)) + rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + if (IS_ERR(rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); + } err = nft_delrule(&ctx, rule); } else if (nla[NFTA_RULE_ID]) { rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_ID]); - if (IS_ERR(rule)) + if (IS_ERR(rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]); return PTR_ERR(rule); + } err = nft_delrule(&ctx, rule); } else { @@ -2546,14 +2564,12 @@ void nft_unregister_set(struct nft_set_type *type) EXPORT_SYMBOL_GPL(nft_unregister_set); #define NFT_SET_FEATURES (NFT_SET_INTERVAL | NFT_SET_MAP | \ - NFT_SET_TIMEOUT | NFT_SET_OBJECT) + NFT_SET_TIMEOUT | NFT_SET_OBJECT | \ + NFT_SET_EVAL) -static bool nft_set_ops_candidate(const struct nft_set_ops *ops, u32 flags) +static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) { - if ((flags & NFT_SET_EVAL) && !ops->update) - return false; - - return (flags & ops->features) == (flags & NFT_SET_FEATURES); + return (flags & type->features) == (flags & NFT_SET_FEATURES); } /* @@ -2590,14 +2606,9 @@ nft_select_set_ops(const struct nft_ctx *ctx, best.space = ~0; list_for_each_entry(type, &nf_tables_set_types, list) { - if (!type->select_ops) - ops = type->ops; - else - ops = type->select_ops(ctx, desc, flags); - if (!ops) - continue; + ops = &type->ops; - if (!nft_set_ops_candidate(ops, flags)) + if (!nft_set_ops_candidate(type, flags)) continue; if (!ops->estimate(desc, flags, &est)) continue; @@ -2628,7 +2639,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, if (!try_module_get(type->owner)) continue; if (bops != NULL) - module_put(bops->type->owner); + module_put(to_set_type(bops)->owner); bops = ops; best = est; @@ -2669,6 +2680,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], + struct netlink_ext_ack *extack, u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2676,18 +2688,20 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, struct nft_table *table = NULL; if (nla[NFTA_SET_TABLE] != NULL) { - table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], - family, genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, + genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); + } } nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); return 0; } -static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +static struct nft_set *nft_set_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -2702,14 +2716,12 @@ static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, return ERR_PTR(-ENOENT); } -static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u8 genmask) { struct nft_set *set; - if (nla == NULL) - return ERR_PTR(-EINVAL); - list_for_each_entry(set, &table->sets, list) { if (be64_to_cpu(nla_get_be64(nla)) == set->handle && nft_active_genmask(set, genmask)) @@ -2718,9 +2730,8 @@ static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *tab return ERR_PTR(-ENOENT); } -static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla, - u8 genmask) +static struct nft_set *nft_set_lookup_byid(const struct net *net, + const struct nlattr *nla, u8 genmask) { struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); @@ -2744,12 +2755,12 @@ struct nft_set *nft_set_lookup_global(const struct net *net, { struct nft_set *set; - set = nf_tables_set_lookup(table, nla_set_name, genmask); + set = nft_set_lookup(table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; - set = nf_tables_set_lookup_byid(net, nla_set_id, genmask); + set = nft_set_lookup_byid(net, nla_set_id, genmask); } return set; } @@ -2809,6 +2820,27 @@ cont: return 0; } +static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) +{ + u64 ms = be64_to_cpu(nla_get_be64(nla)); + u64 max = (u64)(~((u64)0)); + + max = div_u64(max, NSEC_PER_MSEC); + if (ms >= max) + return -ERANGE; + + ms *= NSEC_PER_MSEC; + *result = nsecs_to_jiffies64(ms); + return 0; +} + +static u64 nf_jiffies64_to_msecs(u64 input) +{ + u64 ms = jiffies64_to_nsecs(input); + + return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -2856,7 +2888,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (set->timeout && nla_put_be64(skb, NFTA_SET_TIMEOUT, - cpu_to_be64(jiffies_to_msecs(set->timeout)), + nf_jiffies64_to_msecs(set->timeout), NFTA_SET_PAD)) goto nla_put_failure; if (set->gc_int && @@ -2994,7 +3026,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; @@ -3021,7 +3054,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3151,8 +3184,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TIMEOUT] != NULL) { if (!(flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - nla[NFTA_SET_TIMEOUT]))); + + err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &timeout); + if (err) + return err; } gc_int = 0; if (nla[NFTA_SET_GC_INTERVAL] != NULL) { @@ -3173,22 +3208,28 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false; - table = nf_tables_table_lookup(net, nla[NFTA_SET_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]); return PTR_ERR(table); + } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); - set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { - if (PTR_ERR(set) != -ENOENT) + if (PTR_ERR(set) != -ENOENT) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); + } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; + } if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; + return 0; } @@ -3265,14 +3306,14 @@ err3: err2: kvfree(set); err1: - module_put(ops->type->owner); + module_put(to_set_type(ops)->owner); return err; } static void nft_set_destroy(struct nft_set *set) { set->ops->destroy(set); - module_put(set->ops->type->owner); + module_put(to_set_type(set->ops)->owner); kfree(set->name); kvfree(set); } @@ -3291,6 +3332,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); + const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; int err; @@ -3300,20 +3342,28 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; - if (nla[NFTA_SET_HANDLE]) - set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask); - else - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask); - if (IS_ERR(set)) - return PTR_ERR(set); + if (nla[NFTA_SET_HANDLE]) { + attr = nla[NFTA_SET_HANDLE]; + set = nft_set_lookup_byhandle(ctx.table, attr, genmask); + } else { + attr = nla[NFTA_SET_NAME]; + set = nft_set_lookup(ctx.table, attr, genmask); + } + if (IS_ERR(set)) { + NL_SET_BAD_ATTR(extack, attr); + return PTR_ERR(set); + } if (!list_empty(&set->bindings) || - (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) + (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) { + NL_SET_BAD_ATTR(extack, attr); return -EBUSY; + } return nft_delset(&ctx, set); } @@ -3403,8 +3453,8 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .align = __alignof__(u64), }, [NFT_SET_EXT_EXPIRATION] = { - .len = sizeof(unsigned long), - .align = __alignof__(unsigned long), + .len = sizeof(u64), + .align = __alignof__(u64), }, [NFT_SET_EXT_USERDATA] = { .len = sizeof(struct nft_userdata), @@ -3441,16 +3491,19 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], + struct netlink_ext_ack *extack, u8 genmask) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); int family = nfmsg->nfgen_family; struct nft_table *table; - table = nf_tables_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], - family, genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family, + genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]); return PTR_ERR(table); + } nft_ctx_init(ctx, net, skb, nlh, family, table, NULL, nla); return 0; @@ -3494,22 +3547,21 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - cpu_to_be64(jiffies_to_msecs( - *nft_set_ext_timeout(ext))), + nf_jiffies64_to_msecs(*nft_set_ext_timeout(ext)), NFTA_SET_ELEM_PAD)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { - unsigned long expires, now = jiffies; + u64 expires, now = get_jiffies_64(); expires = *nft_set_ext_expiration(ext); - if (time_before(now, expires)) + if (time_before64(now, expires)) expires -= now; else expires = 0; if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - cpu_to_be64(jiffies_to_msecs(expires)), + nf_jiffies64_to_msecs(expires), NFTA_SET_ELEM_PAD)) goto nla_put_failure; } @@ -3780,12 +3832,12 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); @@ -3884,7 +3936,7 @@ void *nft_set_elem_init(const struct nft_set *set, memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) *nft_set_ext_expiration(ext) = - jiffies + timeout; + get_jiffies_64() + timeout; if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) *nft_set_ext_timeout(ext) = timeout; @@ -3971,8 +4023,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) return -EINVAL; - timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64( - nla[NFTA_SET_ELEM_TIMEOUT]))); + err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT], + &timeout); + if (err) + return err; } else if (set->flags & NFT_SET_TIMEOUT) { timeout = set->timeout; } @@ -3997,8 +4051,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -EINVAL; goto err2; } - obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], - set->objtype, genmask); + obj = nft_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF], + set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); goto err2; @@ -4137,7 +4191,8 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; @@ -4325,12 +4380,12 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, genmask); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + genmask); if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); + set = nft_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) @@ -4418,9 +4473,9 @@ void nft_unregister_obj(struct nft_object_type *obj_type) } EXPORT_SYMBOL_GPL(nft_unregister_obj); -struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, - const struct nlattr *nla, - u32 objtype, u8 genmask) +struct nft_object *nft_obj_lookup(const struct nft_table *table, + const struct nlattr *nla, u32 objtype, + u8 genmask) { struct nft_object *obj; @@ -4432,11 +4487,11 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); +EXPORT_SYMBOL_GPL(nft_obj_lookup); -static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, - u32 objtype, u8 genmask) +static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) { struct nft_object *obj; @@ -4580,22 +4635,25 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_DATA]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); + } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); - if (err != -ENOENT) + if (err != -ENOENT) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return err; - + } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; - + } return 0; } @@ -4806,15 +4864,18 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, !nla[NFTA_OBJ_TYPE]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); + } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); - if (IS_ERR(obj)) + obj = nft_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask); + if (IS_ERR(obj)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return PTR_ERR(obj); + } skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb2) @@ -4853,6 +4914,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + const struct nlattr *attr; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; @@ -4862,22 +4924,29 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE])) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family, - genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]); return PTR_ERR(table); + } objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); - if (nla[NFTA_OBJ_HANDLE]) - obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE], - objtype, genmask); - else - obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], - objtype, genmask); - if (IS_ERR(obj)) + if (nla[NFTA_OBJ_HANDLE]) { + attr = nla[NFTA_OBJ_HANDLE]; + obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask); + } else { + attr = nla[NFTA_OBJ_NAME]; + obj = nft_obj_lookup(table, attr, objtype, genmask); + } + + if (IS_ERR(obj)) { + NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(obj); - if (obj->use > 0) + } + if (obj->use > 0) { + NL_SET_BAD_ATTR(extack, attr); return -EBUSY; + } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -4948,9 +5017,8 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 }, }; -struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, - const struct nlattr *nla, - u8 genmask) +struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; @@ -4961,11 +5029,11 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); +EXPORT_SYMBOL_GPL(nft_flowtable_lookup); static struct nft_flowtable * -nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +nft_flowtable_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; @@ -5064,7 +5132,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, flowtable->ops[i].pf = NFPROTO_NETDEV; flowtable->ops[i].hooknum = hooknum; flowtable->ops[i].priority = priority; - flowtable->ops[i].priv = &flowtable->data.rhashtable; + flowtable->ops[i].priv = &flowtable->data; flowtable->ops[i].hook = flowtable->data.type->hook; flowtable->ops[i].dev = dev_array[i]; flowtable->dev_name[i] = kstrdup(dev_array[i]->name, @@ -5105,23 +5173,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family) return ERR_PTR(-ENOENT); } -void nft_flow_table_iterate(struct net *net, - void (*iter)(struct nf_flowtable *flowtable, void *data), - void *data) -{ - struct nft_flowtable *flowtable; - const struct nft_table *table; - - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - iter(&flowtable->data, data); - } - } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); -} -EXPORT_SYMBOL_GPL(nft_flow_table_iterate); - static void nft_unregister_flowtable_net_hooks(struct net *net, struct nft_flowtable *flowtable) { @@ -5155,20 +5206,26 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, !nla[NFTA_FLOWTABLE_HOOK]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); + } - flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); - if (err != -ENOENT) + if (err != -ENOENT) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return err; + } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (nlh->nlmsg_flags & NLM_F_EXCL) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -EEXIST; + } return 0; } @@ -5195,14 +5252,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, } flowtable->data.type = type; - err = rhashtable_init(&flowtable->data.rhashtable, type->params); + err = type->init(&flowtable->data); if (err < 0) goto err3; err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], flowtable); if (err < 0) - goto err3; + goto err4; for (i = 0; i < flowtable->ops_len; i++) { if (!flowtable->ops[i].dev) @@ -5216,37 +5273,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (flowtable->ops[i].dev == ft->ops[k].dev && flowtable->ops[i].pf == ft->ops[k].pf) { err = -EBUSY; - goto err4; + goto err5; } } } err = nf_register_net_hook(net, &flowtable->ops[i]); if (err < 0) - goto err4; + goto err5; } err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable); if (err < 0) - goto err5; - - INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc); - queue_delayed_work(system_power_efficient_wq, - &flowtable->data.gc_work, HZ); + goto err6; list_add_tail_rcu(&flowtable->list, &table->flowtables); table->use++; return 0; -err5: +err6: i = flowtable->ops_len; -err4: +err5: for (k = i - 1; k >= 0; k--) { kfree(flowtable->dev_name[k]); nf_unregister_net_hook(net, &flowtable->ops[k]); } kfree(flowtable->ops); +err4: + flowtable->data.type->free(&flowtable->data); err3: module_put(type->owner); err2: @@ -5266,6 +5321,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; + const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; @@ -5274,23 +5330,29 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, !nla[NFTA_FLOWTABLE_HANDLE])) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); - if (IS_ERR(table)) + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); + if (IS_ERR(table)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]); return PTR_ERR(table); + } - if (nla[NFTA_FLOWTABLE_HANDLE]) - flowtable = nf_tables_flowtable_lookup_byhandle(table, - nla[NFTA_FLOWTABLE_HANDLE], - genmask); - else - flowtable = nf_tables_flowtable_lookup(table, - nla[NFTA_FLOWTABLE_NAME], - genmask); - if (IS_ERR(flowtable)) - return PTR_ERR(flowtable); - if (flowtable->use > 0) + if (nla[NFTA_FLOWTABLE_HANDLE]) { + attr = nla[NFTA_FLOWTABLE_HANDLE]; + flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); + } else { + attr = nla[NFTA_FLOWTABLE_NAME]; + flowtable = nft_flowtable_lookup(table, attr, genmask); + } + + if (IS_ERR(flowtable)) { + NL_SET_BAD_ATTR(extack, attr); + return PTR_ERR(flowtable); + } + if (flowtable->use > 0) { + NL_SET_BAD_ATTR(extack, attr); return -EBUSY; + } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -5471,13 +5533,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, if (!nla[NFTA_FLOWTABLE_NAME]) return -EINVAL; - table = nf_tables_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], - family, genmask); + table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family, + genmask); if (IS_ERR(table)) return PTR_ERR(table); - flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); @@ -5530,11 +5592,9 @@ err: static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) { - cancel_delayed_work_sync(&flowtable->data.gc_work); kfree(flowtable->ops); kfree(flowtable->name); flowtable->data.type->free(&flowtable->data); - rhashtable_destroy(&flowtable->data.rhashtable); module_put(flowtable->data.type->owner); } @@ -6459,8 +6519,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, case NFT_GOTO: if (!tb[NFTA_VERDICT_CHAIN]) return -EINVAL; - chain = nf_tables_chain_lookup(ctx->table, - tb[NFTA_VERDICT_CHAIN], genmask); + chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN], + genmask); if (IS_ERR(chain)) return PTR_ERR(chain); if (nft_is_base_chain(chain)) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 942702a2776f..ebb9799350ed 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -258,6 +258,9 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_payload_type, &nft_dynset_type, &nft_range_type, + &nft_meta_type, + &nft_rt_type, + &nft_exthdr_type, }; int __init nf_tables_core_module_init(void) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 7b46aa4c478d..e5cc4d9b9ce7 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -37,7 +37,6 @@ #include <net/sock.h> #include <net/netfilter/nf_log.h> #include <net/netns/generic.h> -#include <net/netfilter/nfnetlink_log.h> #include <linux/atomic.h> #include <linux/refcount.h> @@ -47,6 +46,7 @@ #include "../bridge/br_private.h" #endif +#define NFULNL_COPY_DISABLED 0xff #define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE #define NFULNL_TIMEOUT_DEFAULT 100 /* every second */ #define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ @@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = { }; /* log handler for internal netfilter logging api */ -void +static void nfulnl_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, @@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net, struct nfulnl_instance *inst; const struct nf_loginfo *li; unsigned int qthreshold; - unsigned int plen; + unsigned int plen = 0; struct nfnl_log_net *log = nfnl_log_pernet(net); const struct nfnl_ct_hook *nfnl_ct = NULL; struct nf_conn *ct = NULL; @@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net, if (!inst) return; - plen = 0; if (prefix) plen = strlen(prefix) + 1; @@ -760,7 +759,6 @@ alloc_failure: /* FIXME: statistics */ goto unlock_and_release; } -EXPORT_SYMBOL_GPL(nfulnl_log_packet); static int nfulnl_rcv_nl_event(struct notifier_block *this, diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 04863fad05dd..b07a3fd9eeea 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -36,7 +36,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, u64 timeout; void *elem; - if (set->size && !atomic_add_unless(&set->nelems, 1, set->size)) + if (!atomic_add_unless(&set->nelems, 1, set->size)) return NULL; timeout = priv->timeout ? : set->timeout; @@ -81,7 +81,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; - *nft_set_ext_expiration(ext) = jiffies + timeout; + *nft_set_ext_expiration(ext) = get_jiffies_64() + timeout; } if (sexpr != NULL) @@ -216,6 +216,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (err < 0) goto err1; + if (set->size == 0) + set->size = 0xffff; + priv->set = set; return 0; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 47ec1046ad11..a940c9fd9045 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -10,11 +10,10 @@ #include <asm/unaligned.h> #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/tcp.h> @@ -353,7 +352,6 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } -static struct nft_expr_type nft_exthdr_type; static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -407,27 +405,10 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EOPNOTSUPP); } -static struct nft_expr_type nft_exthdr_type __read_mostly = { +struct nft_expr_type nft_exthdr_type __read_mostly = { .name = "exthdr", .select_ops = nft_exthdr_select_ops, .policy = nft_exthdr_policy, .maxattr = NFTA_EXTHDR_MAX, .owner = THIS_MODULE, }; - -static int __init nft_exthdr_module_init(void) -{ - return nft_register_expr(&nft_exthdr_type); -} - -static void __exit nft_exthdr_module_exit(void) -{ - nft_unregister_expr(&nft_exthdr_type); -} - -module_init(nft_exthdr_module_init); -module_exit(nft_exthdr_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("exthdr"); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index b65829b2be22..d6bab8c3cbb0 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -142,9 +142,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nf_tables_flowtable_lookup(ctx->table, - tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], + genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 24f2f7567ddb..e235c17f1b8b 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -97,7 +97,7 @@ static int nft_jhash_init(const struct nft_ctx *ctx, priv->len = len; priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); - if (priv->modulus <= 1) + if (priv->modulus < 1) return -ERANGE; if (priv->offset + priv->modulus - 1 < priv->offset) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8fb91940e2e7..5348bd058c88 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -1,5 +1,7 @@ /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2014 Intel Corporation + * Author: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -9,8 +11,6 @@ */ #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -24,21 +24,35 @@ #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> -#include <net/netfilter/nft_meta.h> #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ +struct nft_meta { + enum nft_meta_keys key:8; + union { + enum nft_registers dreg:8; + enum nft_registers sreg:8; + }; +}; + static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -void nft_meta_get_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +#ifdef CONFIG_NF_TABLES_BRIDGE +#include "../bridge/br_private.h" +#endif + +static void nft_meta_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); struct sock *sk; u32 *dest = ®s->data[priv->dreg]; +#ifdef CONFIG_NF_TABLES_BRIDGE + const struct net_bridge_port *p; +#endif switch (priv->key) { case NFT_META_LEN: @@ -215,6 +229,18 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store8(dest, !!skb->sp); break; #endif +#ifdef CONFIG_NF_TABLES_BRIDGE + case NFT_META_BRI_IIFNAME: + if (in == NULL || (p = br_port_get_rcu(in)) == NULL) + goto err; + strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); + return; + case NFT_META_BRI_OIFNAME: + if (out == NULL || (p = br_port_get_rcu(out)) == NULL) + goto err; + strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); + return; +#endif default: WARN_ON(1); goto err; @@ -224,11 +250,10 @@ void nft_meta_get_eval(const struct nft_expr *expr, err: regs->verdict.code = NFT_BREAK; } -EXPORT_SYMBOL_GPL(nft_meta_get_eval); -void nft_meta_set_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_meta_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { const struct nft_meta *meta = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; @@ -258,18 +283,16 @@ void nft_meta_set_eval(const struct nft_expr *expr, WARN_ON(1); } } -EXPORT_SYMBOL_GPL(nft_meta_set_eval); -const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { +static const struct nla_policy nft_meta_policy[NFTA_META_MAX + 1] = { [NFTA_META_DREG] = { .type = NLA_U32 }, [NFTA_META_KEY] = { .type = NLA_U32 }, [NFTA_META_SREG] = { .type = NLA_U32 }, }; -EXPORT_SYMBOL_GPL(nft_meta_policy); -int nft_meta_get_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_meta_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -318,6 +341,14 @@ int nft_meta_get_init(const struct nft_ctx *ctx, len = sizeof(u8); break; #endif +#ifdef CONFIG_NF_TABLES_BRIDGE + case NFT_META_BRI_IIFNAME: + case NFT_META_BRI_OIFNAME: + if (ctx->family != NFPROTO_BRIDGE) + return -EOPNOTSUPP; + len = IFNAMSIZ; + break; +#endif default: return -EOPNOTSUPP; } @@ -326,7 +357,6 @@ int nft_meta_get_init(const struct nft_ctx *ctx, return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, len); } -EXPORT_SYMBOL_GPL(nft_meta_get_init); static int nft_meta_get_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -360,9 +390,9 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } -int nft_meta_set_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +static int nft_meta_set_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; @@ -388,11 +418,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, return nft_chain_validate_hooks(ctx->chain, hooks); } -EXPORT_SYMBOL_GPL(nft_meta_set_validate); -int nft_meta_set_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_meta_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { struct nft_meta *priv = nft_expr_priv(expr); unsigned int len; @@ -424,10 +453,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx, return 0; } -EXPORT_SYMBOL_GPL(nft_meta_set_init); -int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) +static int nft_meta_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -440,10 +468,8 @@ int nft_meta_get_dump(struct sk_buff *skb, nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nft_meta_get_dump); -int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr) +static int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -457,19 +483,16 @@ int nft_meta_set_dump(struct sk_buff *skb, nla_put_failure: return -1; } -EXPORT_SYMBOL_GPL(nft_meta_set_dump); -void nft_meta_set_destroy(const struct nft_ctx *ctx, - const struct nft_expr *expr) +static void nft_meta_set_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) { const struct nft_meta *priv = nft_expr_priv(expr); if (priv->key == NFT_META_NFTRACE) static_branch_dec(&nft_trace_enabled); } -EXPORT_SYMBOL_GPL(nft_meta_set_destroy); -static struct nft_expr_type nft_meta_type; static const struct nft_expr_ops nft_meta_get_ops = { .type = &nft_meta_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), @@ -508,27 +531,10 @@ nft_meta_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EINVAL); } -static struct nft_expr_type nft_meta_type __read_mostly = { +struct nft_expr_type nft_meta_type __read_mostly = { .name = "meta", .select_ops = nft_meta_select_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, }; - -static int __init nft_meta_module_init(void) -{ - return nft_register_expr(&nft_meta_type); -} - -static void __exit nft_meta_module_exit(void) -{ - nft_unregister_expr(&nft_meta_type); -} - -module_init(nft_meta_module_init); -module_exit(nft_meta_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_ALIAS_NFT_EXPR("meta"); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 1f36954c2ba9..c15807d10b91 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -43,7 +43,7 @@ static void nft_nat_eval(const struct nft_expr *expr, const struct nft_nat *priv = nft_expr_priv(expr); enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo); - struct nf_nat_range range; + struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); if (priv->sreg_addr_min) { diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 5a3a52c71545..8a64db8f2e69 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -24,13 +24,11 @@ struct nft_ng_inc { u32 modulus; atomic_t counter; u32 offset; + struct nft_set *map; }; -static void nft_ng_inc_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static u32 nft_ng_inc_gen(struct nft_ng_inc *priv) { - struct nft_ng_inc *priv = nft_expr_priv(expr); u32 nval, oval; do { @@ -38,7 +36,36 @@ static void nft_ng_inc_eval(const struct nft_expr *expr, nval = (oval + 1 < priv->modulus) ? oval + 1 : 0; } while (atomic_cmpxchg(&priv->counter, oval, nval) != oval); - regs->data[priv->dreg] = nval + priv->offset; + return nval + priv->offset; +} + +static void nft_ng_inc_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + + regs->data[priv->dreg] = nft_ng_inc_gen(priv); +} + +static void nft_ng_inc_map_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + const struct nft_set *map = priv->map; + const struct nft_set_ext *ext; + u32 result; + bool found; + + result = nft_ng_inc_gen(priv); + found = map->ops->lookup(nft_net(pkt), map, &result, &ext); + + if (!found) + return; + + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), map->dlen); } static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { @@ -46,6 +73,9 @@ static const struct nla_policy nft_ng_policy[NFTA_NG_MAX + 1] = { [NFTA_NG_MODULUS] = { .type = NLA_U32 }, [NFTA_NG_TYPE] = { .type = NLA_U32 }, [NFTA_NG_OFFSET] = { .type = NLA_U32 }, + [NFTA_NG_SET_NAME] = { .type = NLA_STRING, + .len = NFT_SET_MAXNAMELEN - 1 }, + [NFTA_NG_SET_ID] = { .type = NLA_U32 }, }; static int nft_ng_inc_init(const struct nft_ctx *ctx, @@ -71,6 +101,25 @@ static int nft_ng_inc_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } +static int nft_ng_inc_map_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_ng_inc *priv = nft_expr_priv(expr); + u8 genmask = nft_genmask_next(ctx->net); + + nft_ng_inc_init(ctx, expr, tb); + + priv->map = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_NG_SET_NAME], + tb[NFTA_NG_SET_ID], genmask); + + if (IS_ERR(priv->map)) + return PTR_ERR(priv->map); + + return 0; +} + static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, u32 modulus, enum nft_ng_types type, u32 offset) { @@ -97,6 +146,22 @@ static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) priv->offset); } +static int nft_ng_inc_map_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_ng_inc *priv = nft_expr_priv(expr); + + if (nft_ng_dump(skb, priv->dreg, priv->modulus, + NFT_NG_INCREMENTAL, priv->offset) || + nla_put_string(skb, NFTA_NG_SET_NAME, priv->map->name)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + struct nft_ng_random { enum nft_registers dreg:8; u32 modulus; @@ -156,6 +221,14 @@ static const struct nft_expr_ops nft_ng_inc_ops = { .dump = nft_ng_inc_dump, }; +static const struct nft_expr_ops nft_ng_inc_map_ops = { + .type = &nft_ng_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_inc)), + .eval = nft_ng_inc_map_eval, + .init = nft_ng_inc_map_init, + .dump = nft_ng_inc_map_dump, +}; + static const struct nft_expr_ops nft_ng_random_ops = { .type = &nft_ng_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ng_random)), @@ -178,6 +251,8 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) switch (type) { case NFT_NG_INCREMENTAL: + if (tb[NFTA_NG_SET_NAME]) + return &nft_ng_inc_map_ops; return &nft_ng_inc_ops; case NFT_NG_RANDOM: return &nft_ng_random_ops; diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 0b02407773ad..cdf348f751ec 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -38,8 +38,8 @@ static int nft_objref_init(const struct nft_ctx *ctx, return -EINVAL; objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE])); - obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, - genmask); + obj = nft_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], objtype, + genmask); if (IS_ERR(obj)) return -ENOENT; diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 11a2071b6dd4..76dba9f6b6f6 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -7,8 +7,6 @@ */ #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> @@ -179,7 +177,6 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp return nft_chain_validate_hooks(ctx->chain, hooks); } -static struct nft_expr_type nft_rt_type; static const struct nft_expr_ops nft_rt_get_ops = { .type = &nft_rt_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_rt)), @@ -189,27 +186,10 @@ static const struct nft_expr_ops nft_rt_get_ops = { .validate = nft_rt_validate, }; -static struct nft_expr_type nft_rt_type __read_mostly = { +struct nft_expr_type nft_rt_type __read_mostly = { .name = "rt", .ops = &nft_rt_get_ops, .policy = nft_rt_policy, .maxattr = NFTA_RT_MAX, .owner = THIS_MODULE, }; - -static int __init nft_rt_module_init(void) -{ - return nft_register_expr(&nft_rt_type); -} - -static void __exit nft_rt_module_exit(void) -{ - nft_unregister_expr(&nft_rt_type); -} - -module_init(nft_rt_module_init); -module_exit(nft_rt_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Anders K. Pedersen <akp@cohaesio.com>"); -MODULE_ALIAS_NFT_EXPR("rt"); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 45fb2752fb63..d6626e01c7ee 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -296,27 +296,23 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_bitmap_type; -static struct nft_set_ops nft_bitmap_ops __read_mostly = { - .type = &nft_bitmap_type, - .privsize = nft_bitmap_privsize, - .elemsize = offsetof(struct nft_bitmap_elem, ext), - .estimate = nft_bitmap_estimate, - .init = nft_bitmap_init, - .destroy = nft_bitmap_destroy, - .insert = nft_bitmap_insert, - .remove = nft_bitmap_remove, - .deactivate = nft_bitmap_deactivate, - .flush = nft_bitmap_flush, - .activate = nft_bitmap_activate, - .lookup = nft_bitmap_lookup, - .walk = nft_bitmap_walk, - .get = nft_bitmap_get, -}; - static struct nft_set_type nft_bitmap_type __read_mostly = { - .ops = &nft_bitmap_ops, .owner = THIS_MODULE, + .ops = { + .privsize = nft_bitmap_privsize, + .elemsize = offsetof(struct nft_bitmap_elem, ext), + .estimate = nft_bitmap_estimate, + .init = nft_bitmap_init, + .destroy = nft_bitmap_destroy, + .insert = nft_bitmap_insert, + .remove = nft_bitmap_remove, + .deactivate = nft_bitmap_deactivate, + .flush = nft_bitmap_flush, + .activate = nft_bitmap_activate, + .lookup = nft_bitmap_lookup, + .walk = nft_bitmap_walk, + .get = nft_bitmap_get, + }, }; static int __init nft_bitmap_module_init(void) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index fc9c6d5d64cd..dbf1f4ad077c 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -605,6 +605,12 @@ static void nft_hash_destroy(const struct nft_set *set) static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { + if (!desc->size) + return false; + + if (desc->klen == 4) + return false; + est->size = sizeof(struct nft_hash) + nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + desc->size * sizeof(struct nft_hash_elem); @@ -614,91 +620,100 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_hash_type; -static struct nft_set_ops nft_rhash_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_rhash_privsize, - .elemsize = offsetof(struct nft_rhash_elem, ext), - .estimate = nft_rhash_estimate, - .init = nft_rhash_init, - .destroy = nft_rhash_destroy, - .insert = nft_rhash_insert, - .activate = nft_rhash_activate, - .deactivate = nft_rhash_deactivate, - .flush = nft_rhash_flush, - .remove = nft_rhash_remove, - .lookup = nft_rhash_lookup, - .update = nft_rhash_update, - .walk = nft_rhash_walk, - .get = nft_rhash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, -}; +static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + if (!desc->size) + return false; -static struct nft_set_ops nft_hash_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_hash_privsize, - .elemsize = offsetof(struct nft_hash_elem, ext), - .estimate = nft_hash_estimate, - .init = nft_hash_init, - .destroy = nft_hash_destroy, - .insert = nft_hash_insert, - .activate = nft_hash_activate, - .deactivate = nft_hash_deactivate, - .flush = nft_hash_flush, - .remove = nft_hash_remove, - .lookup = nft_hash_lookup, - .walk = nft_hash_walk, - .get = nft_hash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT, -}; + if (desc->klen != 4) + return false; -static struct nft_set_ops nft_hash_fast_ops __read_mostly = { - .type = &nft_hash_type, - .privsize = nft_hash_privsize, - .elemsize = offsetof(struct nft_hash_elem, ext), - .estimate = nft_hash_estimate, - .init = nft_hash_init, - .destroy = nft_hash_destroy, - .insert = nft_hash_insert, - .activate = nft_hash_activate, - .deactivate = nft_hash_deactivate, - .flush = nft_hash_flush, - .remove = nft_hash_remove, - .lookup = nft_hash_lookup_fast, - .walk = nft_hash_walk, - .get = nft_hash_get, - .features = NFT_SET_MAP | NFT_SET_OBJECT, -}; - -static const struct nft_set_ops * -nft_hash_select_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, - u32 flags) -{ - if (desc->size && !(flags & (NFT_SET_EVAL | NFT_SET_TIMEOUT))) { - switch (desc->klen) { - case 4: - return &nft_hash_fast_ops; - default: - return &nft_hash_ops; - } - } + est->size = sizeof(struct nft_hash) + + nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + + desc->size * sizeof(struct nft_hash_elem); + est->lookup = NFT_SET_CLASS_O_1; + est->space = NFT_SET_CLASS_O_N; - return &nft_rhash_ops; + return true; } +static struct nft_set_type nft_rhash_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT | NFT_SET_EVAL, + .ops = { + .privsize = nft_rhash_privsize, + .elemsize = offsetof(struct nft_rhash_elem, ext), + .estimate = nft_rhash_estimate, + .init = nft_rhash_init, + .destroy = nft_rhash_destroy, + .insert = nft_rhash_insert, + .activate = nft_rhash_activate, + .deactivate = nft_rhash_deactivate, + .flush = nft_rhash_flush, + .remove = nft_rhash_remove, + .lookup = nft_rhash_lookup, + .update = nft_rhash_update, + .walk = nft_rhash_walk, + .get = nft_rhash_get, + }, +}; + static struct nft_set_type nft_hash_type __read_mostly = { - .select_ops = nft_hash_select_ops, .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .flush = nft_hash_flush, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup, + .walk = nft_hash_walk, + .get = nft_hash_get, + }, +}; + +static struct nft_set_type nft_hash_fast_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_hash_privsize, + .elemsize = offsetof(struct nft_hash_elem, ext), + .estimate = nft_hash_fast_estimate, + .init = nft_hash_init, + .destroy = nft_hash_destroy, + .insert = nft_hash_insert, + .activate = nft_hash_activate, + .deactivate = nft_hash_deactivate, + .flush = nft_hash_flush, + .remove = nft_hash_remove, + .lookup = nft_hash_lookup_fast, + .walk = nft_hash_walk, + .get = nft_hash_get, + }, }; static int __init nft_hash_module_init(void) { - return nft_register_set(&nft_hash_type); + if (nft_register_set(&nft_hash_fast_type) || + nft_register_set(&nft_hash_type) || + nft_register_set(&nft_rhash_type)) + return 1; + return 0; } static void __exit nft_hash_module_exit(void) { + nft_unregister_set(&nft_rhash_type); nft_unregister_set(&nft_hash_type); + nft_unregister_set(&nft_hash_fast_type); } module_init(nft_hash_module_init); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index e6f08bc5f359..22c57d7612c4 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -393,28 +393,24 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_rbtree_type; -static struct nft_set_ops nft_rbtree_ops __read_mostly = { - .type = &nft_rbtree_type, - .privsize = nft_rbtree_privsize, - .elemsize = offsetof(struct nft_rbtree_elem, ext), - .estimate = nft_rbtree_estimate, - .init = nft_rbtree_init, - .destroy = nft_rbtree_destroy, - .insert = nft_rbtree_insert, - .remove = nft_rbtree_remove, - .deactivate = nft_rbtree_deactivate, - .flush = nft_rbtree_flush, - .activate = nft_rbtree_activate, - .lookup = nft_rbtree_lookup, - .walk = nft_rbtree_walk, - .get = nft_rbtree_get, - .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, -}; - static struct nft_set_type nft_rbtree_type __read_mostly = { - .ops = &nft_rbtree_ops, .owner = THIS_MODULE, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT, + .ops = { + .privsize = nft_rbtree_privsize, + .elemsize = offsetof(struct nft_rbtree_elem, ext), + .estimate = nft_rbtree_estimate, + .init = nft_rbtree_init, + .destroy = nft_rbtree_destroy, + .insert = nft_rbtree_insert, + .remove = nft_rbtree_remove, + .deactivate = nft_rbtree_deactivate, + .flush = nft_rbtree_flush, + .activate = nft_rbtree_activate, + .lookup = nft_rbtree_lookup, + .walk = nft_rbtree_walk, + .get = nft_rbtree_get, + }, }; static int __init nft_rbtree_module_init(void) diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c index 58aa9dd3c5b7..1d437875e15a 100644 --- a/net/netfilter/xt_NETMAP.c +++ b/net/netfilter/xt_NETMAP.c @@ -21,8 +21,8 @@ static unsigned int netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; - struct nf_nat_range newrange; + const struct nf_nat_range2 *range = par->targinfo; + struct nf_nat_range2 newrange; struct nf_conn *ct; enum ip_conntrack_info ctinfo; union nf_inet_addr new_addr, netmask; @@ -56,7 +56,7 @@ netmap_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int netmap_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return -EINVAL; @@ -75,7 +75,7 @@ netmap_tg4(struct sk_buff *skb, const struct xt_action_param *par) enum ip_conntrack_info ctinfo; __be32 new_ip, netmask; const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range newrange; + struct nf_nat_range2 newrange; WARN_ON(xt_hooknum(par) != NF_INET_PRE_ROUTING && xt_hooknum(par) != NF_INET_POST_ROUTING && diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index c7f8958cea4a..1ed0cac585c4 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -13,7 +13,6 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFLOG.h> #include <net/netfilter/nf_log.h> -#include <net/netfilter/nfnetlink_log.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); @@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) if (info->flags & XT_NFLOG_F_COPY_LEN) li.u.ulog.flags |= NF_LOG_F_COPY_LEN; - nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb, - xt_in(par), xt_out(par), &li, info->prefix); + nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par), + xt_out(par), &li, "%s", info->prefix); + return XT_CONTINUE; } @@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; - return 0; + + return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); +} + +static void nflog_tg_destroy(const struct xt_tgdtor_param *par) +{ + nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } static struct xt_target nflog_tg_reg __read_mostly = { @@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = { .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = nflog_tg_check, + .destroy = nflog_tg_destroy, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c index 98a4c6d4f1cb..5ce9461e979c 100644 --- a/net/netfilter/xt_REDIRECT.c +++ b/net/netfilter/xt_REDIRECT.c @@ -36,7 +36,7 @@ redirect_tg6(struct sk_buff *skb, const struct xt_action_param *par) static int redirect_tg6_checkentry(const struct xt_tgchk_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; diff --git a/net/netfilter/xt_nat.c b/net/netfilter/xt_nat.c index bdb689cdc829..8af9707f8789 100644 --- a/net/netfilter/xt_nat.c +++ b/net/netfilter/xt_nat.c @@ -37,11 +37,12 @@ static void xt_nat_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static void xt_nat_convert_range(struct nf_nat_range *dst, +static void xt_nat_convert_range(struct nf_nat_range2 *dst, const struct nf_nat_ipv4_range *src) { memset(&dst->min_addr, 0, sizeof(dst->min_addr)); memset(&dst->max_addr, 0, sizeof(dst->max_addr)); + memset(&dst->base_proto, 0, sizeof(dst->base_proto)); dst->flags = src->flags; dst->min_addr.ip = src->min_ip; @@ -54,7 +55,7 @@ static unsigned int xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range range; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -71,7 +72,7 @@ static unsigned int xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; - struct nf_nat_range range; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -86,7 +87,8 @@ xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) static unsigned int xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range *range_v1 = par->targinfo; + struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -95,13 +97,49 @@ xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); - return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); + memcpy(&range, range_v1, sizeof(*range_v1)); + memset(&range.base_proto, 0, sizeof(range.base_proto)); + + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { - const struct nf_nat_range *range = par->targinfo; + const struct nf_nat_range *range_v1 = par->targinfo; + struct nf_nat_range2 range; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + WARN_ON(!(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); + + memcpy(&range, range_v1, sizeof(*range_v1)); + memset(&range.base_proto, 0, sizeof(range.base_proto)); + + return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); +} + +static unsigned int +xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + WARN_ON(!(ct != NULL && + (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED_REPLY))); + + return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); +} + +static unsigned int +xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -163,6 +201,28 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = { (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, + { + .name = "SNAT", + .revision = 2, + .checkentry = xt_nat_checkentry, + .destroy = xt_nat_destroy, + .target = xt_snat_target_v2, + .targetsize = sizeof(struct nf_nat_range2), + .table = "nat", + .hooks = (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, + { + .name = "DNAT", + .revision = 2, + .target = xt_dnat_target_v2, + .targetsize = sizeof(struct nf_nat_range2), + .table = "nat", + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT), + .me = THIS_MODULE, + }, }; static int __init xt_nat_init(void) diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index a34f314a8c23..9cfef73b4107 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -37,21 +37,6 @@ #include <net/netfilter/nf_log.h> #include <linux/netfilter/xt_osf.h> -struct xt_osf_finger { - struct rcu_head rcu_head; - struct list_head finger_entry; - struct xt_osf_user_finger finger; -}; - -enum osf_fmatch_states { - /* Packet does not match the fingerprint */ - FMATCH_WRONG = 0, - /* Packet matches the fingerprint */ - FMATCH_OK, - /* Options do not match the fingerprint, but header does */ - FMATCH_OPT_WRONG, -}; - /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. @@ -164,200 +149,17 @@ static const struct nfnetlink_subsystem xt_osf_nfnetlink = { .cb = xt_osf_nfnetlink_callbacks, }; -static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info, - unsigned char f_ttl) -{ - const struct iphdr *ip = ip_hdr(skb); - - if (info->flags & XT_OSF_TTL) { - if (info->ttl == XT_OSF_TTL_TRUE) - return ip->ttl == f_ttl; - if (info->ttl == XT_OSF_TTL_NOCHECK) - return 1; - else if (ip->ttl <= f_ttl) - return 1; - else { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); - int ret = 0; - - for_ifa(in_dev) { - if (inet_ifa_match(ip->saddr, ifa)) { - ret = (ip->ttl == f_ttl); - break; - } - } - endfor_ifa(in_dev); - - return ret; - } - } - - return ip->ttl == f_ttl; -} - static bool xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) { const struct xt_osf_info *info = p->matchinfo; - const struct iphdr *ip = ip_hdr(skb); - const struct tcphdr *tcp; - struct tcphdr _tcph; - int fmatch = FMATCH_WRONG, fcount = 0; - unsigned int optsize = 0, check_WSS = 0; - u16 window, totlen, mss = 0; - bool df; - const unsigned char *optp = NULL, *_optp = NULL; - unsigned char opts[MAX_IPOPTLEN]; - const struct xt_osf_finger *kf; - const struct xt_osf_user_finger *f; struct net *net = xt_net(p); if (!info) return false; - tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); - if (!tcp) - return false; - - if (!tcp->syn) - return false; - - totlen = ntohs(ip->tot_len); - df = ntohs(ip->frag_off) & IP_DF; - window = ntohs(tcp->window); - - if (tcp->doff * 4 > sizeof(struct tcphdr)) { - optsize = tcp->doff * 4 - sizeof(struct tcphdr); - - _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + - sizeof(struct tcphdr), optsize, opts); - } - - list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) { - int foptsize, optnum; - - f = &kf->finger; - - if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre)) - continue; - - optp = _optp; - fmatch = FMATCH_WRONG; - - if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl)) - continue; - - /* - * Should not happen if userspace parser was written correctly. - */ - if (f->wss.wc >= OSF_WSS_MAX) - continue; - - /* Check options */ - - foptsize = 0; - for (optnum = 0; optnum < f->opt_num; ++optnum) - foptsize += f->opt[optnum].length; - - if (foptsize > MAX_IPOPTLEN || - optsize > MAX_IPOPTLEN || - optsize != foptsize) - continue; - - check_WSS = f->wss.wc; - - for (optnum = 0; optnum < f->opt_num; ++optnum) { - if (f->opt[optnum].kind == (*optp)) { - __u32 len = f->opt[optnum].length; - const __u8 *optend = optp + len; - - fmatch = FMATCH_OK; - - switch (*optp) { - case OSFOPT_MSS: - mss = optp[3]; - mss <<= 8; - mss |= optp[2]; - - mss = ntohs((__force __be16)mss); - break; - case OSFOPT_TS: - break; - } - - optp = optend; - } else - fmatch = FMATCH_OPT_WRONG; - - if (fmatch != FMATCH_OK) - break; - } - - if (fmatch != FMATCH_OPT_WRONG) { - fmatch = FMATCH_WRONG; - - switch (check_WSS) { - case OSF_WSS_PLAIN: - if (f->wss.val == 0 || window == f->wss.val) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MSS: - /* - * Some smart modems decrease mangle MSS to - * SMART_MSS_2, so we check standard, decreased - * and the one provided in the fingerprint MSS - * values. - */ -#define SMART_MSS_1 1460 -#define SMART_MSS_2 1448 - if (window == f->wss.val * mss || - window == f->wss.val * SMART_MSS_1 || - window == f->wss.val * SMART_MSS_2) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MTU: - if (window == f->wss.val * (mss + 40) || - window == f->wss.val * (SMART_MSS_1 + 40) || - window == f->wss.val * (SMART_MSS_2 + 40)) - fmatch = FMATCH_OK; - break; - case OSF_WSS_MODULO: - if ((window % f->wss.val) == 0) - fmatch = FMATCH_OK; - break; - } - } - - if (fmatch != FMATCH_OK) - continue; - - fcount++; - - if (info->flags & XT_OSF_LOG) - nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, - xt_in(p), xt_out(p), NULL, - "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", - f->genre, f->version, f->subtype, - &ip->saddr, ntohs(tcp->source), - &ip->daddr, ntohs(tcp->dest), - f->ttl - ip->ttl); - - if ((info->flags & XT_OSF_LOG) && - info->loglevel == XT_OSF_LOGLEVEL_FIRST) - break; - } - - if (!fcount && (info->flags & XT_OSF_LOG)) - nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p), - xt_out(p), NULL, - "Remote OS is not known: %pI4:%u -> %pI4:%u\n", - &ip->saddr, ntohs(tcp->source), - &ip->daddr, ntohs(tcp->dest)); - - if (fcount) - fmatch = FMATCH_OK; - - return fmatch == FMATCH_OK; + return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), + xt_out(p), info, net, xt_osf_fingers); } static struct xt_match xt_osf_match = { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c5904f629091..02fc343feb66 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -72,7 +72,7 @@ struct ovs_conntrack_info { struct md_mark mark; struct md_labels labels; #ifdef CONFIG_NF_NAT_NEEDED - struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */ + struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ #endif }; @@ -710,7 +710,7 @@ static bool skb_nfct_cached(struct net *net, */ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct nf_nat_range *range, + const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { int hooknum, nh_off, err = NF_ACCEPT; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e9422fe45179..2cc98c763003 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -209,7 +209,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *, static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); -static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb); +static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { @@ -243,40 +243,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po); static int packet_direct_xmit(struct sk_buff *skb) { - struct net_device *dev = skb->dev; - struct sk_buff *orig_skb = skb; - struct netdev_queue *txq; - int ret = NETDEV_TX_BUSY; - bool again = false; - - if (unlikely(!netif_running(dev) || - !netif_carrier_ok(dev))) - goto drop; - - skb = validate_xmit_skb_list(skb, dev, &again); - if (skb != orig_skb) - goto drop; - - packet_pick_tx_queue(dev, skb); - txq = skb_get_tx_queue(dev, skb); - - local_bh_disable(); - - HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_drv_stopped(txq)) - ret = netdev_start_xmit(skb, dev, txq, false); - HARD_TX_UNLOCK(dev, txq); - - local_bh_enable(); - - if (!dev_xmit_complete(ret)) - kfree_skb(skb); - - return ret; -drop: - atomic_long_inc(&dev->tx_dropped); - kfree_skb_list(skb); - return NET_XMIT_DROP; + return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) @@ -313,8 +280,9 @@ static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; } -static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +static u16 packet_pick_tx_queue(struct sk_buff *skb) { + struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; u16 queue_index; @@ -326,7 +294,7 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) queue_index = __packet_pick_tx_queue(dev, skb); } - skb_set_queue_mapping(skb, queue_index); + return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig index 326fd97444f5..1944834d225c 100644 --- a/net/qrtr/Kconfig +++ b/net/qrtr/Kconfig @@ -21,4 +21,11 @@ config QRTR_SMD Say Y here to support SMD based ipcrouter channels. SMD is the most common transport for IPC Router. +config QRTR_TUN + tristate "TUN device for Qualcomm IPC Router" + ---help--- + Say Y here to expose a character device that allows user space to + implement endpoints of QRTR, for purpose of tunneling data to other + hosts or testing purposes. + endif # QRTR diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile index ab09e40f7c74..be012bfd3e52 100644 --- a/net/qrtr/Makefile +++ b/net/qrtr/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_QRTR) := qrtr.o obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o qrtr-smd-y := smd.o +obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o +qrtr-tun-y := tun.o diff --git a/net/qrtr/tun.c b/net/qrtr/tun.c new file mode 100644 index 000000000000..ccff1e544c21 --- /dev/null +++ b/net/qrtr/tun.c @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018, Linaro Ltd */ + +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/poll.h> +#include <linux/skbuff.h> +#include <linux/uaccess.h> + +#include "qrtr.h" + +struct qrtr_tun { + struct qrtr_endpoint ep; + + struct sk_buff_head queue; + wait_queue_head_t readq; +}; + +static int qrtr_tun_send(struct qrtr_endpoint *ep, struct sk_buff *skb) +{ + struct qrtr_tun *tun = container_of(ep, struct qrtr_tun, ep); + + skb_queue_tail(&tun->queue, skb); + + /* wake up any blocking processes, waiting for new data */ + wake_up_interruptible(&tun->readq); + + return 0; +} + +static int qrtr_tun_open(struct inode *inode, struct file *filp) +{ + struct qrtr_tun *tun; + + tun = kzalloc(sizeof(*tun), GFP_KERNEL); + if (!tun) + return -ENOMEM; + + skb_queue_head_init(&tun->queue); + init_waitqueue_head(&tun->readq); + + tun->ep.xmit = qrtr_tun_send; + + filp->private_data = tun; + + return qrtr_endpoint_register(&tun->ep, QRTR_EP_NID_AUTO); +} + +static ssize_t qrtr_tun_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *filp = iocb->ki_filp; + struct qrtr_tun *tun = filp->private_data; + struct sk_buff *skb; + int count; + + while (!(skb = skb_dequeue(&tun->queue))) { + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + /* Wait until we get data or the endpoint goes away */ + if (wait_event_interruptible(tun->readq, + !skb_queue_empty(&tun->queue))) + return -ERESTARTSYS; + } + + count = min_t(size_t, iov_iter_count(to), skb->len); + if (copy_to_iter(skb->data, count, to) != count) + count = -EFAULT; + + kfree_skb(skb); + + return count; +} + +static ssize_t qrtr_tun_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *filp = iocb->ki_filp; + struct qrtr_tun *tun = filp->private_data; + size_t len = iov_iter_count(from); + ssize_t ret; + void *kbuf; + + kbuf = kzalloc(len, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + if (!copy_from_iter_full(kbuf, len, from)) + return -EFAULT; + + ret = qrtr_endpoint_post(&tun->ep, kbuf, len); + + return ret < 0 ? ret : len; +} + +static __poll_t qrtr_tun_poll(struct file *filp, poll_table *wait) +{ + struct qrtr_tun *tun = filp->private_data; + __poll_t mask = 0; + + poll_wait(filp, &tun->readq, wait); + + if (!skb_queue_empty(&tun->queue)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static int qrtr_tun_release(struct inode *inode, struct file *filp) +{ + struct qrtr_tun *tun = filp->private_data; + struct sk_buff *skb; + + qrtr_endpoint_unregister(&tun->ep); + + /* Discard all SKBs */ + while (!skb_queue_empty(&tun->queue)) { + skb = skb_dequeue(&tun->queue); + kfree_skb(skb); + } + + kfree(tun); + + return 0; +} + +static const struct file_operations qrtr_tun_ops = { + .owner = THIS_MODULE, + .open = qrtr_tun_open, + .poll = qrtr_tun_poll, + .read_iter = qrtr_tun_read_iter, + .write_iter = qrtr_tun_write_iter, + .release = qrtr_tun_release, +}; + +static struct miscdevice qrtr_tun_miscdev = { + MISC_DYNAMIC_MINOR, + "qrtr-tun", + &qrtr_tun_ops, +}; + +static int __init qrtr_tun_init(void) +{ + int ret; + + ret = misc_register(&qrtr_tun_miscdev); + if (ret) + pr_err("failed to register Qualcomm IPC Router tun device\n"); + + return ret; +} + +static void __exit qrtr_tun_exit(void) +{ + misc_deregister(&qrtr_tun_miscdev); +} + +module_init(qrtr_tun_init); +module_exit(qrtr_tun_exit); + +MODULE_DESCRIPTION("Qualcomm IPC Router TUN device"); +MODULE_LICENSE("GPL v2"); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 7e28b2ce1437..526a8e491626 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -648,6 +648,11 @@ static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static size_t tcf_csum_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_csum)); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -658,6 +663,7 @@ static struct tc_action_ops act_csum_ops = { .cleanup = tcf_csum_cleanup, .walk = tcf_csum_walker, .lookup = tcf_csum_search, + .get_fill_size = tcf_csum_get_fill_size, .size = sizeof(struct tcf_csum), }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d964e60c730e..eacaaf803914 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -61,16 +61,18 @@ struct fl_flow_mask_range { struct fl_flow_mask { struct fl_flow_key key; struct fl_flow_mask_range range; - struct rcu_head rcu; + struct rhash_head ht_node; + struct rhashtable ht; + struct rhashtable_params filter_ht_params; + struct flow_dissector dissector; + struct list_head filters; + struct rcu_head rcu; + struct list_head list; }; struct cls_fl_head { struct rhashtable ht; - struct fl_flow_mask mask; - struct flow_dissector dissector; - bool mask_assigned; - struct list_head filters; - struct rhashtable_params ht_params; + struct list_head masks; union { struct work_struct work; struct rcu_head rcu; @@ -79,6 +81,7 @@ struct cls_fl_head { }; struct cls_fl_filter { + struct fl_flow_mask *mask; struct rhash_head ht_node; struct fl_flow_key mkey; struct tcf_exts exts; @@ -94,6 +97,13 @@ struct cls_fl_filter { struct net_device *hw_dev; }; +static const struct rhashtable_params mask_ht_params = { + .key_offset = offsetof(struct fl_flow_mask, key), + .key_len = sizeof(struct fl_flow_key), + .head_offset = offsetof(struct fl_flow_mask, ht_node), + .automatic_shrinking = true, +}; + static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) { return mask->range.end - mask->range.start; @@ -103,13 +113,19 @@ static void fl_mask_update_range(struct fl_flow_mask *mask) { const u8 *bytes = (const u8 *) &mask->key; size_t size = sizeof(mask->key); - size_t i, first = 0, last = size - 1; + size_t i, first = 0, last; - for (i = 0; i < sizeof(mask->key); i++) { + for (i = 0; i < size; i++) { + if (bytes[i]) { + first = i; + break; + } + } + last = first; + for (i = size - 1; i != first; i--) { if (bytes[i]) { - if (!first && i) - first = i; last = i; + break; } } mask->range.start = rounddown(first, sizeof(long)); @@ -140,12 +156,11 @@ static void fl_clear_masked_range(struct fl_flow_key *key, memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); } -static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head, +static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, struct fl_flow_key *mkey) { - return rhashtable_lookup_fast(&head->ht, - fl_key_get_start(mkey, &head->mask), - head->ht_params); + return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask), + mask->filter_ht_params); } static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, @@ -153,28 +168,28 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, { struct cls_fl_head *head = rcu_dereference_bh(tp->root); struct cls_fl_filter *f; + struct fl_flow_mask *mask; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; - if (!atomic_read(&head->ht.nelems)) - return -1; - - fl_clear_masked_range(&skb_key, &head->mask); + list_for_each_entry_rcu(mask, &head->masks, list) { + fl_clear_masked_range(&skb_key, mask); - skb_key.indev_ifindex = skb->skb_iif; - /* skb_flow_dissect() does not set n_proto in case an unknown protocol, - * so do it rather here. - */ - skb_key.basic.n_proto = skb->protocol; - skb_flow_dissect_tunnel_info(skb, &head->dissector, &skb_key); - skb_flow_dissect(skb, &head->dissector, &skb_key, 0); + skb_key.indev_ifindex = skb->skb_iif; + /* skb_flow_dissect() does not set n_proto in case an unknown + * protocol, so do it rather here. + */ + skb_key.basic.n_proto = skb->protocol; + skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); + skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); - fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); + fl_set_masked_key(&skb_mkey, &skb_key, mask); - f = fl_lookup(head, &skb_mkey); - if (f && !tc_skip_sw(f->flags)) { - *res = f->res; - return tcf_exts_exec(skb, &f->exts, res); + f = fl_lookup(mask, &skb_mkey); + if (f && !tc_skip_sw(f->flags)) { + *res = f->res; + return tcf_exts_exec(skb, &f->exts, res); + } } return -1; } @@ -187,11 +202,28 @@ static int fl_init(struct tcf_proto *tp) if (!head) return -ENOBUFS; - INIT_LIST_HEAD_RCU(&head->filters); + INIT_LIST_HEAD_RCU(&head->masks); rcu_assign_pointer(tp->root, head); idr_init(&head->handle_idr); - return 0; + return rhashtable_init(&head->ht, &mask_ht_params); +} + +static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask, + bool async) +{ + if (!list_empty(&mask->filters)) + return false; + + rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params); + rhashtable_destroy(&mask->ht); + list_del_rcu(&mask->list); + if (async) + kfree_rcu(mask, rcu); + else + kfree(mask); + + return true; } static void __fl_destroy_filter(struct cls_fl_filter *f) @@ -234,8 +266,6 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f, } static int fl_hw_replace_filter(struct tcf_proto *tp, - struct flow_dissector *dissector, - struct fl_flow_key *mask, struct cls_fl_filter *f, struct netlink_ext_ack *extack) { @@ -247,8 +277,8 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack); cls_flower.command = TC_CLSFLOWER_REPLACE; cls_flower.cookie = (unsigned long) f; - cls_flower.dissector = dissector; - cls_flower.mask = mask; + cls_flower.dissector = &f->mask->dissector; + cls_flower.mask = &f->mask->key; cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; cls_flower.classid = f->res.classid; @@ -283,28 +313,31 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) &cls_flower, false); } -static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, +static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); + bool async = tcf_exts_get_net(&f->exts); + bool last; idr_remove(&head->handle_idr, f->handle); list_del_rcu(&f->list); + last = fl_mask_put(head, f->mask, async); if (!tc_skip_hw(f->flags)) fl_hw_destroy_filter(tp, f, extack); tcf_unbind_filter(tp, &f->res); - if (tcf_exts_get_net(&f->exts)) + if (async) call_rcu(&f->rcu, fl_destroy_filter); else __fl_destroy_filter(f); + + return last; } static void fl_destroy_sleepable(struct work_struct *work) { struct cls_fl_head *head = container_of(work, struct cls_fl_head, work); - if (head->mask_assigned) - rhashtable_destroy(&head->ht); kfree(head); module_put(THIS_MODULE); } @@ -320,10 +353,15 @@ static void fl_destroy_rcu(struct rcu_head *rcu) static void fl_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct cls_fl_head *head = rtnl_dereference(tp->root); + struct fl_flow_mask *mask, *next_mask; struct cls_fl_filter *f, *next; - list_for_each_entry_safe(f, next, &head->filters, list) - __fl_delete(tp, f, extack); + list_for_each_entry_safe(mask, next_mask, &head->masks, list) { + list_for_each_entry_safe(f, next, &mask->filters, list) { + if (__fl_delete(tp, f, extack)) + break; + } + } idr_destroy(&head->handle_idr); __module_get(THIS_MODULE); @@ -715,14 +753,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb, return ret; } -static bool fl_mask_eq(struct fl_flow_mask *mask1, - struct fl_flow_mask *mask2) +static void fl_mask_copy(struct fl_flow_mask *dst, + struct fl_flow_mask *src) { - const long *lmask1 = fl_key_get_start(&mask1->key, mask1); - const long *lmask2 = fl_key_get_start(&mask2->key, mask2); + const void *psrc = fl_key_get_start(&src->key, src); + void *pdst = fl_key_get_start(&dst->key, src); - return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) && - !memcmp(lmask1, lmask2, fl_mask_range(mask1)); + memcpy(pdst, psrc, fl_mask_range(src)); + dst->range = src->range; } static const struct rhashtable_params fl_ht_params = { @@ -731,14 +769,13 @@ static const struct rhashtable_params fl_ht_params = { .automatic_shrinking = true, }; -static int fl_init_hashtable(struct cls_fl_head *head, - struct fl_flow_mask *mask) +static int fl_init_mask_hashtable(struct fl_flow_mask *mask) { - head->ht_params = fl_ht_params; - head->ht_params.key_len = fl_mask_range(mask); - head->ht_params.key_offset += mask->range.start; + mask->filter_ht_params = fl_ht_params; + mask->filter_ht_params.key_len = fl_mask_range(mask); + mask->filter_ht_params.key_offset += mask->range.start; - return rhashtable_init(&head->ht, &head->ht_params); + return rhashtable_init(&mask->ht, &mask->filter_ht_params); } #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) @@ -761,8 +798,7 @@ static int fl_init_hashtable(struct cls_fl_head *head, FL_KEY_SET(keys, cnt, id, member); \ } while(0); -static void fl_init_dissector(struct cls_fl_head *head, - struct fl_flow_mask *mask) +static void fl_init_dissector(struct fl_flow_mask *mask) { struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; @@ -802,31 +838,66 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); - skb_flow_dissector_init(&head->dissector, keys, cnt); + skb_flow_dissector_init(&mask->dissector, keys, cnt); +} + +static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, + struct fl_flow_mask *mask) +{ + struct fl_flow_mask *newmask; + int err; + + newmask = kzalloc(sizeof(*newmask), GFP_KERNEL); + if (!newmask) + return ERR_PTR(-ENOMEM); + + fl_mask_copy(newmask, mask); + + err = fl_init_mask_hashtable(newmask); + if (err) + goto errout_free; + + fl_init_dissector(newmask); + + INIT_LIST_HEAD_RCU(&newmask->filters); + + err = rhashtable_insert_fast(&head->ht, &newmask->ht_node, + mask_ht_params); + if (err) + goto errout_destroy; + + list_add_tail_rcu(&newmask->list, &head->masks); + + return newmask; + +errout_destroy: + rhashtable_destroy(&newmask->ht); +errout_free: + kfree(newmask); + + return ERR_PTR(err); } static int fl_check_assign_mask(struct cls_fl_head *head, + struct cls_fl_filter *fnew, + struct cls_fl_filter *fold, struct fl_flow_mask *mask) { - int err; + struct fl_flow_mask *newmask; - if (head->mask_assigned) { - if (!fl_mask_eq(&head->mask, mask)) + fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params); + if (!fnew->mask) { + if (fold) return -EINVAL; - else - return 0; - } - /* Mask is not assigned yet. So assign it and init hashtable - * according to that. - */ - err = fl_init_hashtable(head, mask); - if (err) - return err; - memcpy(&head->mask, mask, sizeof(head->mask)); - head->mask_assigned = true; + newmask = fl_create_new_mask(head, mask); + if (IS_ERR(newmask)) + return PTR_ERR(newmask); - fl_init_dissector(head, mask); + fnew->mask = newmask; + } else if (fold && fold->mask == fnew->mask) { + return -EINVAL; + } return 0; } @@ -924,30 +995,26 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout_idr; - err = fl_check_assign_mask(head, &mask); + err = fl_check_assign_mask(head, fnew, fold, &mask); if (err) goto errout_idr; if (!tc_skip_sw(fnew->flags)) { - if (!fold && fl_lookup(head, &fnew->mkey)) { + if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) { err = -EEXIST; - goto errout_idr; + goto errout_mask; } - err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, - head->ht_params); + err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node, + fnew->mask->filter_ht_params); if (err) - goto errout_idr; + goto errout_mask; } if (!tc_skip_hw(fnew->flags)) { - err = fl_hw_replace_filter(tp, - &head->dissector, - &mask.key, - fnew, - extack); + err = fl_hw_replace_filter(tp, fnew, extack); if (err) - goto errout_idr; + goto errout_mask; } if (!tc_in_hw(fnew->flags)) @@ -955,8 +1022,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (fold) { if (!tc_skip_sw(fold->flags)) - rhashtable_remove_fast(&head->ht, &fold->ht_node, - head->ht_params); + rhashtable_remove_fast(&fold->mask->ht, + &fold->ht_node, + fold->mask->filter_ht_params); if (!tc_skip_hw(fold->flags)) fl_hw_destroy_filter(tp, fold, NULL); } @@ -970,12 +1038,15 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, tcf_exts_get_net(&fold->exts); call_rcu(&fold->rcu, fl_destroy_filter); } else { - list_add_tail_rcu(&fnew->list, &head->filters); + list_add_tail_rcu(&fnew->list, &fnew->mask->filters); } kfree(tb); return 0; +errout_mask: + fl_mask_put(head, fnew->mask, false); + errout_idr: if (fnew->handle) idr_remove(&head->handle_idr, fnew->handle); @@ -994,10 +1065,10 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last, struct cls_fl_filter *f = arg; if (!tc_skip_sw(f->flags)) - rhashtable_remove_fast(&head->ht, &f->ht_node, - head->ht_params); + rhashtable_remove_fast(&f->mask->ht, &f->ht_node, + f->mask->filter_ht_params); __fl_delete(tp, f, extack); - *last = list_empty(&head->filters); + *last = list_empty(&head->masks); return 0; } @@ -1005,16 +1076,19 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; - - list_for_each_entry_rcu(f, &head->filters, list) { - if (arg->count < arg->skip) - goto skip; - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; - break; - } + struct fl_flow_mask *mask; + + list_for_each_entry_rcu(mask, &head->masks, list) { + list_for_each_entry_rcu(f, &mask->filters, list) { + if (arg->count < arg->skip) + goto skip; + if (arg->fn(tp, f, arg) < 0) { + arg->stop = 1; + break; + } skip: - arg->count++; + arg->count++; + } } } @@ -1150,7 +1224,6 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { - struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = fh; struct nlattr *nest; struct fl_flow_key *key, *mask; @@ -1169,7 +1242,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, goto nla_put_failure; key = &f->key; - mask = &head->mask.key; + mask = &f->mask->key; if (mask->indev_ifindex) { struct net_device *dev; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 39c144b6ff98..760ab1b09f8b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -373,33 +373,24 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, */ static inline bool qdisc_restart(struct Qdisc *q, int *packets) { - bool more, validate, nolock = q->flags & TCQ_F_NOLOCK; spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; struct sk_buff *skb; + bool validate; /* Dequeue packet */ - if (nolock && test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) - return false; - skb = dequeue_skb(q, &validate, packets); - if (unlikely(!skb)) { - if (nolock) - clear_bit(__QDISC_STATE_RUNNING, &q->state); + if (unlikely(!skb)) return false; - } - if (!nolock) + if (!(q->flags & TCQ_F_NOLOCK)) root_lock = qdisc_lock(q); dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); - more = sch_direct_xmit(skb, q, dev, txq, root_lock, validate); - if (nolock) - clear_bit(__QDISC_STATE_RUNNING, &q->state); - return more; + return sch_direct_xmit(skb, q, dev, txq, root_lock, validate); } void __qdisc_run(struct Qdisc *q) @@ -665,7 +656,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) if (__skb_array_empty(q)) continue; - skb = skb_array_consume_bh(q); + skb = __skb_array_consume(q); } if (likely(skb)) { qdisc_qstats_cpu_backlog_dec(qdisc, skb); @@ -706,7 +697,7 @@ static void pfifo_fast_reset(struct Qdisc *qdisc) if (!q->ring.queue) continue; - while ((skb = skb_array_consume_bh(q)) != NULL) + while ((skb = __skb_array_consume(q)) != NULL) kfree_skb(skb); } @@ -867,6 +858,11 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + /* seqlock has the same scope of busylock, for NOLOCK qdisc */ + spin_lock_init(&sch->seqlock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); lockdep_set_class(&sch->running, dev->qdisc_running_key ?: &qdisc_running_key); @@ -1106,6 +1102,10 @@ static void dev_deactivate_queue(struct net_device *dev, qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { + bool nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); spin_lock_bh(qdisc_lock(qdisc)); if (!(qdisc->flags & TCQ_F_BUILTIN)) @@ -1115,6 +1115,8 @@ static void dev_deactivate_queue(struct net_device *dev, qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) + spin_unlock_bh(&qdisc->seqlock); } } @@ -1131,17 +1133,13 @@ static bool some_qdisc_is_busy(struct net_device *dev) dev_queue = netdev_get_tx_queue(dev, i); q = dev_queue->qdisc_sleeping; - if (q->flags & TCQ_F_NOLOCK) { - val = test_bit(__QDISC_STATE_SCHED, &q->state); - } else { - root_lock = qdisc_lock(q); - spin_lock_bh(root_lock); + root_lock = qdisc_lock(q); + spin_lock_bh(root_lock); - val = (qdisc_is_running(q) || - test_bit(__QDISC_STATE_SCHED, &q->state)); + val = (qdisc_is_running(q) || + test_bit(__QDISC_STATE_SCHED, &q->state)); - spin_unlock_bh(root_lock); - } + spin_unlock_bh(root_lock); if (val) return true; diff --git a/net/sctp/associola.c b/net/sctp/associola.c index a47179da24e6..5d5a16204d50 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -652,33 +652,20 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, */ peer->param_flags = asoc->param_flags; - sctp_transport_route(peer, NULL, sp); - /* Initialize the pmtu of the transport. */ - if (peer->param_flags & SPP_PMTUD_DISABLE) { - if (asoc->pathmtu) - peer->pathmtu = asoc->pathmtu; - else - peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT; - } + sctp_transport_route(peer, NULL, sp); /* If this is the first transport addr on this association, * initialize the association PMTU to the peer's PMTU. * If not and the current association PMTU is higher than the new * peer's PMTU, reset the association PMTU to the new peer's PMTU. */ - if (asoc->pathmtu) - asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu); - else - asoc->pathmtu = peer->pathmtu; - - pr_debug("%s: association:%p PMTU set to %d\n", __func__, asoc, - asoc->pathmtu); + sctp_assoc_set_pmtu(asoc, asoc->pathmtu ? + min_t(int, peer->pathmtu, asoc->pathmtu) : + peer->pathmtu); peer->pmtu_pending = 0; - asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); - /* The asoc->peer.port might not be meaningful yet, but * initialize the packet structure anyway. */ @@ -988,31 +975,6 @@ out: return match; } -/* Is this the association we are looking for? */ -struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc, - struct net *net, - const union sctp_addr *laddr, - const union sctp_addr *paddr) -{ - struct sctp_transport *transport; - - if ((htons(asoc->base.bind_addr.port) == laddr->v4.sin_port) && - (htons(asoc->peer.port) == paddr->v4.sin_port) && - net_eq(sock_net(asoc->base.sk), net)) { - transport = sctp_assoc_lookup_paddr(asoc, paddr); - if (!transport) - goto out; - - if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr, - sctp_sk(asoc->base.sk))) - goto out; - } - transport = NULL; - -out: - return transport; -} - /* Do delayed input processing. This is scheduled by sctp_rcv(). */ static void sctp_assoc_bh_rcv(struct work_struct *work) { @@ -1434,6 +1396,31 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc, } } +void sctp_assoc_update_frag_point(struct sctp_association *asoc) +{ + int frag = sctp_mtu_payload(sctp_sk(asoc->base.sk), asoc->pathmtu, + sctp_datachk_len(&asoc->stream)); + + if (asoc->user_frag) + frag = min_t(int, frag, asoc->user_frag); + + frag = min_t(int, frag, SCTP_MAX_CHUNK_LEN - + sctp_datachk_len(&asoc->stream)); + + asoc->frag_point = SCTP_TRUNC4(frag); +} + +void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu) +{ + if (asoc->pathmtu != pmtu) { + asoc->pathmtu = pmtu; + sctp_assoc_update_frag_point(asoc); + } + + pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc, + asoc->pathmtu, asoc->frag_point); +} + /* Update the association's pmtu and frag_point by going through all the * transports. This routine is called when a transport's PMTU has changed. */ @@ -1446,24 +1433,16 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) return; /* Get the lowest pmtu of all the transports. */ - list_for_each_entry(t, &asoc->peer.transport_addr_list, - transports) { + list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->pmtu_pending && t->dst) { - sctp_transport_update_pmtu( - t, SCTP_TRUNC4(dst_mtu(t->dst))); + sctp_transport_update_pmtu(t, sctp_dst_mtu(t->dst)); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) pmtu = t->pathmtu; } - if (pmtu) { - asoc->pathmtu = pmtu; - asoc->frag_point = sctp_frag_point(asoc, pmtu); - } - - pr_debug("%s: asoc:%p, pmtu:%d, frag_point:%d\n", __func__, asoc, - asoc->pathmtu, asoc->frag_point); + sctp_assoc_set_pmtu(asoc, pmtu); } /* Should we send a SACK to update our peer? */ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index be296d633e95..79daa98208c3 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -172,8 +172,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; - struct sctp_sock *sp; - struct sctp_af *af; int err; msg = sctp_datamsg_new(GFP_KERNEL); @@ -192,12 +190,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, /* This is the biggest possible DATA chunk that can fit into * the packet */ - sp = sctp_sk(asoc->base.sk); - af = sp->pf->af; - max_data = asoc->pathmtu - af->net_header_len - - sizeof(struct sctphdr) - sctp_datachk_len(&asoc->stream) - - af->ip_options_len(asoc->base.sk); - max_data = SCTP_TRUNC4(max_data); + max_data = asoc->frag_point; /* If the the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with @@ -222,9 +215,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, } } - /* Check what's our max considering the above */ - max_data = min_t(size_t, max_data, asoc->frag_point); - /* Set first_len and then account for possible bundles on first frag */ first_len = max_data; diff --git a/net/sctp/output.c b/net/sctp/output.c index 690d8557bb7b..e672dee302c7 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -90,8 +90,8 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; + struct sctp_sock *sp = NULL; struct sock *sk; - size_t overhead = sizeof(struct ipv6hdr) + sizeof(struct sctphdr); pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; @@ -102,28 +102,20 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, /* set packet max_size with pathmtu, then calculate overhead */ packet->max_size = tp->pathmtu; + if (asoc) { - struct sctp_sock *sp = sctp_sk(asoc->base.sk); - struct sctp_af *af = sp->pf->af; - - overhead = af->net_header_len + - af->ip_options_len(asoc->base.sk); - overhead += sizeof(struct sctphdr); - packet->overhead = overhead; - packet->size = overhead; - } else { - packet->overhead = overhead; - packet->size = overhead; - return; + sk = asoc->base.sk; + sp = sctp_sk(sk); } + packet->overhead = sctp_mtu_payload(sp, 0, 0); + packet->size = packet->overhead; + + if (!asoc) + return; /* update dst or transport pathmtu if in need */ - sk = asoc->base.sk; if (!sctp_transport_dst_check(tp)) { - sctp_transport_route(tp, NULL, sctp_sk(sk)); - if (asoc->param_flags & SPP_PMTUD_ENABLE) - sctp_assoc_sync_pmtu(asoc); - } else if (!sctp_transport_pmtu_check(tp)) { + sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index f211b3db6a35..d68aa33485a9 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -601,14 +601,14 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, /* * Transmit DATA chunks on the retransmit queue. Upon return from - * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which + * __sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which * need to be transmitted by the caller. * We assume that pkt->transport has already been set. * * The return value is a normal kernel error return value. */ -static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, - int rtx_timeout, int *start_timer) +static int __sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, + int rtx_timeout, int *start_timer, gfp_t gfp) { struct sctp_transport *transport = pkt->transport; struct sctp_chunk *chunk, *chunk1; @@ -684,12 +684,12 @@ redo: * control chunks are already freed so there * is nothing we can do. */ - sctp_packet_transmit(pkt, GFP_ATOMIC); + sctp_packet_transmit(pkt, gfp); goto redo; } /* Send this packet. */ - error = sctp_packet_transmit(pkt, GFP_ATOMIC); + error = sctp_packet_transmit(pkt, gfp); /* If we are retransmitting, we should only * send a single packet. @@ -705,7 +705,7 @@ redo: case SCTP_XMIT_RWND_FULL: /* Send this packet. */ - error = sctp_packet_transmit(pkt, GFP_ATOMIC); + error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA as there is no more room * at the receiver. @@ -715,7 +715,7 @@ redo: case SCTP_XMIT_DELAY: /* Send this packet. */ - error = sctp_packet_transmit(pkt, GFP_ATOMIC); + error = sctp_packet_transmit(pkt, gfp); /* Stop sending DATA because of nagle delay. */ done = 1; @@ -776,68 +776,43 @@ void sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp) sctp_outq_flush(q, 0, gfp); } - -/* - * Try to flush an outqueue. - * - * Description: Send everything in q which we legally can, subject to - * congestion limitations. - * * Note: This function can be called from multiple contexts so appropriate - * locking concerns must be made. Today we use the sock lock to protect - * this function. - */ -static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) +static int sctp_packet_singleton(struct sctp_transport *transport, + struct sctp_chunk *chunk, gfp_t gfp) { - struct sctp_packet *packet; + const struct sctp_association *asoc = transport->asoc; + const __u16 sport = asoc->base.bind_addr.port; + const __u16 dport = asoc->peer.port; + const __u32 vtag = asoc->peer.i.init_tag; struct sctp_packet singleton; - struct sctp_association *asoc = q->asoc; - __u16 sport = asoc->base.bind_addr.port; - __u16 dport = asoc->peer.port; - __u32 vtag = asoc->peer.i.init_tag; - struct sctp_transport *transport = NULL; - struct sctp_transport *new_transport; - struct sctp_chunk *chunk, *tmp; - enum sctp_xmit status; - int error = 0; - int start_timer = 0; - int one_packet = 0; + sctp_packet_init(&singleton, transport, sport, dport); + sctp_packet_config(&singleton, vtag, 0); + sctp_packet_append_chunk(&singleton, chunk); + return sctp_packet_transmit(&singleton, gfp); +} + +/* Struct to hold the context during sctp outq flush */ +struct sctp_flush_ctx { + struct sctp_outq *q; + /* Current transport being used. It's NOT the same as curr active one */ + struct sctp_transport *transport; /* These transports have chunks to send. */ struct list_head transport_list; - struct list_head *ltransport; - - INIT_LIST_HEAD(&transport_list); - packet = NULL; - - /* - * 6.10 Bundling - * ... - * When bundling control chunks with DATA chunks, an - * endpoint MUST place control chunks first in the outbound - * SCTP packet. The transmitter MUST transmit DATA chunks - * within a SCTP packet in increasing order of TSN. - * ... - */ - - list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) { - /* RFC 5061, 5.3 - * F1) This means that until such time as the ASCONF - * containing the add is acknowledged, the sender MUST - * NOT use the new IP address as a source for ANY SCTP - * packet except on carrying an ASCONF Chunk. - */ - if (asoc->src_out_of_asoc_ok && - chunk->chunk_hdr->type != SCTP_CID_ASCONF) - continue; - - list_del_init(&chunk->list); + struct sctp_association *asoc; + /* Packet on the current transport above */ + struct sctp_packet *packet; + gfp_t gfp; +}; - /* Pick the right transport to use. */ - new_transport = chunk->transport; +/* transport: current transport */ +static void sctp_outq_select_transport(struct sctp_flush_ctx *ctx, + struct sctp_chunk *chunk) +{ + struct sctp_transport *new_transport = chunk->transport; - if (!new_transport) { - /* - * If we have a prior transport pointer, see if + if (!new_transport) { + if (!sctp_chunk_is_data(chunk)) { + /* If we have a prior transport pointer, see if * the destination address of the chunk * matches the destination address of the * current transport. If not a match, then @@ -846,22 +821,26 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) * after processing ASCONFs, we may have new * transports created. */ - if (transport && - sctp_cmp_addr_exact(&chunk->dest, - &transport->ipaddr)) - new_transport = transport; + if (ctx->transport && sctp_cmp_addr_exact(&chunk->dest, + &ctx->transport->ipaddr)) + new_transport = ctx->transport; else - new_transport = sctp_assoc_lookup_paddr(asoc, - &chunk->dest); + new_transport = sctp_assoc_lookup_paddr(ctx->asoc, + &chunk->dest); + } - /* if we still don't have a new transport, then - * use the current active path. - */ - if (!new_transport) - new_transport = asoc->peer.active_path; - } else if ((new_transport->state == SCTP_INACTIVE) || - (new_transport->state == SCTP_UNCONFIRMED) || - (new_transport->state == SCTP_PF)) { + /* if we still don't have a new transport, then + * use the current active path. + */ + if (!new_transport) + new_transport = ctx->asoc->peer.active_path; + } else { + __u8 type; + + switch (new_transport->state) { + case SCTP_INACTIVE: + case SCTP_UNCONFIRMED: + case SCTP_PF: /* If the chunk is Heartbeat or Heartbeat Ack, * send it to chunk->transport, even if it's * inactive. @@ -875,29 +854,64 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) * * ASCONF_ACKs also must be sent to the source. */ - if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT && - chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK && - chunk->chunk_hdr->type != SCTP_CID_ASCONF_ACK) - new_transport = asoc->peer.active_path; + type = chunk->chunk_hdr->type; + if (type != SCTP_CID_HEARTBEAT && + type != SCTP_CID_HEARTBEAT_ACK && + type != SCTP_CID_ASCONF_ACK) + new_transport = ctx->asoc->peer.active_path; + break; + default: + break; } + } + + /* Are we switching transports? Take care of transport locks. */ + if (new_transport != ctx->transport) { + ctx->transport = new_transport; + ctx->packet = &ctx->transport->packet; - /* Are we switching transports? - * Take care of transport locks. + if (list_empty(&ctx->transport->send_ready)) + list_add_tail(&ctx->transport->send_ready, + &ctx->transport_list); + + sctp_packet_config(ctx->packet, + ctx->asoc->peer.i.init_tag, + ctx->asoc->peer.ecn_capable); + /* We've switched transports, so apply the + * Burst limit to the new transport. */ - if (new_transport != transport) { - transport = new_transport; - if (list_empty(&transport->send_ready)) { - list_add_tail(&transport->send_ready, - &transport_list); - } - packet = &transport->packet; - sctp_packet_config(packet, vtag, - asoc->peer.ecn_capable); - } + sctp_transport_burst_limited(ctx->transport); + } +} + +static void sctp_outq_flush_ctrl(struct sctp_flush_ctx *ctx) +{ + struct sctp_chunk *chunk, *tmp; + enum sctp_xmit status; + int one_packet, error; + + list_for_each_entry_safe(chunk, tmp, &ctx->q->control_chunk_list, list) { + one_packet = 0; + + /* RFC 5061, 5.3 + * F1) This means that until such time as the ASCONF + * containing the add is acknowledged, the sender MUST + * NOT use the new IP address as a source for ANY SCTP + * packet except on carrying an ASCONF Chunk. + */ + if (ctx->asoc->src_out_of_asoc_ok && + chunk->chunk_hdr->type != SCTP_CID_ASCONF) + continue; + + list_del_init(&chunk->list); + + /* Pick the right transport to use. Should always be true for + * the first chunk as we don't have a transport by then. + */ + sctp_outq_select_transport(ctx, chunk); switch (chunk->chunk_hdr->type) { - /* - * 6.10 Bundling + /* 6.10 Bundling * ... * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN * COMPLETE with any other chunks. [Send them immediately.] @@ -905,20 +919,19 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: case SCTP_CID_SHUTDOWN_COMPLETE: - sctp_packet_init(&singleton, transport, sport, dport); - sctp_packet_config(&singleton, vtag, 0); - sctp_packet_append_chunk(&singleton, chunk); - error = sctp_packet_transmit(&singleton, gfp); + error = sctp_packet_singleton(ctx->transport, chunk, + ctx->gfp); if (error < 0) { - asoc->base.sk->sk_err = -error; + ctx->asoc->base.sk->sk_err = -error; return; } break; case SCTP_CID_ABORT: if (sctp_test_T_bit(chunk)) - packet->vtag = asoc->c.my_vtag; + ctx->packet->vtag = ctx->asoc->c.my_vtag; /* fallthru */ + /* The following chunks are "response" chunks, i.e. * they are generated in response to something we * received. If we are sending these, then we can @@ -942,27 +955,27 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) case SCTP_CID_FWD_TSN: case SCTP_CID_I_FWD_TSN: case SCTP_CID_RECONF: - status = sctp_packet_transmit_chunk(packet, chunk, - one_packet, gfp); - if (status != SCTP_XMIT_OK) { + status = sctp_packet_transmit_chunk(ctx->packet, chunk, + one_packet, ctx->gfp); + if (status != SCTP_XMIT_OK) { /* put the chunk back */ - list_add(&chunk->list, &q->control_chunk_list); + list_add(&chunk->list, &ctx->q->control_chunk_list); break; } - asoc->stats.octrlchunks++; + ctx->asoc->stats.octrlchunks++; /* PR-SCTP C5) If a FORWARD TSN is sent, the * sender MUST assure that at least one T3-rtx * timer is running. */ if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN || chunk->chunk_hdr->type == SCTP_CID_I_FWD_TSN) { - sctp_transport_reset_t3_rtx(transport); - transport->last_time_sent = jiffies; + sctp_transport_reset_t3_rtx(ctx->transport); + ctx->transport->last_time_sent = jiffies; } - if (chunk == asoc->strreset_chunk) - sctp_transport_reset_reconf_timer(transport); + if (chunk == ctx->asoc->strreset_chunk) + sctp_transport_reset_reconf_timer(ctx->transport); break; @@ -971,232 +984,186 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) BUG(); } } +} - if (q->asoc->src_out_of_asoc_ok) - goto sctp_flush_out; +/* Returns false if new data shouldn't be sent */ +static bool sctp_outq_flush_rtx(struct sctp_flush_ctx *ctx, + int rtx_timeout) +{ + int error, start_timer = 0; + + if (ctx->asoc->peer.retran_path->state == SCTP_UNCONFIRMED) + return false; + + if (ctx->transport != ctx->asoc->peer.retran_path) { + /* Switch transports & prepare the packet. */ + ctx->transport = ctx->asoc->peer.retran_path; + ctx->packet = &ctx->transport->packet; + + if (list_empty(&ctx->transport->send_ready)) + list_add_tail(&ctx->transport->send_ready, + &ctx->transport_list); + + sctp_packet_config(ctx->packet, ctx->asoc->peer.i.init_tag, + ctx->asoc->peer.ecn_capable); + } + + error = __sctp_outq_flush_rtx(ctx->q, ctx->packet, rtx_timeout, + &start_timer, ctx->gfp); + if (error < 0) + ctx->asoc->base.sk->sk_err = -error; + + if (start_timer) { + sctp_transport_reset_t3_rtx(ctx->transport); + ctx->transport->last_time_sent = jiffies; + } + + /* This can happen on COOKIE-ECHO resend. Only + * one chunk can get bundled with a COOKIE-ECHO. + */ + if (ctx->packet->has_cookie_echo) + return false; + + /* Don't send new data if there is still data + * waiting to retransmit. + */ + if (!list_empty(&ctx->q->retransmit)) + return false; + + return true; +} + +static void sctp_outq_flush_data(struct sctp_flush_ctx *ctx, + int rtx_timeout) +{ + struct sctp_chunk *chunk; + enum sctp_xmit status; /* Is it OK to send data chunks? */ - switch (asoc->state) { + switch (ctx->asoc->state) { case SCTP_STATE_COOKIE_ECHOED: /* Only allow bundling when this packet has a COOKIE-ECHO * chunk. */ - if (!packet || !packet->has_cookie_echo) - break; + if (!ctx->packet || !ctx->packet->has_cookie_echo) + return; /* fallthru */ case SCTP_STATE_ESTABLISHED: case SCTP_STATE_SHUTDOWN_PENDING: case SCTP_STATE_SHUTDOWN_RECEIVED: - /* - * RFC 2960 6.1 Transmission of DATA Chunks - * - * C) When the time comes for the sender to transmit, - * before sending new DATA chunks, the sender MUST - * first transmit any outstanding DATA chunks which - * are marked for retransmission (limited by the - * current cwnd). - */ - if (!list_empty(&q->retransmit)) { - if (asoc->peer.retran_path->state == SCTP_UNCONFIRMED) - goto sctp_flush_out; - if (transport == asoc->peer.retran_path) - goto retran; - - /* Switch transports & prepare the packet. */ - - transport = asoc->peer.retran_path; + break; - if (list_empty(&transport->send_ready)) { - list_add_tail(&transport->send_ready, - &transport_list); - } + default: + /* Do nothing. */ + return; + } - packet = &transport->packet; - sctp_packet_config(packet, vtag, - asoc->peer.ecn_capable); - retran: - error = sctp_outq_flush_rtx(q, packet, - rtx_timeout, &start_timer); - if (error < 0) - asoc->base.sk->sk_err = -error; + /* RFC 2960 6.1 Transmission of DATA Chunks + * + * C) When the time comes for the sender to transmit, + * before sending new DATA chunks, the sender MUST + * first transmit any outstanding DATA chunks which + * are marked for retransmission (limited by the + * current cwnd). + */ + if (!list_empty(&ctx->q->retransmit) && + !sctp_outq_flush_rtx(ctx, rtx_timeout)) + return; - if (start_timer) { - sctp_transport_reset_t3_rtx(transport); - transport->last_time_sent = jiffies; - } + /* Apply Max.Burst limitation to the current transport in + * case it will be used for new data. We are going to + * rest it before we return, but we want to apply the limit + * to the currently queued data. + */ + if (ctx->transport) + sctp_transport_burst_limited(ctx->transport); - /* This can happen on COOKIE-ECHO resend. Only - * one chunk can get bundled with a COOKIE-ECHO. - */ - if (packet->has_cookie_echo) - goto sctp_flush_out; + /* Finally, transmit new packets. */ + while ((chunk = sctp_outq_dequeue_data(ctx->q)) != NULL) { + __u32 sid = ntohs(chunk->subh.data_hdr->stream); - /* Don't send new data if there is still data - * waiting to retransmit. - */ - if (!list_empty(&q->retransmit)) - goto sctp_flush_out; + /* Has this chunk expired? */ + if (sctp_chunk_abandoned(chunk)) { + sctp_sched_dequeue_done(ctx->q, chunk); + sctp_chunk_fail(chunk, 0); + sctp_chunk_free(chunk); + continue; } - /* Apply Max.Burst limitation to the current transport in - * case it will be used for new data. We are going to - * rest it before we return, but we want to apply the limit - * to the currently queued data. - */ - if (transport) - sctp_transport_burst_limited(transport); - - /* Finally, transmit new packets. */ - while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { - __u32 sid = ntohs(chunk->subh.data_hdr->stream); - - /* Has this chunk expired? */ - if (sctp_chunk_abandoned(chunk)) { - sctp_sched_dequeue_done(q, chunk); - sctp_chunk_fail(chunk, 0); - sctp_chunk_free(chunk); - continue; - } + if (ctx->asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) { + sctp_outq_head_data(ctx->q, chunk); + break; + } - if (asoc->stream.out[sid].state == SCTP_STREAM_CLOSED) { - sctp_outq_head_data(q, chunk); - goto sctp_flush_out; - } + sctp_outq_select_transport(ctx, chunk); - /* If there is a specified transport, use it. - * Otherwise, we want to use the active path. + pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p skb->users:%d\n", + __func__, ctx->q, chunk, chunk && chunk->chunk_hdr ? + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : + "illegal chunk", ntohl(chunk->subh.data_hdr->tsn), + chunk->skb ? chunk->skb->head : NULL, chunk->skb ? + refcount_read(&chunk->skb->users) : -1); + + /* Add the chunk to the packet. */ + status = sctp_packet_transmit_chunk(ctx->packet, chunk, 0, + ctx->gfp); + if (status != SCTP_XMIT_OK) { + /* We could not append this chunk, so put + * the chunk back on the output queue. */ - new_transport = chunk->transport; - if (!new_transport || - ((new_transport->state == SCTP_INACTIVE) || - (new_transport->state == SCTP_UNCONFIRMED) || - (new_transport->state == SCTP_PF))) - new_transport = asoc->peer.active_path; - if (new_transport->state == SCTP_UNCONFIRMED) { - WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); - sctp_sched_dequeue_done(q, chunk); - sctp_chunk_fail(chunk, 0); - sctp_chunk_free(chunk); - continue; - } - - /* Change packets if necessary. */ - if (new_transport != transport) { - transport = new_transport; + pr_debug("%s: could not transmit tsn:0x%x, status:%d\n", + __func__, ntohl(chunk->subh.data_hdr->tsn), + status); - /* Schedule to have this transport's - * packet flushed. - */ - if (list_empty(&transport->send_ready)) { - list_add_tail(&transport->send_ready, - &transport_list); - } - - packet = &transport->packet; - sctp_packet_config(packet, vtag, - asoc->peer.ecn_capable); - /* We've switched transports, so apply the - * Burst limit to the new transport. - */ - sctp_transport_burst_limited(transport); - } - - pr_debug("%s: outq:%p, chunk:%p[%s], tx-tsn:0x%x skb->head:%p " - "skb->users:%d\n", - __func__, q, chunk, chunk && chunk->chunk_hdr ? - sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) : - "illegal chunk", ntohl(chunk->subh.data_hdr->tsn), - chunk->skb ? chunk->skb->head : NULL, chunk->skb ? - refcount_read(&chunk->skb->users) : -1); - - /* Add the chunk to the packet. */ - status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp); - - switch (status) { - case SCTP_XMIT_PMTU_FULL: - case SCTP_XMIT_RWND_FULL: - case SCTP_XMIT_DELAY: - /* We could not append this chunk, so put - * the chunk back on the output queue. - */ - pr_debug("%s: could not transmit tsn:0x%x, status:%d\n", - __func__, ntohl(chunk->subh.data_hdr->tsn), - status); - - sctp_outq_head_data(q, chunk); - goto sctp_flush_out; - - case SCTP_XMIT_OK: - /* The sender is in the SHUTDOWN-PENDING state, - * The sender MAY set the I-bit in the DATA - * chunk header. - */ - if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) - chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM; - if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) - asoc->stats.ouodchunks++; - else - asoc->stats.oodchunks++; - - /* Only now it's safe to consider this - * chunk as sent, sched-wise. - */ - sctp_sched_dequeue_done(q, chunk); - - break; + sctp_outq_head_data(ctx->q, chunk); + break; + } - default: - BUG(); - } + /* The sender is in the SHUTDOWN-PENDING state, + * The sender MAY set the I-bit in the DATA + * chunk header. + */ + if (ctx->asoc->state == SCTP_STATE_SHUTDOWN_PENDING) + chunk->chunk_hdr->flags |= SCTP_DATA_SACK_IMM; + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + ctx->asoc->stats.ouodchunks++; + else + ctx->asoc->stats.oodchunks++; - /* BUG: We assume that the sctp_packet_transmit() - * call below will succeed all the time and add the - * chunk to the transmitted list and restart the - * timers. - * It is possible that the call can fail under OOM - * conditions. - * - * Is this really a problem? Won't this behave - * like a lost TSN? - */ - list_add_tail(&chunk->transmitted_list, - &transport->transmitted); + /* Only now it's safe to consider this + * chunk as sent, sched-wise. + */ + sctp_sched_dequeue_done(ctx->q, chunk); - sctp_transport_reset_t3_rtx(transport); - transport->last_time_sent = jiffies; + list_add_tail(&chunk->transmitted_list, + &ctx->transport->transmitted); - /* Only let one DATA chunk get bundled with a - * COOKIE-ECHO chunk. - */ - if (packet->has_cookie_echo) - goto sctp_flush_out; - } - break; + sctp_transport_reset_t3_rtx(ctx->transport); + ctx->transport->last_time_sent = jiffies; - default: - /* Do nothing. */ - break; + /* Only let one DATA chunk get bundled with a + * COOKIE-ECHO chunk. + */ + if (ctx->packet->has_cookie_echo) + break; } +} -sctp_flush_out: +static void sctp_outq_flush_transports(struct sctp_flush_ctx *ctx) +{ + struct list_head *ltransport; + struct sctp_packet *packet; + struct sctp_transport *t; + int error = 0; - /* Before returning, examine all the transports touched in - * this call. Right now, we bluntly force clear all the - * transports. Things might change after we implement Nagle. - * But such an examination is still required. - * - * --xguo - */ - while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL) { - struct sctp_transport *t = list_entry(ltransport, - struct sctp_transport, - send_ready); + while ((ltransport = sctp_list_dequeue(&ctx->transport_list)) != NULL) { + t = list_entry(ltransport, struct sctp_transport, send_ready); packet = &t->packet; if (!sctp_packet_empty(packet)) { - error = sctp_packet_transmit(packet, gfp); + error = sctp_packet_transmit(packet, ctx->gfp); if (error < 0) - asoc->base.sk->sk_err = -error; + ctx->q->asoc->base.sk->sk_err = -error; } /* Clear the burst limited state, if any */ @@ -1204,6 +1171,47 @@ sctp_flush_out: } } +/* Try to flush an outqueue. + * + * Description: Send everything in q which we legally can, subject to + * congestion limitations. + * * Note: This function can be called from multiple contexts so appropriate + * locking concerns must be made. Today we use the sock lock to protect + * this function. + */ + +static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) +{ + struct sctp_flush_ctx ctx = { + .q = q, + .transport = NULL, + .transport_list = LIST_HEAD_INIT(ctx.transport_list), + .asoc = q->asoc, + .packet = NULL, + .gfp = gfp, + }; + + /* 6.10 Bundling + * ... + * When bundling control chunks with DATA chunks, an + * endpoint MUST place control chunks first in the outbound + * SCTP packet. The transmitter MUST transmit DATA chunks + * within a SCTP packet in increasing order of TSN. + * ... + */ + + sctp_outq_flush_ctrl(&ctx); + + if (q->asoc->src_out_of_asoc_ok) + goto sctp_flush_out; + + sctp_outq_flush_data(&ctx, rtx_timeout); + +sctp_flush_out: + + sctp_outq_flush_transports(&ctx); +} + /* Update unack_data based on the incoming SACK chunk */ static void sctp_sack_update_unack_data(struct sctp_association *assoc, struct sctp_sackhdr *sack) @@ -1457,7 +1465,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, * the outstanding bytes for this chunk, so only * count bytes associated with a transport. */ - if (transport) { + if (transport && !tchunk->tsn_gap_acked) { /* If this chunk is being used for RTT * measurement, calculate the RTT and update * the RTO using this value. @@ -1469,14 +1477,34 @@ static void sctp_check_transmitted(struct sctp_outq *q, * first instance of the packet or a later * instance). */ - if (!tchunk->tsn_gap_acked && - !sctp_chunk_retransmitted(tchunk) && + if (!sctp_chunk_retransmitted(tchunk) && tchunk->rtt_in_progress) { tchunk->rtt_in_progress = 0; rtt = jiffies - tchunk->sent_at; sctp_transport_update_rto(transport, rtt); } + + if (TSN_lte(tsn, sack_ctsn)) { + /* + * SFR-CACC algorithm: + * 2) If the SACK contains gap acks + * and the flag CHANGEOVER_ACTIVE is + * set the receiver of the SACK MUST + * take the following action: + * + * B) For each TSN t being acked that + * has not been acked in any SACK so + * far, set cacc_saw_newack to 1 for + * the destination that the TSN was + * sent to. + */ + if (sack->num_gap_ack_blocks && + q->asoc->peer.primary_path->cacc. + changeover_active) + transport->cacc.cacc_saw_newack + = 1; + } } /* If the chunk hasn't been marked as ACKED, @@ -1508,28 +1536,6 @@ static void sctp_check_transmitted(struct sctp_outq *q, restart_timer = 1; forward_progress = true; - if (!tchunk->tsn_gap_acked) { - /* - * SFR-CACC algorithm: - * 2) If the SACK contains gap acks - * and the flag CHANGEOVER_ACTIVE is - * set the receiver of the SACK MUST - * take the following action: - * - * B) For each TSN t being acked that - * has not been acked in any SACK so - * far, set cacc_saw_newack to 1 for - * the destination that the TSN was - * sent to. - */ - if (transport && - sack->num_gap_ack_blocks && - q->asoc->peer.primary_path->cacc. - changeover_active) - transport->cacc.cacc_saw_newack - = 1; - } - list_add_tail(&tchunk->transmitted_list, &q->sacked); } else { @@ -1756,7 +1762,7 @@ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) if (TSN_lte(tsn, ctsn)) goto pass; - /* 3.3.4 Selective Acknowledgement (SACK) (3): + /* 3.3.4 Selective Acknowledgment (SACK) (3): * * Gap Ack Blocks: * These fields contain the Gap Ack Blocks. They are repeated diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index e62addb60434..4a4fd1971255 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -81,8 +81,6 @@ static int sctp_process_param(struct sctp_association *asoc, gfp_t gfp); static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data); -static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len, - const void *data); /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) @@ -154,12 +152,11 @@ static const struct sctp_paramhdr prsctp_param = { cpu_to_be16(sizeof(struct sctp_paramhdr)), }; -/* A helper to initialize an op error inside a - * provided chunk, as most cause codes will be embedded inside an - * abort chunk. +/* A helper to initialize an op error inside a provided chunk, as most + * cause codes will be embedded inside an abort chunk. */ -void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, - size_t paylen) +int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, + size_t paylen) { struct sctp_errhdr err; __u16 len; @@ -167,33 +164,16 @@ void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, /* Cause code constants are now defined in network order. */ err.cause = cause_code; len = sizeof(err) + paylen; - err.length = htons(len); - chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err); -} - -/* A helper to initialize an op error inside a - * provided chunk, as most cause codes will be embedded inside an - * abort chunk. Differs from sctp_init_cause in that it won't oops - * if there isn't enough space in the op error chunk - */ -static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code, - size_t paylen) -{ - struct sctp_errhdr err; - __u16 len; - - /* Cause code constants are now defined in network order. */ - err.cause = cause_code; - len = sizeof(err) + paylen; - err.length = htons(len); + err.length = htons(len); if (skb_tailroom(chunk->skb) < len) return -ENOSPC; - chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, sizeof(err), &err); + chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err); return 0; } + /* 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two @@ -779,10 +759,9 @@ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, * association. This reports on which TSN's we've seen to date, * including duplicates and gaps. */ -struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) +struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc) { struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; - struct sctp_association *aptr = (struct sctp_association *)asoc; struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; __u16 num_gabs, num_dup_tsns; struct sctp_transport *trans; @@ -857,7 +836,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) /* Add the duplicate TSN information. */ if (num_dup_tsns) { - aptr->stats.idupchunks += num_dup_tsns; + asoc->stats.idupchunks += num_dup_tsns; sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, sctp_tsnmap_get_dups(map)); } @@ -869,11 +848,11 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) * association so no transport will match after a wrap event like this, * Until the next sack */ - if (++aptr->peer.sack_generation == 0) { + if (++asoc->peer.sack_generation == 0) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) trans->sack_generation = 0; - aptr->peer.sack_generation = 1; + asoc->peer.sack_generation = 1; } nodata: return retval; @@ -1258,20 +1237,26 @@ nodata: return retval; } -/* Create an Operation Error chunk of a fixed size, - * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - * This is a helper function to allocate an error chunk for - * for those invalid parameter codes in which we may not want - * to report all the errors, if the incoming chunk is large +/* Create an Operation Error chunk of a fixed size, specifically, + * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads. + * This is a helper function to allocate an error chunk for for those + * invalid parameter codes in which we may not want to report all the + * errors, if the incoming chunk is large. If it can't fit in a single + * packet, we ignore it. */ -static inline struct sctp_chunk *sctp_make_op_error_fixed( +static inline struct sctp_chunk *sctp_make_op_error_limited( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { - size_t size = asoc ? asoc->pathmtu : 0; + size_t size = SCTP_DEFAULT_MAXSEGMENT; + struct sctp_sock *sp = NULL; - if (!size) - size = SCTP_DEFAULT_MAXSEGMENT; + if (asoc) { + size = min_t(size_t, size, asoc->pathmtu); + sp = sctp_sk(asoc->base.sk); + } + + size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr)); return sctp_make_op_error_space(asoc, chunk, size); } @@ -1523,18 +1508,6 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) return target; } -/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient - * space in the chunk - */ -static void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk, - int len, const void *data) -{ - if (skb_tailroom(chunk->skb) >= len) - return sctp_addto_chunk(chunk, len, data); - else - return NULL; -} - /* Append bytes from user space to the end of a chunk. Will panic if * chunk is not big enough. * Returns a kernel err value. @@ -1829,6 +1802,9 @@ no_hmac: kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { + suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); + __be32 n = htonl(usecs); + /* * Section 3.3.10.3 Stale Cookie Error (3) * @@ -1837,17 +1813,12 @@ no_hmac: * Stale Cookie Error: Indicates the receipt of a valid State * Cookie that has expired. */ - len = ntohs(chunk->chunk_hdr->length); - *errp = sctp_make_op_error_space(asoc, chunk, len); - if (*errp) { - suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); - __be32 n = htonl(usecs); - - sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE, - sizeof(n)); - sctp_addto_chunk(*errp, sizeof(n), &n); + *errp = sctp_make_op_error(asoc, chunk, + SCTP_ERROR_STALE_COOKIE, &n, + sizeof(n), 0); + if (*errp) *error = -SCTP_IERROR_STALE_COOKIE; - } else + else *error = -SCTP_IERROR_NOMEM; goto fail; @@ -1998,12 +1969,8 @@ static int sctp_process_hn_param(const struct sctp_association *asoc, if (*errp) sctp_chunk_free(*errp); - *errp = sctp_make_op_error_space(asoc, chunk, len); - - if (*errp) { - sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len); - sctp_addto_chunk(*errp, len, param.v); - } + *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED, + param.v, len, 0); /* Stop processing this chunk. */ return 0; @@ -2128,23 +2095,23 @@ static enum sctp_ierror sctp_process_unk_param( /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ - if (NULL == *errp) - *errp = sctp_make_op_error_fixed(asoc, chunk); - - if (*errp) { - if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM, - SCTP_PAD4(ntohs(param.p->length)))) - sctp_addto_chunk_fixed(*errp, - SCTP_PAD4(ntohs(param.p->length)), - param.v); - } else { - /* If there is no memory for generating the ERROR - * report as specified, an ABORT will be triggered - * to the peer and the association won't be - * established. - */ - retval = SCTP_IERROR_NOMEM; + if (!*errp) { + *errp = sctp_make_op_error_limited(asoc, chunk); + if (!*errp) { + /* If there is no memory for generating the + * ERROR report as specified, an ABORT will be + * triggered to the peer and the association + * won't be established. + */ + retval = SCTP_IERROR_NOMEM; + break; + } } + + if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, + ntohs(param.p->length))) + sctp_addto_chunk(*errp, ntohs(param.p->length), + param.v); break; default: break; @@ -2220,10 +2187,10 @@ static enum sctp_ierror sctp_verify_param(struct net *net, * MUST be aborted. The ABORT chunk SHOULD contain the error * cause 'Protocol Violation'. */ - if (SCTP_AUTH_RANDOM_LENGTH != - ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) { + if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) - + sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, - chunk, err_chunk); + chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 80835ac26d2c..1b4593b842b0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -644,16 +644,15 @@ static int sctp_send_asconf_add_ip(struct sock *sk, list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { - /* Clear the source and route cache */ - sctp_transport_dst_release(trans); trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); trans->ssthresh = asoc->peer.i.a_rwnd; trans->rto = asoc->rto_initial; sctp_max_rto(asoc, trans); trans->rtt = trans->srtt = trans->rttvar = 0; + /* Clear the source and route cache */ sctp_transport_route(trans, NULL, - sctp_sk(asoc->base.sk)); + sctp_sk(asoc->base.sk)); } } retval = sctp_send_asconf(asoc, chunk); @@ -896,7 +895,6 @@ skip_mkasconf: */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { - sctp_transport_dst_release(transport); sctp_transport_route(transport, NULL, sctp_sk(asoc->base.sk)); } @@ -1895,6 +1893,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo) { struct sock *sk = asoc->base.sk; + struct sctp_sock *sp = sctp_sk(sk); struct net *net = sock_net(sk); struct sctp_datamsg *datamsg; bool wait_connect = false; @@ -1913,13 +1912,16 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, goto err; } - if (sctp_sk(sk)->disable_fragments && msg_len > asoc->frag_point) { + if (sp->disable_fragments && msg_len > asoc->frag_point) { err = -EMSGSIZE; goto err; } - if (asoc->pmtu_pending) - sctp_assoc_pending_pmtu(asoc); + if (asoc->pmtu_pending) { + if (sp->param_flags & SPP_PMTUD_ENABLE) + sctp_assoc_sync_pmtu(asoc); + asoc->pmtu_pending = 0; + } if (sctp_wspace(asoc) < msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); @@ -1936,7 +1938,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (err) goto err; - if (sctp_sk(sk)->strm_interleave) { + if (sp->strm_interleave) { timeo = sock_sndtimeo(sk, 0); err = sctp_wait_for_connect(asoc, &timeo); if (err) @@ -2539,7 +2541,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, trans->pathmtu = params->spp_pathmtu; sctp_assoc_sync_pmtu(asoc); } else if (asoc) { - asoc->pathmtu = params->spp_pathmtu; + sctp_assoc_set_pmtu(asoc, params->spp_pathmtu); } else { sp->pathmtu = params->spp_pathmtu; } @@ -3209,7 +3211,6 @@ static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, unsign static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int optlen) { struct sctp_sock *sp = sctp_sk(sk); - struct sctp_af *af = sp->pf->af; struct sctp_assoc_value params; struct sctp_association *asoc; int val; @@ -3231,30 +3232,24 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned return -EINVAL; } + asoc = sctp_id2assoc(sk, params.assoc_id); + if (val) { int min_len, max_len; + __u16 datasize = asoc ? sctp_datachk_len(&asoc->stream) : + sizeof(struct sctp_data_chunk); - min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len; - min_len -= af->ip_options_len(sk); - min_len -= sizeof(struct sctphdr) + - sizeof(struct sctp_data_chunk); - - max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk); + min_len = sctp_mtu_payload(sp, SCTP_DEFAULT_MINSEGMENT, + datasize); + max_len = SCTP_MAX_CHUNK_LEN - datasize; if (val < min_len || val > max_len) return -EINVAL; } - asoc = sctp_id2assoc(sk, params.assoc_id); if (asoc) { - if (val == 0) { - val = asoc->pathmtu - af->net_header_len; - val -= af->ip_options_len(sk); - val -= sizeof(struct sctphdr) + - sctp_datachk_len(&asoc->stream); - } asoc->user_frag = val; - asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu); + sctp_assoc_update_frag_point(asoc); } else { if (params.assoc_id && sctp_style(sk, UDP)) return -EINVAL; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 47f82bd794d9..4a95e260b674 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -242,9 +242,18 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) &transport->fl, sk); } - if (transport->dst) { - transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); - } else + if (transport->param_flags & SPP_PMTUD_DISABLE) { + struct sctp_association *asoc = transport->asoc; + + if (!transport->pathmtu && asoc && asoc->pathmtu) + transport->pathmtu = asoc->pathmtu; + if (transport->pathmtu) + return; + } + + if (transport->dst) + transport->pathmtu = sctp_dst_mtu(transport->dst); + else transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } @@ -290,6 +299,7 @@ void sctp_transport_route(struct sctp_transport *transport, struct sctp_association *asoc = transport->asoc; struct sctp_af *af = transport->af_specific; + sctp_transport_dst_release(transport); af->get_dst(transport, saddr, &transport->fl, sctp_opt2sk(opt)); if (saddr) @@ -297,21 +307,14 @@ void sctp_transport_route(struct sctp_transport *transport, else af->get_saddr(opt, transport, &transport->fl); - if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { - return; - } - if (transport->dst) { - transport->pathmtu = SCTP_TRUNC4(dst_mtu(transport->dst)); + sctp_transport_pmtu(transport, sctp_opt2sk(opt)); - /* Initialize sk->sk_rcv_saddr, if the transport is the - * association's active path for getsockname(). - */ - if (asoc && (!asoc->peer.primary_path || - (transport == asoc->peer.active_path))) - opt->pf->to_sk_saddr(&transport->saddr, - asoc->base.sk); - } else - transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; + /* Initialize sk->sk_rcv_saddr, if the transport is the + * association's active path for getsockname(). + */ + if (transport->dst && asoc && + (!asoc->peer.primary_path || transport == asoc->peer.active_path)) + opt->pf->to_sk_saddr(&transport->saddr, asoc->base.sk); } /* Hold a reference to a transport. */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 544bab42f925..48530dab5c94 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -29,6 +29,7 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/smc.h> +#include <asm/ioctls.h> #include "smc.h" #include "smc_clc.h" @@ -45,11 +46,6 @@ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group * creation */ -struct smc_lgr_list smc_lgr_list = { /* established link groups */ - .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), - .list = LIST_HEAD_INIT(smc_lgr_list.list), -}; - static void smc_tcp_listen_work(struct work_struct *); static void smc_set_keepalive(struct sock *sk, int val) @@ -192,8 +188,10 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_protocol = protocol; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); + spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); @@ -292,19 +290,28 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +/* register a new rmb, optionally send confirm_rkey msg to register with peer */ +static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, + bool conf_rkey) { /* register memory region for new rmb */ if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { rmb_desc->regerr = 1; return -EFAULT; } + if (!conf_rkey) + return 0; + /* exchange confirm_rkey msg with peer */ + if (smc_llc_do_confirm_rkey(link, rmb_desc)) { + rmb_desc->regerr = 1; + return -EFAULT; + } return 0; } static int smc_clnt_conf_first_link(struct smc_sock *smc) { + struct net *net = sock_net(smc->clcsock->sk); struct smc_link_group *lgr = smc->conn.lgr; struct smc_link *link; int rest; @@ -332,7 +339,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc)) + if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_INTERR; /* send CONFIRM LINK response over RoCE fabric */ @@ -362,7 +369,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) if (rc < 0) return SMC_CLC_DECL_TCL; - link->state = SMC_LNK_ACTIVE; + smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); return 0; } @@ -370,10 +377,13 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) static void smc_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - smc->conn.peer_conn_idx = clc->conn_idx; + int bufsize = smc_uncompress_bufsize(clc->rmbe_size); + + smc->conn.peer_rmbe_idx = clc->rmbe_idx; smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); - smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); + smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); + smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } static void smc_link_save_peer_info(struct smc_link *link, @@ -386,160 +396,186 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } -/* setup for RDMA connection of client */ -static int smc_connect_rdma(struct smc_sock *smc) +/* fall back during connect */ +static int smc_connect_fallback(struct smc_sock *smc) { - struct smc_clc_msg_accept_confirm aclc; - int local_contact = SMC_FIRST_CONTACT; - struct smc_ib_device *smcibdev; - struct smc_link *link; - u8 srv_first_contact; - int reason_code = 0; - int rc = 0; - u8 ibport; + smc->use_fallback = true; + smc_copy_sock_settings_to_clc(smc); + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + return 0; +} - sock_hold(&smc->sk); /* sock put in passive closing */ +/* decline and fall back during connect */ +static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) +{ + int rc; - if (!tcp_sk(smc->clcsock->sk)->syn_smc) { - /* peer has not signalled SMC-capability */ - smc->use_fallback = true; - goto out_connected; + if (reason_code < 0) /* error, fallback is not possible */ + return reason_code; + if (reason_code != SMC_CLC_DECL_REPLY) { + rc = smc_clc_send_decline(smc, reason_code); + if (rc < 0) + return rc; } + return smc_connect_fallback(smc); +} - /* IPSec connections opt out of SMC-R optimizations */ - if (using_ipsec(smc)) { - reason_code = SMC_CLC_DECL_IPSEC; - goto decline_rdma; - } +/* abort connecting */ +static int smc_connect_abort(struct smc_sock *smc, int reason_code, + int local_contact) +{ + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(smc->conn.lgr); + mutex_unlock(&smc_create_lgr_pending); + smc_conn_free(&smc->conn); + if (reason_code < 0 && smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return reason_code; +} + +/* check if there is a rdma device available for this connection. */ +/* called for connect and listen */ +static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, + u8 *ibport) +{ + int reason_code = 0; /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); - if (!smcibdev) { + smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport); + if (!(*ibdev)) reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } + + return reason_code; +} + +/* CLC handshake during connect */ +static int smc_connect_clc(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_ib_device *ibdev, u8 ibport) +{ + int rc = 0; /* do inband token exchange */ - reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); - if (reason_code < 0) { - rc = reason_code; - goto out_err; - } - if (reason_code > 0) /* configuration error */ - goto decline_rdma; + rc = smc_clc_send_proposal(smc, ibdev, ibport); + if (rc) + return rc; /* receive SMC Accept CLC message */ - reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), - SMC_CLC_ACCEPT); - if (reason_code < 0) { - rc = reason_code; - goto out_err; - } - if (reason_code > 0) - goto decline_rdma; + return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); +} + +/* setup for RDMA connection of client */ +static int smc_connect_rdma(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_ib_device *ibdev, u8 ibport) +{ + int local_contact = SMC_FIRST_CONTACT; + struct smc_link *link; + int reason_code = 0; - srv_first_contact = aclc.hdr.flag; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl, - srv_first_contact); + local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, + aclc->hdr.flag); if (local_contact < 0) { - rc = local_contact; - if (rc == -ENOMEM) + if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) + else if (local_contact == -ENOLINK) reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ - goto decline_rdma_unlock; + else + reason_code = SMC_CLC_DECL_INTERR; /* other error */ + return smc_connect_abort(smc, reason_code, 0); } link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; - smc_conn_save_peer_info(smc, &aclc); + smc_conn_save_peer_info(smc, aclc); /* create send buffer and rmb */ - rc = smc_buf_create(smc); - if (rc) { - reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma_unlock; - } + if (smc_buf_create(smc)) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); if (local_contact == SMC_FIRST_CONTACT) - smc_link_save_peer_info(link, &aclc); + smc_link_save_peer_info(link, aclc); - rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); - if (rc) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_rmb_rtoken_handling(&smc->conn, aclc)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); smc_close_init(smc); smc_rx_init(smc); if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_ib_ready_link(link); - if (rc) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_ib_ready_link(link)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); } else { - if (!smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } - } + if (!smc->conn.rmb_desc->reused && + smc_reg_rmb(link, smc->conn.rmb_desc, true)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); } smc_rmb_sync_sg_for_device(&smc->conn); - rc = smc_clc_send_confirm(smc); - if (rc) - goto out_err_unlock; + reason_code = smc_clc_send_confirm(smc); + if (reason_code) + return smc_connect_abort(smc, reason_code, local_contact); + + smc_tx_init(smc); if (local_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ reason_code = smc_clnt_conf_first_link(smc); - if (reason_code < 0) { - rc = reason_code; - goto out_err_unlock; - } - if (reason_code > 0) - goto decline_rdma_unlock; + if (reason_code) + return smc_connect_abort(smc, reason_code, + local_contact); } - mutex_unlock(&smc_create_lgr_pending); - smc_tx_init(smc); -out_connected: smc_copy_sock_settings_to_clc(smc); if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; - return rc ? rc : local_contact; + return 0; +} -decline_rdma_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); - smc_conn_free(&smc->conn); -decline_rdma: - /* RDMA setup failed, switch back to TCP */ - smc->use_fallback = true; - if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code); - if (rc < 0) - goto out_err; - } - goto out_connected; +/* perform steps before actually connecting */ +static int __smc_connect(struct smc_sock *smc) +{ + struct smc_clc_msg_accept_confirm aclc; + struct smc_ib_device *ibdev; + int rc = 0; + u8 ibport; -out_err_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); - smc_conn_free(&smc->conn); -out_err: - if (smc->sk.sk_state == SMC_INIT) - sock_put(&smc->sk); /* passive closing */ - return rc; + sock_hold(&smc->sk); /* sock put in passive closing */ + + if (smc->use_fallback) + return smc_connect_fallback(smc); + + /* if peer has not signalled SMC-capability, fall back */ + if (!tcp_sk(smc->clcsock->sk)->syn_smc) + return smc_connect_fallback(smc); + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(smc)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); + + /* check if a RDMA device is available; if not, fall back */ + if (smc_check_rdma(smc, &ibdev, &ibport)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + + /* perform CLC handshake */ + rc = smc_connect_clc(smc, &aclc, ibdev, ibport); + if (rc) + return smc_connect_decline_fallback(smc, rc); + + /* connect using rdma */ + rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + if (rc) + return smc_connect_decline_fallback(smc, rc); + + return 0; } static int smc_connect(struct socket *sock, struct sockaddr *addr, @@ -575,8 +611,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, if (rc) goto out; - /* setup RDMA connection */ - rc = smc_connect_rdma(smc); + rc = __smc_connect(smc); if (rc < 0) goto out; else @@ -716,6 +751,7 @@ void smc_close_non_accepted(struct sock *sk) static int smc_serv_conf_first_link(struct smc_sock *smc) { + struct net *net = sock_net(smc->clcsock->sk); struct smc_link_group *lgr = smc->conn.lgr; struct smc_link *link; int rest; @@ -723,7 +759,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) link = &lgr->lnk[SMC_SINGLE_LINK]; - if (smc_reg_rmb(link, smc->conn.rmb_desc)) + if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_INTERR; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -768,184 +804,244 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) return rc; } - link->state = SMC_LNK_ACTIVE; + smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); return 0; } -/* setup for RDMA connection of server */ -static void smc_listen_work(struct work_struct *work) +/* listen worker: finish */ +static void smc_listen_out(struct smc_sock *new_smc) { - struct smc_sock *new_smc = container_of(work, struct smc_sock, - smc_listen_work); - struct smc_clc_msg_proposal_prefix *pclc_prfx; - struct socket *newclcsock = new_smc->clcsock; struct smc_sock *lsmc = new_smc->listen_smc; - struct smc_clc_msg_accept_confirm cclc; - int local_contact = SMC_REUSE_CONTACT; struct sock *newsmcsk = &new_smc->sk; - struct smc_clc_msg_proposal *pclc; - struct smc_ib_device *smcibdev; - u8 buf[SMC_CLC_MAX_LEN]; - struct smc_link *link; - int reason_code = 0; - int rc = 0; - u8 ibport; - /* check if peer is smc capable */ - if (!tcp_sk(newclcsock->sk)->syn_smc) { - new_smc->use_fallback = true; - goto out_connected; + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); + if (lsmc->sk.sk_state == SMC_LISTEN) { + smc_accept_enqueue(&lsmc->sk, newsmcsk); + } else { /* no longer listening */ + smc_close_non_accepted(newsmcsk); } + release_sock(&lsmc->sk); - /* do inband token exchange - - *wait for and receive SMC Proposal CLC message - */ - reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), - SMC_CLC_PROPOSAL); - if (reason_code < 0) - goto out_err; - if (reason_code > 0) - goto decline_rdma; + /* Wake up accept */ + lsmc->sk.sk_data_ready(&lsmc->sk); + sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ +} - /* IPSec connections opt out of SMC-R optimizations */ - if (using_ipsec(new_smc)) { - reason_code = SMC_CLC_DECL_IPSEC; - goto decline_rdma; - } +/* listen worker: finish in state connected */ +static void smc_listen_out_connected(struct smc_sock *new_smc) +{ + struct sock *newsmcsk = &new_smc->sk; - /* PNET table look up: search active ib_device and port - * within same PNETID that also contains the ethernet device - * used for the internal TCP socket - */ - smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); - if (!smcibdev) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; + sk_refcnt_debug_inc(newsmcsk); + if (newsmcsk->sk_state == SMC_INIT) + newsmcsk->sk_state = SMC_ACTIVE; + + smc_listen_out(new_smc); +} + +/* listen worker: finish in error state */ +static void smc_listen_out_err(struct smc_sock *new_smc) +{ + struct sock *newsmcsk = &new_smc->sk; + + if (newsmcsk->sk_state == SMC_INIT) + sock_put(&new_smc->sk); /* passive closing */ + newsmcsk->sk_state = SMC_CLOSED; + smc_conn_free(&new_smc->conn); + + smc_listen_out(new_smc); +} + +/* listen worker: decline and fall back if possible */ +static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, + int local_contact) +{ + /* RDMA setup failed, switch back to TCP */ + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + if (reason_code < 0) { /* error, no fallback possible */ + smc_listen_out_err(new_smc); + return; } + smc_conn_free(&new_smc->conn); + new_smc->use_fallback = true; + if (reason_code && reason_code != SMC_CLC_DECL_REPLY) { + if (smc_clc_send_decline(new_smc, reason_code) < 0) { + smc_listen_out_err(new_smc); + return; + } + } + smc_listen_out_connected(new_smc); +} + +/* listen worker: check prefixes */ +static int smc_listen_rdma_check(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc) +{ + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct socket *newclcsock = new_smc->clcsock; - pclc = (struct smc_clc_msg_proposal *)&buf; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (smc_clc_prfx_match(newclcsock, pclc_prfx)) + return SMC_CLC_DECL_CNFERR; - rc = smc_clc_prfx_match(newclcsock, pclc_prfx); - if (rc) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } + return 0; +} +/* listen worker: initialize connection and buffers */ +static int smc_listen_rdma_init(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_ib_device *ibdev, u8 ibport, + int *local_contact) +{ /* allocate connection / link group */ - mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl, - 0); - if (local_contact < 0) { - rc = local_contact; - if (rc == -ENOMEM) - reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - goto decline_rdma_unlock; + *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); + if (*local_contact < 0) { + if (*local_contact == -ENOMEM) + return SMC_CLC_DECL_MEM;/* insufficient memory*/ + return SMC_CLC_DECL_INTERR; /* other error */ } - link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; /* create send buffer and rmb */ - rc = smc_buf_create(new_smc); - if (rc) { - reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma_unlock; - } + if (smc_buf_create(new_smc)) + return SMC_CLC_DECL_MEM; - smc_close_init(new_smc); - smc_rx_init(new_smc); + return 0; +} + +/* listen worker: register buffers */ +static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) +{ + struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; if (local_contact != SMC_FIRST_CONTACT) { if (!new_smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + return SMC_CLC_DECL_INTERR; } } smc_rmb_sync_sg_for_device(&new_smc->conn); - rc = smc_clc_send_accept(new_smc, local_contact); - if (rc) - goto out_err_unlock; + return 0; +} + +/* listen worker: finish RDMA setup */ +static void smc_listen_rdma_finish(struct smc_sock *new_smc, + struct smc_clc_msg_accept_confirm *cclc, + int local_contact) +{ + struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + int reason_code = 0; - /* receive SMC Confirm CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM); - if (reason_code < 0) - goto out_err_unlock; - if (reason_code > 0) - goto decline_rdma_unlock; - smc_conn_save_peer_info(new_smc, &cclc); if (local_contact == SMC_FIRST_CONTACT) - smc_link_save_peer_info(link, &cclc); + smc_link_save_peer_info(link, cclc); - rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); - if (rc) { + if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; + goto decline; } if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_ib_ready_link(link); - if (rc) { + if (smc_ib_ready_link(link)) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; + goto decline; } /* QP confirmation over RoCE fabric */ reason_code = smc_serv_conf_first_link(new_smc); - if (reason_code < 0) - /* peer is not aware of a problem */ - goto out_err_unlock; - if (reason_code > 0) - goto decline_rdma_unlock; + if (reason_code) + goto decline; } + return; - smc_tx_init(new_smc); +decline: mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, reason_code, local_contact); +} -out_connected: - sk_refcnt_debug_inc(newsmcsk); - if (newsmcsk->sk_state == SMC_INIT) - newsmcsk->sk_state = SMC_ACTIVE; -enqueue: - lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); - if (lsmc->sk.sk_state == SMC_LISTEN) { - smc_accept_enqueue(&lsmc->sk, newsmcsk); - } else { /* no longer listening */ - smc_close_non_accepted(newsmcsk); +/* setup for RDMA connection of server */ +static void smc_listen_work(struct work_struct *work) +{ + struct smc_sock *new_smc = container_of(work, struct smc_sock, + smc_listen_work); + struct socket *newclcsock = new_smc->clcsock; + struct smc_clc_msg_accept_confirm cclc; + struct smc_clc_msg_proposal *pclc; + struct smc_ib_device *ibdev; + u8 buf[SMC_CLC_MAX_LEN]; + int local_contact = 0; + int reason_code = 0; + int rc = 0; + u8 ibport; + + if (new_smc->use_fallback) { + smc_listen_out_connected(new_smc); + return; } - release_sock(&lsmc->sk); - /* Wake up accept */ - lsmc->sk.sk_data_ready(&lsmc->sk); - sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ - return; + /* check if peer is smc capable */ + if (!tcp_sk(newclcsock->sk)->syn_smc) { + new_smc->use_fallback = true; + smc_listen_out_connected(new_smc); + return; + } -decline_rdma_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); -decline_rdma: - /* RDMA setup failed, switch back to TCP */ - smc_conn_free(&new_smc->conn); - new_smc->use_fallback = true; - if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - if (smc_clc_send_decline(new_smc, reason_code) < 0) - goto out_err; + /* do inband token exchange - + * wait for and receive SMC Proposal CLC message + */ + pclc = (struct smc_clc_msg_proposal *)&buf; + reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + SMC_CLC_PROPOSAL); + if (reason_code) { + smc_listen_decline(new_smc, reason_code, 0); + return; } - goto out_connected; -out_err_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(new_smc)) { + smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); + return; + } + + mutex_lock(&smc_create_lgr_pending); + smc_close_init(new_smc); + smc_rx_init(new_smc); + smc_tx_init(new_smc); + + /* check if RDMA is available */ + if (smc_check_rdma(new_smc, &ibdev, &ibport) || + smc_listen_rdma_check(new_smc, pclc) || + smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, + &local_contact) || + smc_listen_rdma_reg(new_smc, local_contact)) { + /* SMC not supported, decline */ + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); + return; + } + + /* send SMC Accept CLC message */ + rc = smc_clc_send_accept(new_smc, local_contact); + if (rc) { + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, rc, local_contact); + return; + } + + /* receive SMC Confirm CLC message */ + reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM); + if (reason_code) { + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, reason_code, local_contact); + return; + } + + /* finish worker */ + smc_listen_rdma_finish(new_smc, &cclc, local_contact); + smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); -out_err: - if (newsmcsk->sk_state == SMC_INIT) - sock_put(&new_smc->sk); /* passive closing */ - newsmcsk->sk_state = SMC_CLOSED; - smc_conn_free(&new_smc->conn); - goto enqueue; /* queue new sock with sk_err set */ + smc_listen_out_connected(new_smc); } static void smc_tcp_listen_work(struct work_struct *work) @@ -965,7 +1061,7 @@ static void smc_tcp_listen_work(struct work_struct *work) continue; new_smc->listen_smc = lsmc; - new_smc->use_fallback = false; /* assume rdma capability first*/ + new_smc->use_fallback = lsmc->use_fallback; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); @@ -1001,7 +1097,8 @@ static int smc_listen(struct socket *sock, int backlog) * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); - tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (!smc->use_fallback) + tcp_sk(smc->clcsock->sk)->syn_smc = 1; rc = kernel_listen(smc->clcsock, backlog); if (rc) @@ -1034,6 +1131,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (lsmc->sk.sk_state != SMC_LISTEN) { rc = -EINVAL; + release_sock(sk); goto out; } @@ -1061,9 +1159,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (!rc) rc = sock_error(nsk); + release_sock(sk); + if (rc) + goto out; + + if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + /* wait till data arrives on the socket */ + timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * + MSEC_PER_SEC); + if (smc_sk(nsk)->use_fallback) { + struct sock *clcsk = smc_sk(nsk)->clcsock->sk; + + lock_sock(clcsk); + if (skb_queue_empty(&clcsk->sk_receive_queue)) + sk_wait_data(clcsk, &timeo, NULL); + release_sock(clcsk); + } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { + lock_sock(nsk); + smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); + release_sock(nsk); + } + } out: - release_sock(sk); sock_put(sk); /* sock_hold above */ return rc; } @@ -1094,6 +1212,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_INIT)) goto out; + + if (msg->msg_flags & MSG_FASTOPEN) { + if (sk->sk_state == SMC_INIT) { + smc->use_fallback = true; + } else { + rc = -EINVAL; + goto out; + } + } + if (smc->use_fallback) rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); else @@ -1122,10 +1250,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto out; } - if (smc->use_fallback) + if (smc->use_fallback) { rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); - else - rc = smc_rx_recvmsg(smc, msg, len, flags); + } else { + msg->msg_namelen = 0; + rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); + } out: release_sock(sk); @@ -1172,7 +1302,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, if (sk->sk_state == SMC_INIT && mask & EPOLLOUT && smc->clcsock->sk->sk_state != TCP_CLOSE) { - rc = smc_connect_rdma(smc); + rc = __smc_connect(smc); if (rc < 0) mask |= EPOLLERR; /* success cases including fallback */ @@ -1273,14 +1403,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct smc_sock *smc; + int val, rc; smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the * CLC socket */ - return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, - optval, optlen); + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); + if (smc->clcsock->sk->sk_err) { + sk->sk_err = smc->clcsock->sk->sk_err; + sk->sk_error_report(sk); + } + if (rc) + return rc; + + if (optlen < sizeof(int)) + return rc; + get_user(val, (int __user *)optval); + + lock_sock(sk); + switch (optname) { + case TCP_ULP: + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: + case TCP_FASTOPEN_NO_COOKIE: + /* option not supported by SMC */ + if (sk->sk_state == SMC_INIT) { + smc->use_fallback = true; + } else { + if (!smc->use_fallback) + rc = -EINVAL; + } + break; + case TCP_NODELAY: + if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (val && !smc->use_fallback) + mod_delayed_work(system_wq, &smc->conn.tx_work, + 0); + } + break; + case TCP_CORK: + if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (!val && !smc->use_fallback) + mod_delayed_work(system_wq, &smc->conn.tx_work, + 0); + } + break; + case TCP_DEFER_ACCEPT: + smc->sockopt_defer_accept = val; + break; + default: + break; + } + release_sock(sk); + + return rc; } static int smc_getsockopt(struct socket *sock, int level, int optname, @@ -1298,12 +1478,38 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct smc_sock *smc; + int answ; smc = smc_sk(sock->sk); - if (smc->use_fallback) + if (smc->use_fallback) { + if (!smc->clcsock) + return -EBADF; return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); - else - return sock_no_ioctl(sock, cmd, arg); + } + switch (cmd) { + case SIOCINQ: /* same as FIONREAD */ + if (smc->sk.sk_state == SMC_LISTEN) + return -EINVAL; + answ = atomic_read(&smc->conn.bytes_to_rcv); + break; + case SIOCOUTQ: + /* output queue size (not send + not acked) */ + if (smc->sk.sk_state == SMC_LISTEN) + return -EINVAL; + answ = smc->conn.sndbuf_desc->len - + atomic_read(&smc->conn.sndbuf_space); + break; + case SIOCOUTQNSD: + /* output queue size (not send only) */ + if (smc->sk.sk_state == SMC_LISTEN) + return -EINVAL; + answ = smc_tx_prepared_sends(&smc->conn); + break; + default: + return -ENOIOCTLCMD; + } + + return put_user(answ, (int __user *)arg); } static ssize_t smc_sendpage(struct socket *sock, struct page *page, @@ -1330,9 +1536,15 @@ out: return rc; } +/* Map the affected portions of the rmbe into an spd, note the number of bytes + * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor + * updates till whenever a respective page has been fully processed. + * Note that subsequent recv() calls have to wait till all splice() processing + * completed. + */ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, - unsigned int flags) + unsigned int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -1340,16 +1552,34 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, smc = smc_sk(sk); lock_sock(sk); - if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) + + if (sk->sk_state == SMC_INIT || + sk->sk_state == SMC_LISTEN || + sk->sk_state == SMC_CLOSED) goto out; + + if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { + rc = 0; + goto out; + } + if (smc->use_fallback) { rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, pipe, len, flags); } else { - rc = -EOPNOTSUPP; + if (*ppos) { + rc = -ESPIPE; + goto out; + } + if (flags & SPLICE_F_NONBLOCK) + flags = MSG_DONTWAIT; + else + flags = 0; + rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: release_sock(sk); + return rc; } @@ -1482,18 +1712,7 @@ out_pnet: static void __exit smc_exit(void) { - struct smc_link_group *lgr, *lg; - LIST_HEAD(lgr_freeing_list); - - spin_lock_bh(&smc_lgr_list.lock); - if (!list_empty(&smc_lgr_list.list)) - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); - spin_unlock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { - list_del_init(&lgr->list); - cancel_delayed_work_sync(&lgr->free_work); - smc_lgr_free(lgr); /* free link group */ - } + smc_core_exit(); static_branch_disable(&tcp_have_smc); smc_ib_unregister_client(); sock_unregister(PF_SMC); diff --git a/net/smc/smc.h b/net/smc/smc.h index e4829a2f46ba..a1467e411645 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -118,7 +118,7 @@ struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ u32 alert_token_local; /* unique conn. id */ - u8 peer_conn_idx; /* from tcp handshake */ + u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ atomic_t peer_rmbe_space;/* remaining free bytes in peer * rmbe @@ -126,9 +126,7 @@ struct smc_connection { int rtoken_idx; /* idx to peer RMB rkey/addr */ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ - int sndbuf_size; /* sndbuf size <== sock wmem */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size; /* RMBE size <== sock rmem */ int rmbe_size_short;/* compressed notation */ int rmbe_update_limit; /* lower limit for consumer @@ -153,6 +151,7 @@ struct smc_connection { u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ + u32 tx_off; /* base offset in peer rmb */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt @@ -164,6 +163,9 @@ struct smc_connection { atomic_t bytes_to_rcv; /* arrived data, * not yet received */ + atomic_t splice_pending; /* number of spliced bytes + * pending processing + */ #ifndef KERNEL_HAS_ATOMIC64 spinlock_t acurs_lock; /* protect cursors */ #endif @@ -180,6 +182,10 @@ struct smc_sock { /* smc sock container */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool use_fallback; /* fallback to tcp */ + int sockopt_defer_accept; + /* sockopt TCP_DEFER_ACCEPT + * value + */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close * started, waiting for unsent @@ -214,41 +220,6 @@ static inline u32 ntoh24(u8 *net) return be32_to_cpu(t); } -#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ - -#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */ -/* theoretically, the RFC states that largest size would be 512K, - * i.e. compressed 5 and thus 6 sizes (0..5), despite - * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) - */ - -/* convert the RMB size into the compressed notation - minimum 16K. - * In contrast to plain ilog2, this rounds towards the next power of 2, - * so the socket application gets at least its desired sndbuf / rcvbuf size. - */ -static inline u8 smc_compress_bufsize(int size) -{ - u8 compressed; - - if (size <= SMC_BUF_MIN_SIZE) - return 0; - - size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; - return compressed; -} - -/* convert the RMB size from compressed notation into integer */ -static inline int smc_uncompress_bufsize(u8 compressed) -{ - u32 size; - - size = 0x00000001 << (((int)compressed) + 14); - return (int)size; -} - #ifdef CONFIG_XFRM static inline bool using_ipsec(struct smc_sock *smc) { @@ -262,12 +233,6 @@ static inline bool using_ipsec(struct smc_sock *smc) } #endif -struct smc_clc_msg_local; - -void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, - struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact); struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b42395d24cba..8d2c079c87b0 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -44,13 +44,13 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, smc = container_of(cdcpend->conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { - diff = smc_curs_diff(cdcpend->conn->sndbuf_size, + diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); /* sndbuf_space is decreased in smc_sendmsg */ smp_mb__before_atomic(); atomic_add(diff, &cdcpend->conn->sndbuf_space); - /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); smc_curs_write(&cdcpend->conn->tx_curs_fin, smc_curs_read(&cdcpend->cursor, cdcpend->conn), @@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn, sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); BUILD_BUG_ON_MSG( - offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, + sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE, "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); BUILD_BUG_ON_MSG( sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, @@ -165,19 +165,12 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2) } static void smc_cdc_msg_recv_action(struct smc_sock *smc, - struct smc_link *link, struct smc_cdc_msg *cdc) { union smc_host_cursor cons_old, prod_old; struct smc_connection *conn = &smc->conn; int diff_cons, diff_prod; - if (!cdc->prod_flags.failover_validation) { - if (smc_cdc_before(ntohs(cdc->seqno), - conn->local_rx_ctrl.seqno)) - /* received seqno is old */ - return; - } smc_curs_write(&prod_old, smc_curs_read(&conn->local_rx_ctrl.prod, conn), conn); @@ -198,13 +191,13 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smp_mb__after_atomic(); } - diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old, + diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old, &conn->local_rx_ctrl.prod); if (diff_prod) { /* bytes_to_rcv is decreased in smc_recvmsg */ smp_mb__before_atomic(); atomic_add(diff_prod, &conn->bytes_to_rcv); - /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ smp_mb__after_atomic(); smc->sk.sk_data_ready(&smc->sk); } else if ((conn->local_rx_ctrl.prod_flags.write_blocked) || @@ -236,26 +229,11 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, } /* called under tasklet context */ -static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc, - struct smc_link *link, u64 wr_id) +static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - struct smc_connection *connection; - struct smc_sock *smc; - - /* lookup connection */ - read_lock_bh(&lgr->conns_lock); - connection = smc_lgr_find_conn(ntohl(cdc->token), lgr); - if (!connection) { - read_unlock_bh(&lgr->conns_lock); - return; - } - smc = container_of(connection, struct smc_sock, conn); sock_hold(&smc->sk); - read_unlock_bh(&lgr->conns_lock); bh_lock_sock(&smc->sk); - smc_cdc_msg_recv_action(smc, link, cdc); + smc_cdc_msg_recv_action(smc, cdc); bh_unlock_sock(&smc->sk); sock_put(&smc->sk); /* no free sk in softirq-context */ } @@ -266,12 +244,31 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) { struct smc_link *link = (struct smc_link *)wc->qp->qp_context; struct smc_cdc_msg *cdc = buf; + struct smc_connection *conn; + struct smc_link_group *lgr; + struct smc_sock *smc; if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved)) return; /* short message */ if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ - smc_cdc_msg_recv(cdc, link, wc->wr_id); + + /* lookup connection */ + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + read_lock_bh(&lgr->conns_lock); + conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); + read_unlock_bh(&lgr->conns_lock); + if (!conn) + return; + smc = container_of(conn, struct smc_sock, conn); + + if (!cdc->prod_flags.failover_validation) { + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + } + smc_cdc_msg_recv(smc, cdc); } static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = { diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index ab240b37ad11..d2012fd22100 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -48,7 +48,7 @@ struct smc_cdc_msg { struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; u8 reserved[18]; -} __aligned(8); +} __packed; /* format defined in RFC7609 */ static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3a988c22f627..717449b1da0b 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -316,7 +316,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, if (clcm->type == SMC_CLC_DECLINE) { reason_code = SMC_CLC_DECL_REPLY; if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { - smc->conn.lgr->sync_err = true; + smc->conn.lgr->sync_err = 1; smc_lgr_terminate(smc->conn.lgr); } } @@ -442,7 +442,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ + cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; @@ -494,7 +494,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ + aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 63bf1dc2c1f9..41ff9ea96139 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -97,7 +97,7 @@ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_local lcl; u8 qpn[3]; /* QP number */ __be32 rmb_rkey; /* RMB rkey */ - u8 conn_idx; /* Connection index, which RMBE in RMB */ + u8 rmbe_idx; /* Index of RMBE in RMB */ __be32 rmbe_alert_token;/* unique connection id */ #if defined(__BIG_ENDIAN_BITFIELD) u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d4bd01bb44e1..1e5c0e90a706 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -30,10 +30,14 @@ #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) -static u32 smc_lgr_num; /* unique link group number */ +static struct smc_lgr_list smc_lgr_list = { /* established link groups */ + .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), + .list = LIST_HEAD_INIT(smc_lgr_list.list), + .num = 0, +}; -static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, - bool is_rmb); +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc); static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { @@ -148,8 +152,11 @@ static void smc_lgr_free_work(struct work_struct *work) list_del_init(&lgr->list); /* remove from smc_lgr_list */ free: spin_unlock_bh(&smc_lgr_list.lock); - if (!delayed_work_pending(&lgr->free_work)) + if (!delayed_work_pending(&lgr->free_work)) { + if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); smc_lgr_free(lgr); + } } /* create a new SMC link group */ @@ -169,7 +176,7 @@ static int smc_lgr_create(struct smc_sock *smc, goto out; } lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - lgr->sync_err = false; + lgr->sync_err = 0; memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); @@ -178,8 +185,8 @@ static int smc_lgr_create(struct smc_sock *smc, INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } - smc_lgr_num += SMC_LGR_NUM_INCR; - memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE); + smc_lgr_list.num += SMC_LGR_NUM_INCR; + memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; @@ -194,9 +201,12 @@ static int smc_lgr_create(struct smc_sock *smc, smc_ib_setup_per_ibdev(smcibdev); get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); - rc = smc_wr_alloc_link_mem(lnk); + rc = smc_llc_link_init(lnk); if (rc) goto free_lgr; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; rc = smc_ib_create_protection_domain(lnk); if (rc) goto free_link_mem; @@ -206,10 +216,6 @@ static int smc_lgr_create(struct smc_sock *smc, rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; - init_completion(&lnk->llc_confirm); - init_completion(&lnk->llc_confirm_resp); - init_completion(&lnk->llc_add); - init_completion(&lnk->llc_add_resp); smc->conn.lgr = lgr; rwlock_init(&lgr->conns_lock); @@ -224,6 +230,8 @@ dealloc_pd: smc_ib_dealloc_protection_domain(lnk); free_link_mem: smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk); free_lgr: kfree(lgr); out: @@ -232,26 +240,21 @@ out: static void smc_buf_unuse(struct smc_connection *conn) { - if (conn->sndbuf_desc) { + if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; - conn->sndbuf_size = 0; - } if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { conn->rmb_desc->reused = 1; conn->rmb_desc->used = 0; - conn->rmbe_size = 0; } else { /* buf registration failed, reuse not possible */ struct smc_link_group *lgr = conn->lgr; - struct smc_link *lnk; write_lock_bh(&lgr->rmbs_lock); list_del(&conn->rmb_desc->list); write_unlock_bh(&lgr->rmbs_lock); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - smc_buf_free(conn->rmb_desc, lnk, true); + smc_buf_free(lgr, true, conn->rmb_desc); } } } @@ -269,6 +272,7 @@ void smc_conn_free(struct smc_connection *conn) static void smc_link_clear(struct smc_link *lnk) { lnk->peer_qpn = 0; + smc_llc_link_clear(lnk); smc_ib_modify_qp_reset(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); @@ -276,9 +280,11 @@ static void smc_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); } -static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, - bool is_rmb) +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + if (is_rmb) { if (buf_desc->mr_rx[SMC_SINGLE_LINK]) smc_ib_put_memory_region( @@ -290,14 +296,13 @@ static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, DMA_TO_DEVICE); } sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); - if (buf_desc->cpu_addr) - free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order); + if (buf_desc->pages) + __free_pages(buf_desc->pages, buf_desc->order); kfree(buf_desc); } static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; struct smc_buf_desc *buf_desc, *bf_desc; struct list_head *buf_list; int i; @@ -310,7 +315,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { list_del(&buf_desc->list); - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); } } } @@ -347,7 +352,11 @@ void smc_lgr_terminate(struct smc_link_group *lgr) struct smc_sock *smc; struct rb_node *node; + if (lgr->terminating) + return; /* lgr already terminating */ + lgr->terminating = 1; smc_lgr_forget(lgr); + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); @@ -368,13 +377,26 @@ void smc_lgr_terminate(struct smc_link_group *lgr) smc_lgr_schedule_free_work(lgr); } +/* Called when IB port is terminated */ +void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *l; + + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) + smc_lgr_terminate(lgr); + } +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) { struct dst_entry *dst = sk_dst_get(clcsock->sk); - int rc = 0; + struct net_device *ndev; + int i, nest_lvl, rc = 0; *vlan_id = 0; if (!dst) { @@ -386,8 +408,27 @@ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) goto out_rel; } - if (is_vlan_dev(dst->dev)) - *vlan_id = vlan_dev_vlan_id(dst->dev); + ndev = dst->dev; + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + goto out_rel; + } + + rtnl_lock(); + nest_lvl = dev_get_nest_level(ndev); + for (i = 0; i < nest_lvl; i++) { + struct list_head *lower = &ndev->adj_list.lower; + + if (list_empty(lower)) + break; + lower = lower->next; + ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + break; + } + } + rtnl_unlock(); out_rel: dst_release(dst); @@ -432,10 +473,10 @@ int smc_conn_create(struct smc_sock *smc, struct smc_clc_msg_local *lcl, int srv_first_contact) { struct smc_connection *conn = &smc->conn; + int local_contact = SMC_FIRST_CONTACT; struct smc_link_group *lgr; unsigned short vlan_id; enum smc_lgr_role role; - int local_contact = SMC_FIRST_CONTACT; int rc = 0; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; @@ -501,14 +542,39 @@ out: return rc ? rc : local_contact; } +/* convert the RMB size into the compressed notation - minimum 16K. + * In contrast to plain ilog2, this rounds towards the next power of 2, + * so the socket application gets at least its desired sndbuf / rcvbuf size. + */ +static u8 smc_compress_bufsize(int size) +{ + u8 compressed; + + if (size <= SMC_BUF_MIN_SIZE) + return 0; + + size = (size - 1) >> 14; + compressed = ilog2(size) + 1; + if (compressed >= SMC_RMBE_SIZES) + compressed = SMC_RMBE_SIZES - 1; + return compressed; +} + +/* convert the RMB size from compressed notation into integer */ +int smc_uncompress_bufsize(u8 compressed) +{ + u32 size; + + size = 0x00000001 << (((int)compressed) + 14); + return (int)size; +} + /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ -static inline -struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr, - int compressed_bufsize, - rwlock_t *lock, - struct list_head *buf_list) +static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, + rwlock_t *lock, + struct list_head *buf_list) { struct smc_buf_desc *buf_slot; @@ -544,23 +610,23 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, if (!buf_desc) return ERR_PTR(-ENOMEM); - buf_desc->cpu_addr = - (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | - __GFP_NORETRY | __GFP_ZERO, - get_order(bufsize)); - if (!buf_desc->cpu_addr) { + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (!buf_desc->pages) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } - buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); /* build the sg table from the pages */ lnk = &lgr->lnk[SMC_SINGLE_LINK]; rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, GFP_KERNEL); if (rc) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, @@ -571,7 +637,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != 1) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(-EAGAIN); } @@ -582,19 +648,20 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, IB_ACCESS_LOCAL_WRITE, buf_desc); if (rc) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } } + buf_desc->len = bufsize; return buf_desc; } static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) { + struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; - struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct list_head *buf_list; int bufsize, bufsize_short; int sk_buf_size; @@ -622,7 +689,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) continue; /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ @@ -646,14 +713,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; - conn->rmbe_size = bufsize; conn->rmbe_size_short = bufsize_short; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); } else { conn->sndbuf_desc = buf_desc; - conn->sndbuf_size = bufsize; smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } @@ -709,8 +774,7 @@ int smc_buf_create(struct smc_sock *smc) /* create rmb */ rc = __smc_buf_create(smc, true); if (rc) - smc_buf_free(smc->conn.sndbuf_desc, - &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false); + smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); return rc; } @@ -777,3 +841,21 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, return conn->rtoken_idx; return 0; } + +/* Called (from smc_exit) when module is removed */ +void smc_core_exit(void) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_freeing_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!list_empty(&smc_lgr_list.list)) + list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); + spin_unlock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { + list_del_init(&lgr->list); + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + cancel_delayed_work_sync(&lgr->free_work); + smc_lgr_free(lgr); /* free link group */ + } +} diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5dfcb15d529f..93cb3523bf50 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -23,10 +23,9 @@ struct smc_lgr_list { /* list of link group definition */ struct list_head list; spinlock_t lock; /* protects list of link groups */ + u32 num; /* unique link group number */ }; -extern struct smc_lgr_list smc_lgr_list; /* list of link groups */ - enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ @@ -79,6 +78,7 @@ struct smc_link { dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ u64 wr_rx_id; /* seq # of last recv WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ + unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ @@ -95,12 +95,18 @@ struct smc_link { u8 link_id; /* unique # within link group */ enum smc_link_state state; /* state of link */ + struct workqueue_struct *llc_wq; /* single thread work queue */ struct completion llc_confirm; /* wait for rx of conf link */ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ int llc_confirm_rc; /* rc from confirm link msg */ int llc_confirm_resp_rc; /* rc from conf_resp msg */ struct completion llc_add; /* wait for rx of add link */ struct completion llc_add_resp; /* wait for rx of add link rsp*/ + struct delayed_work llc_testlink_wrk; /* testlink worker */ + struct completion llc_testlink_resp; /* wait for rx of testlink */ + int llc_testlink_time; /* testlink interval */ + struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ + int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -116,6 +122,8 @@ struct smc_link { struct smc_buf_desc { struct list_head list; void *cpu_addr; /* virtual address of buffer */ + struct page *pages; + int len; /* length of buffer */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; /* for rmb only: memory region @@ -133,6 +141,12 @@ struct smc_rtoken { /* address/key of remote RMB */ }; #define SMC_LGR_ID_SIZE 4 +#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ +#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ +/* theoretically, the RFC states that largest size would be 512K, + * i.e. compressed 5 and thus 6 sizes (0..5), despite + * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) + */ struct smc_link_group { struct list_head list; @@ -158,7 +172,8 @@ struct smc_link_group { u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ - bool sync_err; /* lgr no longer fits to peer */ + u8 sync_err : 1; /* lgr no longer fits to peer */ + u8 terminating : 1;/* lgr is terminating */ }; /* Find the connection associated with the given alert token in the link group. @@ -196,11 +211,14 @@ static inline struct smc_connection *smc_lgr_find_conn( struct smc_sock; struct smc_clc_msg_accept_confirm; +struct smc_clc_msg_local; void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); +void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); int smc_buf_create(struct smc_sock *smc); +int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); @@ -209,4 +227,9 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); +void smc_conn_free(struct smc_connection *conn); +int smc_conn_create(struct smc_sock *smc, + struct smc_ib_device *smcibdev, u8 ibport, + struct smc_clc_msg_local *lcl, int srv_first_contact); +void smc_core_exit(void); #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 427b91c1c964..839354402215 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -38,17 +38,27 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); - r->diag_family = sk->sk_family; if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; sock_diag_save_cookie(sk, r->id.idiag_cookie); - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; - r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; + if (sk->sk_protocol == SMCPROTO_SMC) { + r->diag_family = PF_INET; + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; + r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_protocol == SMCPROTO_SMC6) { + r->diag_family = PF_INET6; + memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr, + sizeof(smc->clcsock->sk->sk_v6_rcv_saddr)); + memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr, + sizeof(smc->clcsock->sk->sk_v6_daddr)); +#endif + } } static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, @@ -91,8 +101,9 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct smc_connection *conn = &smc->conn; struct smc_diag_conninfo cinfo = { .token = conn->alert_token_local, - .sndbuf_size = conn->sndbuf_size, - .rmbe_size = conn->rmbe_size, + .sndbuf_size = conn->sndbuf_desc ? + conn->sndbuf_desc->len : 0, + .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0, .peer_rmbe_size = conn->peer_rmbe_size, .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, @@ -153,7 +164,8 @@ errout: return -EMSGSIZE; } -static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, + struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr *bc = NULL; @@ -161,8 +173,8 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct sock *sk; int rc = 0; - read_lock(&smc_proto.h.smc_hash->lock); - head = &smc_proto.h.smc_hash->ht; + read_lock(&prot->h.smc_hash->lock); + head = &prot->h.smc_hash->ht; if (hlist_empty(head)) goto out; @@ -175,7 +187,17 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } out: - read_unlock(&smc_proto.h.smc_hash->lock); + read_unlock(&prot->h.smc_hash->lock); + return rc; +} + +static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int rc = 0; + + rc = smc_diag_dump_proto(&smc_proto, skb, cb); + if (!rc) + rc = smc_diag_dump_proto(&smc_proto6, skb, cb); return rc; } diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 26df554f7588..0eed7ab9f28b 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -143,17 +143,6 @@ out: return rc; } -static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct smc_link_group *lgr, *l; - - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) - smc_lgr_terminate(lgr); - } -} - /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -165,7 +154,7 @@ static void smc_ib_port_event_work(struct work_struct *work) smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); if (!smc_ib_port_active(smcibdev, port_idx + 1)) - smc_ib_port_terminate(smcibdev, port_idx + 1); + smc_port_terminate(smcibdev, port_idx + 1); } } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index ea4b21981b4b..5800a6b43d83 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -214,12 +214,11 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], return rc; } -/* send ADD LINK request or response */ -int smc_llc_send_add_link(struct smc_link *link, u8 mac[], - union ib_gid *gid, - enum smc_llc_reqresp reqresp) +/* send LLC confirm rkey request */ +static int smc_llc_send_confirm_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) { - struct smc_llc_msg_add_link *addllc; + struct smc_llc_msg_confirm_rkey *rkeyllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; @@ -227,7 +226,25 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) return rc; - addllc = (struct smc_llc_msg_add_link *)wr_buf; + rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; + memset(rkeyllc, 0, sizeof(*rkeyllc)); + rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; + rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + rkeyllc->rtoken[0].rmb_key = + htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( + (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* prepare an add link message */ +static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, + struct smc_link *link, u8 mac[], + union ib_gid *gid, + enum smc_llc_reqresp reqresp) +{ memset(addllc, 0, sizeof(*addllc)); addllc->hd.common.type = SMC_LLC_ADD_LINK; addllc->hd.length = sizeof(struct smc_llc_msg_add_link); @@ -239,16 +256,14 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], } memcpy(addllc->sender_mac, mac, ETH_ALEN); memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); - /* send llc message */ - rc = smc_wr_tx_send(link, pend); - return rc; } -/* send DELETE LINK request or response */ -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp) +/* send ADD LINK request or response */ +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], + union ib_gid *gid, + enum smc_llc_reqresp reqresp) { - struct smc_llc_msg_del_link *delllc; + struct smc_llc_msg_add_link *addllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; @@ -256,7 +271,18 @@ int smc_llc_send_delete_link(struct smc_link *link, rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) return rc; - delllc = (struct smc_llc_msg_del_link *)wr_buf; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + smc_llc_prep_add_link(addllc, link, mac, gid, reqresp); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* prepare a delete link message */ +static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, + struct smc_link *link, + enum smc_llc_reqresp reqresp) +{ memset(delllc, 0, sizeof(*delllc)); delllc->hd.common.type = SMC_LLC_DELETE_LINK; delllc->hd.length = sizeof(struct smc_llc_msg_add_link); @@ -266,14 +292,29 @@ int smc_llc_send_delete_link(struct smc_link *link, delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; delllc->link_num = link->link_id; +} + +/* send DELETE LINK request or response */ +int smc_llc_send_delete_link(struct smc_link *link, + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_del_link *delllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + delllc = (struct smc_llc_msg_del_link *)wr_buf; + smc_llc_prep_delete_link(delllc, link, reqresp); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* send LLC test link request or response */ -int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16], - enum smc_llc_reqresp reqresp) +/* send LLC test link request */ +static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) { struct smc_llc_msg_test_link *testllc; struct smc_wr_tx_pend_priv *pend; @@ -287,28 +328,52 @@ int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16], memset(testllc, 0, sizeof(*testllc)); testllc->hd.common.type = SMC_LLC_TEST_LINK; testllc->hd.length = sizeof(struct smc_llc_msg_test_link); - if (reqresp == SMC_LLC_RESP) - testllc->hd.flags |= SMC_LLC_FLAG_RESP; memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* send a prepared message */ -static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +struct smc_llc_send_work { + struct work_struct work; + struct smc_link *link; + int llclen; + union smc_llc_msg llcbuf; +}; + +/* worker that sends a prepared message */ +static void smc_llc_send_message_work(struct work_struct *work) { + struct smc_llc_send_work *llcwrk = container_of(work, + struct smc_llc_send_work, work); struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (llcwrk->link->state == SMC_LNK_INACTIVE) + goto out; + rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); if (rc) - return rc; - memcpy(wr_buf, llcbuf, llclen); - /* send llc message */ - rc = smc_wr_tx_send(link, pend); - return rc; + goto out; + memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen); + smc_wr_tx_send(llcwrk->link, pend); +out: + kfree(llcwrk); +} + +/* copy llcbuf and schedule an llc send on link */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +{ + struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + + if (!wrk) + return -ENOMEM; + INIT_WORK(&wrk->work, smc_llc_send_message_work); + wrk->link = link; + wrk->llclen = llclen; + memcpy(&wrk->llcbuf, llcbuf, llclen); + queue_work(link->llc_wq, &wrk->work); + return 0; } /********************************* receive ***********************************/ @@ -359,17 +424,18 @@ static void smc_llc_rx_add_link(struct smc_link *link, } if (lgr->role == SMC_SERV) { - smc_llc_send_add_link(link, + smc_llc_prep_add_link(llc, link, link->smcibdev->mac[link->ibport - 1], &link->smcibdev->gid[link->ibport - 1], SMC_LLC_REQ); } else { - smc_llc_send_add_link(link, + smc_llc_prep_add_link(llc, link, link->smcibdev->mac[link->ibport - 1], &link->smcibdev->gid[link->ibport - 1], SMC_LLC_RESP); } + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -385,9 +451,11 @@ static void smc_llc_rx_delete_link(struct smc_link *link, } else { if (lgr->role == SMC_SERV) { smc_lgr_forget(lgr); - smc_llc_send_delete_link(link, SMC_LLC_REQ); + smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ); + smc_llc_send_message(link, llc, sizeof(*llc)); } else { - smc_llc_send_delete_link(link, SMC_LLC_RESP); + smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP); + smc_llc_send_message(link, llc, sizeof(*llc)); smc_lgr_terminate(lgr); } } @@ -397,9 +465,11 @@ static void smc_llc_rx_test_link(struct smc_link *link, struct smc_llc_msg_test_link *llc) { if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ + if (link->state == SMC_LNK_ACTIVE) + complete(&link->llc_testlink_resp); } else { - smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP); + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -412,7 +482,9 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ + link->llc_confirm_rkey_rc = llc->hd.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_confirm_rkey); } else { rc = smc_rtoken_add(lgr, llc->rtoken[0].rmb_vaddr, @@ -423,7 +495,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, llc->hd.flags |= SMC_LLC_FLAG_RESP; if (rc < 0) llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -435,7 +507,7 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, } else { /* ignore rtokens for other links, we have only one link */ llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -463,7 +535,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, } llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -476,6 +548,8 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) return; /* short message */ if (llc->raw.hdr.length != sizeof(*llc)) return; /* invalid message */ + if (link->state == SMC_LNK_INACTIVE) + return; /* link not active, drop msg */ switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: @@ -502,6 +576,100 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) } } +/***************************** worker, utils *********************************/ + +static void smc_llc_testlink_work(struct work_struct *work) +{ + struct smc_link *link = container_of(to_delayed_work(work), + struct smc_link, llc_testlink_wrk); + unsigned long next_interval; + struct smc_link_group *lgr; + unsigned long expire_time; + u8 user_data[16] = { 0 }; + int rc; + + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + if (link->state != SMC_LNK_ACTIVE) + return; /* don't reschedule worker */ + expire_time = link->wr_rx_tstamp + link->llc_testlink_time; + if (time_is_after_jiffies(expire_time)) { + next_interval = expire_time - jiffies; + goto out; + } + reinit_completion(&link->llc_testlink_resp); + smc_llc_send_test_link(link, user_data); + /* receive TEST LINK response over RoCE fabric */ + rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, + SMC_LLC_WAIT_TIME); + if (rc <= 0) { + smc_lgr_terminate(lgr); + return; + } + next_interval = link->llc_testlink_time; +out: + queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, + next_interval); +} + +int smc_llc_link_init(struct smc_link *link) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, + *((u32 *)lgr->id), + link->link_id); + if (!link->llc_wq) + return -ENOMEM; + init_completion(&link->llc_confirm); + init_completion(&link->llc_confirm_resp); + init_completion(&link->llc_add); + init_completion(&link->llc_add_resp); + init_completion(&link->llc_confirm_rkey); + init_completion(&link->llc_testlink_resp); + INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + return 0; +} + +void smc_llc_link_active(struct smc_link *link, int testlink_time) +{ + link->state = SMC_LNK_ACTIVE; + if (testlink_time) { + link->llc_testlink_time = testlink_time * HZ; + queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, + link->llc_testlink_time); + } +} + +/* called in tasklet context */ +void smc_llc_link_inactive(struct smc_link *link) +{ + link->state = SMC_LNK_INACTIVE; + cancel_delayed_work(&link->llc_testlink_wrk); +} + +/* called in worker context */ +void smc_llc_link_clear(struct smc_link *link) +{ + flush_workqueue(link->llc_wq); + destroy_workqueue(link->llc_wq); +} + +/* register a new rtoken at the remote peer */ +int smc_llc_do_confirm_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + int rc; + + reinit_completion(&link->llc_confirm_rkey); + smc_llc_send_confirm_rkey(link, rmb_desc); + /* receive CONFIRM RKEY response from server over RoCE fabric */ + rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, + SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_confirm_rkey_rc) + return -EFAULT; + return 0; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index e4a7d5e234d5..65c8645e96a1 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -42,8 +42,12 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid, enum smc_llc_reqresp reqresp); int smc_llc_send_delete_link(struct smc_link *link, enum smc_llc_reqresp reqresp); -int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16], - enum smc_llc_reqresp reqresp); +int smc_llc_link_init(struct smc_link *link); +void smc_llc_link_active(struct smc_link *link, int testlink_time); +void smc_llc_link_inactive(struct smc_link *link); +void smc_llc_link_clear(struct smc_link *link); +int smc_llc_do_confirm_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index eff4e0d0bb31..290a434471d1 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -22,11 +22,10 @@ #include "smc_tx.h" /* smc_tx_consumer_update() */ #include "smc_rx.h" -/* callback implementation for sk.sk_data_ready() - * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data(). +/* callback implementation to wakeup consumers blocked with smc_rx_wait(). * indirectly called by smc_cdc_msg_recv_action(). */ -static void smc_rx_data_ready(struct sock *sk) +static void smc_rx_wake_up(struct sock *sk) { struct socket_wq *wq; @@ -44,28 +43,140 @@ static void smc_rx_data_ready(struct sock *sk) rcu_read_unlock(); } +/* Update consumer cursor + * @conn connection to update + * @cons consumer cursor + * @len number of Bytes consumed + */ +static void smc_rx_update_consumer(struct smc_connection *conn, + union smc_host_cursor cons, size_t len) +{ + smc_curs_add(conn->rmb_desc->len, &cons, len); + smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn), + conn); + /* send consumer cursor update if required */ + /* similar to advertising new TCP rcv_wnd if required */ + smc_tx_consumer_update(conn); +} + +struct smc_spd_priv { + struct smc_sock *smc; + size_t len; +}; + +static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private; + struct smc_sock *smc = priv->smc; + struct smc_connection *conn; + union smc_host_cursor cons; + struct sock *sk = &smc->sk; + + if (sk->sk_state == SMC_CLOSED || + sk->sk_state == SMC_PEERFINCLOSEWAIT || + sk->sk_state == SMC_APPFINCLOSEWAIT) + goto out; + conn = &smc->conn; + lock_sock(sk); + smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + smc_rx_update_consumer(conn, cons, priv->len); + release_sock(sk); + if (atomic_sub_and_test(priv->len, &conn->splice_pending)) + smc_rx_wake_up(sk); +out: + kfree(priv); + put_page(buf->page); + sock_put(sk); +} + +static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static const struct pipe_buf_operations smc_pipe_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = smc_rx_pipe_buf_release, + .steal = smc_rx_pipe_buf_nosteal, + .get = generic_pipe_buf_get +}; + +static void smc_rx_spd_release(struct splice_pipe_desc *spd, + unsigned int i) +{ + put_page(spd->pages[i]); +} + +static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, + struct smc_sock *smc) +{ + struct splice_pipe_desc spd; + struct partial_page partial; + struct smc_spd_priv *priv; + struct page *page; + int bytes; + + page = virt_to_page(smc->conn.rmb_desc->cpu_addr); + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + priv->len = len; + priv->smc = smc; + partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial.len = len; + partial.private = (unsigned long)priv; + + spd.nr_pages_max = 1; + spd.nr_pages = 1; + spd.pages = &page; + spd.partial = &partial; + spd.ops = &smc_pipe_ops; + spd.spd_release = smc_rx_spd_release; + + bytes = splice_to_pipe(pipe, &spd); + if (bytes > 0) { + sock_hold(&smc->sk); + get_page(smc->conn.rmb_desc->pages); + atomic_add(bytes, &smc->conn.splice_pending); + } + + return bytes; +} + +static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) +{ + return atomic_read(&conn->bytes_to_rcv) && + !atomic_read(&conn->splice_pending); +} + /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted * @smc smc socket * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * @fcrit add'l criterion to evaluate as function pointer * Returns: * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). */ -static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) +int smc_rx_wait(struct smc_sock *smc, long *timeo, + int (*fcrit)(struct smc_connection *conn)) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int rc; - if (atomic_read(&conn->bytes_to_rcv)) + if (fcrit(conn)) return 1; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, sk->sk_err || sk->sk_shutdown & RCV_SHUTDOWN || - atomic_read(&conn->bytes_to_rcv) || + fcrit(conn) || smc_cdc_rxed_any_close_or_senddone(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); @@ -73,19 +184,25 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) return rc; } -/* rcvbuf consumer: main API called by socket layer. - * called under sk lock. +/* smc_rx_recvmsg - receive data from RMBE + * @msg: copy data to receive buffer + * @pipe: copy data to pipe if set - indicates splice() call + * + * rcvbuf consumer: main API called by socket layer. + * Called under sk lock. */ -int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, - int flags) +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, + struct pipe_inode_info *pipe, size_t len, int flags) { size_t copylen, read_done = 0, read_remaining = len; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; + int (*func)(struct smc_connection *conn); union smc_host_cursor cons; int readable, chunk; char *rcvbuf_base; struct sock *sk; + int splbytes; long timeo; int target; /* Read at least these many bytes */ int rc; @@ -101,37 +218,32 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - msg->msg_namelen = 0; /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ rcvbuf_base = conn->rmb_desc->cpu_addr; do { /* while (read_remaining) */ - if (read_done >= target) + if (read_done >= target || (pipe && read_done)) break; if (atomic_read(&conn->bytes_to_rcv)) goto copy; + if (sk->sk_shutdown & RCV_SHUTDOWN || + smc_cdc_rxed_any_close_or_senddone(conn) || + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + break; + if (read_done) { if (sk->sk_err || sk->sk_state == SMC_CLOSED || - sk->sk_shutdown & RCV_SHUTDOWN || !timeo || - signal_pending(current) || - smc_cdc_rxed_any_close_or_senddone(conn) || - conn->local_tx_ctrl.conn_state_flags. - peer_conn_abort) + signal_pending(current)) break; } else { if (sk->sk_err) { read_done = sock_error(sk); break; } - if (sk->sk_shutdown & RCV_SHUTDOWN || - smc_cdc_rxed_any_close_or_senddone(conn) || - conn->local_tx_ctrl.conn_state_flags. - peer_conn_abort) - break; if (sk->sk_state == SMC_CLOSED) { if (!sock_flag(sk, SOCK_DONE)) { /* This occurs when user tries to read @@ -150,32 +262,52 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, return -EAGAIN; } - if (!atomic_read(&conn->bytes_to_rcv)) { - smc_rx_wait_data(smc, &timeo); + if (!smc_rx_data_available(conn)) { + smc_rx_wait(smc, &timeo, smc_rx_data_available); continue; } copy: /* initialize variables for 1st iteration of subsequent loop */ - /* could be just 1 byte, even after smc_rx_wait_data above */ + /* could be just 1 byte, even after waiting on data above */ readable = atomic_read(&conn->bytes_to_rcv); + splbytes = atomic_read(&conn->splice_pending); + if (!readable || (msg && splbytes)) { + if (splbytes) + func = smc_rx_data_available_and_no_splice_pend; + else + func = smc_rx_data_available; + smc_rx_wait(smc, &timeo, func); + continue; + } + /* not more than what user space asked for */ copylen = min_t(size_t, read_remaining, readable); smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), conn); + /* subsequent splice() calls pick up where previous left */ + if (splbytes) + smc_curs_add(conn->rmb_desc->len, &cons, splbytes); /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ - chunk_len = min_t(size_t, - copylen, conn->rmbe_size - cons.count); + chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - + cons.count); chunk_len_sum = chunk_len; chunk_off = cons.count; smc_rmb_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { if (!(flags & MSG_TRUNC)) { - rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, - chunk_len); - if (rc) { + if (msg) { + rc = memcpy_to_msg(msg, rcvbuf_base + + chunk_off, + chunk_len); + } else { + rc = smc_rx_splice(pipe, rcvbuf_base + + chunk_off, chunk_len, + smc); + } + if (rc < 0) { if (!read_done) read_done = -EFAULT; smc_rmb_sync_sg_for_device(conn); @@ -196,18 +328,13 @@ copy: /* update cursors */ if (!(flags & MSG_PEEK)) { - smc_curs_add(conn->rmbe_size, &cons, copylen); /* increased in recv tasklet smc_cdc_msg_rcv() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->bytes_to_rcv); - /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ smp_mb__after_atomic(); - smc_curs_write(&conn->local_tx_ctrl.cons, - smc_curs_read(&cons, conn), - conn); - /* send consumer cursor update if required */ - /* similar to advertising new TCP rcv_wnd if required */ - smc_tx_consumer_update(conn); + if (msg) + smc_rx_update_consumer(conn, cons, copylen); } } while (read_remaining); out: @@ -217,5 +344,6 @@ out: /* Initialize receive properties on connection establishment. NB: not __init! */ void smc_rx_init(struct smc_sock *smc) { - smc->sk.sk_data_ready = smc_rx_data_ready; + smc->sk.sk_data_ready = smc_rx_wake_up; + atomic_set(&smc->conn.splice_pending, 0); } diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index 3a32b59bf06c..db823c97d824 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -18,7 +18,14 @@ #include "smc.h" void smc_rx_init(struct smc_sock *smc); -int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, - int flags); + +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, + struct pipe_inode_info *pipe, size_t len, int flags); +int smc_rx_wait(struct smc_sock *smc, long *timeo, + int (*fcrit)(struct smc_connection *conn)); +static inline int smc_rx_data_available(struct smc_connection *conn) +{ + return atomic_read(&conn->bytes_to_rcv); +} #endif /* SMC_RX_H */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 72f004c9c9b1..1f4a38b857f0 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -19,6 +19,7 @@ #include <linux/sched/signal.h> #include <net/sock.h> +#include <net/tcp.h> #include "smc.h" #include "smc_wr.h" @@ -26,6 +27,7 @@ #include "smc_tx.h" #define SMC_TX_WORK_DELAY HZ +#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -115,6 +117,13 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) return rc; } +static bool smc_tx_is_corked(struct smc_sock *smc) +{ + struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); + + return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -171,8 +180,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ - chunk_len = min_t(size_t, - copylen, conn->sndbuf_size - tx_cnt_prep); + chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len - + tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; smc_sndbuf_sync_sg_for_cpu(conn); @@ -197,19 +206,28 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) } smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ - smc_curs_add(conn->sndbuf_size, &prep, copylen); + smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); smc_curs_write(&conn->tx_curs_prep, smc_curs_read(&prep, conn), conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); - /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - smc_tx_sndbuf_nonempty(conn); + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && + (atomic_read(&conn->sndbuf_space) > + (conn->sndbuf_desc->len >> 1))) + /* for a corked socket defer the RDMA writes if there + * is still sufficient sndbuf_space available + */ + schedule_delayed_work(&conn->tx_work, + SMC_TX_CORK_DELAY); + else + smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ return send_done; @@ -243,7 +261,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, rdma_wr.remote_addr = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + /* RMBE within RMB */ - ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) + + conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; @@ -268,7 +286,7 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn, atomic_sub(len, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); - smc_curs_add(conn->sndbuf_size, sent, len); + smc_curs_add(conn->sndbuf_desc->len, sent, len); } /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; @@ -291,7 +309,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); /* cf. wmem_alloc - (snd_max - snd_una) */ - to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep); + to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); if (to_send <= 0) return 0; @@ -333,12 +351,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) dst_len_sum = dst_len; src_off = sent.count; /* dst_len determines the maximum src_len */ - if (sent.count + dst_len <= conn->sndbuf_size) { + if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ src_len = dst_len; } else { /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ - src_len = conn->sndbuf_size - sent.count; + src_len = conn->sndbuf_desc->len - sent.count; } src_len_sum = src_len; dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); @@ -350,8 +368,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; num_sges++; src_off += src_len; - if (src_off >= conn->sndbuf_size) - src_off -= conn->sndbuf_size; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ @@ -369,7 +387,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, - dst_len, conn->sndbuf_size - sent.count); + dst_len, conn->sndbuf_desc->len - sent.count); src_len_sum = src_len; } @@ -409,8 +427,8 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) } rc = 0; if (conn->alert_token_local) /* connection healthy */ - schedule_delayed_work(&conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } @@ -432,7 +450,7 @@ out_unlock: /* Wakeup sndbuf consumers from process context * since there is more data to transmit */ -static void smc_tx_work(struct work_struct *work) +void smc_tx_work(struct work_struct *work) { struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, @@ -466,11 +484,11 @@ void smc_tx_consumer_update(struct smc_connection *conn) smc_curs_write(&cfed, smc_curs_read(&conn->rx_curs_confirmed, conn), conn); - to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons); + to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || ((to_confirm > conn->rmbe_update_limit) && - ((to_confirm > (conn->rmbe_size / 2)) || + ((to_confirm > (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && conn->alert_token_local) { /* connection healthy */ @@ -494,6 +512,4 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); - spin_lock_init(&smc->conn.send_lock); } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 78255964fa4d..44d077942976 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -24,9 +24,10 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); - return smc_curs_diff(conn->sndbuf_size, &sent, &prep); + return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } +void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 1b8af23e6e2b..cc7c1bb60fe8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -376,6 +376,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) for (i = 0; i < num; i++) { link = wc[i].qp->qp_context; if (wc[i].status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; smc_wr_rx_demultiplex(&wc[i]); smc_wr_rx_post(link); /* refill WR RX */ } else { diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index f7d47c89d658..2dfb492a7c94 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -697,6 +697,9 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) goto prop_msg_full; + if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) + goto prop_msg_full; nla_nest_end(msg->skb, prop); @@ -979,12 +982,23 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_TOL]) { b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); - tipc_node_apply_tolerance(net, b); + tipc_node_apply_property(net, b, TIPC_NLA_PROP_TOL); } if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (props[TIPC_NLA_PROP_MTU]) { + if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) + return -EINVAL; +#ifdef CONFIG_TIPC_MEDIA_UDP + if (tipc_udp_mtu_bad(nla_get_u32 + (props[TIPC_NLA_PROP_MTU]))) + return -EINVAL; + b->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]); + tipc_node_apply_property(net, b, TIPC_NLA_PROP_MTU); +#endif + } } return 0; @@ -1029,6 +1043,9 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) goto prop_msg_full; + if (media->type_id == TIPC_MEDIA_TYPE_UDP) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) + goto prop_msg_full; nla_nest_end(msg->skb, prop); nla_nest_end(msg->skb, attrs); @@ -1158,6 +1175,16 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (props[TIPC_NLA_PROP_MTU]) { + if (m->type_id != TIPC_MEDIA_TYPE_UDP) + return -EINVAL; +#ifdef CONFIG_TIPC_MEDIA_UDP + if (tipc_udp_mtu_bad(nla_get_u32 + (props[TIPC_NLA_PROP_MTU]))) + return -EINVAL; + m->mtu = nla_get_u32(props[TIPC_NLA_PROP_MTU]); +#endif + } } return 0; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 6efcee63a381..394290cbbb1d 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -94,6 +94,8 @@ struct tipc_bearer; * @priority: default link (and bearer) priority * @tolerance: default time (in ms) before declaring link failure * @window: default window (in packets) before declaring link congestion + * @mtu: max packet size bearer can support for media type not dependent on + * underlying device MTU * @type_id: TIPC media identifier * @hwaddr_len: TIPC media address len * @name: media name @@ -118,6 +120,7 @@ struct tipc_media { u32 priority; u32 tolerance; u32 window; + u32 mtu; u32 type_id; u32 hwaddr_len; char name[TIPC_MAX_MEDIA_NAME]; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index dd1c4fa2eb78..bebe88cae07b 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -136,12 +136,12 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd) } /** - * tipc_service_find_range - find service range matching a service instance + * tipc_service_first_range - find first service range in tree matching instance * * Very time-critical, so binary search through range rb tree */ -static struct service_range *tipc_service_find_range(struct tipc_service *sc, - u32 instance) +static struct service_range *tipc_service_first_range(struct tipc_service *sc, + u32 instance) { struct rb_node *n = sc->ranges.rb_node; struct service_range *sr; @@ -158,6 +158,30 @@ static struct service_range *tipc_service_find_range(struct tipc_service *sc, return NULL; } +/* tipc_service_find_range - find service range matching publication parameters + */ +static struct service_range *tipc_service_find_range(struct tipc_service *sc, + u32 lower, u32 upper) +{ + struct rb_node *n = sc->ranges.rb_node; + struct service_range *sr; + + sr = tipc_service_first_range(sc, lower); + if (!sr) + return NULL; + + /* Look for exact match */ + for (n = &sr->tree_node; n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->upper == upper) + break; + } + if (!n || sr->lower != lower || sr->upper != upper) + return NULL; + + return sr; +} + static struct service_range *tipc_service_create_range(struct tipc_service *sc, u32 lower, u32 upper) { @@ -238,54 +262,19 @@ err: /** * tipc_service_remove_publ - remove a publication from a service */ -static struct publication *tipc_service_remove_publ(struct net *net, - struct tipc_service *sc, - u32 lower, u32 upper, - u32 node, u32 key, - struct service_range **rng) +static struct publication *tipc_service_remove_publ(struct service_range *sr, + u32 node, u32 key) { - struct tipc_subscription *sub, *tmp; - struct service_range *sr; struct publication *p; - bool found = false; - bool last = false; - struct rb_node *n; - - sr = tipc_service_find_range(sc, lower); - if (!sr) - return NULL; - /* Find exact matching service range */ - for (n = &sr->tree_node; n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper == upper) - break; - } - if (!n || sr->lower != lower || sr->upper != upper) - return NULL; - - /* Find publication, if it exists */ list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->key != key || (node && node != p->node)) continue; - found = true; - break; + list_del(&p->all_publ); + list_del(&p->local_publ); + return p; } - if (!found) - return NULL; - - list_del(&p->all_publ); - list_del(&p->local_publ); - if (list_empty(&sr->all_publ)) - last = true; - - /* Notify any waiting subscriptions */ - list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { - tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN, - p->port, p->node, p->scope, last); - } - *rng = sr; - return p; + return NULL; } /** @@ -376,17 +365,31 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, u32 node, u32 key) { struct tipc_service *sc = tipc_service_find(net, type); + struct tipc_subscription *sub, *tmp; struct service_range *sr = NULL; struct publication *p = NULL; + bool last; if (!sc) return NULL; spin_lock_bh(&sc->lock); - p = tipc_service_remove_publ(net, sc, lower, upper, node, key, &sr); + sr = tipc_service_find_range(sc, lower, upper); + if (!sr) + goto exit; + p = tipc_service_remove_publ(sr, node, key); + if (!p) + goto exit; + + /* Notify any waiting subscriptions */ + last = list_empty(&sr->all_publ); + list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { + tipc_sub_report_overlap(sub, lower, upper, TIPC_WITHDRAWN, + p->port, node, p->scope, last); + } /* Remove service range item if this was its last publication */ - if (sr && list_empty(&sr->all_publ)) { + if (list_empty(&sr->all_publ)) { rb_erase(&sr->tree_node, &sc->ranges); kfree(sr); } @@ -396,6 +399,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } +exit: spin_unlock_bh(&sc->lock); return p; } @@ -437,7 +441,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode) goto not_found; spin_lock_bh(&sc->lock); - sr = tipc_service_find_range(sc, instance); + sr = tipc_service_first_range(sc, instance); if (unlikely(!sr)) goto no_match; @@ -484,7 +488,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, spin_lock_bh(&sc->lock); - sr = tipc_service_find_range(sc, instance); + sr = tipc_service_first_range(sc, instance); if (!sr) goto no_match; @@ -756,8 +760,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc) spin_lock_bh(&sc->lock); rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { - tipc_service_remove_publ(net, sc, p->lower, p->upper, - p->node, p->key, &sr); + tipc_service_remove_publ(sr, p->node, p->key); kfree_rcu(p, rcu); } rb_erase(&sr->tree_node, &sc->ranges); diff --git a/net/tipc/node.c b/net/tipc/node.c index f29549de9245..6a44eb812baf 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -195,6 +195,27 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) return mtu; } +bool tipc_node_get_id(struct net *net, u32 addr, u8 *id) +{ + u8 *own_id = tipc_own_id(net); + struct tipc_node *n; + + if (!own_id) + return true; + + if (addr == tipc_own_addr(net)) { + memcpy(id, own_id, TIPC_NODEID_LEN); + return true; + } + n = tipc_node_find(net, addr); + if (!n) + return false; + + memcpy(id, &n->peer_id, TIPC_NODEID_LEN); + tipc_node_put(n); + return true; +} + u16 tipc_node_get_capabilities(struct net *net, u32 addr) { struct tipc_node *n; @@ -1681,7 +1702,8 @@ discard: kfree_skb(skb); } -void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b) +void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, + int prop) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; @@ -1696,8 +1718,13 @@ void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b) list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_write_lock(n); e = &n->links[bearer_id]; - if (e->link) - tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); + if (e->link) { + if (prop == TIPC_NLA_PROP_TOL) + tipc_link_set_tolerance(e->link, b->tolerance, + &xmitq); + else if (prop == TIPC_NLA_PROP_MTU) + tipc_link_set_mtu(e->link, b->mtu); + } tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr); } diff --git a/net/tipc/node.h b/net/tipc/node.h index f24b83500df1..846c8f240872 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -60,6 +60,7 @@ enum { #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); +bool tipc_node_get_id(struct net *net, u32 addr, u8 *id); u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, @@ -67,7 +68,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); -void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b); +void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 6be21575503a..930852c54d7a 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2974,7 +2974,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - struct sock *sk = sock->sk; + struct net *net = sock_net(sock->sk); + struct tipc_sioc_nodeid_req nr = {0}; struct tipc_sioc_ln_req lnr; void __user *argp = (void __user *)arg; @@ -2982,7 +2983,7 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGETLINKNAME: if (copy_from_user(&lnr, argp, sizeof(lnr))) return -EFAULT; - if (!tipc_node_get_linkname(sock_net(sk), + if (!tipc_node_get_linkname(net, lnr.bearer_id & 0xffff, lnr.peer, lnr.linkname, TIPC_MAX_LINK_NAME)) { if (copy_to_user(argp, &lnr, sizeof(lnr))) @@ -2990,6 +2991,14 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; } return -EADDRNOTAVAIL; + case SIOCGETNODEID: + if (copy_from_user(&nr, argp, sizeof(nr))) + return -EFAULT; + if (!tipc_node_get_id(net, nr.peer, nr.node_id)) + return -EADDRNOTAVAIL; + if (copy_to_user(argp, &nr, sizeof(nr))) + return -EFAULT; + return 0; default: return -ENOIOCTLCMD; } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index e7d91f5d5cae..9783101bc4a9 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -713,8 +713,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, err = -EINVAL; goto err; } - b->mtu = dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr); + b->mtu = b->media->mtu; #if IS_ENABLED(CONFIG_IPV6) } else if (local.proto == htons(ETH_P_IPV6)) { udp_conf.family = AF_INET6; @@ -803,6 +802,7 @@ struct tipc_media udp_media_info = { .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, .window = TIPC_DEF_LINK_WIN, + .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, .name = "udp" diff --git a/net/tipc/udp_media.h b/net/tipc/udp_media.h index 281bbae87726..e7455cc73e16 100644 --- a/net/tipc/udp_media.h +++ b/net/tipc/udp_media.h @@ -38,9 +38,23 @@ #ifndef _TIPC_UDP_MEDIA_H #define _TIPC_UDP_MEDIA_H +#include <linux/ip.h> +#include <linux/udp.h> + int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr); int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b); int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb); +/* check if configured MTU is too low for tipc headers */ +static inline bool tipc_udp_mtu_bad(u32 mtu) +{ + if (mtu >= (TIPC_MIN_BEARER_MTU + sizeof(struct iphdr) + + sizeof(struct udphdr))) + return false; + + pr_warn("MTU too low for tipc bearer\n"); + return true; +} + #endif #endif diff --git a/net/tls/Kconfig b/net/tls/Kconfig index 89b8745a986f..73f05ece53d0 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -14,3 +14,13 @@ config TLS encryption handling of the TLS protocol to be done in-kernel. If unsure, say N. + +config TLS_DEVICE + bool "Transport Layer Security HW offload" + depends on TLS + select SOCK_VALIDATE_XMIT + default n + help + Enable kernel support for HW offload of the TLS protocol. + + If unsure, say N. diff --git a/net/tls/Makefile b/net/tls/Makefile index a930fd1c4f7b..4d6b728a67d0 100644 --- a/net/tls/Makefile +++ b/net/tls/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_TLS) += tls.o tls-y := tls_main.o tls_sw.o + +tls-$(CONFIG_TLS_DEVICE) += tls_device.o tls_device_fallback.o diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c new file mode 100644 index 000000000000..a7a8f8e20ff3 --- /dev/null +++ b/net/tls/tls_device.c @@ -0,0 +1,766 @@ +/* Copyright (c) 2018, Mellanox Technologies All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <crypto/aead.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <net/dst.h> +#include <net/inet_connection_sock.h> +#include <net/tcp.h> +#include <net/tls.h> + +/* device_offload_lock is used to synchronize tls_dev_add + * against NETDEV_DOWN notifications. + */ +static DECLARE_RWSEM(device_offload_lock); + +static void tls_device_gc_task(struct work_struct *work); + +static DECLARE_WORK(tls_device_gc_work, tls_device_gc_task); +static LIST_HEAD(tls_device_gc_list); +static LIST_HEAD(tls_device_list); +static DEFINE_SPINLOCK(tls_device_lock); + +static void tls_device_free_ctx(struct tls_context *ctx) +{ + struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx); + + kfree(offload_ctx); + kfree(ctx); +} + +static void tls_device_gc_task(struct work_struct *work) +{ + struct tls_context *ctx, *tmp; + unsigned long flags; + LIST_HEAD(gc_list); + + spin_lock_irqsave(&tls_device_lock, flags); + list_splice_init(&tls_device_gc_list, &gc_list); + spin_unlock_irqrestore(&tls_device_lock, flags); + + list_for_each_entry_safe(ctx, tmp, &gc_list, list) { + struct net_device *netdev = ctx->netdev; + + if (netdev) { + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + dev_put(netdev); + } + + list_del(&ctx->list); + tls_device_free_ctx(ctx); + } +} + +static void tls_device_queue_ctx_destruction(struct tls_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&tls_device_lock, flags); + list_move_tail(&ctx->list, &tls_device_gc_list); + + /* schedule_work inside the spinlock + * to make sure tls_device_down waits for that work. + */ + schedule_work(&tls_device_gc_work); + + spin_unlock_irqrestore(&tls_device_lock, flags); +} + +/* We assume that the socket is already connected */ +static struct net_device *get_netdev_for_sock(struct sock *sk) +{ + struct dst_entry *dst = sk_dst_get(sk); + struct net_device *netdev = NULL; + + if (likely(dst)) { + netdev = dst->dev; + dev_hold(netdev); + } + + dst_release(dst); + + return netdev; +} + +static void destroy_record(struct tls_record_info *record) +{ + int nr_frags = record->num_frags; + skb_frag_t *frag; + + while (nr_frags-- > 0) { + frag = &record->frags[nr_frags]; + __skb_frag_unref(frag); + } + kfree(record); +} + +static void delete_all_records(struct tls_offload_context *offload_ctx) +{ + struct tls_record_info *info, *temp; + + list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) { + list_del(&info->list); + destroy_record(info); + } + + offload_ctx->retransmit_hint = NULL; +} + +static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_record_info *info, *temp; + struct tls_offload_context *ctx; + u64 deleted_records = 0; + unsigned long flags; + + if (!tls_ctx) + return; + + ctx = tls_offload_ctx(tls_ctx); + + spin_lock_irqsave(&ctx->lock, flags); + info = ctx->retransmit_hint; + if (info && !before(acked_seq, info->end_seq)) { + ctx->retransmit_hint = NULL; + list_del(&info->list); + destroy_record(info); + deleted_records++; + } + + list_for_each_entry_safe(info, temp, &ctx->records_list, list) { + if (before(acked_seq, info->end_seq)) + break; + list_del(&info->list); + + destroy_record(info); + deleted_records++; + } + + ctx->unacked_record_sn += deleted_records; + spin_unlock_irqrestore(&ctx->lock, flags); +} + +/* At this point, there should be no references on this + * socket and no in-flight SKBs associated with this + * socket, so it is safe to free all the resources. + */ +void tls_device_sk_destruct(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + + if (ctx->open_record) + destroy_record(ctx->open_record); + + delete_all_records(ctx); + crypto_free_aead(ctx->aead_send); + ctx->sk_destruct(sk); + clean_acked_data_disable(inet_csk(sk)); + + if (refcount_dec_and_test(&tls_ctx->refcount)) + tls_device_queue_ctx_destruction(tls_ctx); +} +EXPORT_SYMBOL(tls_device_sk_destruct); + +static void tls_append_frag(struct tls_record_info *record, + struct page_frag *pfrag, + int size) +{ + skb_frag_t *frag; + + frag = &record->frags[record->num_frags - 1]; + if (frag->page.p == pfrag->page && + frag->page_offset + frag->size == pfrag->offset) { + frag->size += size; + } else { + ++frag; + frag->page.p = pfrag->page; + frag->page_offset = pfrag->offset; + frag->size = size; + ++record->num_frags; + get_page(pfrag->page); + } + + pfrag->offset += size; + record->len += size; +} + +static int tls_push_record(struct sock *sk, + struct tls_context *ctx, + struct tls_offload_context *offload_ctx, + struct tls_record_info *record, + struct page_frag *pfrag, + int flags, + unsigned char record_type) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct page_frag dummy_tag_frag; + skb_frag_t *frag; + int i; + + /* fill prepend */ + frag = &record->frags[0]; + tls_fill_prepend(ctx, + skb_frag_address(frag), + record->len - ctx->tx.prepend_size, + record_type); + + /* HW doesn't care about the data in the tag, because it fills it. */ + dummy_tag_frag.page = skb_frag_page(frag); + dummy_tag_frag.offset = 0; + + tls_append_frag(record, &dummy_tag_frag, ctx->tx.tag_size); + record->end_seq = tp->write_seq + record->len; + spin_lock_irq(&offload_ctx->lock); + list_add_tail(&record->list, &offload_ctx->records_list); + spin_unlock_irq(&offload_ctx->lock); + offload_ctx->open_record = NULL; + set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); + tls_advance_record_sn(sk, &ctx->tx); + + for (i = 0; i < record->num_frags; i++) { + frag = &record->frags[i]; + sg_unmark_end(&offload_ctx->sg_tx_data[i]); + sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag), + frag->size, frag->page_offset); + sk_mem_charge(sk, frag->size); + get_page(skb_frag_page(frag)); + } + sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]); + + /* all ready, send */ + return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags); +} + +static int tls_create_new_record(struct tls_offload_context *offload_ctx, + struct page_frag *pfrag, + size_t prepend_size) +{ + struct tls_record_info *record; + skb_frag_t *frag; + + record = kmalloc(sizeof(*record), GFP_KERNEL); + if (!record) + return -ENOMEM; + + frag = &record->frags[0]; + __skb_frag_set_page(frag, pfrag->page); + frag->page_offset = pfrag->offset; + skb_frag_size_set(frag, prepend_size); + + get_page(pfrag->page); + pfrag->offset += prepend_size; + + record->num_frags = 1; + record->len = prepend_size; + offload_ctx->open_record = record; + return 0; +} + +static int tls_do_allocation(struct sock *sk, + struct tls_offload_context *offload_ctx, + struct page_frag *pfrag, + size_t prepend_size) +{ + int ret; + + if (!offload_ctx->open_record) { + if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, + sk->sk_allocation))) { + sk->sk_prot->enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return -ENOMEM; + } + + ret = tls_create_new_record(offload_ctx, pfrag, prepend_size); + if (ret) + return ret; + + if (pfrag->size > pfrag->offset) + return 0; + } + + if (!sk_page_frag_refill(sk, pfrag)) + return -ENOMEM; + + return 0; +} + +static int tls_push_data(struct sock *sk, + struct iov_iter *msg_iter, + size_t size, int flags, + unsigned char record_type) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; + int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); + struct tls_record_info *record = ctx->open_record; + struct page_frag *pfrag; + size_t orig_size = size; + u32 max_open_record_len; + int copy, rc = 0; + bool done = false; + long timeo; + + if (flags & + ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST)) + return -ENOTSUPP; + + if (sk->sk_err) + return -sk->sk_err; + + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + rc = tls_complete_pending_work(sk, tls_ctx, flags, &timeo); + if (rc < 0) + return rc; + + pfrag = sk_page_frag(sk); + + /* TLS_HEADER_SIZE is not counted as part of the TLS record, and + * we need to leave room for an authentication tag. + */ + max_open_record_len = TLS_MAX_PAYLOAD_SIZE + + tls_ctx->tx.prepend_size; + do { + rc = tls_do_allocation(sk, ctx, pfrag, + tls_ctx->tx.prepend_size); + if (rc) { + rc = sk_stream_wait_memory(sk, &timeo); + if (!rc) + continue; + + record = ctx->open_record; + if (!record) + break; +handle_error: + if (record_type != TLS_RECORD_TYPE_DATA) { + /* avoid sending partial + * record with type != + * application_data + */ + size = orig_size; + destroy_record(record); + ctx->open_record = NULL; + } else if (record->len > tls_ctx->tx.prepend_size) { + goto last_record; + } + + break; + } + + record = ctx->open_record; + copy = min_t(size_t, size, (pfrag->size - pfrag->offset)); + copy = min_t(size_t, copy, (max_open_record_len - record->len)); + + if (copy_from_iter_nocache(page_address(pfrag->page) + + pfrag->offset, + copy, msg_iter) != copy) { + rc = -EFAULT; + goto handle_error; + } + tls_append_frag(record, pfrag, copy); + + size -= copy; + if (!size) { +last_record: + tls_push_record_flags = flags; + if (more) { + tls_ctx->pending_open_record_frags = + record->num_frags; + break; + } + + done = true; + } + + if (done || record->len >= max_open_record_len || + (record->num_frags >= MAX_SKB_FRAGS - 1)) { + rc = tls_push_record(sk, + tls_ctx, + ctx, + record, + pfrag, + tls_push_record_flags, + record_type); + if (rc < 0) + break; + } + } while (!done); + + if (orig_size - size > 0) + rc = orig_size - size; + + return rc; +} + +int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + unsigned char record_type = TLS_RECORD_TYPE_DATA; + int rc; + + lock_sock(sk); + + if (unlikely(msg->msg_controllen)) { + rc = tls_proccess_cmsg(sk, msg, &record_type); + if (rc) + goto out; + } + + rc = tls_push_data(sk, &msg->msg_iter, size, + msg->msg_flags, record_type); + +out: + release_sock(sk); + return rc; +} + +int tls_device_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct iov_iter msg_iter; + char *kaddr = kmap(page); + struct kvec iov; + int rc; + + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + + lock_sock(sk); + + if (flags & MSG_OOB) { + rc = -ENOTSUPP; + goto out; + } + + iov.iov_base = kaddr + offset; + iov.iov_len = size; + iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, &iov, 1, size); + rc = tls_push_data(sk, &msg_iter, size, + flags, TLS_RECORD_TYPE_DATA); + kunmap(page); + +out: + release_sock(sk); + return rc; +} + +struct tls_record_info *tls_get_record(struct tls_offload_context *context, + u32 seq, u64 *p_record_sn) +{ + u64 record_sn = context->hint_record_sn; + struct tls_record_info *info; + + info = context->retransmit_hint; + if (!info || + before(seq, info->end_seq - info->len)) { + /* if retransmit_hint is irrelevant start + * from the beggining of the list + */ + info = list_first_entry(&context->records_list, + struct tls_record_info, list); + record_sn = context->unacked_record_sn; + } + + list_for_each_entry_from(info, &context->records_list, list) { + if (before(seq, info->end_seq)) { + if (!context->retransmit_hint || + after(info->end_seq, + context->retransmit_hint->end_seq)) { + context->hint_record_sn = record_sn; + context->retransmit_hint = info; + } + *p_record_sn = record_sn; + return info; + } + record_sn++; + } + + return NULL; +} +EXPORT_SYMBOL(tls_get_record); + +static int tls_device_push_pending_record(struct sock *sk, int flags) +{ + struct iov_iter msg_iter; + + iov_iter_kvec(&msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); + return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA); +} + +int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) +{ + u16 nonce_size, tag_size, iv_size, rec_seq_size; + struct tls_record_info *start_marker_record; + struct tls_offload_context *offload_ctx; + struct tls_crypto_info *crypto_info; + struct net_device *netdev; + char *iv, *rec_seq; + struct sk_buff *skb; + int rc = -EINVAL; + __be64 rcd_sn; + + if (!ctx) + goto out; + + if (ctx->priv_ctx_tx) { + rc = -EEXIST; + goto out; + } + + start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL); + if (!start_marker_record) { + rc = -ENOMEM; + goto out; + } + + offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL); + if (!offload_ctx) { + rc = -ENOMEM; + goto free_marker_record; + } + + crypto_info = &ctx->crypto_send; + switch (crypto_info->cipher_type) { + case TLS_CIPHER_AES_GCM_128: + nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; + tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE; + iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; + iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv; + rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE; + rec_seq = + ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq; + break; + default: + rc = -EINVAL; + goto free_offload_ctx; + } + + ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size; + ctx->tx.tag_size = tag_size; + ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size; + ctx->tx.iv_size = iv_size; + ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); + if (!ctx->tx.iv) { + rc = -ENOMEM; + goto free_offload_ctx; + } + + memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + + ctx->tx.rec_seq_size = rec_seq_size; + ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + if (!ctx->tx.rec_seq) { + rc = -ENOMEM; + goto free_iv; + } + memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); + + rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info); + if (rc) + goto free_rec_seq; + + /* start at rec_seq - 1 to account for the start marker record */ + memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn)); + offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1; + + start_marker_record->end_seq = tcp_sk(sk)->write_seq; + start_marker_record->len = 0; + start_marker_record->num_frags = 0; + + INIT_LIST_HEAD(&offload_ctx->records_list); + list_add_tail(&start_marker_record->list, &offload_ctx->records_list); + spin_lock_init(&offload_ctx->lock); + sg_init_table(offload_ctx->sg_tx_data, + ARRAY_SIZE(offload_ctx->sg_tx_data)); + + clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); + ctx->push_pending_record = tls_device_push_pending_record; + offload_ctx->sk_destruct = sk->sk_destruct; + + /* TLS offload is greatly simplified if we don't send + * SKBs where only part of the payload needs to be encrypted. + * So mark the last skb in the write queue as end of record. + */ + skb = tcp_write_queue_tail(sk); + if (skb) + TCP_SKB_CB(skb)->eor = 1; + + refcount_set(&ctx->refcount, 1); + + /* We support starting offload on multiple sockets + * concurrently, so we only need a read lock here. + * This lock must precede get_netdev_for_sock to prevent races between + * NETDEV_DOWN and setsockopt. + */ + down_read(&device_offload_lock); + netdev = get_netdev_for_sock(sk); + if (!netdev) { + pr_err_ratelimited("%s: netdev not found\n", __func__); + rc = -EINVAL; + goto release_lock; + } + + if (!(netdev->features & NETIF_F_HW_TLS_TX)) { + rc = -ENOTSUPP; + goto release_netdev; + } + + /* Avoid offloading if the device is down + * We don't want to offload new flows after + * the NETDEV_DOWN event + */ + if (!(netdev->flags & IFF_UP)) { + rc = -EINVAL; + goto release_netdev; + } + + ctx->priv_ctx_tx = offload_ctx; + rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX, + &ctx->crypto_send, + tcp_sk(sk)->write_seq); + if (rc) + goto release_netdev; + + ctx->netdev = netdev; + + spin_lock_irq(&tls_device_lock); + list_add_tail(&ctx->list, &tls_device_list); + spin_unlock_irq(&tls_device_lock); + + sk->sk_validate_xmit_skb = tls_validate_xmit_skb; + /* following this assignment tls_is_sk_tx_device_offloaded + * will return true and the context might be accessed + * by the netdev's xmit function. + */ + smp_store_release(&sk->sk_destruct, + &tls_device_sk_destruct); + up_read(&device_offload_lock); + goto out; + +release_netdev: + dev_put(netdev); +release_lock: + up_read(&device_offload_lock); + clean_acked_data_disable(inet_csk(sk)); + crypto_free_aead(offload_ctx->aead_send); +free_rec_seq: + kfree(ctx->tx.rec_seq); +free_iv: + kfree(ctx->tx.iv); +free_offload_ctx: + kfree(offload_ctx); + ctx->priv_ctx_tx = NULL; +free_marker_record: + kfree(start_marker_record); +out: + return rc; +} + +static int tls_device_down(struct net_device *netdev) +{ + struct tls_context *ctx, *tmp; + unsigned long flags; + LIST_HEAD(list); + + /* Request a write lock to block new offload attempts */ + down_write(&device_offload_lock); + + spin_lock_irqsave(&tls_device_lock, flags); + list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) { + if (ctx->netdev != netdev || + !refcount_inc_not_zero(&ctx->refcount)) + continue; + + list_move(&ctx->list, &list); + } + spin_unlock_irqrestore(&tls_device_lock, flags); + + list_for_each_entry_safe(ctx, tmp, &list, list) { + netdev->tlsdev_ops->tls_dev_del(netdev, ctx, + TLS_OFFLOAD_CTX_DIR_TX); + ctx->netdev = NULL; + dev_put(netdev); + list_del_init(&ctx->list); + + if (refcount_dec_and_test(&ctx->refcount)) + tls_device_free_ctx(ctx); + } + + up_write(&device_offload_lock); + + flush_work(&tls_device_gc_work); + + return NOTIFY_DONE; +} + +static int tls_dev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + if (!(dev->features & NETIF_F_HW_TLS_TX)) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_FEAT_CHANGE: + if (dev->tlsdev_ops && + dev->tlsdev_ops->tls_dev_add && + dev->tlsdev_ops->tls_dev_del) + return NOTIFY_DONE; + else + return NOTIFY_BAD; + case NETDEV_DOWN: + return tls_device_down(dev); + } + return NOTIFY_DONE; +} + +static struct notifier_block tls_dev_notifier = { + .notifier_call = tls_dev_event, +}; + +void __init tls_device_init(void) +{ + register_netdevice_notifier(&tls_dev_notifier); +} + +void __exit tls_device_cleanup(void) +{ + unregister_netdevice_notifier(&tls_dev_notifier); + flush_work(&tls_device_gc_work); +} diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c new file mode 100644 index 000000000000..748914abdb60 --- /dev/null +++ b/net/tls/tls_device_fallback.c @@ -0,0 +1,450 @@ +/* Copyright (c) 2018, Mellanox Technologies All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <net/tls.h> +#include <crypto/aead.h> +#include <crypto/scatterwalk.h> +#include <net/ip6_checksum.h> + +static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk) +{ + struct scatterlist *src = walk->sg; + int diff = walk->offset - src->offset; + + sg_set_page(sg, sg_page(src), + src->length - diff, walk->offset); + + scatterwalk_crypto_chain(sg, sg_next(src), 0, 2); +} + +static int tls_enc_record(struct aead_request *aead_req, + struct crypto_aead *aead, char *aad, + char *iv, __be64 rcd_sn, + struct scatter_walk *in, + struct scatter_walk *out, int *in_len) +{ + unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE]; + struct scatterlist sg_in[3]; + struct scatterlist sg_out[3]; + u16 len; + int rc; + + len = min_t(int, *in_len, ARRAY_SIZE(buf)); + + scatterwalk_copychunks(buf, in, len, 0); + scatterwalk_copychunks(buf, out, len, 1); + + *in_len -= len; + if (!*in_len) + return 0; + + scatterwalk_pagedone(in, 0, 1); + scatterwalk_pagedone(out, 1, 1); + + len = buf[4] | (buf[3] << 8); + len -= TLS_CIPHER_AES_GCM_128_IV_SIZE; + + tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE, + (char *)&rcd_sn, sizeof(rcd_sn), buf[0]); + + memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE, + TLS_CIPHER_AES_GCM_128_IV_SIZE); + + sg_init_table(sg_in, ARRAY_SIZE(sg_in)); + sg_init_table(sg_out, ARRAY_SIZE(sg_out)); + sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE); + sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE); + chain_to_walk(sg_in + 1, in); + chain_to_walk(sg_out + 1, out); + + *in_len -= len; + if (*in_len < 0) { + *in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE; + /* the input buffer doesn't contain the entire record. + * trim len accordingly. The resulting authentication tag + * will contain garbage, but we don't care, so we won't + * include any of it in the output skb + * Note that we assume the output buffer length + * is larger then input buffer length + tag size + */ + if (*in_len < 0) + len += *in_len; + + *in_len = 0; + } + + if (*in_len) { + scatterwalk_copychunks(NULL, in, len, 2); + scatterwalk_pagedone(in, 0, 1); + scatterwalk_copychunks(NULL, out, len, 2); + scatterwalk_pagedone(out, 1, 1); + } + + len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE; + aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv); + + rc = crypto_aead_encrypt(aead_req); + + return rc; +} + +static void tls_init_aead_request(struct aead_request *aead_req, + struct crypto_aead *aead) +{ + aead_request_set_tfm(aead_req, aead); + aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); +} + +static struct aead_request *tls_alloc_aead_request(struct crypto_aead *aead, + gfp_t flags) +{ + unsigned int req_size = sizeof(struct aead_request) + + crypto_aead_reqsize(aead); + struct aead_request *aead_req; + + aead_req = kzalloc(req_size, flags); + if (aead_req) + tls_init_aead_request(aead_req, aead); + return aead_req; +} + +static int tls_enc_records(struct aead_request *aead_req, + struct crypto_aead *aead, struct scatterlist *sg_in, + struct scatterlist *sg_out, char *aad, char *iv, + u64 rcd_sn, int len) +{ + struct scatter_walk out, in; + int rc; + + scatterwalk_start(&in, sg_in); + scatterwalk_start(&out, sg_out); + + do { + rc = tls_enc_record(aead_req, aead, aad, iv, + cpu_to_be64(rcd_sn), &in, &out, &len); + rcd_sn++; + + } while (rc == 0 && len); + + scatterwalk_done(&in, 0, 0); + scatterwalk_done(&out, 1, 0); + + return rc; +} + +/* Can't use icsk->icsk_af_ops->send_check here because the ip addresses + * might have been changed by NAT. + */ +static void update_chksum(struct sk_buff *skb, int headln) +{ + struct tcphdr *th = tcp_hdr(skb); + int datalen = skb->len - headln; + const struct ipv6hdr *ipv6h; + const struct iphdr *iph; + + /* We only changed the payload so if we are using partial we don't + * need to update anything. + */ + if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) + return; + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + + if (skb->sk->sk_family == AF_INET6) { + ipv6h = ipv6_hdr(skb); + th->check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, + datalen, IPPROTO_TCP, 0); + } else { + iph = ip_hdr(skb); + th->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, + IPPROTO_TCP, 0); + } +} + +static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln) +{ + skb_copy_header(nskb, skb); + + skb_put(nskb, skb->len); + memcpy(nskb->data, skb->data, headln); + update_chksum(nskb, headln); + + nskb->destructor = skb->destructor; + nskb->sk = skb->sk; + skb->destructor = NULL; + skb->sk = NULL; + refcount_add(nskb->truesize - skb->truesize, + &nskb->sk->sk_wmem_alloc); +} + +/* This function may be called after the user socket is already + * closed so make sure we don't use anything freed during + * tls_sk_proto_close here + */ + +static int fill_sg_in(struct scatterlist *sg_in, + struct sk_buff *skb, + struct tls_offload_context *ctx, + u64 *rcd_sn, + s32 *sync_size, + int *resync_sgs) +{ + int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + int payload_len = skb->len - tcp_payload_offset; + u32 tcp_seq = ntohl(tcp_hdr(skb)->seq); + struct tls_record_info *record; + unsigned long flags; + int remaining; + int i; + + spin_lock_irqsave(&ctx->lock, flags); + record = tls_get_record(ctx, tcp_seq, rcd_sn); + if (!record) { + spin_unlock_irqrestore(&ctx->lock, flags); + WARN(1, "Record not found for seq %u\n", tcp_seq); + return -EINVAL; + } + + *sync_size = tcp_seq - tls_record_start_seq(record); + if (*sync_size < 0) { + int is_start_marker = tls_record_is_start_marker(record); + + spin_unlock_irqrestore(&ctx->lock, flags); + /* This should only occur if the relevant record was + * already acked. In that case it should be ok + * to drop the packet and avoid retransmission. + * + * There is a corner case where the packet contains + * both an acked and a non-acked record. + * We currently don't handle that case and rely + * on TCP to retranmit a packet that doesn't contain + * already acked payload. + */ + if (!is_start_marker) + *sync_size = 0; + return -EINVAL; + } + + remaining = *sync_size; + for (i = 0; remaining > 0; i++) { + skb_frag_t *frag = &record->frags[i]; + + __skb_frag_ref(frag); + sg_set_page(sg_in + i, skb_frag_page(frag), + skb_frag_size(frag), frag->page_offset); + + remaining -= skb_frag_size(frag); + + if (remaining < 0) + sg_in[i].length += remaining; + } + *resync_sgs = i; + + spin_unlock_irqrestore(&ctx->lock, flags); + if (skb_to_sgvec(skb, &sg_in[i], tcp_payload_offset, payload_len) < 0) + return -EINVAL; + + return 0; +} + +static void fill_sg_out(struct scatterlist sg_out[3], void *buf, + struct tls_context *tls_ctx, + struct sk_buff *nskb, + int tcp_payload_offset, + int payload_len, + int sync_size, + void *dummy_buf) +{ + sg_set_buf(&sg_out[0], dummy_buf, sync_size); + sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len); + /* Add room for authentication tag produced by crypto */ + dummy_buf += sync_size; + sg_set_buf(&sg_out[2], dummy_buf, TLS_CIPHER_AES_GCM_128_TAG_SIZE); +} + +static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, + struct scatterlist sg_out[3], + struct scatterlist *sg_in, + struct sk_buff *skb, + s32 sync_size, u64 rcd_sn) +{ + int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + int payload_len = skb->len - tcp_payload_offset; + void *buf, *iv, *aad, *dummy_buf; + struct aead_request *aead_req; + struct sk_buff *nskb = NULL; + int buf_len; + + aead_req = tls_alloc_aead_request(ctx->aead_send, GFP_ATOMIC); + if (!aead_req) + return NULL; + + buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE + + TLS_AAD_SPACE_SIZE + + sync_size + + TLS_CIPHER_AES_GCM_128_TAG_SIZE; + buf = kmalloc(buf_len, GFP_ATOMIC); + if (!buf) + goto free_req; + + iv = buf; + memcpy(iv, tls_ctx->crypto_send_aes_gcm_128.salt, + TLS_CIPHER_AES_GCM_128_SALT_SIZE); + aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE + + TLS_CIPHER_AES_GCM_128_IV_SIZE; + dummy_buf = aad + TLS_AAD_SPACE_SIZE; + + nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC); + if (!nskb) + goto free_buf; + + skb_reserve(nskb, skb_headroom(skb)); + + fill_sg_out(sg_out, buf, tls_ctx, nskb, tcp_payload_offset, + payload_len, sync_size, dummy_buf); + + if (tls_enc_records(aead_req, ctx->aead_send, sg_in, sg_out, aad, iv, + rcd_sn, sync_size + payload_len) < 0) + goto free_nskb; + + complete_skb(nskb, skb, tcp_payload_offset); + + /* validate_xmit_skb_list assumes that if the skb wasn't segmented + * nskb->prev will point to the skb itself + */ + nskb->prev = nskb; + +free_buf: + kfree(buf); +free_req: + kfree(aead_req); + return nskb; +free_nskb: + kfree_skb(nskb); + nskb = NULL; + goto free_buf; +} + +static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb) +{ + int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx); + int payload_len = skb->len - tcp_payload_offset; + struct scatterlist *sg_in, sg_out[3]; + struct sk_buff *nskb = NULL; + int sg_in_max_elements; + int resync_sgs = 0; + s32 sync_size = 0; + u64 rcd_sn; + + /* worst case is: + * MAX_SKB_FRAGS in tls_record_info + * MAX_SKB_FRAGS + 1 in SKB head and frags. + */ + sg_in_max_elements = 2 * MAX_SKB_FRAGS + 1; + + if (!payload_len) + return skb; + + sg_in = kmalloc_array(sg_in_max_elements, sizeof(*sg_in), GFP_ATOMIC); + if (!sg_in) + goto free_orig; + + sg_init_table(sg_in, sg_in_max_elements); + sg_init_table(sg_out, ARRAY_SIZE(sg_out)); + + if (fill_sg_in(sg_in, skb, ctx, &rcd_sn, &sync_size, &resync_sgs)) { + /* bypass packets before kernel TLS socket option was set */ + if (sync_size < 0 && payload_len <= -sync_size) + nskb = skb_get(skb); + goto put_sg; + } + + nskb = tls_enc_skb(tls_ctx, sg_out, sg_in, skb, sync_size, rcd_sn); + +put_sg: + while (resync_sgs) + put_page(sg_page(&sg_in[--resync_sgs])); + kfree(sg_in); +free_orig: + kfree_skb(skb); + return nskb; +} + +struct sk_buff *tls_validate_xmit_skb(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb) +{ + if (dev == tls_get_ctx(sk)->netdev) + return skb; + + return tls_sw_fallback(sk, skb); +} + +int tls_sw_fallback_init(struct sock *sk, + struct tls_offload_context *offload_ctx, + struct tls_crypto_info *crypto_info) +{ + const u8 *key; + int rc; + + offload_ctx->aead_send = + crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(offload_ctx->aead_send)) { + rc = PTR_ERR(offload_ctx->aead_send); + pr_err_ratelimited("crypto_alloc_aead failed rc=%d\n", rc); + offload_ctx->aead_send = NULL; + goto err_out; + } + + key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key; + + rc = crypto_aead_setkey(offload_ctx->aead_send, key, + TLS_CIPHER_AES_GCM_128_KEY_SIZE); + if (rc) + goto free_aead; + + rc = crypto_aead_setauthsize(offload_ctx->aead_send, + TLS_CIPHER_AES_GCM_128_TAG_SIZE); + if (rc) + goto free_aead; + + return 0; +free_aead: + crypto_free_aead(offload_ctx->aead_send); +err_out: + return rc; +} diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 20cd93be6236..301f22430469 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -51,12 +51,12 @@ enum { TLSV6, TLS_NUM_PROTS, }; - enum { TLS_BASE, - TLS_SW_TX, - TLS_SW_RX, - TLS_SW_RXTX, + TLS_SW, +#ifdef CONFIG_TLS_DEVICE + TLS_HW, +#endif TLS_HW_RECORD, TLS_NUM_CONFIG, }; @@ -65,14 +65,14 @@ static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static LIST_HEAD(device_list); static DEFINE_MUTEX(device_mutex); -static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; +static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; -static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) +static void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - sk->sk_prot = &tls_prots[ip_ver][ctx->conf]; + sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf]; } int wait_on_pending_writer(struct sock *sk, long *timeo) @@ -254,7 +254,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); sk_proto_close = ctx->sk_proto_close; - if (ctx->conf == TLS_BASE || ctx->conf == TLS_HW_RECORD) { + if ((ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD) || + (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)) { free_ctx = true; goto skip_tx_cleanup; } @@ -275,15 +276,26 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } } - kfree(ctx->tx.rec_seq); - kfree(ctx->tx.iv); - kfree(ctx->rx.rec_seq); - kfree(ctx->rx.iv); + /* We need these for tls_sw_fallback handling of other packets */ + if (ctx->tx_conf == TLS_SW) { + kfree(ctx->tx.rec_seq); + kfree(ctx->tx.iv); + tls_sw_free_resources_tx(sk); + } - if (ctx->conf == TLS_SW_TX || - ctx->conf == TLS_SW_RX || - ctx->conf == TLS_SW_RXTX) { - tls_sw_free_resources(sk); + if (ctx->rx_conf == TLS_SW) { + kfree(ctx->rx.rec_seq); + kfree(ctx->rx.iv); + tls_sw_free_resources_rx(sk); + } + +#ifdef CONFIG_TLS_DEVICE + if (ctx->tx_conf != TLS_HW) { +#else + { +#endif + kfree(ctx); + ctx = NULL; } skip_tx_cleanup: @@ -446,25 +458,29 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, goto err_crypto_info; } - /* currently SW is default, we will have ethtool in future */ if (tx) { - rc = tls_set_sw_offload(sk, ctx, 1); - if (ctx->conf == TLS_SW_RX) - conf = TLS_SW_RXTX; - else - conf = TLS_SW_TX; +#ifdef CONFIG_TLS_DEVICE + rc = tls_set_device_offload(sk, ctx); + conf = TLS_HW; + if (rc) { +#else + { +#endif + rc = tls_set_sw_offload(sk, ctx, 1); + conf = TLS_SW; + } } else { rc = tls_set_sw_offload(sk, ctx, 0); - if (ctx->conf == TLS_SW_TX) - conf = TLS_SW_RXTX; - else - conf = TLS_SW_RX; + conf = TLS_SW; } if (rc) goto err_crypto_info; - ctx->conf = conf; + if (tx) + ctx->tx_conf = conf; + else + ctx->rx_conf = conf; update_sk_prot(sk, ctx); if (tx) { ctx->sk_write_space = sk->sk_write_space; @@ -540,7 +556,8 @@ static int tls_hw_prot(struct sock *sk) ctx->hash = sk->sk_prot->hash; ctx->unhash = sk->sk_prot->unhash; ctx->sk_proto_close = sk->sk_prot->close; - ctx->conf = TLS_HW_RECORD; + ctx->rx_conf = TLS_HW_RECORD; + ctx->tx_conf = TLS_HW_RECORD; update_sk_prot(sk, ctx); rc = 1; break; @@ -584,29 +601,40 @@ static int tls_hw_hash(struct sock *sk) return err; } -static void build_protos(struct proto *prot, struct proto *base) +static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], + struct proto *base) { - prot[TLS_BASE] = *base; - prot[TLS_BASE].setsockopt = tls_setsockopt; - prot[TLS_BASE].getsockopt = tls_getsockopt; - prot[TLS_BASE].close = tls_sk_proto_close; - - prot[TLS_SW_TX] = prot[TLS_BASE]; - prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; - prot[TLS_SW_TX].sendpage = tls_sw_sendpage; - - prot[TLS_SW_RX] = prot[TLS_BASE]; - prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg; - prot[TLS_SW_RX].close = tls_sk_proto_close; - - prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; - prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; - prot[TLS_SW_RXTX].close = tls_sk_proto_close; - - prot[TLS_HW_RECORD] = *base; - prot[TLS_HW_RECORD].hash = tls_hw_hash; - prot[TLS_HW_RECORD].unhash = tls_hw_unhash; - prot[TLS_HW_RECORD].close = tls_sk_proto_close; + prot[TLS_BASE][TLS_BASE] = *base; + prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; + prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close; + + prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; + prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg; + prot[TLS_SW][TLS_BASE].sendpage = tls_sw_sendpage; + + prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; + prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; + + prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; + prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; + prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; + +#ifdef CONFIG_TLS_DEVICE + prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; + prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg; + prot[TLS_HW][TLS_BASE].sendpage = tls_device_sendpage; + + prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; + prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; + prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage; +#endif + + prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; + prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_hw_hash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD][TLS_HW_RECORD].close = tls_sk_proto_close; } static int tls_init(struct sock *sk) @@ -637,7 +665,7 @@ static int tls_init(struct sock *sk) ctx->getsockopt = sk->sk_prot->getsockopt; ctx->sk_proto_close = sk->sk_prot->close; - /* Build IPv6 TLS whenever the address of tcpv6_prot changes */ + /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ if (ip_ver == TLSV6 && unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { mutex_lock(&tcpv6_prot_mutex); @@ -648,7 +676,8 @@ static int tls_init(struct sock *sk) mutex_unlock(&tcpv6_prot_mutex); } - ctx->conf = TLS_BASE; + ctx->tx_conf = TLS_BASE; + ctx->rx_conf = TLS_BASE; update_sk_prot(sk, ctx); out: return rc; @@ -686,6 +715,9 @@ static int __init tls_register(void) tls_sw_proto_ops.poll = tls_sw_poll; tls_sw_proto_ops.splice_read = tls_sw_splice_read; +#ifdef CONFIG_TLS_DEVICE + tls_device_init(); +#endif tcp_register_ulp(&tcp_tls_ulp_ops); return 0; @@ -694,6 +726,9 @@ static int __init tls_register(void) static void __exit tls_unregister(void) { tcp_unregister_ulp(&tcp_tls_ulp_ops); +#ifdef CONFIG_TLS_DEVICE + tls_device_cleanup(); +#endif } module_init(tls_register); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e1c93ce74e0f..839e1e165a0c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -52,7 +52,7 @@ static int tls_do_decryption(struct sock *sk, gfp_t flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = strp_msg(skb); struct aead_request *aead_req; @@ -122,7 +122,7 @@ out: static void trim_both_sgl(struct sock *sk, int target_size) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); trim_sg(sk, ctx->sg_plaintext_data, &ctx->sg_plaintext_num_elem, @@ -141,7 +141,7 @@ static void trim_both_sgl(struct sock *sk, int target_size) static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int rc = 0; rc = sk_alloc_sg(sk, len, @@ -155,7 +155,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len) static int alloc_plaintext_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int rc = 0; rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, @@ -181,7 +181,7 @@ static void free_sg(struct sock *sk, struct scatterlist *sg, static void tls_free_both_sg(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); free_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size); @@ -191,7 +191,7 @@ static void tls_free_both_sg(struct sock *sk) } static int tls_do_encryption(struct tls_context *tls_ctx, - struct tls_sw_context *ctx, size_t data_len, + struct tls_sw_context_tx *ctx, size_t data_len, gfp_t flags) { unsigned int req_size = sizeof(struct aead_request) + @@ -227,7 +227,7 @@ static int tls_push_record(struct sock *sk, int flags, unsigned char record_type) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int rc; sg_mark_end(ctx->sg_plaintext_data + ctx->sg_plaintext_num_elem - 1); @@ -339,7 +339,7 @@ static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, int bytes) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct scatterlist *sg = ctx->sg_plaintext_data; int copy, i, rc = 0; @@ -367,7 +367,7 @@ out: int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int ret = 0; int required_size; long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -522,7 +522,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); int ret = 0; long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); bool eor; @@ -636,7 +636,7 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags, long timeo, int *err) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct sk_buff *skb; DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -674,7 +674,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + MAX_IV_SIZE]; struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; struct scatterlist *sgin = &sgin_arr[0]; @@ -692,8 +692,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, if (!sgout) { nsg = skb_cow_data(skb, 0, &unused) + 1; sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); - if (!sgout) - sgout = sgin; + sgout = sgin; } sg_init_table(sgin, nsg); @@ -723,7 +722,7 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, unsigned int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = strp_msg(skb); if (len < rxm->full_len) { @@ -749,7 +748,7 @@ int tls_sw_recvmsg(struct sock *sk, int *addr_len) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); unsigned char control; struct strp_msg *rxm; struct sk_buff *skb; @@ -869,7 +868,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, size_t len, unsigned int flags) { struct tls_context *tls_ctx = tls_get_ctx(sock->sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm = NULL; struct sock *sk = sock->sk; struct sk_buff *skb; @@ -922,7 +921,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock, unsigned int ret; struct sock *sk = sock->sk; struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); /* Grab POLLOUT and POLLHUP from the underlying socket */ ret = ctx->sk_poll(file, sock, wait); @@ -938,7 +937,7 @@ unsigned int tls_sw_poll(struct file *file, struct socket *sock, static int tls_read_size(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); char header[tls_ctx->rx.prepend_size]; struct strp_msg *rxm = strp_msg(skb); size_t cipher_overhead; @@ -987,7 +986,7 @@ read_failure: static void tls_queue(struct strparser *strp, struct sk_buff *skb) { struct tls_context *tls_ctx = tls_get_ctx(strp->sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); struct strp_msg *rxm; rxm = strp_msg(skb); @@ -1003,18 +1002,28 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb) static void tls_data_ready(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); strp_data_ready(&ctx->strp); } -void tls_sw_free_resources(struct sock *sk) +void tls_sw_free_resources_tx(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); if (ctx->aead_send) crypto_free_aead(ctx->aead_send); + tls_free_both_sg(sk); + + kfree(ctx); +} + +void tls_sw_free_resources_rx(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + if (ctx->aead_recv) { if (ctx->recv_pkt) { kfree_skb(ctx->recv_pkt); @@ -1030,10 +1039,7 @@ void tls_sw_free_resources(struct sock *sk) lock_sock(sk); } - tls_free_both_sg(sk); - kfree(ctx); - kfree(tls_ctx); } int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) @@ -1041,7 +1047,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; - struct tls_sw_context *sw_ctx; + struct tls_sw_context_tx *sw_ctx_tx = NULL; + struct tls_sw_context_rx *sw_ctx_rx = NULL; struct cipher_context *cctx; struct crypto_aead **aead; struct strp_callbacks cb; @@ -1054,27 +1061,32 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto out; } - if (!ctx->priv_ctx) { - sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); - if (!sw_ctx) { + if (tx) { + sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL); + if (!sw_ctx_tx) { rc = -ENOMEM; goto out; } - crypto_init_wait(&sw_ctx->async_wait); + crypto_init_wait(&sw_ctx_tx->async_wait); + ctx->priv_ctx_tx = sw_ctx_tx; } else { - sw_ctx = ctx->priv_ctx; + sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL); + if (!sw_ctx_rx) { + rc = -ENOMEM; + goto out; + } + crypto_init_wait(&sw_ctx_rx->async_wait); + ctx->priv_ctx_rx = sw_ctx_rx; } - ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - if (tx) { crypto_info = &ctx->crypto_send; cctx = &ctx->tx; - aead = &sw_ctx->aead_send; + aead = &sw_ctx_tx->aead_send; } else { crypto_info = &ctx->crypto_recv; cctx = &ctx->rx; - aead = &sw_ctx->aead_recv; + aead = &sw_ctx_rx->aead_recv; } switch (crypto_info->cipher_type) { @@ -1121,22 +1133,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) } memcpy(cctx->rec_seq, rec_seq, rec_seq_size); - if (tx) { - sg_init_table(sw_ctx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx->sg_encrypted_data)); - sg_init_table(sw_ctx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx->sg_plaintext_data)); - - sg_init_table(sw_ctx->sg_aead_in, 2); - sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_in[1]); - sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); - sg_init_table(sw_ctx->sg_aead_out, 2); - sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_out[1]); - sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); + if (sw_ctx_tx) { + sg_init_table(sw_ctx_tx->sg_encrypted_data, + ARRAY_SIZE(sw_ctx_tx->sg_encrypted_data)); + sg_init_table(sw_ctx_tx->sg_plaintext_data, + ARRAY_SIZE(sw_ctx_tx->sg_plaintext_data)); + + sg_init_table(sw_ctx_tx->sg_aead_in, 2); + sg_set_buf(&sw_ctx_tx->sg_aead_in[0], sw_ctx_tx->aad_space, + sizeof(sw_ctx_tx->aad_space)); + sg_unmark_end(&sw_ctx_tx->sg_aead_in[1]); + sg_chain(sw_ctx_tx->sg_aead_in, 2, + sw_ctx_tx->sg_plaintext_data); + sg_init_table(sw_ctx_tx->sg_aead_out, 2); + sg_set_buf(&sw_ctx_tx->sg_aead_out[0], sw_ctx_tx->aad_space, + sizeof(sw_ctx_tx->aad_space)); + sg_unmark_end(&sw_ctx_tx->sg_aead_out[1]); + sg_chain(sw_ctx_tx->sg_aead_out, 2, + sw_ctx_tx->sg_encrypted_data); } if (!*aead) { @@ -1161,22 +1175,22 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) if (rc) goto free_aead; - if (!tx) { + if (sw_ctx_rx) { /* Set up strparser */ memset(&cb, 0, sizeof(cb)); cb.rcv_msg = tls_queue; cb.parse_msg = tls_read_size; - strp_init(&sw_ctx->strp, sk, &cb); + strp_init(&sw_ctx_rx->strp, sk, &cb); write_lock_bh(&sk->sk_callback_lock); - sw_ctx->saved_data_ready = sk->sk_data_ready; + sw_ctx_rx->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = tls_data_ready; write_unlock_bh(&sk->sk_callback_lock); - sw_ctx->sk_poll = sk->sk_socket->ops->poll; + sw_ctx_rx->sk_poll = sk->sk_socket->ops->poll; - strp_check_rcv(&sw_ctx->strp); + strp_check_rcv(&sw_ctx_rx->strp); } goto out; @@ -1188,11 +1202,16 @@ free_rec_seq: kfree(cctx->rec_seq); cctx->rec_seq = NULL; free_iv: - kfree(ctx->tx.iv); - ctx->tx.iv = NULL; + kfree(cctx->iv); + cctx->iv = NULL; free_priv: - kfree(ctx->priv_ctx); - ctx->priv_ctx = NULL; + if (tx) { + kfree(ctx->priv_ctx_tx); + ctx->priv_ctx_tx = NULL; + } else { + kfree(ctx->priv_ctx_rx); + ctx->priv_ctx_rx = NULL; + } out: return rc; } diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig new file mode 100644 index 000000000000..90e4a7152854 --- /dev/null +++ b/net/xdp/Kconfig @@ -0,0 +1,7 @@ +config XDP_SOCKETS + bool "XDP sockets" + depends on BPF_SYSCALL + default n + help + XDP sockets allows a channel between XDP programs and + userspace applications. diff --git a/net/xdp/Makefile b/net/xdp/Makefile new file mode 100644 index 000000000000..074fb2b2d51c --- /dev/null +++ b/net/xdp/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o + diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c new file mode 100644 index 000000000000..2b47a1dd7c6c --- /dev/null +++ b/net/xdp/xdp_umem.c @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/mm.h> + +#include "xdp_umem.h" + +#define XDP_UMEM_MIN_FRAME_SIZE 2048 + +int xdp_umem_create(struct xdp_umem **umem) +{ + *umem = kzalloc(sizeof(**umem), GFP_KERNEL); + + if (!(*umem)) + return -ENOMEM; + + return 0; +} + +static void xdp_umem_unpin_pages(struct xdp_umem *umem) +{ + unsigned int i; + + if (umem->pgs) { + for (i = 0; i < umem->npgs; i++) { + struct page *page = umem->pgs[i]; + + set_page_dirty_lock(page); + put_page(page); + } + + kfree(umem->pgs); + umem->pgs = NULL; + } +} + +static void xdp_umem_unaccount_pages(struct xdp_umem *umem) +{ + if (umem->user) { + atomic_long_sub(umem->npgs, &umem->user->locked_vm); + free_uid(umem->user); + } +} + +static void xdp_umem_release(struct xdp_umem *umem) +{ + struct task_struct *task; + struct mm_struct *mm; + + if (umem->fq) { + xskq_destroy(umem->fq); + umem->fq = NULL; + } + + if (umem->cq) { + xskq_destroy(umem->cq); + umem->cq = NULL; + } + + if (umem->pgs) { + xdp_umem_unpin_pages(umem); + + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + mmput(mm); + umem->pgs = NULL; + } + + xdp_umem_unaccount_pages(umem); +out: + kfree(umem); +} + +static void xdp_umem_release_deferred(struct work_struct *work) +{ + struct xdp_umem *umem = container_of(work, struct xdp_umem, work); + + xdp_umem_release(umem); +} + +void xdp_get_umem(struct xdp_umem *umem) +{ + atomic_inc(&umem->users); +} + +void xdp_put_umem(struct xdp_umem *umem) +{ + if (!umem) + return; + + if (atomic_dec_and_test(&umem->users)) { + INIT_WORK(&umem->work, xdp_umem_release_deferred); + schedule_work(&umem->work); + } +} + +static int xdp_umem_pin_pages(struct xdp_umem *umem) +{ + unsigned int gup_flags = FOLL_WRITE; + long npgs; + int err; + + umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL); + if (!umem->pgs) + return -ENOMEM; + + down_write(¤t->mm->mmap_sem); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags, &umem->pgs[0], NULL); + up_write(¤t->mm->mmap_sem); + + if (npgs != umem->npgs) { + if (npgs >= 0) { + umem->npgs = npgs; + err = -ENOMEM; + goto out_pin; + } + err = npgs; + goto out_pgs; + } + return 0; + +out_pin: + xdp_umem_unpin_pages(umem); +out_pgs: + kfree(umem->pgs); + umem->pgs = NULL; + return err; +} + +static int xdp_umem_account_pages(struct xdp_umem *umem) +{ + unsigned long lock_limit, new_npgs, old_npgs; + + if (capable(CAP_IPC_LOCK)) + return 0; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + umem->user = get_uid(current_user()); + + do { + old_npgs = atomic_long_read(&umem->user->locked_vm); + new_npgs = old_npgs + umem->npgs; + if (new_npgs > lock_limit) { + free_uid(umem->user); + umem->user = NULL; + return -ENOBUFS; + } + } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, + new_npgs) != old_npgs); + return 0; +} + +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) +{ + u32 frame_size = mr->frame_size, frame_headroom = mr->frame_headroom; + u64 addr = mr->addr, size = mr->len; + unsigned int nframes, nfpp; + int size_chk, err; + + if (!umem) + return -EINVAL; + + if (frame_size < XDP_UMEM_MIN_FRAME_SIZE || frame_size > PAGE_SIZE) { + /* Strictly speaking we could support this, if: + * - huge pages, or* + * - using an IOMMU, or + * - making sure the memory area is consecutive + * but for now, we simply say "computer says no". + */ + return -EINVAL; + } + + if (!is_power_of_2(frame_size)) + return -EINVAL; + + if (!PAGE_ALIGNED(addr)) { + /* Memory area has to be page size aligned. For + * simplicity, this might change. + */ + return -EINVAL; + } + + if ((addr + size) < addr) + return -EINVAL; + + nframes = (unsigned int)div_u64(size, frame_size); + if (nframes == 0 || nframes > UINT_MAX) + return -EINVAL; + + nfpp = PAGE_SIZE / frame_size; + if (nframes < nfpp || nframes % nfpp) + return -EINVAL; + + frame_headroom = ALIGN(frame_headroom, 64); + + size_chk = frame_size - frame_headroom - XDP_PACKET_HEADROOM; + if (size_chk < 0) + return -EINVAL; + + umem->pid = get_task_pid(current, PIDTYPE_PID); + umem->size = (size_t)size; + umem->address = (unsigned long)addr; + umem->props.frame_size = frame_size; + umem->props.nframes = nframes; + umem->frame_headroom = frame_headroom; + umem->npgs = size / PAGE_SIZE; + umem->pgs = NULL; + umem->user = NULL; + + umem->frame_size_log2 = ilog2(frame_size); + umem->nfpp_mask = nfpp - 1; + umem->nfpplog2 = ilog2(nfpp); + atomic_set(&umem->users, 1); + + err = xdp_umem_account_pages(umem); + if (err) + goto out; + + err = xdp_umem_pin_pages(umem); + if (err) + goto out_account; + return 0; + +out_account: + xdp_umem_unaccount_pages(umem); +out: + put_pid(umem->pid); + return err; +} + +bool xdp_umem_validate_queues(struct xdp_umem *umem) +{ + return (umem->fq && umem->cq); +} diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h new file mode 100644 index 000000000000..7e0b2fab8522 --- /dev/null +++ b/net/xdp/xdp_umem.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_H_ +#define XDP_UMEM_H_ + +#include <linux/mm.h> +#include <linux/if_xdp.h> +#include <linux/workqueue.h> + +#include "xsk_queue.h" +#include "xdp_umem_props.h" + +struct xdp_umem { + struct xsk_queue *fq; + struct xsk_queue *cq; + struct page **pgs; + struct xdp_umem_props props; + u32 npgs; + u32 frame_headroom; + u32 nfpp_mask; + u32 nfpplog2; + u32 frame_size_log2; + struct user_struct *user; + struct pid *pid; + unsigned long address; + size_t size; + atomic_t users; + struct work_struct work; +}; + +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx) +{ + u64 pg, off; + char *data; + + pg = idx >> umem->nfpplog2; + off = (idx & umem->nfpp_mask) << umem->frame_size_log2; + + data = page_address(umem->pgs[pg]); + return data + off; +} + +static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, + u32 idx) +{ + return xdp_umem_get_data(umem, idx) + umem->frame_headroom; +} + +bool xdp_umem_validate_queues(struct xdp_umem *umem); +int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); +void xdp_get_umem(struct xdp_umem *umem); +void xdp_put_umem(struct xdp_umem *umem); +int xdp_umem_create(struct xdp_umem **umem); + +#endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xdp_umem_props.h b/net/xdp/xdp_umem_props.h new file mode 100644 index 000000000000..77fb5daf29f3 --- /dev/null +++ b/net/xdp/xdp_umem_props.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space packet buffer + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef XDP_UMEM_PROPS_H_ +#define XDP_UMEM_PROPS_H_ + +struct xdp_umem_props { + u32 frame_size; + u32 nframes; +}; + +#endif /* XDP_UMEM_PROPS_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c new file mode 100644 index 000000000000..009c5af5bba5 --- /dev/null +++ b/net/xdp/xsk.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets + * + * AF_XDP sockets allows a channel between XDP programs and userspace + * applications. + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Author(s): Björn Töpel <bjorn.topel@intel.com> + * Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ + +#include <linux/if_xdp.h> +#include <linux/init.h> +#include <linux/sched/mm.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/socket.h> +#include <linux/file.h> +#include <linux/uaccess.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <net/xdp_sock.h> +#include <net/xdp.h> + +#include "xsk_queue.h" +#include "xdp_umem.h" + +#define TX_BATCH_SIZE 16 + +static struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) +{ + return !!xs->rx; +} + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 *id, len = xdp->data_end - xdp->data; + void *buffer; + int err = 0; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + id = xskq_peek_id(xs->umem->fq); + if (!id) + return -ENOSPC; + + buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, *id, len, + xs->umem->frame_headroom); + if (!err) + xskq_discard_id(xs->umem->fq); + + return err; +} + +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (likely(!err)) + xdp_return_buff(xdp); + else + xs->rx_dropped++; + + return err; +} + +void xsk_flush(struct xdp_sock *xs) +{ + xskq_produce_flush_desc(xs->rx); + xs->sk.sk_data_ready(&xs->sk); +} + +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (!err) + xsk_flush(xs); + else + xs->rx_dropped++; + + return err; +} + +static void xsk_destruct_skb(struct sk_buff *skb) +{ + u32 id = (u32)(long)skb_shinfo(skb)->destructor_arg; + struct xdp_sock *xs = xdp_sk(skb->sk); + + WARN_ON_ONCE(xskq_produce_id(xs->umem->cq, id)); + + sock_wfree(skb); +} + +static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, + size_t total_len) +{ + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); + u32 max_batch = TX_BATCH_SIZE; + struct xdp_sock *xs = xdp_sk(sk); + bool sent_frame = false; + struct xdp_desc desc; + struct sk_buff *skb; + int err = 0; + + if (unlikely(!xs->tx)) + return -ENOBUFS; + if (need_wait) + return -EOPNOTSUPP; + + mutex_lock(&xs->mutex); + + while (xskq_peek_desc(xs->tx, &desc)) { + char *buffer; + u32 id, len; + + if (max_batch-- == 0) { + err = -EAGAIN; + goto out; + } + + if (xskq_reserve_id(xs->umem->cq)) { + err = -EAGAIN; + goto out; + } + + len = desc.len; + if (unlikely(len > xs->dev->mtu)) { + err = -EMSGSIZE; + goto out; + } + + skb = sock_alloc_send_skb(sk, len, !need_wait, &err); + if (unlikely(!skb)) { + err = -EAGAIN; + goto out; + } + + skb_put(skb, len); + id = desc.idx; + buffer = xdp_umem_get_data(xs->umem, id) + desc.offset; + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) { + kfree_skb(skb); + goto out; + } + + skb->dev = xs->dev; + skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; + skb_shinfo(skb)->destructor_arg = (void *)(long)id; + skb->destructor = xsk_destruct_skb; + + err = dev_direct_xmit(skb, xs->queue_id); + /* Ignore NET_XMIT_CN as packet might have been sent */ + if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { + err = -EAGAIN; + /* SKB consumed by dev_direct_xmit() */ + goto out; + } + + sent_frame = true; + xskq_discard_desc(xs->tx); + } + +out: + if (sent_frame) + sk->sk_write_space(sk); + + mutex_unlock(&xs->mutex); + return err; +} + +static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (unlikely(!xs->dev)) + return -ENXIO; + if (unlikely(!(xs->dev->flags & IFF_UP))) + return -ENETDOWN; + + return xsk_generic_xmit(sk, m, total_len); +} + +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (xs->rx && !xskq_empty_desc(xs->rx)) + mask |= POLLIN | POLLRDNORM; + if (xs->tx && !xskq_full_desc(xs->tx)) + mask |= POLLOUT | POLLWRNORM; + + return mask; +} + +static int xsk_init_queue(u32 entries, struct xsk_queue **queue, + bool umem_queue) +{ + struct xsk_queue *q; + + if (entries == 0 || *queue || !is_power_of_2(entries)) + return -EINVAL; + + q = xskq_create(entries, umem_queue); + if (!q) + return -ENOMEM; + + *queue = q; + return 0; +} + +static void __xsk_release(struct xdp_sock *xs) +{ + /* Wait for driver to stop using the xdp socket. */ + synchronize_net(); + + dev_put(xs->dev); +} + +static int xsk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + struct net *net; + + if (!sk) + return 0; + + net = sock_net(sk); + + local_bh_disable(); + sock_prot_inuse_add(net, sk->sk_prot, -1); + local_bh_enable(); + + if (xs->dev) { + __xsk_release(xs); + xs->dev = NULL; + } + + sock_orphan(sk); + sock->sk = NULL; + + sk_refcnt_debug_release(sk); + sock_put(sk); + + return 0; +} + +static struct socket *xsk_lookup_xsk_from_fd(int fd) +{ + struct socket *sock; + int err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return ERR_PTR(-ENOTSOCK); + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return ERR_PTR(-ENOPROTOOPT); + } + + return sock; +} + +static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; + struct sock *sk = sock->sk; + struct net_device *dev, *dev_curr; + struct xdp_sock *xs = xdp_sk(sk); + struct xdp_umem *old_umem = NULL; + int err = 0; + + if (addr_len < sizeof(struct sockaddr_xdp)) + return -EINVAL; + if (sxdp->sxdp_family != AF_XDP) + return -EINVAL; + + mutex_lock(&xs->mutex); + dev_curr = xs->dev; + dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); + if (!dev) { + err = -ENODEV; + goto out_release; + } + + if (!xs->rx && !xs->tx) { + err = -EINVAL; + goto out_unlock; + } + + if (sxdp->sxdp_queue_id >= dev->num_rx_queues) { + err = -EINVAL; + goto out_unlock; + } + + if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { + struct xdp_sock *umem_xs; + struct socket *sock; + + if (xs->umem) { + /* We have already our own. */ + err = -EINVAL; + goto out_unlock; + } + + sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); + if (IS_ERR(sock)) { + err = PTR_ERR(sock); + goto out_unlock; + } + + umem_xs = xdp_sk(sock->sk); + if (!umem_xs->umem) { + /* No umem to inherit. */ + err = -EBADF; + sockfd_put(sock); + goto out_unlock; + } else if (umem_xs->dev != dev || + umem_xs->queue_id != sxdp->sxdp_queue_id) { + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + + xdp_get_umem(umem_xs->umem); + old_umem = xs->umem; + xs->umem = umem_xs->umem; + sockfd_put(sock); + } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { + err = -EINVAL; + goto out_unlock; + } else { + /* This xsk has its own umem. */ + xskq_set_umem(xs->umem->fq, &xs->umem->props); + xskq_set_umem(xs->umem->cq, &xs->umem->props); + } + + /* Rebind? */ + if (dev_curr && (dev_curr != dev || + xs->queue_id != sxdp->sxdp_queue_id)) { + __xsk_release(xs); + if (old_umem) + xdp_put_umem(old_umem); + } + + xs->dev = dev; + xs->queue_id = sxdp->sxdp_queue_id; + + xskq_set_umem(xs->rx, &xs->umem->props); + xskq_set_umem(xs->tx, &xs->umem->props); + +out_unlock: + if (err) + dev_put(dev); +out_release: + mutex_unlock(&xs->mutex); + return err; +} + +static int xsk_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int err; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + switch (optname) { + case XDP_RX_RING: + case XDP_TX_RING: + { + struct xsk_queue **q; + int entries; + + if (optlen < sizeof(entries)) + return -EINVAL; + if (copy_from_user(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; + err = xsk_init_queue(entries, q, false); + mutex_unlock(&xs->mutex); + return err; + } + case XDP_UMEM_REG: + { + struct xdp_umem_reg mr; + struct xdp_umem *umem; + + if (xs->umem) + return -EBUSY; + + if (copy_from_user(&mr, optval, sizeof(mr))) + return -EFAULT; + + mutex_lock(&xs->mutex); + err = xdp_umem_create(&umem); + + err = xdp_umem_reg(umem, &mr); + if (err) { + kfree(umem); + mutex_unlock(&xs->mutex); + return err; + } + + /* Make sure umem is ready before it can be seen by others */ + smp_wmb(); + + xs->umem = umem; + mutex_unlock(&xs->mutex); + return 0; + } + case XDP_UMEM_FILL_RING: + case XDP_UMEM_COMPLETION_RING: + { + struct xsk_queue **q; + int entries; + + if (!xs->umem) + return -EINVAL; + + if (copy_from_user(&entries, optval, sizeof(entries))) + return -EFAULT; + + mutex_lock(&xs->mutex); + q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : + &xs->umem->cq; + err = xsk_init_queue(entries, q, true); + mutex_unlock(&xs->mutex); + return err; + } + default: + break; + } + + return -ENOPROTOOPT; +} + +static int xsk_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + int len; + + if (level != SOL_XDP) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case XDP_STATISTICS: + { + struct xdp_statistics stats; + + if (len < sizeof(stats)) + return -EINVAL; + + mutex_lock(&xs->mutex); + stats.rx_dropped = xs->rx_dropped; + stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); + stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); + mutex_unlock(&xs->mutex); + + if (copy_to_user(optval, &stats, sizeof(stats))) + return -EFAULT; + if (put_user(sizeof(stats), optlen)) + return -EFAULT; + + return 0; + } + default: + break; + } + + return -EOPNOTSUPP; +} + +static int xsk_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct xdp_sock *xs = xdp_sk(sock->sk); + struct xsk_queue *q = NULL; + unsigned long pfn; + struct page *qpg; + + if (offset == XDP_PGOFF_RX_RING) { + q = xs->rx; + } else if (offset == XDP_PGOFF_TX_RING) { + q = xs->tx; + } else { + if (!xs->umem) + return -EINVAL; + + if (offset == XDP_UMEM_PGOFF_FILL_RING) + q = xs->umem->fq; + else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) + q = xs->umem->cq; + } + + if (!q) + return -EINVAL; + + qpg = virt_to_head_page(q->ring); + if (size > (PAGE_SIZE << compound_order(qpg))) + return -EINVAL; + + pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, + size, vma->vm_page_prot); +} + +static struct proto xsk_proto = { + .name = "XDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct xdp_sock), +}; + +static const struct proto_ops xsk_proto_ops = { + .family = PF_XDP, + .owner = THIS_MODULE, + .release = xsk_release, + .bind = xsk_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = xsk_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = xsk_setsockopt, + .getsockopt = xsk_getsockopt, + .sendmsg = xsk_sendmsg, + .recvmsg = sock_no_recvmsg, + .mmap = xsk_mmap, + .sendpage = sock_no_sendpage, +}; + +static void xsk_destruct(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + return; + + xskq_destroy(xs->rx); + xskq_destroy(xs->tx); + xdp_put_umem(xs->umem); + + sk_refcnt_debug_dec(sk); +} + +static int xsk_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + struct xdp_sock *xs; + + if (!ns_capable(net->user_ns, CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + if (protocol) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); + if (!sk) + return -ENOBUFS; + + sock->ops = &xsk_proto_ops; + + sock_init_data(sock, sk); + + sk->sk_family = PF_XDP; + + sk->sk_destruct = xsk_destruct; + sk_refcnt_debug_inc(sk); + + xs = xdp_sk(sk); + mutex_init(&xs->mutex); + + local_bh_disable(); + sock_prot_inuse_add(net, &xsk_proto, 1); + local_bh_enable(); + + return 0; +} + +static const struct net_proto_family xsk_family_ops = { + .family = PF_XDP, + .create = xsk_create, + .owner = THIS_MODULE, +}; + +static int __init xsk_init(void) +{ + int err; + + err = proto_register(&xsk_proto, 0 /* no slab */); + if (err) + goto out; + + err = sock_register(&xsk_family_ops); + if (err) + goto out_proto; + + return 0; + +out_proto: + proto_unregister(&xsk_proto); +out: + return err; +} + +fs_initcall(xsk_init); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c new file mode 100644 index 000000000000..d012e5e23591 --- /dev/null +++ b/net/xdp/xsk_queue.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/slab.h> + +#include "xsk_queue.h" + +void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props) +{ + if (!q) + return; + + q->umem_props = *umem_props; +} + +static u32 xskq_umem_get_ring_size(struct xsk_queue *q) +{ + return sizeof(struct xdp_umem_ring) + q->nentries * sizeof(u32); +} + +static u32 xskq_rxtx_get_ring_size(struct xsk_queue *q) +{ + return (sizeof(struct xdp_ring) + + q->nentries * sizeof(struct xdp_desc)); +} + +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue) +{ + struct xsk_queue *q; + gfp_t gfp_flags; + size_t size; + + q = kzalloc(sizeof(*q), GFP_KERNEL); + if (!q) + return NULL; + + q->nentries = nentries; + q->ring_mask = nentries - 1; + + gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | + __GFP_COMP | __GFP_NORETRY; + size = umem_queue ? xskq_umem_get_ring_size(q) : + xskq_rxtx_get_ring_size(q); + + q->ring = (struct xdp_ring *)__get_free_pages(gfp_flags, + get_order(size)); + if (!q->ring) { + kfree(q); + return NULL; + } + + return q; +} + +void xskq_destroy(struct xsk_queue *q) +{ + if (!q) + return; + + page_frag_free(q->ring); + kfree(q); +} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h new file mode 100644 index 000000000000..7aa9a535db0e --- /dev/null +++ b/net/xdp/xsk_queue.h @@ -0,0 +1,247 @@ +/* SPDX-License-Identifier: GPL-2.0 + * XDP user-space ring structure + * Copyright(c) 2018 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_XSK_QUEUE_H +#define _LINUX_XSK_QUEUE_H + +#include <linux/types.h> +#include <linux/if_xdp.h> + +#include "xdp_umem_props.h" + +#define RX_BATCH_SIZE 16 + +struct xsk_queue { + struct xdp_umem_props umem_props; + u32 ring_mask; + u32 nentries; + u32 prod_head; + u32 prod_tail; + u32 cons_head; + u32 cons_tail; + struct xdp_ring *ring; + u64 invalid_descs; +}; + +/* Common functions operating for both RXTX and umem queues */ + +static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) +{ + return q ? q->invalid_descs : 0; +} + +static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) +{ + u32 entries = q->prod_tail - q->cons_tail; + + if (entries == 0) { + /* Refresh the local pointer */ + q->prod_tail = READ_ONCE(q->ring->producer); + entries = q->prod_tail - q->cons_tail; + } + + return (entries > dcnt) ? dcnt : entries; +} + +static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) +{ + u32 free_entries = q->nentries - (producer - q->cons_tail); + + if (free_entries >= dcnt) + return free_entries; + + /* Refresh the local tail pointer */ + q->cons_tail = READ_ONCE(q->ring->consumer); + return q->nentries - (producer - q->cons_tail); +} + +/* UMEM queue */ + +static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) +{ + if (unlikely(idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + return true; +} + +static inline u32 *xskq_validate_id(struct xsk_queue *q) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_id(q, ring->desc[idx])) + return &ring->desc[idx]; + + q->cons_tail++; + } + + return NULL; +} + +static inline u32 *xskq_peek_id(struct xsk_queue *q) +{ + struct xdp_umem_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_id(q); + } + + ring = (struct xdp_umem_ring *)q->ring; + return &ring->desc[q->cons_tail & q->ring_mask]; +} + +static inline void xskq_discard_id(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_id(q); +} + +static inline int xskq_produce_id(struct xsk_queue *q, u32 id) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[q->prod_tail++ & q->ring_mask] = id; + + /* Order producer and data */ + smp_wmb(); + + WRITE_ONCE(q->ring->producer, q->prod_tail); + return 0; +} + +static inline int xskq_reserve_id(struct xsk_queue *q) +{ + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + q->prod_head++; + return 0; +} + +/* Rx/Tx queue */ + +static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d) +{ + u32 buff_len; + + if (unlikely(d->idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + + buff_len = q->umem_props.frame_size; + if (unlikely(d->len > buff_len || d->len == 0 || + d->offset > buff_len || d->offset + d->len > buff_len)) { + q->invalid_descs++; + return false; + } + + return true; +} + +static inline struct xdp_desc *xskq_validate_desc(struct xsk_queue *q, + struct xdp_desc *desc) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_desc(q, &ring->desc[idx])) { + if (desc) + *desc = ring->desc[idx]; + return desc; + } + + q->cons_tail++; + } + + return NULL; +} + +static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q, + struct xdp_desc *desc) +{ + struct xdp_rxtx_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_desc(q, desc); + } + + ring = (struct xdp_rxtx_ring *)q->ring; + *desc = ring->desc[q->cons_tail & q->ring_mask]; + return desc; +} + +static inline void xskq_discard_desc(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_desc(q, NULL); +} + +static inline int xskq_produce_batch_desc(struct xsk_queue *q, + u32 id, u32 len, u16 offset) +{ + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx; + + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + idx = (q->prod_head++) & q->ring_mask; + ring->desc[idx].idx = id; + ring->desc[idx].len = len; + ring->desc[idx].offset = offset; + + return 0; +} + +static inline void xskq_produce_flush_desc(struct xsk_queue *q) +{ + /* Order producer and data */ + smp_wmb(); + + q->prod_tail = q->prod_head, + WRITE_ONCE(q->ring->producer, q->prod_tail); +} + +static inline bool xskq_full_desc(struct xsk_queue *q) +{ + return (xskq_nb_avail(q, q->nentries) == q->nentries); +} + +static inline bool xskq_empty_desc(struct xsk_queue *q) +{ + return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); +} + +void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); +struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); +void xskq_destroy(struct xsk_queue *q_ops); + +#endif /* _LINUX_XSK_QUEUE_H */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 6c177ae7a6d9..8308281f3253 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -42,6 +42,7 @@ static void xfrm_state_gc_task(struct work_struct *work); static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; static __read_mostly seqcount_t xfrm_state_hash_generation = SEQCNT_ZERO(xfrm_state_hash_generation); +static struct kmem_cache *xfrm_state_cache __ro_after_init; static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); static HLIST_HEAD(xfrm_state_gc_list); @@ -451,7 +452,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) } xfrm_dev_state_free(x); security_xfrm_state_free(x); - kfree(x); + kmem_cache_free(xfrm_state_cache, x); } static void xfrm_state_gc_task(struct work_struct *work) @@ -563,7 +564,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) { struct xfrm_state *x; - x = kzalloc(sizeof(struct xfrm_state), GFP_ATOMIC); + x = kmem_cache_alloc(xfrm_state_cache, GFP_ATOMIC | __GFP_ZERO); if (x) { write_pnet(&x->xs_net, net); @@ -2313,6 +2314,10 @@ int __net_init xfrm_state_init(struct net *net) { unsigned int sz; + if (net_eq(net, &init_net)) + xfrm_state_cache = KMEM_CACHE(xfrm_state, + SLAB_HWCACHE_ALIGN | SLAB_PANIC); + INIT_LIST_HEAD(&net->xfrm.state_all); sz = sizeof(struct hlist_head) * 8; |