diff options
Diffstat (limited to 'net')
308 files changed, 9976 insertions, 4064 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 9649579b5b9f..71c3e045505b 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -138,7 +138,7 @@ int vlan_check_real_dev(struct net_device *real_dev, return 0; } -int register_vlan_dev(struct net_device *dev) +int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; @@ -174,7 +174,7 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; - err = netdev_upper_dev_link(real_dev, dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; @@ -270,7 +270,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; - err = register_vlan_dev(new_dev); + err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index df8bd65dd370..94f8eed9f9b3 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -107,7 +107,7 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result); int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id); void vlan_setup(struct net_device *dev); -int register_vlan_dev(struct net_device *dev); +int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index e2ed69850489..0bc31de9071a 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -21,6 +21,12 @@ bool vlan_do_receive(struct sk_buff **skbp) if (unlikely(!skb)) return false; + if (unlikely(!(vlan_dev->flags & IFF_UP))) { + kfree_skb(skb); + *skbp = NULL; + return false; + } + skb->dev = vlan_dev; if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 5e831de3103e..6e7c5a6a7930 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -160,7 +160,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; - return register_vlan_dev(dev); + return register_vlan_dev(dev, extack); } static inline size_t vlan_qos_map_size(unsigned int n) diff --git a/net/atm/clip.c b/net/atm/clip.c index 65f706e4344c..d4f6029d5109 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -153,7 +153,7 @@ static int neigh_check_cb(struct neighbour *n) return 1; } -static void idle_timer_check(unsigned long dummy) +static void idle_timer_check(struct timer_list *unused) { write_lock(&arp_tbl.lock); __neigh_for_each_release(&arp_tbl, neigh_check_cb); @@ -887,7 +887,7 @@ static int __init atm_clip_init(void) register_netdevice_notifier(&clip_dev_notifier); register_inetaddr_notifier(&clip_inet_notifier); - setup_timer(&idle_timer, idle_timer_check, 0); + timer_setup(&idle_timer, idle_timer_check, 0); #ifdef CONFIG_PROC_FS { diff --git a/net/atm/lec.c b/net/atm/lec.c index a3d93a1bb133..c976196da3ea 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1232,7 +1232,7 @@ static void lane2_associate_ind(struct net_device *dev, const u8 *mac_addr, #define LEC_ARP_REFRESH_INTERVAL (3*HZ) static void lec_arp_check_expire(struct work_struct *work); -static void lec_arp_expire_arp(unsigned long data); +static void lec_arp_expire_arp(struct timer_list *t); /* * Arp table funcs @@ -1559,8 +1559,7 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, } ether_addr_copy(to_return->mac_addr, mac_addr); INIT_HLIST_NODE(&to_return->next); - setup_timer(&to_return->timer, lec_arp_expire_arp, - (unsigned long)to_return); + timer_setup(&to_return->timer, lec_arp_expire_arp, 0); to_return->last_used = jiffies; to_return->priv = priv; skb_queue_head_init(&to_return->tx_wait); @@ -1569,11 +1568,11 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv, } /* Arp sent timer expired */ -static void lec_arp_expire_arp(unsigned long data) +static void lec_arp_expire_arp(struct timer_list *t) { struct lec_arp_table *entry; - entry = (struct lec_arp_table *)data; + entry = from_timer(entry, t, timer); pr_debug("\n"); if (entry->status == ESI_ARP_PENDING) { @@ -1591,10 +1590,10 @@ static void lec_arp_expire_arp(unsigned long data) } /* Unknown/unused vcc expire, remove associated entry */ -static void lec_arp_expire_vcc(unsigned long data) +static void lec_arp_expire_vcc(struct timer_list *t) { unsigned long flags; - struct lec_arp_table *to_remove = (struct lec_arp_table *)data; + struct lec_arp_table *to_remove = from_timer(to_remove, t, timer); struct lec_priv *priv = to_remove->priv; del_timer(&to_remove->timer); @@ -1799,7 +1798,7 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, else send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL); entry->timer.expires = jiffies + (1 * HZ); - entry->timer.function = lec_arp_expire_arp; + entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_arp; add_timer(&entry->timer); found = priv->mcast_vcc; } @@ -1999,7 +1998,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry->old_recv_push = old_push; entry->status = ESI_UNKNOWN; entry->timer.expires = jiffies + priv->vcc_timeout_period; - entry->timer.function = lec_arp_expire_vcc; + entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc; hlist_add_head(&entry->next, &priv->lec_no_forward); add_timer(&entry->timer); dump_arp_table(priv); @@ -2083,7 +2082,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry->status = ESI_UNKNOWN; hlist_add_head(&entry->next, &priv->lec_arp_empty_ones); entry->timer.expires = jiffies + priv->vcc_timeout_period; - entry->timer.function = lec_arp_expire_vcc; + entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc; add_timer(&entry->timer); pr_debug("After vcc was added\n"); dump_arp_table(priv); diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 5677147209e8..b43d99430eb6 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -799,7 +799,6 @@ static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg) int err; if (mpcs == NULL) { - init_timer(&mpc_timer); mpc_timer_refresh(); /* This lets us now how our LECs are doing */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 83ba5483455a..1b659ab652fb 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -916,8 +916,8 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) u16 tvlv_len = 0; unsigned long send_time; - if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || - (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) return; /* the interface gets activated here to avoid race conditions between @@ -1264,7 +1264,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, * drops as they can't send and receive at the same time. */ tq_iface_penalty = BATADV_TQ_MAX_VALUE; - if (if_outgoing && (if_incoming == if_outgoing) && + if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, bat_priv); @@ -1369,7 +1369,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, ret = BATADV_NEIGH_DUP; } else { set_mark = 0; - if (is_dup && (ret != BATADV_NEIGH_DUP)) + if (is_dup && ret != BATADV_NEIGH_DUP) ret = BATADV_ORIG_DUP; } @@ -1515,7 +1515,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ - if (!is_single_hop_neigh && (!orig_neigh_router)) { + if (!is_single_hop_neigh && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out_neigh; @@ -1535,7 +1535,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno); similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl; - if (is_bidirect && ((dup_status == BATADV_NO_DUP) || + if (is_bidirect && (dup_status == BATADV_NO_DUP || (sameseq && similar_ttl))) { batadv_iv_ogm_orig_update(bat_priv, orig_node, orig_ifinfo, ethhdr, @@ -1553,8 +1553,8 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, /* OGMs from secondary interfaces should only scheduled once * per interface where it has been received, not multiple times */ - if ((ogm_packet->ttl <= 2) && - (if_incoming != if_outgoing)) { + if (ogm_packet->ttl <= 2 && + if_incoming != if_outgoing) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM from secondary interface and wrong outgoing interface\n"); goto out_neigh; @@ -1590,7 +1590,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, if_incoming, if_outgoing); out_neigh: - if ((orig_neigh_node) && (!is_single_hop_neigh)) + if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: if (router_ifinfo) @@ -2523,9 +2523,9 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) tmp_gw_factor *= 100 * 100; tmp_gw_factor >>= 18; - if ((tmp_gw_factor > max_gw_factor) || - ((tmp_gw_factor == max_gw_factor) && - (tq_avg > max_tq))) { + if (tmp_gw_factor > max_gw_factor || + (tmp_gw_factor == max_gw_factor && + tq_avg > max_tq)) { if (curr_gw) batadv_gw_node_put(curr_gw); curr_gw = gw_node; diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 4e2724c5b33d..93ef1c06227e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -767,7 +767,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) goto next; - if (curr_gw && (bw <= max_bw)) + if (curr_gw && bw <= max_bw) goto next; if (curr_gw) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index bd1064d98e16..1de992c58b35 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -134,7 +134,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; throughput = link_settings.base.speed; - if (throughput && (throughput != SPEED_UNKNOWN)) + if (throughput && throughput != SPEED_UNKNOWN) return throughput * 10; } @@ -263,8 +263,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) goto out; /* we are in the process of shutting this interface down */ - if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || - (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) goto out; /* the interface was enabled but may not be ready yet */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 8be61734fc43..c251445a42a0 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -304,8 +304,8 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, * due to the store & forward characteristics of WIFI. * Very low throughput values are the exception. */ - if ((throughput > 10) && - (if_incoming == if_outgoing) && + if (throughput > 10 && + if_incoming == if_outgoing && !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) return throughput / 2; @@ -455,7 +455,7 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, /* drop packets with old seqnos, however accept the first packet after * a host has been rebooted. */ - if ((seq_diff < 0) && !protection_started) + if (seq_diff < 0 && !protection_started) goto out; neigh_node->last_seen = jiffies; @@ -568,8 +568,8 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, router_throughput = router_ifinfo->bat_v.throughput; neigh_throughput = neigh_ifinfo->bat_v.throughput; - if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && - (router_throughput >= neigh_throughput)) + if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF && + router_throughput >= neigh_throughput) goto out; } @@ -621,7 +621,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, return; /* only unknown & newer OGMs contain TVLVs we are interested in */ - if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT)) + if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, (unsigned char *)(ogm2 + 1), diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b6cfa78e9381..760c0de72582 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -492,8 +492,8 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, /* this is an hash collision with the temporary selected node. Choose * the one with the lowest address */ - if ((tmp_max == max) && max_orig_node && - (batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0)) + if (tmp_max == max && max_orig_node && + batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0) goto out; ret = true; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index de9955d5224d..10d521f0b17f 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -248,12 +248,12 @@ void batadv_gw_election(struct batadv_priv *bat_priv) } } - if ((curr_gw) && (!next_gw)) { + if (curr_gw && !next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Removing selected gateway - no gateway in range\n"); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); - } else if ((!curr_gw) && (next_gw)) { + } else if (!curr_gw && next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, @@ -411,8 +411,8 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, goto out; } - if ((gw_node->bandwidth_down == ntohl(gateway->bandwidth_down)) && - (gw_node->bandwidth_up == ntohl(gateway->bandwidth_up))) + if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && + gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 33940c5c74a8..2c26039c23fc 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -56,8 +56,8 @@ bool batadv_parse_throughput(struct net_device *net_dev, char *buff, if (strncasecmp(tmp_ptr, "mbit", 4) == 0) bw_unit_type = BATADV_BW_UNIT_MBIT; - if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) || - (bw_unit_type == BATADV_BW_UNIT_MBIT)) + if (strncasecmp(tmp_ptr, "kbit", 4) == 0 || + bw_unit_type == BATADV_BW_UNIT_MBIT) *tmp_ptr = '\0'; } @@ -190,7 +190,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, if (!up_new) up_new = 1; - if ((down_curr == down_new) && (up_curr == up_new)) + if (down_curr == down_new && up_curr == up_new) return count; batadv_gw_reselect(bat_priv); @@ -224,16 +224,16 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /* only fetch the tvlv value if the handler wasn't called via the * CIFNOTFND flag and if there is data to fetch */ - if ((flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) || - (tvlv_value_len < sizeof(gateway))) { + if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND || + tvlv_value_len < sizeof(gateway)) { gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; } else { gateway_ptr = tvlv_value; gateway.bandwidth_down = gateway_ptr->bandwidth_down; gateway.bandwidth_up = gateway_ptr->bandwidth_up; - if ((gateway.bandwidth_down == 0) || - (gateway.bandwidth_up == 0)) { + if (gateway.bandwidth_down == 0 || + gateway.bandwidth_up == 0) { gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; } @@ -242,8 +242,8 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig, &gateway); /* restart gateway selection */ - if ((gateway.bandwidth_down != 0) && - (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT)) + if (gateway.bandwidth_down != 0 && + atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) batadv_gw_check_election(bat_priv, orig); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index e348f76ea8c1..4e3d5340ad96 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -504,8 +504,8 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->net_dev == net_dev) @@ -568,8 +568,8 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->soft_iface != soft_iface) @@ -654,8 +654,8 @@ out: static void batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) return; hard_iface->if_status = BATADV_IF_INACTIVE; @@ -738,7 +738,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, bat_priv = netdev_priv(hard_iface->soft_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, - soft_iface, NULL, NULL); + soft_iface, NULL, NULL, NULL); if (ret) goto err_dev; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 8ead292886d1..bded31121d12 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -132,10 +132,10 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf, size_t packet_len; int error; - if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0)) + if ((file->f_flags & O_NONBLOCK) && socket_client->queue_len == 0) return -EAGAIN; - if ((!buf) || (count < sizeof(struct batadv_icmp_packet))) + if (!buf || count < sizeof(struct batadv_icmp_packet)) return -EINVAL; if (!access_ok(VERIFY_WRITE, buf, count)) diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index fb381fb26a66..4daed7ad46f2 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -73,8 +73,8 @@ * list traversals just rcu-locked */ struct list_head batadv_hardif_list; -static int (*batadv_rx_handler[256])(struct sk_buff *, - struct batadv_hard_iface *); +static int (*batadv_rx_handler[256])(struct sk_buff *skb, + struct batadv_hard_iface *recv_if); unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; @@ -540,12 +540,12 @@ batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)) { - int (*curr)(struct sk_buff *, - struct batadv_hard_iface *); + int (*curr)(struct sk_buff *skb, + struct batadv_hard_iface *recv_if); curr = batadv_rx_handler[packet_type]; - if ((curr != batadv_recv_unhandled_packet) && - (curr != batadv_recv_unhandled_unicast_packet)) + if (curr != batadv_recv_unhandled_packet && + curr != batadv_recv_unhandled_unicast_packet) return -EBUSY; batadv_rx_handler[packet_type] = recv_handler; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 05cc7637c064..edb2f239d04d 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2017.3" +#define BATADV_SOURCE_VERSION "2017.4" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index d327670641ac..e553a8770a89 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1126,7 +1126,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, bool orig_initialized; if (orig_mcast_enabled && tvlv_value && - (tvlv_value_len >= sizeof(mcast_flags))) + tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; spin_lock_bh(&orig->mcast_handler_lock); diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 8e2a4b205257..2967b86c13da 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1062,9 +1062,9 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, continue; /* don't purge if the interface is not (going) down */ - if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && - (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && - (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + if (if_outgoing->if_status != BATADV_IF_INACTIVE && + if_outgoing->if_status != BATADV_IF_NOT_IN_USE && + if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -1106,9 +1106,9 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, continue; /* don't purge if the interface is not (going) down */ - if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && - (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && - (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + if (if_outgoing->if_status != BATADV_IF_INACTIVE && + if_outgoing->if_status != BATADV_IF_NOT_IN_USE && + if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -1155,13 +1155,13 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, last_seen = neigh_node->last_seen; if_incoming = neigh_node->if_incoming; - if ((batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT)) || - (if_incoming->if_status == BATADV_IF_INACTIVE) || - (if_incoming->if_status == BATADV_IF_NOT_IN_USE) || - (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) { - if ((if_incoming->if_status == BATADV_IF_INACTIVE) || - (if_incoming->if_status == BATADV_IF_NOT_IN_USE) || - (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) + if (batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT) || + if_incoming->if_status == BATADV_IF_INACTIVE || + if_incoming->if_status == BATADV_IF_NOT_IN_USE || + if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) { + if (if_incoming->if_status == BATADV_IF_INACTIVE || + if_incoming->if_status == BATADV_IF_NOT_IN_USE || + if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n", orig_node->orig, neigh_node->addr, diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f10e3ff26f9d..40d9bf3e5bfe 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -93,14 +93,14 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, batadv_orig_ifinfo_put(orig_ifinfo); /* route deleted */ - if ((curr_router) && (!neigh_node)) { + if (curr_router && !neigh_node) { batadv_dbg(BATADV_DBG_ROUTES, bat_priv, "Deleting route towards: %pM\n", orig_node->orig); batadv_tt_global_del_orig(bat_priv, orig_node, -1, "Deleted route towards originator"); /* route added */ - } else if ((!curr_router) && (neigh_node)) { + } else if (!curr_router && neigh_node) { batadv_dbg(BATADV_DBG_ROUTES, bat_priv, "Adding route towards: %pM (via %pM)\n", orig_node->orig, neigh_node->addr); @@ -381,7 +381,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, /* add record route information if not full */ if ((icmph->msg_type == BATADV_ECHO_REPLY || icmph->msg_type == BATADV_ECHO_REQUEST) && - (skb->len >= sizeof(struct batadv_icmp_packet_rr))) { + skb->len >= sizeof(struct batadv_icmp_packet_rr)) { if (skb_linearize(skb) < 0) goto free_skb; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 054a65e6eb68..7895323fd2a7 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -142,7 +142,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb, #ifdef CONFIG_BATMAN_ADV_BATMAN_V hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr); - if ((hardif_neigh) && (ret != NET_XMIT_DROP)) + if (hardif_neigh && ret != NET_XMIT_DROP) hardif_neigh->bat_v.last_unicast_tx = jiffies; if (hardif_neigh) @@ -615,8 +615,8 @@ batadv_forw_packet_list_steal(struct hlist_head *forw_list, * we delete only packets belonging to the given interface */ if (hard_iface && - (forw_packet->if_incoming != hard_iface) && - (forw_packet->if_outgoing != hard_iface)) + forw_packet->if_incoming != hard_iface && + forw_packet->if_outgoing != hard_iface) continue; hlist_del(&forw_packet->list); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index c2c986746d0b..543d2c3e0f0d 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -69,8 +69,8 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) int result; /* TODO: We must check if we can release all references to non-payload - * data using __skb_header_release in our skbs to allow skb_cow_header to - * work optimally. This means that those skbs are not allowed to read + * data using __skb_header_release in our skbs to allow skb_cow_header + * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. @@ -160,7 +160,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { /* check ranges */ - if ((new_mtu < 68) || (new_mtu > batadv_hardif_min_mtu(dev))) + if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; @@ -867,7 +867,8 @@ free_bat_counters: * Return: 0 if successful or error otherwise. */ static int batadv_softif_slave_add(struct net_device *dev, - struct net_device *slave_dev) + struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct batadv_hard_iface *hard_iface; struct net *net = dev_net(dev); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 0ae8b30e4eaa..aa187fd42475 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -925,8 +925,8 @@ static int batadv_store_mesh_iface_finish(struct net_device *net_dev, if (hard_iface->if_status == status_tmp) goto out; - if ((hard_iface->soft_iface) && - (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)) + if (hard_iface->soft_iface && + strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0) goto out; if (status_tmp == BATADV_IF_NOT_IN_USE) { diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index bfe8effe9238..4b90033f35a8 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1206,7 +1206,7 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst, /* send the ack */ r = batadv_send_skb_to_orig(skb, orig_node, NULL); - if (unlikely(r < 0) || (r == NET_XMIT_DROP)) { + if (unlikely(r < 0) || r == NET_XMIT_DROP) { ret = BATADV_TP_REASON_DST_UNREACHABLE; goto out; } diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index c18115d22f00..db82a40875e8 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -126,14 +126,4 @@ config BT_DEBUGFS Provide extensive information about internal Bluetooth states in debugfs. -config BT_LEGACY_IOCTL - bool "Enable legacy ioctl interfaces" - depends on BT && BT_BREDR - default y - help - Enable support for legacy ioctl interfaces. This is only needed - for old and deprecated applications using direct ioctl calls for - controller management. Since Linux 3.4 all configuration and - setup is done via mgmt interface and this is no longer needed. - source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0bad296fe0af..65d734c165bd 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,6 @@ static int hci_sock_release(struct socket *sock) return 0; } -#ifdef CONFIG_BT_LEGACY_IOCTL static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg) { bdaddr_t bdaddr; @@ -1050,7 +1049,6 @@ done: release_sock(sk); return err; } -#endif static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) @@ -1971,11 +1969,7 @@ static const struct proto_ops hci_sock_ops = { .getname = hci_sock_getname, .sendmsg = hci_sock_sendmsg, .recvmsg = hci_sock_recvmsg, -#ifdef CONFIG_BT_LEGACY_IOCTL .ioctl = hci_sock_ioctl, -#else - .ioctl = sock_no_ioctl, -#endif .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6be41a44d688..a86e6687026e 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, if (is_l2) __skb_push(skb, ETH_HLEN); if (is_direct_pkt_access) - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); retval = bpf_test_run(prog, skb, repeat, &duration); if (!is_l2) __skb_push(skb, ETH_HLEN); @@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, xdp.data_hard_start = data; xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; retval = bpf_test_run(prog, &xdp, repeat, &duration); diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 40b1ede527ca..4aee55fdcc92 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ br_stp_if.o br_stp_timer.o br_netlink.o \ - br_netlink_tunnel.o + br_netlink_tunnel.o br_arp_nd_proxy.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c new file mode 100644 index 000000000000..2cf7716254be --- /dev/null +++ b/net/bridge/br_arp_nd_proxy.c @@ -0,0 +1,469 @@ +/* + * Handle bridge arp/nd proxy/suppress + * + * Copyright (C) 2017 Cumulus Networks + * Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com> + * + * Authors: + * Roopa Prabhu <roopa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/neighbour.h> +#include <net/arp.h> +#include <linux/if_vlan.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_checksum.h> +#endif + +#include "br_private.h" + +void br_recalculate_neigh_suppress_enabled(struct net_bridge *br) +{ + struct net_bridge_port *p; + bool neigh_suppress = false; + + list_for_each_entry(p, &br->port_list, list) { + if (p->flags & BR_NEIGH_SUPPRESS) { + neigh_suppress = true; + break; + } + } + + br->neigh_suppress_enabled = neigh_suppress; +} + +#if IS_ENABLED(CONFIG_INET) +static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p, + struct net_device *dev, __be32 dest_ip, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw, + __be16 vlan_proto, u16 vlan_tci) +{ + struct net_bridge_vlan_group *vg; + struct sk_buff *skb; + u16 pvid; + + netdev_dbg(dev, "arp send dev %s dst %pI4 dst_hw %pM src %pI4 src_hw %pM\n", + dev->name, &dest_ip, dest_hw, &src_ip, src_hw); + + if (!vlan_tci) { + arp_send(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + return; + } + + skb = arp_create(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (!skb) + return; + + if (p) + vg = nbp_vlan_group_rcu(p); + else + vg = br_vlan_group_rcu(br); + pvid = br_get_pvid(vg); + if (pvid == (vlan_tci & VLAN_VID_MASK)) + vlan_tci = 0; + + if (vlan_tci) + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + + if (p) { + arp_xmit(skb); + } else { + skb_reset_mac_header(skb); + __skb_pull(skb, skb_network_offset(skb)); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_HOST; + + netif_rx_ni(skb); + } +} + +static int br_chk_addr_ip(struct net_device *dev, void *data) +{ + __be32 ip = *(__be32 *)data; + struct in_device *in_dev; + __be32 addr = 0; + + in_dev = __in_dev_get_rcu(dev); + if (in_dev) + addr = inet_confirm_addr(dev_net(dev), in_dev, 0, ip, + RT_SCOPE_HOST); + + if (addr == ip) + return 1; + + return 0; +} + +static bool br_is_local_ip(struct net_device *dev, __be32 ip) +{ + if (br_chk_addr_ip(dev, &ip)) + return true; + + /* check if ip is configured on upper dev */ + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip)) + return true; + + return false; +} + +void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p) +{ + struct net_device *dev = br->dev; + struct net_device *vlandev = dev; + struct neighbour *n; + struct arphdr *parp; + u8 *arpptr, *sha; + __be32 sip, tip; + + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + + if ((dev->flags & IFF_NOARP) || + !pskb_may_pull(skb, arp_hdr_len(dev))) + return; + + parp = arp_hdr(skb); + + if (parp->ar_pro != htons(ETH_P_IP) || + parp->ar_hln != dev->addr_len || + parp->ar_pln != 4) + return; + + arpptr = (u8 *)parp + sizeof(struct arphdr); + sha = arpptr; + arpptr += dev->addr_len; /* sha */ + memcpy(&sip, arpptr, sizeof(sip)); + arpptr += sizeof(sip); + arpptr += dev->addr_len; /* tha */ + memcpy(&tip, arpptr, sizeof(tip)); + + if (ipv4_is_loopback(tip) || + ipv4_is_multicast(tip)) + return; + + if (br->neigh_suppress_enabled) { + if (p && (p->flags & BR_NEIGH_SUPPRESS)) + return; + if (ipv4_is_zeronet(sip) || sip == tip) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + } + + if (parp->ar_op != htons(ARPOP_REQUEST)) + return; + + if (vid != 0) { + vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto, + vid); + if (!vlandev) + return; + } + + if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) { + /* its our local ip, so don't proxy reply + * and don't forward to neigh suppress ports + */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + n = neigh_lookup(&arp_tbl, &tip, vlandev); + if (n) { + struct net_bridge_fdb_entry *f; + + if (!(n->nud_state & NUD_VALID)) { + neigh_release(n); + return; + } + + f = br_fdb_find_rcu(br, n->ha, vid); + if (f) { + bool replied = false; + + if ((p && (p->flags & BR_PROXYARP)) || + (f->dst && (f->dst->flags & (BR_PROXYARP_WIFI | + BR_NEIGH_SUPPRESS)))) { + if (!vid) + br_arp_send(br, p, skb->dev, sip, tip, + sha, n->ha, sha, 0, 0); + else + br_arp_send(br, p, skb->dev, sip, tip, + sha, n->ha, sha, + skb->vlan_proto, + skb_vlan_tag_get(skb)); + replied = true; + } + + /* If we have replied or as long as we know the + * mac, indicate to arp replied + */ + if (replied || br->neigh_suppress_enabled) + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } + + neigh_release(n); + } +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg) +{ + struct nd_msg *m; + + m = skb_header_pointer(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr), sizeof(*msg), msg); + if (!m) + return NULL; + + if (m->icmph.icmp6_code != 0 || + (m->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && + m->icmph.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) + return NULL; + + return m; +} + +static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, + struct sk_buff *request, struct neighbour *n, + __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns) +{ + struct net_device *dev = request->dev; + struct net_bridge_vlan_group *vg; + struct sk_buff *reply; + struct nd_msg *na; + struct ipv6hdr *pip6; + int na_olen = 8; /* opt hdr + ETH_ALEN for target */ + int ns_olen; + int i, len; + u8 *daddr; + u16 pvid; + + if (!dev) + return; + + len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + + sizeof(*na) + na_olen + dev->needed_tailroom; + + reply = alloc_skb(len, GFP_ATOMIC); + if (!reply) + return; + + reply->protocol = htons(ETH_P_IPV6); + reply->dev = dev; + skb_reserve(reply, LL_RESERVED_SPACE(dev)); + skb_push(reply, sizeof(struct ethhdr)); + skb_set_mac_header(reply, 0); + + daddr = eth_hdr(request)->h_source; + + /* Do we need option processing ? */ + ns_olen = request->len - (skb_network_offset(request) + + sizeof(struct ipv6hdr)) - sizeof(*ns); + for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) { + if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { + daddr = ns->opt + i + sizeof(struct nd_opt_hdr); + break; + } + } + + /* Ethernet header */ + ether_addr_copy(eth_hdr(reply)->h_dest, daddr); + ether_addr_copy(eth_hdr(reply)->h_source, n->ha); + eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); + reply->protocol = htons(ETH_P_IPV6); + + skb_pull(reply, sizeof(struct ethhdr)); + skb_set_network_header(reply, 0); + skb_put(reply, sizeof(struct ipv6hdr)); + + /* IPv6 header */ + pip6 = ipv6_hdr(reply); + memset(pip6, 0, sizeof(struct ipv6hdr)); + pip6->version = 6; + pip6->priority = ipv6_hdr(request)->priority; + pip6->nexthdr = IPPROTO_ICMPV6; + pip6->hop_limit = 255; + pip6->daddr = ipv6_hdr(request)->saddr; + pip6->saddr = *(struct in6_addr *)n->primary_key; + + skb_pull(reply, sizeof(struct ipv6hdr)); + skb_set_transport_header(reply, 0); + + na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen); + + /* Neighbor Advertisement */ + memset(na, 0, sizeof(*na) + na_olen); + na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; + na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */ + na->icmph.icmp6_override = 1; + na->icmph.icmp6_solicited = 1; + na->target = ns->target; + ether_addr_copy(&na->opt[2], n->ha); + na->opt[0] = ND_OPT_TARGET_LL_ADDR; + na->opt[1] = na_olen >> 3; + + na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, + &pip6->daddr, + sizeof(*na) + na_olen, + IPPROTO_ICMPV6, + csum_partial(na, sizeof(*na) + na_olen, 0)); + + pip6->payload_len = htons(sizeof(*na) + na_olen); + + skb_push(reply, sizeof(struct ipv6hdr)); + skb_push(reply, sizeof(struct ethhdr)); + + reply->ip_summed = CHECKSUM_UNNECESSARY; + + if (p) + vg = nbp_vlan_group_rcu(p); + else + vg = br_vlan_group_rcu(br); + pvid = br_get_pvid(vg); + if (pvid == (vlan_tci & VLAN_VID_MASK)) + vlan_tci = 0; + + if (vlan_tci) + __vlan_hwaccel_put_tag(reply, vlan_proto, vlan_tci); + + netdev_dbg(dev, "nd send dev %s dst %pI6 dst_hw %pM src %pI6 src_hw %pM\n", + dev->name, &pip6->daddr, daddr, &pip6->saddr, n->ha); + + if (p) { + dev_queue_xmit(reply); + } else { + skb_reset_mac_header(reply); + __skb_pull(reply, skb_network_offset(reply)); + reply->ip_summed = CHECKSUM_UNNECESSARY; + reply->pkt_type = PACKET_HOST; + + netif_rx_ni(reply); + } +} + +static int br_chk_addr_ip6(struct net_device *dev, void *data) +{ + struct in6_addr *addr = (struct in6_addr *)data; + + if (ipv6_chk_addr(dev_net(dev), addr, dev, 0)) + return 1; + + return 0; +} + +static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr) + +{ + if (br_chk_addr_ip6(dev, addr)) + return true; + + /* check if ip is configured on upper dev */ + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr)) + return true; + + return false; +} + +void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p, struct nd_msg *msg) +{ + struct net_device *dev = br->dev; + struct net_device *vlandev = NULL; + struct in6_addr *saddr, *daddr; + struct ipv6hdr *iphdr; + struct neighbour *n; + + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + + if (p && (p->flags & BR_NEIGH_SUPPRESS)) + return; + + if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && + !msg->icmph.icmp6_solicited) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) + return; + + iphdr = ipv6_hdr(skb); + saddr = &iphdr->saddr; + daddr = &iphdr->daddr; + + if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + if (vid != 0) { + /* build neigh table lookup on the vlan device */ + vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto, + vid); + if (!vlandev) + return; + } else { + vlandev = dev; + } + + if (br_is_local_ip6(vlandev, &msg->target)) { + /* its our own ip, so don't proxy reply + * and don't forward to arp suppress ports + */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, vlandev); + if (n) { + struct net_bridge_fdb_entry *f; + + if (!(n->nud_state & NUD_VALID)) { + neigh_release(n); + return; + } + + f = br_fdb_find_rcu(br, n->ha, vid); + if (f) { + bool replied = false; + + if (f->dst && (f->dst->flags & BR_NEIGH_SUPPRESS)) { + if (vid != 0) + br_nd_send(br, p, skb, n, + skb->vlan_proto, + skb_vlan_tag_get(skb), msg); + else + br_nd_send(br, p, skb, n, 0, 0, msg); + replied = true; + } + + /* If we have replied or as long as we know the + * mac, indicate to NEIGH_SUPPRESS ports that we + * have replied + */ + if (replied || br->neigh_suppress_enabled) + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } + neigh_release(n); + } +} +#endif diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f6b6a92f1c48..28bb22186fa0 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -39,6 +39,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; const unsigned char *dest; + struct ethhdr *eth; u16 vid = 0; rcu_read_lock(); @@ -57,11 +58,30 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) BR_INPUT_SKB_CB(skb)->brdev = dev; skb_reset_mac_header(skb); + eth = eth_hdr(skb); skb_pull(skb, ETH_HLEN); if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; + if (IS_ENABLED(CONFIG_INET) && + (eth->h_proto == htons(ETH_P_ARP) || + eth->h_proto == htons(ETH_P_RARP)) && + br->neigh_suppress_enabled) { + br_do_proxy_suppress_arp(skb, br, vid, NULL); + } else if (IS_ENABLED(CONFIG_IPV6) && + skb->protocol == htons(ETH_P_IPV6) && + br->neigh_suppress_enabled && + pskb_may_pull(skb, sizeof(struct ipv6hdr) + + sizeof(struct nd_msg)) && + ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { + struct nd_msg *msg, _msg; + + msg = br_is_nd_neigh_msg(skb, &_msg); + if (msg) + br_do_suppress_nd(skb, br, vid, NULL, msg); + } + dest = eth_hdr(skb)->h_dest; if (is_broadcast_ether_addr(dest)) { br_flood(br, skb, BR_PKT_BROADCAST, false, true); @@ -320,12 +340,13 @@ void br_netpoll_disable(struct net_bridge_port *p) #endif -static int br_add_slave(struct net_device *dev, struct net_device *slave_dev) +static int br_add_slave(struct net_device *dev, struct net_device *slave_dev, + struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); - return br_add_if(br, slave_dev); + return br_add_if(br, slave_dev, extack); } static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 48fb17417fac..b4eed113d2ec 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -204,7 +204,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) continue; - if ((p->flags & BR_PROXYARP_WIFI) && + if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) && BR_INPUT_SKB_CB(skb)->proxyarp_replied) continue; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f3aef22931ab..ae38547bbf91 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -310,6 +310,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } + br_recalculate_neigh_suppress_enabled(br); + br_fdb_delete_by_port(br, NULL, 0, 1); cancel_delayed_work_sync(&br->gc_work); @@ -480,7 +482,8 @@ netdev_features_t br_features_recompute(struct net_bridge *br, } /* called with RTNL */ -int br_add_if(struct net_bridge *br, struct net_device *dev) +int br_add_if(struct net_bridge *br, struct net_device *dev, + struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; @@ -500,16 +503,22 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) return -EINVAL; /* No bridging of bridges */ - if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) + if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { + NL_SET_ERR_MSG(extack, + "Can not enslave a bridge to a bridge"); return -ELOOP; + } /* Device is already being bridged */ if (br_port_exists(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ - if (dev->priv_flags & IFF_DONT_BRIDGE) + if (dev->priv_flags & IFF_DONT_BRIDGE) { + NL_SET_ERR_MSG(extack, + "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; + } p = new_nbp(br, dev); if (IS_ERR(p)) @@ -540,7 +549,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) dev->priv_flags |= IFF_BRIDGE_PORT; - err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL); + err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; @@ -653,4 +662,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) if (mask & BR_AUTO_MASK) nbp_update_port_count(br); + + if (mask & BR_NEIGH_SUPPRESS) + br_recalculate_neigh_suppress_enabled(br); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7637f58c1226..a096d3e189da 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -71,62 +71,6 @@ static int br_pass_frame_up(struct sk_buff *skb) br_netif_receive_skb); } -static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, - u16 vid, struct net_bridge_port *p) -{ - struct net_device *dev = br->dev; - struct neighbour *n; - struct arphdr *parp; - u8 *arpptr, *sha; - __be32 sip, tip; - - BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; - - if ((dev->flags & IFF_NOARP) || - !pskb_may_pull(skb, arp_hdr_len(dev))) - return; - - parp = arp_hdr(skb); - - if (parp->ar_pro != htons(ETH_P_IP) || - parp->ar_op != htons(ARPOP_REQUEST) || - parp->ar_hln != dev->addr_len || - parp->ar_pln != 4) - return; - - arpptr = (u8 *)parp + sizeof(struct arphdr); - sha = arpptr; - arpptr += dev->addr_len; /* sha */ - memcpy(&sip, arpptr, sizeof(sip)); - arpptr += sizeof(sip); - arpptr += dev->addr_len; /* tha */ - memcpy(&tip, arpptr, sizeof(tip)); - - if (ipv4_is_loopback(tip) || - ipv4_is_multicast(tip)) - return; - - n = neigh_lookup(&arp_tbl, &tip, dev); - if (n) { - struct net_bridge_fdb_entry *f; - - if (!(n->nud_state & NUD_VALID)) { - neigh_release(n); - return; - } - - f = br_fdb_find_rcu(br, n->ha, vid); - if (f && ((p->flags & BR_PROXYARP) || - (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, - sha, n->ha, sha); - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; - } - - neigh_release(n); - } -} - /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -171,8 +115,22 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb BR_INPUT_SKB_CB(skb)->brdev = br->dev; - if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) - br_do_proxy_arp(skb, br, vid, p); + if (IS_ENABLED(CONFIG_INET) && + (skb->protocol == htons(ETH_P_ARP) || + skb->protocol == htons(ETH_P_RARP))) { + br_do_proxy_suppress_arp(skb, br, vid, p); + } else if (IS_ENABLED(CONFIG_IPV6) && + skb->protocol == htons(ETH_P_IPV6) && + br->neigh_suppress_enabled && + pskb_may_pull(skb, sizeof(struct ipv6hdr) + + sizeof(struct nd_msg)) && + ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { + struct nd_msg *msg, _msg; + + msg = br_is_nd_neigh_msg(skb, &_msg); + if (msg) + br_do_suppress_nd(skb, br, vid, p, msg); + } switch (pkt_type) { case BR_PKT_MULTICAST: @@ -289,6 +247,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) * * Others reserved for future standardization */ + fwd_mask |= p->group_fwd_mask; switch (dest[5]) { case 0x00: /* Bridge Group Address */ /* If STP is turned off, diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 66cd98772051..8f29103935a3 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -98,7 +98,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) return -EINVAL; if (isadd) - ret = br_add_if(br, dev); + ret = br_add_if(br, dev, NULL); else ret = br_del_if(br, dev); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 8dc5c8d69bcd..7947e0436e18 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -859,8 +859,32 @@ out: spin_unlock(&br->multicast_lock); } +static void br_mc_router_state_change(struct net_bridge *p, + bool is_mc_router) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, + .flags = SWITCHDEV_F_DEFER, + .u.mrouter = is_mc_router, + }; + + switchdev_port_attr_set(p->dev, &attr); +} + static void br_multicast_local_router_expired(unsigned long data) { + struct net_bridge *br = (struct net_bridge *)data; + + spin_lock(&br->multicast_lock); + if (br->multicast_router == MDB_RTR_TYPE_DISABLED || + br->multicast_router == MDB_RTR_TYPE_PERM || + timer_pending(&br->multicast_router_timer)) + goto out; + + br_mc_router_state_change(br, false); +out: + spin_unlock(&br->multicast_lock); } static void br_multicast_querier_expired(struct net_bridge *br, @@ -1364,9 +1388,12 @@ static void br_multicast_mark_router(struct net_bridge *br, unsigned long now = jiffies; if (!port) { - if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) + if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { + if (!timer_pending(&br->multicast_router_timer)) + br_mc_router_state_change(br, true); mod_timer(&br->multicast_router_timer, now + br->multicast_querier_interval); + } return; } @@ -1952,7 +1979,7 @@ void br_multicast_init(struct net_bridge *br) spin_lock_init(&br->multicast_lock); setup_timer(&br->multicast_router_timer, - br_multicast_local_router_expired, 0); + br_multicast_local_router_expired, (unsigned long)br); setup_timer(&br->ip4_other_query.timer, br_ip4_multicast_querier_expired, (unsigned long)br); setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired, @@ -2042,9 +2069,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) switch (val) { case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: + br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); del_timer(&br->multicast_router_timer); - /* fall through */ + br->multicast_router = val; + err = 0; + break; case MDB_RTR_TYPE_TEMP_QUERY: + if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) + br_mc_router_state_change(br, false); br->multicast_router = val; err = 0; break; @@ -2184,6 +2216,18 @@ bool br_multicast_enabled(const struct net_device *dev) } EXPORT_SYMBOL_GPL(br_multicast_enabled); +bool br_multicast_router(const struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + bool is_router; + + spin_lock_bh(&br->multicast_lock); + is_router = br_multicast_is_router(br); + spin_unlock_bh(&br->multicast_lock); + return is_router; +} +EXPORT_SYMBOL_GPL(br_multicast_router); + int br_multicast_set_querier(struct net_bridge *br, unsigned long val) { unsigned long max_delay; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3bc890716c89..f0e82682e071 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -138,6 +138,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -152,6 +153,7 @@ static inline size_t br_port_info_size(void) #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + 0; } @@ -208,7 +210,10 @@ static int br_port_fill_attrs(struct sk_buff *skb, p->topology_change_ack) || nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & - BR_VLAN_TUNNEL))) + BR_VLAN_TUNNEL)) || + nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || + nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, + !!(p->flags & BR_NEIGH_SUPPRESS))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -637,6 +642,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, }; /* Change the state of the port and notify spanning tree */ @@ -773,6 +779,20 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) return err; } #endif + + if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) { + u16 fwd_mask = nla_get_u16(tb[IFLA_BRPORT_GROUP_FWD_MASK]); + + if (fwd_mask & BR_GROUPFWD_MACPAUSE) + return -EINVAL; + p->group_fwd_mask = fwd_mask; + } + + err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, + BR_NEIGH_SUPPRESS); + if (err) + return err; + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e870cfc85b14..fa0039f44818 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -36,7 +36,14 @@ /* Control of forwarding link local multicast */ #define BR_GROUPFWD_DEFAULT 0 /* Don't allow forwarding of control protocols like STP, MAC PAUSE and LACP */ -#define BR_GROUPFWD_RESTRICTED 0x0007u +enum { + BR_GROUPFWD_STP = BIT(0), + BR_GROUPFWD_MACPAUSE = BIT(1), + BR_GROUPFWD_LACP = BIT(2), +}; + +#define BR_GROUPFWD_RESTRICTED (BR_GROUPFWD_STP | BR_GROUPFWD_MACPAUSE | \ + BR_GROUPFWD_LACP) /* The Nearest Customer Bridge Group Address, 01-80-C2-00-00-[00,0B,0C,0D,0F] */ #define BR_GROUPFWD_8021AD 0xB801u @@ -268,6 +275,7 @@ struct net_bridge_port { #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif + u16 group_fwd_mask; }; #define br_auto_port(p) ((p)->flags & BR_AUTO_MASK) @@ -396,6 +404,7 @@ struct net_bridge { #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif + bool neigh_suppress_enabled; }; struct br_input_skb_cb { @@ -558,7 +567,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, void br_port_carrier_check(struct net_bridge_port *p); int br_add_bridge(struct net *net, const char *name); int br_del_bridge(struct net *net, const char *name); -int br_add_if(struct net_bridge *br, struct net_device *dev); +int br_add_if(struct net_bridge *br, struct net_device *dev, + struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); int br_min_mtu(const struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, @@ -1130,4 +1140,11 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb) } #endif /* CONFIG_NET_SWITCHDEV */ +/* br_arp_nd_proxy.c */ +void br_recalculate_neigh_suppress_enabled(struct net_bridge *br); +void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p); +void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p, struct nd_msg *msg); +struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m); #endif diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 5d5d413a6cf8..0a1fa9ccd8b7 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -165,6 +165,23 @@ static int store_flush(struct net_bridge_port *p, unsigned long v) } static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); +static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%#x\n", p->group_fwd_mask); +} + +static int store_group_fwd_mask(struct net_bridge_port *p, + unsigned long v) +{ + if (v & BR_GROUPFWD_MACPAUSE) + return -EINVAL; + p->group_fwd_mask = v; + + return 0; +} +static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask, + store_group_fwd_mask); + BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); @@ -174,6 +191,7 @@ BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); +BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -223,6 +241,8 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, &brport_attr_broadcast_flood, + &brport_attr_group_fwd_mask, + &brport_attr_neigh_suppress, NULL }; diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 2585b100ebbb..276b60262981 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -65,8 +65,8 @@ static int ebt_broute(struct sk_buff *skb) static int __net_init broute_net_init(struct net *net) { - net->xt.broute_table = ebt_register_table(net, &broute_table, NULL); - return PTR_ERR_OR_ZERO(net->xt.broute_table); + return ebt_register_table(net, &broute_table, NULL, + &net->xt.broute_table); } static void __net_exit broute_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 45a00dbdbcad..c41da5fac84f 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_filter[] = { static int __net_init frame_filter_net_init(struct net *net) { - net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter); - return PTR_ERR_OR_ZERO(net->xt.frame_filter); + return ebt_register_table(net, &frame_filter, ebt_ops_filter, + &net->xt.frame_filter); } static void __net_exit frame_filter_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 57cd5bb154e7..08df7406ecb3 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_nat[] = { static int __net_init frame_nat_net_init(struct net *net) { - net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat); - return PTR_ERR_OR_ZERO(net->xt.frame_nat); + return ebt_register_table(net, &frame_nat, ebt_ops_nat, + &net->xt.frame_nat); } static void __net_exit frame_nat_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 83951f978445..3b3dcf719e07 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1169,9 +1169,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) kfree(table); } -struct ebt_table * -ebt_register_table(struct net *net, const struct ebt_table *input_table, - const struct nf_hook_ops *ops) +int ebt_register_table(struct net *net, const struct ebt_table *input_table, + const struct nf_hook_ops *ops, struct ebt_table **res) { struct ebt_table_info *newinfo; struct ebt_table *t, *table; @@ -1183,7 +1182,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, repl->entries == NULL || repl->entries_size == 0 || repl->counters != NULL || input_table->private != NULL) { BUGPRINT("Bad table data for ebt_register_table!!!\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } /* Don't add one table to multiple lists. */ @@ -1252,16 +1251,18 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]); mutex_unlock(&ebt_mutex); + WRITE_ONCE(*res, table); + if (!ops) - return table; + return 0; ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret) { __ebt_unregister_table(net, table); - return ERR_PTR(ret); + *res = NULL; } - return table; + return ret; free_unlock: mutex_unlock(&ebt_mutex); free_chainstack: @@ -1276,7 +1277,7 @@ free_newinfo: free_table: kfree(table); out: - return ERR_PTR(ret); + return ret; } void ebt_unregister_table(struct net *net, struct ebt_table *table, diff --git a/net/can/af_can.c b/net/can/af_can.c index 88edac0f3e36..1f75e11ac35a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -882,8 +882,8 @@ static int can_pernet_init(struct net *net) if (IS_ENABLED(CONFIG_PROC_FS)) { /* the statistics are updated every second (timer triggered) */ if (stats_timer) { - setup_timer(&net->can.can_stattimer, can_stat_update, - (unsigned long)net); + timer_setup(&net->can.can_stattimer, can_stat_update, + 0); mod_timer(&net->can.can_stattimer, round_jiffies(jiffies + HZ)); } diff --git a/net/can/af_can.h b/net/can/af_can.h index d0ef45bb2a72..eca6463c6213 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -113,6 +113,6 @@ struct s_pstats { /* function prototypes for the CAN networklayer procfs (proc.c) */ void can_init_proc(struct net *net); void can_remove_proc(struct net *net); -void can_stat_update(unsigned long data); +void can_stat_update(struct timer_list *t); #endif /* AF_CAN_H */ diff --git a/net/can/proc.c b/net/can/proc.c index 83045f00c63c..d979b3dc49a6 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -115,9 +115,9 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, return rate; } -void can_stat_update(unsigned long data) +void can_stat_update(struct timer_list *t) { - struct net *net = (struct net *)data; + struct net *net = from_timer(net, t, can.can_stattimer); struct s_stats *can_stats = net->can.can_stats; unsigned long j = jiffies; /* snapshot */ @@ -221,7 +221,7 @@ static int can_stats_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); - if (net->can.can_stattimer.function == can_stat_update) { + if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) { seq_printf(m, " %8ld %% total match ratio (RXMR)\n", can_stats->total_rx_match_ratio); @@ -291,7 +291,7 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v) user_reset = 1; - if (net->can.can_stattimer.function == can_stat_update) { + if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) { seq_printf(m, "Scheduled statistic reset #%ld.\n", can_pstats->stats_reset + 1); } else { diff --git a/net/core/datagram.c b/net/core/datagram.c index f7fb7e3f2acf..0b7b4c22719e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -188,7 +188,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, } if (!skb->len) { skb = skb_set_peeked(skb); - if (unlikely(IS_ERR(skb))) { + if (IS_ERR(skb)) { *err = PTR_ERR(skb); return NULL; } diff --git a/net/core/dev.c b/net/core/dev.c index 97abddd9039a..cf5894f0e6eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -145,6 +145,7 @@ #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> +#include <linux/net_namespace.h> #include "net-sysfs.h" @@ -162,7 +163,6 @@ static struct list_head offload_base __read_mostly; static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, - struct net_device *dev, struct netdev_notifier_info *info); static struct napi_struct *napi_by_id(unsigned int napi_id); @@ -188,6 +188,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id); DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +static DEFINE_MUTEX(ifalias_mutex); + /* protects napi_hash addition/deletion and napi_gen_id */ static DEFINE_SPINLOCK(napi_hash_lock); @@ -1265,29 +1267,53 @@ rollback: */ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) { - char *new_ifalias; - - ASSERT_RTNL(); + struct dev_ifalias *new_alias = NULL; if (len >= IFALIASZ) return -EINVAL; - if (!len) { - kfree(dev->ifalias); - dev->ifalias = NULL; - return 0; + if (len) { + new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL); + if (!new_alias) + return -ENOMEM; + + memcpy(new_alias->ifalias, alias, len); + new_alias->ifalias[len] = 0; } - new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); - if (!new_ifalias) - return -ENOMEM; - dev->ifalias = new_ifalias; - memcpy(dev->ifalias, alias, len); - dev->ifalias[len] = 0; + mutex_lock(&ifalias_mutex); + rcu_swap_protected(dev->ifalias, new_alias, + mutex_is_locked(&ifalias_mutex)); + mutex_unlock(&ifalias_mutex); + + if (new_alias) + kfree_rcu(new_alias, rcuhead); return len; } +/** + * dev_get_alias - get ifalias of a device + * @dev: device + * @name: buffer to store name of ifalias + * @len: size of buffer + * + * get ifalias for a device. Caller must make sure dev cannot go + * away, e.g. rcu read lock or own a reference count to device. + */ +int dev_get_alias(const struct net_device *dev, char *name, size_t len) +{ + const struct dev_ifalias *alias; + int ret = 0; + + rcu_read_lock(); + alias = rcu_dereference(dev->ifalias); + if (alias) + ret = snprintf(name, len, "%s", alias->ifalias); + rcu_read_unlock(); + + return ret; +} /** * netdev_features_change - device changes features @@ -1312,10 +1338,11 @@ EXPORT_SYMBOL(netdev_features_change); void netdev_state_change(struct net_device *dev) { if (dev->flags & IFF_UP) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info.dev = dev, + }; - change_info.flags_changed = 0; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); } @@ -1536,9 +1563,10 @@ EXPORT_SYMBOL(dev_disable_lro); static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - netdev_notifier_info_init(&info, dev); return nb->notifier_call(nb, val, &info); } @@ -1663,11 +1691,9 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); */ static int call_netdevice_notifiers_info(unsigned long val, - struct net_device *dev, struct netdev_notifier_info *info) { ASSERT_RTNL(); - netdev_notifier_info_init(info, dev); return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -1682,9 +1708,11 @@ static int call_netdevice_notifiers_info(unsigned long val, int call_netdevice_notifiers(unsigned long val, struct net_device *dev) { - struct netdev_notifier_info info; + struct netdev_notifier_info info = { + .dev = dev, + }; - return call_netdevice_notifiers_info(val, dev, &info); + return call_netdevice_notifiers_info(val, &info); } EXPORT_SYMBOL(call_netdevice_notifiers); @@ -2012,6 +2040,7 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) return 0; } +EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS static DEFINE_MUTEX(xps_map_mutex); @@ -3864,8 +3893,8 @@ drop: static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + u32 metalen, act = XDP_DROP; struct xdp_buff xdp; - u32 act = XDP_DROP; void *orig_data; int hlen, off; u32 mac_len; @@ -3876,8 +3905,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_cloned(skb)) return XDP_PASS; - if (skb_linearize(skb)) - goto do_drop; + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (troom > 0 && __skb_linearize(skb)) + goto do_drop; + } /* The XDP program wants to see the packet starting at the MAC * header. @@ -3885,6 +3931,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; xdp.data = skb->data - mac_len; + xdp.data_meta = xdp.data; xdp.data_end = xdp.data + hlen; xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; @@ -3902,10 +3949,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_REDIRECT: case XDP_TX: __skb_push(skb, mac_len); - /* fall through */ + break; case XDP_PASS: + metalen = xdp.data - xdp.data_meta; + if (metalen) + skb_metadata_set(skb, metalen); break; - default: bpf_warn_invalid_xdp_action(act); /* fall through */ @@ -4443,6 +4492,33 @@ out: return ret; } +/** + * netif_receive_skb_core - special purpose version of netif_receive_skb + * @skb: buffer to process + * + * More direct receive version of netif_receive_skb(). It should + * only be used by callers that have a need to skip RPS and Generic XDP. + * Caller must also take care of handling if (page_is_)pfmemalloc. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb_core(struct sk_buff *skb) +{ + int ret; + + rcu_read_lock(); + ret = __netif_receive_skb_core(skb, false); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(netif_receive_skb_core); + static int __netif_receive_skb(struct sk_buff *skb) { int ret; @@ -4695,6 +4771,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); @@ -6228,9 +6305,19 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, - void *upper_priv, void *upper_info) -{ - struct netdev_notifier_changeupper_info changeupper_info; + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .upper_dev = upper_dev, + .master = master, + .linking = true, + .upper_info = upper_info, + }; int ret = 0; ASSERT_RTNL(); @@ -6248,12 +6335,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - changeupper_info.upper_dev = upper_dev; - changeupper_info.master = master; - changeupper_info.linking = true; - changeupper_info.upper_info = upper_info; - - ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) @@ -6264,7 +6346,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (ret) return ret; - ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); ret = notifier_to_errno(ret); if (ret) @@ -6289,9 +6371,11 @@ rollback: * returns zero. */ int netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev) + struct net_device *upper_dev, + struct netlink_ext_ack *extack) { - return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); + return __netdev_upper_dev_link(dev, upper_dev, false, + NULL, NULL, extack); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -6310,10 +6394,11 @@ EXPORT_SYMBOL(netdev_upper_dev_link); */ int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, - void *upper_priv, void *upper_info) + void *upper_priv, void *upper_info, + struct netlink_ext_ack *extack) { return __netdev_upper_dev_link(dev, upper_dev, true, - upper_priv, upper_info); + upper_priv, upper_info, extack); } EXPORT_SYMBOL(netdev_master_upper_dev_link); @@ -6328,20 +6413,24 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - struct netdev_notifier_changeupper_info changeupper_info; + struct netdev_notifier_changeupper_info changeupper_info = { + .info = { + .dev = dev, + }, + .upper_dev = upper_dev, + .linking = false, + }; ASSERT_RTNL(); - changeupper_info.upper_dev = upper_dev; changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; - changeupper_info.linking = false; - call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, + call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, &changeupper_info.info); __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); - call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); @@ -6357,11 +6446,13 @@ EXPORT_SYMBOL(netdev_upper_dev_unlink); void netdev_bonding_info_change(struct net_device *dev, struct netdev_bonding_info *bonding_info) { - struct netdev_notifier_bonding_info info; + struct netdev_notifier_bonding_info info = { + .info.dev = dev, + }; memcpy(&info.bonding_info, bonding_info, sizeof(struct netdev_bonding_info)); - call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, &info.info); } EXPORT_SYMBOL(netdev_bonding_info_change); @@ -6487,11 +6578,13 @@ EXPORT_SYMBOL(dev_get_nest_level); void netdev_lower_state_changed(struct net_device *lower_dev, void *lower_state_info) { - struct netdev_notifier_changelowerstate_info changelowerstate_info; + struct netdev_notifier_changelowerstate_info changelowerstate_info = { + .info.dev = lower_dev, + }; ASSERT_RTNL(); changelowerstate_info.lower_state_info = lower_state_info; - call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, + call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, &changelowerstate_info.info); } EXPORT_SYMBOL(netdev_lower_state_changed); @@ -6782,11 +6875,14 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, if (dev->flags & IFF_UP && (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { - struct netdev_notifier_change_info change_info; + struct netdev_notifier_change_info change_info = { + .info = { + .dev = dev, + }, + .flags_changed = changes, + }; - change_info.flags_changed = changes; - call_netdevice_notifiers_info(NETDEV_CHANGE, dev, - &change_info.info); + call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); } } @@ -7157,7 +7253,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL); + GFP_KERNEL, NULL); /* * Flush the unicast and multicast chains @@ -8244,7 +8340,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int err; + int err, new_nsid; ASSERT_RTNL(); @@ -8300,7 +8396,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) + new_nsid = peernet2id_alloc(dev_net(dev), net); + else + new_nsid = peernet2id(dev_net(dev), net); + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); /* * Flush the unicast and multicast chains diff --git a/net/core/dst.c b/net/core/dst.c index a6c47da7d0f8..662a2d4a3d19 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -322,3 +322,19 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) +{ +#ifdef CONFIG_DST_CACHE + int cpu; + + for_each_possible_cpu(cpu) { + struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); + + if (one_md_dst->type == METADATA_IP_TUNNEL) + dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); + } +#endif + free_percpu(md_dst); +} +EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/filter.c b/net/core/filter.c index 82edad58d066..09e011f20291 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -43,6 +43,7 @@ #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/unaligned.h> +#include <asm/cmpxchg.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> @@ -989,10 +990,14 @@ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { - bool ret = __sk_filter_charge(sk, fp); - if (ret) - refcount_inc(&fp->refcnt); - return ret; + if (!refcount_inc_not_zero(&fp->refcnt)) + return false; + + if (!__sk_filter_charge(sk, fp)) { + sk_filter_release(fp); + return false; + } + return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) @@ -1402,7 +1407,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb, { int err = __bpf_try_make_writable(skb, write_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return err; } @@ -1962,7 +1967,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -1984,7 +1989,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2178,7 +2183,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2303,7 +2308,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : bpf_skb_net_grow(skb, len_diff_abs); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2394,7 +2399,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, skb_gso_reset(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return ret; } @@ -2434,7 +2439,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); return 0; } @@ -2447,14 +2452,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) +{ + return xdp_data_meta_unsupported(xdp) ? 0 : + xdp->data - xdp->data_meta; +} + BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + unsigned long metalen = xdp_get_metalen(xdp); + void *data_start = xdp->data_hard_start + metalen; void *data = xdp->data + offset; - if (unlikely(data < xdp->data_hard_start || + if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; + if (metalen) + memmove(xdp->data_meta + offset, + xdp->data_meta, metalen); + xdp->data_meta += offset; xdp->data = data; return 0; @@ -2468,6 +2485,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) +{ + void *meta = xdp->data_meta + offset; + unsigned long metalen = xdp->data - meta; + + if (xdp_data_meta_unsupported(xdp)) + return -ENOTSUPP; + if (unlikely(meta < xdp->data_hard_start || + meta > xdp->data)) + return -EINVAL; + if (unlikely((metalen & (sizeof(__u32) - 1)) || + (metalen > 32))) + return -EACCES; + + xdp->data_meta = meta; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { + .func = bpf_xdp_adjust_meta, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static int __bpf_tx_xdp(struct net_device *dev, struct bpf_map *map, struct xdp_buff *xdp, @@ -2482,10 +2526,36 @@ static int __bpf_tx_xdp(struct net_device *dev, err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); if (err) return err; - if (map) + dev->netdev_ops->ndo_xdp_flush(dev); + return 0; +} + +static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, + struct bpf_map *map, + struct xdp_buff *xdp, + u32 index) +{ + int err; + + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + struct net_device *dev = fwd; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + if (err) + return err; __dev_map_insert_ctx(map, index); - else - dev->netdev_ops->ndo_xdp_flush(dev); + + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + struct bpf_cpu_map_entry *rcpu = fwd; + + err = cpu_map_enqueue(rcpu, xdp, dev_rx); + if (err) + return err; + __cpu_map_insert_ctx(map, index); + } return 0; } @@ -2495,11 +2565,33 @@ void xdp_do_flush_map(void) struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; - if (map) - __dev_map_flush(map); + if (map) { + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: + __dev_map_flush(map); + break; + case BPF_MAP_TYPE_CPUMAP: + __cpu_map_flush(map); + break; + default: + break; + } + } } EXPORT_SYMBOL_GPL(xdp_do_flush_map); +static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) +{ + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: + return __dev_map_lookup_elem(map, index); + case BPF_MAP_TYPE_CPUMAP: + return __cpu_map_lookup_elem(map, index); + default: + return NULL; + } +} + static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, unsigned long aux) { @@ -2512,8 +2604,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err; ri->ifindex = 0; @@ -2526,7 +2618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, goto err; } - fwd = __dev_map_lookup_elem(map, index); + fwd = __xdp_map_lookup_elem(map, index); if (!fwd) { err = -EINVAL; goto err; @@ -2534,7 +2626,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, if (ri->map_to_flush && ri->map_to_flush != map) xdp_do_flush_map(); - err = __bpf_tx_xdp(fwd, map, xdp, index); + err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); if (unlikely(err)) goto err; @@ -2576,54 +2668,88 @@ err: } EXPORT_SYMBOL_GPL(xdp_do_redirect); -int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) +static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) +{ + unsigned int len; + + if (unlikely(!(fwd->flags & IFF_UP))) + return -ENETDOWN; + + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; + if (skb->len > len) + return -EMSGSIZE; + + return 0; +} + +int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; struct net_device *fwd = NULL; u32 index = ri->ifindex; - unsigned int len; int err = 0; ri->ifindex = 0; ri->map = NULL; ri->map_owner = 0; - if (map) { - if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { - err = -EFAULT; - map = NULL; - goto err; - } - fwd = __dev_map_lookup_elem(map, index); - } else { - fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { + err = -EFAULT; + map = NULL; + goto err; } + fwd = __xdp_map_lookup_elem(map, index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } - if (unlikely(!(fwd->flags & IFF_UP))) { - err = -ENETDOWN; + if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + goto err; + skb->dev = fwd; + } else { + /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ + err = -EBADRQC; goto err; } - len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) { - err = -EMSGSIZE; + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + return 0; +err: + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + return err; +} + +int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + u32 index = ri->ifindex; + struct net_device *fwd; + int err = 0; + + if (ri->map) + return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + + ri->ifindex = 0; + fwd = dev_get_by_index_rcu(dev_net(dev), index); + if (unlikely(!fwd)) { + err = -EINVAL; goto err; } + if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + goto err; + skb->dev = fwd; - map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index) - : _trace_xdp_redirect(dev, xdp_prog, index); + _trace_xdp_redirect(dev, xdp_prog, index); return 0; err: - map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err) - : _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_err(dev, xdp_prog, index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); @@ -2692,7 +2818,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head) + func == bpf_xdp_adjust_head || + func == bpf_xdp_adjust_meta) return true; return false; @@ -2943,14 +3070,15 @@ static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* Race is not possible, since it's called from verifier - * that is holding verifier mutex. - */ - md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, - METADATA_IP_TUNNEL, - GFP_KERNEL); - if (!md_dst) + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + METADATA_IP_TUNNEL, + GFP_KERNEL); + if (!tmp) return NULL; + if (cmpxchg(&md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); } switch (which) { @@ -3288,6 +3416,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; + case BPF_FUNC_xdp_adjust_meta: + return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: @@ -3418,6 +3548,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; @@ -3444,6 +3575,7 @@ static bool sk_filter_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; @@ -3468,6 +3600,7 @@ static bool lwt_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): + case bpf_ctx_range(struct __sk_buff, data_meta): return false; } @@ -3586,6 +3719,9 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; + case bpf_ctx_range(struct __sk_buff, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3596,6 +3732,25 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool +tc_cls_act_is_valid_access_analyzer(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case offsetof(struct sk_buff, len): + return true; + case offsetof(struct sk_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_data_end, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -3619,6 +3774,9 @@ static bool xdp_is_valid_access(int off, int size, case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; + case offsetof(struct xdp_md, data_meta): + info->reg_type = PTR_TO_PACKET_META; + break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; @@ -3627,6 +3785,21 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } +static bool xdp_is_valid_access_analyzer(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case offsetof(struct xdp_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct xdp_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; +} + void bpf_warn_invalid_xdp_action(u32 act) { const u32 act_max = XDP_REDIRECT; @@ -3677,6 +3850,12 @@ static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + return false; + } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): @@ -3689,8 +3868,6 @@ static bool sk_skb_is_valid_access(int off, int size, } switch (off) { - case bpf_ctx_range(struct __sk_buff, tc_classid): - return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; @@ -3847,6 +4024,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, offsetof(struct sk_buff, data)); break; + case offsetof(struct __sk_buff, data_meta): + off = si->off; + off -= offsetof(struct __sk_buff, data_meta); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_meta); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; + case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); @@ -4095,6 +4281,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; + case offsetof(struct xdp_md, data_meta): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, data_meta)); + break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, @@ -4238,68 +4429,103 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -const struct bpf_verifier_ops sk_filter_prog_ops = { +const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -const struct bpf_verifier_ops tc_cls_act_prog_ops = { +const struct bpf_prog_ops sk_filter_prog_ops = { +}; + +const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, +}; + +const struct bpf_verifier_ops tc_cls_act_analyzer_ops = { + .is_valid_access = tc_cls_act_is_valid_access_analyzer, +}; + +const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops xdp_prog_ops = { +const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, +}; + +const struct bpf_verifier_ops xdp_analyzer_ops = { + .is_valid_access = xdp_is_valid_access_analyzer, +}; + +const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; -const struct bpf_verifier_ops cg_skb_prog_ops = { +const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_inout_prog_ops = { +const struct bpf_verifier_ops lwt_inout_verifier_ops = { .get_func_proto = lwt_inout_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops lwt_inout_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops lwt_xmit_prog_ops = { +const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, +}; + +const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; -const struct bpf_verifier_ops cg_sock_prog_ops = { +const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = sock_filter_convert_ctx_access, }; -const struct bpf_verifier_ops sock_ops_prog_ops = { +const struct bpf_prog_ops cg_sock_prog_ops = { +}; + +const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; -const struct bpf_verifier_ops sk_skb_prog_ops = { +const struct bpf_prog_ops sock_ops_prog_ops = { +}; + +const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; +const struct bpf_prog_ops sk_skb_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0a977373d003..1f5caafb4492 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,7 @@ #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <net/dsa.h> +#include <net/dst_metadata.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/gre.h> @@ -115,6 +116,102 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +static void +skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *ctrl; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) + return; + + ctrl = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + target_container); + ctrl->addr_type = type; +} + +static void +__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct ip_tunnel_info *info; + struct ip_tunnel_key *key; + + /* A quick check to see if there might be something to do. */ + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL) && + !dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS)) + return; + + info = skb_tunnel_info(skb); + if (!info) + return; + + key = &info->key; + + switch (ip_tunnel_info_af(info)) { + case AF_INET: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { + struct flow_dissector_key_ipv4_addrs *ipv4; + + ipv4 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, + target_container); + ipv4->src = key->u.ipv4.src; + ipv4->dst = key->u.ipv4.dst; + } + break; + case AF_INET6: + skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS, + flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { + struct flow_dissector_key_ipv6_addrs *ipv6; + + ipv6 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, + target_container); + ipv6->src = key->u.ipv6.src; + ipv6->dst = key->u.ipv6.dst; + } + break; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { + struct flow_dissector_key_keyid *keyid; + + keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_KEYID, + target_container); + keyid->keyid = tunnel_id_to_key32(key->tun_id); + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { + struct flow_dissector_key_ports *tp; + + tp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ENC_PORTS, + target_container); + tp->src = key->tp_src; + tp->dst = key->tp_dst; + } +} + static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -478,6 +575,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + __skb_flow_dissect_tunnel_info(skb, flow_dissector, + target_container); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 1307731ddfe4..e7e626fb87bb 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, */ preempt_disable(); rcu_read_lock(); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); rcu_read_unlock(); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 927a6dcbad96..51d5836d8fb9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -391,10 +391,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); ret = dev_set_alias(netdev, buf, count); - rtnl_unlock(); return ret < 0 ? ret : len; } @@ -403,13 +400,12 @@ static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); + char tmp[IFALIASZ]; ssize_t ret = 0; - if (!rtnl_trylock()) - return restart_syscall(); - if (netdev->ifalias) - ret = sprintf(buf, "%s\n", netdev->ifalias); - rtnl_unlock(); + ret = dev_get_alias(netdev, tmp, sizeof(tmp)); + if (ret > 0) + ret = sprintf(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); @@ -1488,7 +1484,10 @@ static void netdev_release(struct device *d) BUG_ON(dev->reg_state != NETREG_RELEASED); - kfree(dev->ifalias); + /* no need to wait for rcu grace period: + * device is dead and about to be freed. + */ + kfree(rcu_access_pointer(dev->ifalias)); netdev_freemem(dev); } diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 1132820c8e62..f4e4fa2db505 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -31,6 +31,7 @@ #include <trace/events/napi.h> #include <trace/events/sock.h> #include <trace/events/udp.h> +#include <trace/events/tcp.h> #include <trace/events/fib.h> #include <trace/events/qdisc.h> #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a78fd61da0ec..20b550d07fe3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -453,7 +453,7 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) { const struct rtnl_af_ops *ops; - list_for_each_entry(ops, &rtnl_af_ops, list) { + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { if (ops->family == family) return ops; } @@ -470,32 +470,22 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) void rtnl_af_register(struct rtnl_af_ops *ops) { rtnl_lock(); - list_add_tail(&ops->list, &rtnl_af_ops); + list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_register); /** - * __rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. - * @ops: struct rtnl_af_ops * to unregister - * - * The caller must hold the rtnl_mutex. - */ -void __rtnl_af_unregister(struct rtnl_af_ops *ops) -{ - list_del(&ops->list); -} -EXPORT_SYMBOL_GPL(__rtnl_af_unregister); - -/** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); - __rtnl_af_unregister(ops); + list_del_rcu(&ops->list); rtnl_unlock(); + + synchronize_rcu(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -508,13 +498,15 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev, /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); - list_for_each_entry(af_ops, &rtnl_af_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } + rcu_read_unlock(); return size; } @@ -522,11 +514,15 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev, static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; + bool ret = false; - master_dev = netdev_master_upper_dev_get((struct net_device *) dev); + rcu_read_lock(); + + master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) - return true; - return false; + ret = true; + rcu_read_unlock(); + return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, @@ -923,6 +919,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1211,6 +1208,36 @@ nla_put_vfinfo_failure: return -EMSGSIZE; } +static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, + struct net_device *dev, + u32 ext_filter_mask) +{ + struct nlattr *vfinfo; + int i, num_vfs; + + if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) + return 0; + + num_vfs = dev_num_vf(dev->dev.parent); + if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) + return -EMSGSIZE; + + if (!dev->netdev_ops->ndo_get_vf_config) + return 0; + + vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); + if (!vfinfo) + return -EMSGSIZE; + + for (i = 0; i < num_vfs; i++) { + if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) + return -EMSGSIZE; + } + + nla_nest_end(skb, vfinfo); + return 0; +} + static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_ifmap map; @@ -1307,16 +1334,106 @@ static u32 rtnl_get_event(unsigned long event) return rtnl_event_type; } +static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) +{ + const struct net_device *upper_dev; + int ret = 0; + + rcu_read_lock(); + + upper_dev = netdev_master_upper_dev_get_rcu(dev); + if (upper_dev) + ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); + + rcu_read_unlock(); + return ret; +} + +static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev) +{ + int ifindex = dev_get_iflink(dev); + + if (dev->ifindex == ifindex) + return 0; + + return nla_put_u32(skb, IFLA_LINK, ifindex); +} + +static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, + struct net_device *dev) +{ + char buf[IFALIASZ]; + int ret; + + ret = dev_get_alias(dev, buf, sizeof(buf)); + return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; +} + +static int rtnl_fill_link_netnsid(struct sk_buff *skb, + const struct net_device *dev) +{ + if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { + struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); + + if (!net_eq(dev_net(dev), link_net)) { + int id = peernet2id_alloc(dev_net(dev), link_net); + + if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) + return -EMSGSIZE; + } + } + + return 0; +} + +static int rtnl_fill_link_af(struct sk_buff *skb, + const struct net_device *dev, + u32 ext_filter_mask) +{ + const struct rtnl_af_ops *af_ops; + struct nlattr *af_spec; + + af_spec = nla_nest_start(skb, IFLA_AF_SPEC); + if (!af_spec) + return -EMSGSIZE; + + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { + struct nlattr *af; + int err; + + if (!af_ops->fill_link_af) + continue; + + af = nla_nest_start(skb, af_ops->family); + if (!af) + return -EMSGSIZE; + + err = af_ops->fill_link_af(skb, dev, ext_filter_mask); + /* + * Caller may return ENODATA to indicate that there + * was no data to be dumped. This is not an error, it + * means we should trim the attribute header and + * continue. + */ + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + return -EMSGSIZE; + + nla_nest_end(skb, af); + } + + nla_nest_end(skb, af_spec); + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event) + u32 event, int *new_nsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; - struct nlattr *af_spec; - struct rtnl_af_ops *af_ops; - struct net_device *upper_dev = netdev_master_upper_dev_get(dev); ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1345,15 +1462,12 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif - (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || - (upper_dev && - nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) || + nla_put_iflink(skb, dev) || + put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (dev->qdisc && nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || - (dev->ifalias && - nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || + nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_changes)) || nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) @@ -1385,27 +1499,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; - if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) && - nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent))) + if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent && - ext_filter_mask & RTEXT_FILTER_VF) { - int i; - struct nlattr *vfinfo; - int num_vfs = dev_num_vf(dev->dev.parent); - - vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST); - if (!vfinfo) - goto nla_put_failure; - for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo)) - goto nla_put_failure; - } - - nla_nest_end(skb, vfinfo); - } - if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; @@ -1417,51 +1513,23 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; } - if (dev->rtnl_link_ops && - dev->rtnl_link_ops->get_link_net) { - struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); - - if (!net_eq(dev_net(dev), link_net)) { - int id = peernet2id_alloc(dev_net(dev), link_net); - - if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) - goto nla_put_failure; - } - } - - if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC))) + if (rtnl_fill_link_netnsid(skb, dev)) goto nla_put_failure; - list_for_each_entry(af_ops, &rtnl_af_ops, list) { - if (af_ops->fill_link_af) { - struct nlattr *af; - int err; - - if (!(af = nla_nest_start(skb, af_ops->family))) - goto nla_put_failure; - - err = af_ops->fill_link_af(skb, dev, ext_filter_mask); - - /* - * Caller may return ENODATA to indicate that there - * was no data to be dumped. This is not an error, it - * means we should trim the attribute header and - * continue. - */ - if (err == -ENODATA) - nla_nest_cancel(skb, af); - else if (err < 0) - goto nla_put_failure; - - nla_nest_end(skb, af); - } - } + if (new_nsid && + nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) + goto nla_put_failure; - nla_nest_end(skb, af_spec); + rcu_read_lock(); + if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) + goto nla_put_failure_rcu; + rcu_read_unlock(); nlmsg_end(skb, nlh); return 0; +nla_put_failure_rcu: + rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; @@ -1658,7 +1726,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0); + ext_filter_mask, 0, NULL); if (err < 0) { if (likely(skb->len)) @@ -1723,17 +1791,27 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; - if (!(af_ops = rtnl_af_lookup(nla_type(af)))) + rcu_read_lock(); + af_ops = rtnl_af_lookup(nla_type(af)); + if (!af_ops) { + rcu_read_unlock(); return -EAFNOSUPPORT; + } - if (!af_ops->set_link_af) + if (!af_ops->set_link_af) { + rcu_read_unlock(); return -EOPNOTSUPP; + } if (af_ops->validate_link_af) { err = af_ops->validate_link_af(dev, af); - if (err < 0) + if (err < 0) { + rcu_read_unlock(); return err; + } } + + rcu_read_unlock(); } } @@ -1909,7 +1987,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) return err; } -static int do_set_master(struct net_device *dev, int ifindex) +static int do_set_master(struct net_device *dev, int ifindex, + struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; @@ -1934,7 +2013,7 @@ static int do_set_master(struct net_device *dev, int ifindex) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { - err = ops->ndo_add_slave(upper_dev, dev); + err = ops->ndo_add_slave(upper_dev, dev, extack); if (err) return err; } else { @@ -2067,7 +2146,7 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -2190,13 +2269,18 @@ static int do_setlink(const struct sk_buff *skb, nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; + rcu_read_lock(); + if (!(af_ops = rtnl_af_lookup(nla_type(af)))) BUG(); err = af_ops->set_link_af(dev, af); - if (err < 0) + if (err < 0) { + rcu_read_unlock(); goto errout; + } + rcu_read_unlock(); status |= DO_SETLINK_NOTIFY; } } @@ -2576,12 +2660,6 @@ replay: return err; slave_data = slave_attr; } - if (m_ops->slave_validate) { - err = m_ops->slave_validate(tb, slave_data, - extack); - if (err < 0) - return err; - } } if (dev) { @@ -2711,7 +2789,8 @@ replay: goto out_unregister; } if (tb[IFLA_MASTER]) { - err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), + extack); if (err) goto out_unregister; } @@ -2771,7 +2850,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2856,7 +2935,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, - u32 event, gfp_t flags) + u32 event, gfp_t flags, int *new_nsid) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2867,7 +2946,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event, + new_nsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2890,14 +2970,14 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags) + gfp_t flags, int *new_nsid) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } @@ -2905,10 +2985,17 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); } EXPORT_SYMBOL(rtmsg_ifinfo); +void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, + gfp_t flags, int *new_nsid) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + new_nsid); +} + static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, @@ -3014,21 +3101,21 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_add); -static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid) +static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, + struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n"); + NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n", - vid); + NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } @@ -3053,24 +3140,24 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n"); + NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n"); + NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); - err = fdb_vid_parse(tb[NDA_VLAN], &vid); + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; @@ -3157,24 +3244,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n"); + NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n"); + NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); - err = fdb_vid_parse(tb[NDA_VLAN], &vid); + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; @@ -3614,7 +3701,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { - pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } @@ -3689,7 +3776,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { - pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } @@ -3854,6 +3941,9 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, return -EMSGSIZE; ifsm = nlmsg_data(nlh); + ifsm->family = PF_UNSPEC; + ifsm->pad1 = 0; + ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; @@ -3937,25 +4027,30 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, if (!attr) goto nla_put_failure; - list_for_each_entry(af_ops, &rtnl_af_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; int err; af = nla_nest_start(skb, af_ops->family); - if (!af) + if (!af) { + rcu_read_unlock(); goto nla_put_failure; - + } err = af_ops->fill_stats_af(skb, dev); - if (err == -ENODATA) + if (err == -ENODATA) { nla_nest_cancel(skb, af); - else if (err < 0) + } else if (err < 0) { + rcu_read_unlock(); goto nla_put_failure; + } nla_nest_end(skb, af); } } + rcu_read_unlock(); nla_nest_end(skb, attr); @@ -4024,7 +4119,8 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, /* for IFLA_STATS_AF_SPEC */ size += nla_total_size(0); - list_for_each_entry(af_ops, &rtnl_af_ops, list) { + rcu_read_lock(); + list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_stats_af_size) { size += nla_total_size( af_ops->get_stats_af_size(dev)); @@ -4033,6 +4129,7 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, size += nla_total_size(0); } } + rcu_read_unlock(); } return size; @@ -4284,7 +4381,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL); + GFP_KERNEL, NULL); break; default: break; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 000ce735fa8d..40717501cbdd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1350,8 +1350,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) /* Set the tail pointer and length */ skb_put(n, skb->len); - if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) - BUG(); + BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); copy_skb_header(n, skb); return n; @@ -1449,8 +1448,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, BUG_ON(nhead < 0); - if (skb_shared(skb)) - BUG(); + BUG_ON(skb_shared(skb)); size = SKB_DATA_ALIGN(size); @@ -1509,6 +1507,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + skb_metadata_clear(skb); + /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). @@ -1593,9 +1593,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, head_copy_off = newheadroom - head_copy_len; /* Copy the linear header and data. */ - if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, - skb->len + head_copy_len)) - BUG(); + BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)); copy_skb_header(n, skb); @@ -1876,8 +1875,8 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) return NULL; } - if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) - BUG(); + BUG_ON(skb_copy_bits(skb, skb_headlen(skb), + skb_tail_pointer(skb), delta)); /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. @@ -4765,6 +4764,7 @@ EXPORT_SYMBOL(kfree_skb_partial); bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize) { + struct skb_shared_info *to_shinfo, *from_shinfo; int i, delta, len = from->len; *fragstolen = false; @@ -4779,7 +4779,9 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, return true; } - if (skb_has_frag_list(to) || skb_has_frag_list(from)) + to_shinfo = skb_shinfo(to); + from_shinfo = skb_shinfo(from); + if (to_shinfo->frag_list || from_shinfo->frag_list) return false; if (skb_zcopy(to) || skb_zcopy(from)) return false; @@ -4788,8 +4790,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, struct page *page; unsigned int offset; - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags >= MAX_SKB_FRAGS) return false; if (skb_head_is_locked(from)) @@ -4800,12 +4802,12 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, page = virt_to_head_page(from->head); offset = from->data - (unsigned char *)page_address(page); - skb_fill_page_desc(to, skb_shinfo(to)->nr_frags, + skb_fill_page_desc(to, to_shinfo->nr_frags, page, offset, skb_headlen(from)); *fragstolen = true; } else { - if (skb_shinfo(to)->nr_frags + - skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS) + if (to_shinfo->nr_frags + + from_shinfo->nr_frags > MAX_SKB_FRAGS) return false; delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); @@ -4813,19 +4815,19 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, WARN_ON_ONCE(delta < len); - memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags, - skb_shinfo(from)->frags, - skb_shinfo(from)->nr_frags * sizeof(skb_frag_t)); - skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags; + memcpy(to_shinfo->frags + to_shinfo->nr_frags, + from_shinfo->frags, + from_shinfo->nr_frags * sizeof(skb_frag_t)); + to_shinfo->nr_frags += from_shinfo->nr_frags; if (!skb_cloned(from)) - skb_shinfo(from)->nr_frags = 0; + from_shinfo->nr_frags = 0; /* if the skb is not cloned this does nothing * since we set nr_frags to 0. */ - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) - skb_frag_ref(from, i); + for (i = 0; i < from_shinfo->nr_frags; i++) + __skb_frag_ref(&from_shinfo->frags[i]); to->truesize += delta; to->len += len; diff --git a/net/core/sock.c b/net/core/sock.c index 9b7b6bbb2a23..35656a9e4e44 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1654,6 +1654,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); + newsk->sk_prot_creator = sk->sk_prot; + /* SANITY */ if (likely(newsk->sk_net_refcnt)) get_net(sock_net(newsk)); @@ -1682,13 +1684,16 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_reset_flag(newsk, SOCK_DONE); - filter = rcu_dereference_protected(newsk->sk_filter, 1); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) /* though it's an empty new sock, the charging may fail * if sysctl_optmem_max was changed between creation of * original socket and cloning */ is_charged = sk_filter_charge(newsk, filter); + RCU_INIT_POINTER(newsk->sk_filter, filter); + rcu_read_unlock(); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { /* We need to make sure that we don't uncharge the new @@ -2678,7 +2683,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk_init_common(sk); sk->sk_send_head = NULL; - init_timer(&sk->sk_timer); + timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; diff --git a/net/dccp/input.c b/net/dccp/input.c index fa6be9750bb4..d28d46bff6ab 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -534,6 +534,7 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk, case DCCP_PKT_DATA: if (sk->sk_state == DCCP_RESPOND) break; + /* fall through */ case DCCP_PKT_DATAACK: case DCCP_PKT_ACK: /* diff --git a/net/dccp/options.c b/net/dccp/options.c index 51cdfc3bd8ca..4e40db017e19 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -227,8 +227,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, * Ack vectors are processed by the TX CCID if it is * interested. The RX CCID need not parse Ack Vectors, * since it is only interested in clearing old state. - * Fall through. */ + /* fall through */ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 3a2c34027758..1e35526bf436 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -125,10 +125,11 @@ static void dccp_retransmit_timer(struct sock *sk) __sk_dst_reset(sk); } -static void dccp_write_timer(unsigned long data) +static void dccp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; int event = 0; bh_lock_sock(sk); @@ -161,19 +162,20 @@ out: sock_put(sk); } -static void dccp_keepalive_timer(unsigned long data) +static void dccp_keepalive_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct sock *sk = from_timer(sk, t, sk_timer); pr_err("dccp should not use a keepalive timer !\n"); sock_put(sk); } /* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ -static void dccp_delack_timer(unsigned long data) +static void dccp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 73a0399dc7a2..518cea17b811 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -533,10 +533,6 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf scp->keepalive = 10 * HZ; scp->keepalive_fxn = dn_keepalive; - init_timer(&scp->delack_timer); - scp->delack_pending = 0; - scp->delack_fxn = dn_nsp_delayed_ack; - dn_start_slow_timer(sk); out: return sk; @@ -634,10 +630,12 @@ static void dn_destroy_sock(struct sock *sk) goto disc_reject; case DN_RUN: scp->state = DN_DI; + /* fall through */ case DN_DI: case DN_DR: disc_reject: dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation); + /* fall through */ case DN_NC: case DN_NR: case DN_RJ: @@ -651,6 +649,7 @@ disc_reject: break; default: printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n"); + /* fall through */ case DN_O: dn_stop_slow_timer(sk); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 4d339de56862..92dbaa3f1eae 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1038,14 +1038,14 @@ static void dn_eth_down(struct net_device *dev) static void dn_dev_set_timer(struct net_device *dev); -static void dn_dev_timer_func(unsigned long arg) +static void dn_dev_timer_func(struct timer_list *t) { - struct net_device *dev = (struct net_device *)arg; - struct dn_dev *dn_db; + struct dn_dev *dn_db = from_timer(dn_db, t, timer); + struct net_device *dev; struct dn_ifaddr *ifa; rcu_read_lock(); - dn_db = rcu_dereference(dev->dn_ptr); + dev = dn_db->dev; if (dn_db->t3 <= dn_db->parms.t2) { if (dn_db->parms.timer3) { for (ifa = rcu_dereference(dn_db->ifa_list); @@ -1070,8 +1070,6 @@ static void dn_dev_set_timer(struct net_device *dev) if (dn_db->parms.t2 > dn_db->parms.t3) dn_db->parms.t2 = dn_db->parms.t3; - dn_db->timer.data = (unsigned long)dev; - dn_db->timer.function = dn_dev_timer_func; dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ); add_timer(&dn_db->timer); @@ -1100,7 +1098,7 @@ static struct dn_dev *dn_dev_create(struct net_device *dev, int *err) rcu_assign_pointer(dev->dn_ptr, dn_db); dn_db->dev = dev; - init_timer(&dn_db->timer); + timer_setup(&dn_db->timer, dn_dev_timer_func, 0); dn_db->uptime = jiffies; diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index 66f035e476ea..e50a4adfcf7e 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -491,17 +491,6 @@ void dn_send_conn_ack (struct sock *sk) dn_nsp_send(skb); } -void dn_nsp_delayed_ack(struct sock *sk) -{ - struct dn_scp *scp = DN_SK(sk); - - if (scp->ackxmt_oth != scp->numoth_rcv) - dn_nsp_send_oth_ack(sk); - - if (scp->ackxmt_dat != scp->numdat_rcv) - dn_nsp_send_data_ack(sk); -} - static int dn_nsp_retrans_conn_conf(struct sock *sk) { struct dn_scp *scp = DN_SK(sk); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 0bd3afd01dd2..bff5ab88cdbb 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -338,7 +338,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth); - dst_use(&rth->dst, now); + dst_hold_and_use(&rth->dst, now); spin_unlock_bh(&dn_rt_hash_table[hash].lock); dst_release_immediate(&rt->dst); @@ -351,7 +351,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou rcu_assign_pointer(rt->dst.dn_next, dn_rt_hash_table[hash].chain); rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt); - dst_use(&rt->dst, now); + dst_hold_and_use(&rt->dst, now); spin_unlock_bh(&dn_rt_hash_table[hash].lock); *rp = rt; return 0; @@ -1258,7 +1258,7 @@ static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn * (flp->flowidn_mark == rt->fld.flowidn_mark) && dn_is_output_route(rt) && (rt->fld.flowidn_oif == flp->flowidn_oif)) { - dst_use(&rt->dst, jiffies); + dst_hold_and_use(&rt->dst, jiffies); rcu_read_unlock_bh(); *pprt = &rt->dst; return 0; @@ -1535,7 +1535,7 @@ static int dn_route_input(struct sk_buff *skb) (rt->fld.flowidn_oif == 0) && (rt->fld.flowidn_mark == skb->mark) && (rt->fld.flowidn_iif == cb->iif)) { - dst_use(&rt->dst, jiffies); + dst_hold_and_use(&rt->dst, jiffies); rcu_read_unlock(); skb_dst_set(skb, (struct dst_entry *)rt); return 0; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 81c852e32821..a3abf7a7b9a2 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -14,6 +14,7 @@ #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/notifier.h> #include <linux/of.h> #include <linux/of_mdio.h> #include <linux/of_platform.h> @@ -160,12 +161,12 @@ EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { - struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; struct pcpu_sw_netstats *s; struct dsa_slave_priv *p; - if (unlikely(dst == NULL)) { + if (unlikely(!cpu_dp)) { kfree_skb(skb); return 0; } @@ -174,7 +175,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; - nskb = dst->rcv(skb, dev, pt); + nskb = cpu_dp->rcv(skb, dev, pt); if (!nskb) { kfree_skb(skb); return 0; @@ -200,7 +201,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, #ifdef CONFIG_PM_SLEEP static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { - return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev; + return ds->enabled_port_mask & (1 << p) && ds->ports[p].slave; } int dsa_switch_suspend(struct dsa_switch *ds) @@ -212,7 +213,7 @@ int dsa_switch_suspend(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_suspend(ds->ports[i].netdev); + ret = dsa_slave_suspend(ds->ports[i].slave); if (ret) return ret; } @@ -239,7 +240,7 @@ int dsa_switch_resume(struct dsa_switch *ds) if (!dsa_is_port_initialized(ds, i)) continue; - ret = dsa_slave_resume(ds->ports[i].netdev); + ret = dsa_slave_resume(ds->ports[i].slave); if (ret) return ret; } @@ -261,6 +262,28 @@ bool dsa_schedule_work(struct work_struct *work) return queue_work(dsa_owq, work); } +static ATOMIC_NOTIFIER_HEAD(dsa_notif_chain); + +int register_dsa_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&dsa_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(register_dsa_notifier); + +int unregister_dsa_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&dsa_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_dsa_notifier); + +int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info) +{ + info->dev = dev; + return atomic_notifier_call_chain(&dsa_notif_chain, val, info); +} +EXPORT_SYMBOL_GPL(call_dsa_notifiers); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index dcccaebde708..9e8b8aab049d 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -279,7 +279,7 @@ static int dsa_user_port_apply(struct dsa_port *port) if (err) { dev_warn(ds->dev, "Failed to create slave %d: %d\n", port->index, err); - port->netdev = NULL; + port->slave = NULL; return err; } @@ -289,7 +289,7 @@ static int dsa_user_port_apply(struct dsa_port *port) if (err) return err; - devlink_port_type_eth_set(&port->devlink_port, port->netdev); + devlink_port_type_eth_set(&port->devlink_port, port->slave); return 0; } @@ -297,9 +297,9 @@ static int dsa_user_port_apply(struct dsa_port *port) static void dsa_user_port_unapply(struct dsa_port *port) { devlink_port_unregister(&port->devlink_port); - if (port->netdev) { - dsa_slave_destroy(port->netdev); - port->netdev = NULL; + if (port->slave) { + dsa_slave_destroy(port->slave); + port->slave = NULL; port->ds->enabled_port_mask &= ~(1 << port->index); } } @@ -336,12 +336,6 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err) return err; - if (ds->ops->set_addr) { - err = ds->ops->set_addr(ds, dst->cpu_dp->netdev->dev_addr); - if (err < 0) - return err; - } - if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) @@ -438,9 +432,9 @@ static int dsa_dst_apply(struct dsa_switch_tree *dst) * sent to the tag format's receive function. */ wmb(); - dst->cpu_dp->netdev->dsa_ptr = dst; + dst->cpu_dp->master->dsa_ptr = dst->cpu_dp; - err = dsa_master_ethtool_setup(dst->cpu_dp->netdev); + err = dsa_master_ethtool_setup(dst->cpu_dp->master); if (err) return err; @@ -457,9 +451,9 @@ static void dsa_dst_unapply(struct dsa_switch_tree *dst) if (!dst->applied) return; - dsa_master_ethtool_restore(dst->cpu_dp->netdev); + dsa_master_ethtool_restore(dst->cpu_dp->master); - dst->cpu_dp->netdev->dsa_ptr = NULL; + dst->cpu_dp->master->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype * field, make sure that all packets from this point get sent @@ -485,6 +479,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, struct dsa_switch_tree *dst, struct dsa_switch *ds) { + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; struct net_device *ethernet_dev; struct device_node *ethernet; @@ -504,7 +499,7 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, if (!dst->cpu_dp) { dst->cpu_dp = port; - dst->cpu_dp->netdev = ethernet_dev; + dst->cpu_dp->master = ethernet_dev; } /* Initialize cpu_port_mask now for drv->setup() @@ -514,14 +509,18 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, ds->cpu_port_mask |= BIT(index); tag_protocol = ds->ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) { + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); ds->cpu_port_mask &= ~BIT(index); - return PTR_ERR(dst->tag_ops); + return PTR_ERR(tag_ops); } - dst->rcv = dst->tag_ops->rcv; + dst->cpu_dp->tag_ops = tag_ops; + + /* Make a few copies for faster access in master receive hot path */ + dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; + dst->cpu_dp->dst = dst; return 0; } diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 0298a0f6a349..1e9914062d0b 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -66,7 +66,7 @@ struct dsa_notifier_vlan_info { }; struct dsa_slave_priv { - /* Copy of dp->ds->dst->tag_ops->xmit for faster access in hot path */ + /* Copy of CPU port xmit for faster access in slave transmit hot path */ struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); @@ -79,7 +79,6 @@ struct dsa_slave_priv { * The phylib phy_device pointer for the PHY connected * to this port. */ - struct phy_device *phy; phy_interface_t phy_interface; int old_link; int old_pause; @@ -114,6 +113,26 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], int dsa_master_ethtool_setup(struct net_device *dev); void dsa_master_ethtool_restore(struct net_device *dev); +static inline struct net_device *dsa_master_find_slave(struct net_device *dev, + int device, int port) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct dsa_switch *ds; + + if (device < 0 || device >= DSA_MAX_SWITCHES) + return NULL; + + ds = dst->ds[device]; + if (!ds) + return NULL; + + if (port < 0 || port >= ds->num_ports) + return NULL; + + return ds->ports[port].slave; +} + /* port.c */ int dsa_port_set_state(struct dsa_port *dp, u8 state, struct switchdev_trans *trans); @@ -150,6 +169,21 @@ int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); +static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + return p->dp; +} + +static inline struct net_device * +dsa_slave_to_master(const struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return dp->cpu_dp->master; +} + /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); @@ -178,14 +212,4 @@ extern const struct dsa_device_ops qca_netdev_ops; /* tag_trailer.c */ extern const struct dsa_device_ops trailer_netdev_ops; -static inline struct net_device *dsa_master_netdev(struct dsa_slave_priv *p) -{ - return p->dp->cpu_dp->netdev; -} - -static inline struct dsa_port *dsa_get_cpu_port(struct dsa_switch_tree *dst) -{ - return dst->cpu_dp; -} - #endif diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index ae505d8e4417..b6c88fd33d4f 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -120,7 +120,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, return -EINVAL; } dst->cpu_dp = &ds->ports[i]; - dst->cpu_dp->netdev = master; + dst->cpu_dp->master = master; ds->cpu_port_mask |= 1 << i; } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; @@ -144,14 +144,19 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, * switch. */ if (dst->cpu_dp->ds == ds) { + const struct dsa_device_ops *tag_ops; enum dsa_tag_protocol tag_protocol; tag_protocol = ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) - return PTR_ERR(dst->tag_ops); + tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(tag_ops)) + return PTR_ERR(tag_ops); - dst->rcv = dst->tag_ops->rcv; + dst->cpu_dp->tag_ops = tag_ops; + + /* Few copies for faster access in master receive hot path */ + dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv; + dst->cpu_dp->dst = dst; } memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); @@ -167,12 +172,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, if (ret) return ret; - if (ops->set_addr) { - ret = ops->set_addr(ds, master->dev_addr); - if (ret < 0) - return ret; - } - if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) @@ -262,10 +261,10 @@ static void dsa_switch_destroy(struct dsa_switch *ds) if (!(ds->enabled_port_mask & (1 << port))) continue; - if (!ds->ports[port].netdev) + if (!ds->ports[port].slave) continue; - dsa_slave_destroy(ds->ports[port].netdev); + dsa_slave_destroy(ds->ports[port].slave); } /* Disable configuration of the CPU and DSA ports */ @@ -600,9 +599,9 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, * sent to the tag format's receive function. */ wmb(); - dev->dsa_ptr = dst; + dev->dsa_ptr = dst->cpu_dp; - return dsa_master_ethtool_setup(dst->cpu_dp->netdev); + return dsa_master_ethtool_setup(dst->cpu_dp->master); } static int dsa_probe(struct platform_device *pdev) @@ -667,9 +666,9 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; - dsa_master_ethtool_restore(dst->cpu_dp->netdev); + dsa_master_ethtool_restore(dst->cpu_dp->master); - dst->cpu_dp->netdev->dsa_ptr = NULL; + dst->cpu_dp->master->dsa_ptr = NULL; /* If we used a tagging format that doesn't have an ethertype * field, make sure that all packets from this point get sent @@ -684,7 +683,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) dsa_switch_destroy(ds); } - dev_put(dst->cpu_dp->netdev); + dev_put(dst->cpu_dp->master); } static int dsa_remove(struct platform_device *pdev) @@ -741,8 +740,7 @@ int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], const unsigned char *addr, u16 vid, u16 flags) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); return dsa_port_fdb_add(dp, addr, vid); } @@ -751,8 +749,7 @@ int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); return dsa_port_fdb_del(dp, addr, vid); } diff --git a/net/dsa/master.c b/net/dsa/master.c index 5e5147ec5a44..5f3f57e372e0 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -16,10 +16,10 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; int count = 0; if (ops && ops->get_sset_count && ops->get_ethtool_stats) { @@ -28,15 +28,14 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev, } if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, port->index, data + count); + ds->ops->get_ethtool_stats(ds, port, data + count); } static int dsa_master_get_sset_count(struct net_device *dev, int sset) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; int count = 0; if (ops && ops->get_sset_count) @@ -51,17 +50,17 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset) static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; - const struct ethtool_ops *ops = port->orig_ethtool_ops; + struct dsa_port *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch *ds = cpu_dp->ds; + int port = cpu_dp->index; int len = ETH_GSTRING_LEN; int mcount = 0, count; unsigned int i; uint8_t pfx[4]; uint8_t *ndata; - snprintf(pfx, sizeof(pfx), "p%.2d", port->index); + snprintf(pfx, sizeof(pfx), "p%.2d", port); /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; @@ -76,7 +75,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, * the output after to prepend our CPU port prefix we * constructed earlier */ - ds->ops->get_strings(ds, port->index, ndata); + ds->ops->get_strings(ds, port, ndata); count = ds->ops->get_sset_count(ds); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), @@ -88,18 +87,17 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, int dsa_master_ethtool_setup(struct net_device *dev) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; - struct dsa_switch *ds = port->ds; + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch *ds = cpu_dp->ds; struct ethtool_ops *ops; ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; - port->orig_ethtool_ops = dev->ethtool_ops; - if (port->orig_ethtool_ops) - memcpy(ops, port->orig_ethtool_ops, sizeof(*ops)); + cpu_dp->orig_ethtool_ops = dev->ethtool_ops; + if (cpu_dp->orig_ethtool_ops) + memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops)); ops->get_sset_count = dsa_master_get_sset_count; ops->get_ethtool_stats = dsa_master_get_ethtool_stats; @@ -112,9 +110,8 @@ int dsa_master_ethtool_setup(struct net_device *dev) void dsa_master_ethtool_restore(struct net_device *dev) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *port = dst->cpu_dp; + struct dsa_port *cpu_dp = dev->dsa_ptr; - dev->ethtool_ops = port->orig_ethtool_ops; - port->orig_ethtool_ops = NULL; + dev->ethtool_ops = cpu_dp->orig_ethtool_ops; + cpu_dp->orig_ethtool_ops = NULL; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index bd51ef56ec5b..6906de0f0050 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -64,16 +64,13 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) /* slave device handling ****************************************************/ static int dsa_slave_get_iflink(const struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - - return dsa_master_netdev(p)->ifindex; + return dsa_slave_to_master(dev)->ifindex; } static int dsa_slave_open(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); int err; if (!(master->flags & IFF_UP)) @@ -96,12 +93,12 @@ static int dsa_slave_open(struct net_device *dev) goto clear_allmulti; } - err = dsa_port_enable(dp, p->phy); + err = dsa_port_enable(dp, dev->phydev); if (err) goto clear_promisc; - if (p->phy) - phy_start(p->phy); + if (dev->phydev) + phy_start(dev->phydev); return 0; @@ -120,14 +117,13 @@ out: static int dsa_slave_close(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); - struct dsa_port *dp = p->dp; + struct net_device *master = dsa_slave_to_master(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); - if (p->phy) - phy_stop(p->phy); + if (dev->phydev) + phy_stop(dev->phydev); - dsa_port_disable(dp, p->phy); + dsa_port_disable(dp, dev->phydev); dev_mc_unsync(master, dev); dev_uc_unsync(master, dev); @@ -144,8 +140,7 @@ static int dsa_slave_close(struct net_device *dev) static void dsa_slave_change_rx_flags(struct net_device *dev, int change) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); if (change & IFF_ALLMULTI) dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); @@ -155,8 +150,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change) static void dsa_slave_set_rx_mode(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); dev_mc_sync(master, dev); dev_uc_sync(master, dev); @@ -164,8 +158,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev) static int dsa_slave_set_mac_address(struct net_device *dev, void *a) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); + struct net_device *master = dsa_slave_to_master(dev); struct sockaddr *addr = a; int err; @@ -246,14 +239,13 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_dump_ctx dump = { .dev = dev, .skb = skb, .cb = cb, .idx = *idx, }; - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; int err; err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump); @@ -264,20 +256,17 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - struct dsa_slave_priv *p = netdev_priv(dev); - - if (p->phy != NULL) - return phy_mii_ioctl(p->phy, ifr, cmd); + if (!dev->phydev) + return -ENODEV; - return -EOPNOTSUPP; + return phy_mii_ioctl(dev->phydev, ifr, cmd); } static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); int ret; switch (attr->id) { @@ -303,8 +292,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); int err; /* For the prepare phase, ensure the full set of changes is feasable in @@ -331,8 +319,7 @@ static int dsa_slave_port_obj_add(struct net_device *dev, static int dsa_slave_port_obj_del(struct net_device *dev, const struct switchdev_obj *obj) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); int err; switch (obj->id) { @@ -353,8 +340,8 @@ static int dsa_slave_port_obj_del(struct net_device *dev, static int dsa_slave_port_attr_get(struct net_device *dev, struct switchdev_attr *attr) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_PARENT_ID: @@ -415,38 +402,13 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ - nskb->dev = dsa_master_netdev(p); + nskb->dev = dsa_slave_to_master(dev); dev_queue_xmit(nskb); return NETDEV_TX_OK; } /* ethtool operations *******************************************************/ -static int -dsa_slave_get_link_ksettings(struct net_device *dev, - struct ethtool_link_ksettings *cmd) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - if (!p->phy) - return -EOPNOTSUPP; - - phy_ethtool_ksettings_get(p->phy, cmd); - - return 0; -} - -static int -dsa_slave_set_link_ksettings(struct net_device *dev, - const struct ethtool_link_ksettings *cmd) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - if (p->phy != NULL) - return phy_ethtool_ksettings_set(p->phy, cmd); - - return -EOPNOTSUPP; -} static void dsa_slave_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) @@ -458,11 +420,11 @@ static void dsa_slave_get_drvinfo(struct net_device *dev, static int dsa_slave_get_regs_len(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs_len) - return ds->ops->get_regs_len(ds, p->dp->index); + return ds->ops->get_regs_len(ds, dp->index); return -EOPNOTSUPP; } @@ -470,39 +432,27 @@ static int dsa_slave_get_regs_len(struct net_device *dev) static void dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs) - ds->ops->get_regs(ds, p->dp->index, regs, _p); -} - -static int dsa_slave_nway_reset(struct net_device *dev) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - if (p->phy != NULL) - return genphy_restart_aneg(p->phy); - - return -EOPNOTSUPP; + ds->ops->get_regs(ds, dp->index, regs, _p); } static u32 dsa_slave_get_link(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + if (!dev->phydev) + return -ENODEV; - if (p->phy != NULL) { - genphy_update_link(p->phy); - return p->phy->link; - } + genphy_update_link(dev->phydev); - return -EOPNOTSUPP; + return dev->phydev->link; } static int dsa_slave_get_eeprom_len(struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; @@ -516,8 +466,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) static int dsa_slave_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); @@ -528,8 +478,8 @@ static int dsa_slave_get_eeprom(struct net_device *dev, static int dsa_slave_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); @@ -540,8 +490,8 @@ static int dsa_slave_set_eeprom(struct net_device *dev, static void dsa_slave_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; @@ -551,7 +501,7 @@ static void dsa_slave_get_strings(struct net_device *dev, strncpy(data + 2 * len, "rx_packets", len); strncpy(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) - ds->ops->get_strings(ds, p->dp->index, data + 4 * len); + ds->ops->get_strings(ds, dp->index, data + 4 * len); } } @@ -559,8 +509,9 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; int i; @@ -582,13 +533,13 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, data[3] += rx_bytes; } if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, p->dp->index, data + 4); + ds->ops->get_ethtool_stats(ds, dp->index, data + 4); } static int dsa_slave_get_sset_count(struct net_device *dev, int sset) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (sset == ETH_SS_STATS) { int count; @@ -605,77 +556,77 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (ds->ops->get_wol) - ds->ops->get_wol(ds, p->dp->index, w); + ds->ops->get_wol(ds, dp->index, w); } static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; if (ds->ops->set_wol) - ret = ds->ops->set_wol(ds, p->dp->index, w); + ret = ds->ops->set_wol(ds, dp->index, w); return ret; } static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!p->phy) + if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) return -EOPNOTSUPP; - ret = ds->ops->set_mac_eee(ds, p->dp->index, e); + ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; if (e->eee_enabled) { - ret = phy_init_eee(p->phy, 0); + ret = phy_init_eee(dev->phydev, 0); if (ret) return ret; } - return phy_ethtool_set_eee(p->phy, e); + return phy_ethtool_set_eee(dev->phydev, e); } static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; int ret; /* Port's PHY and MAC both need to be EEE capable */ - if (!p->phy) + if (!dev->phydev) return -ENODEV; if (!ds->ops->get_mac_eee) return -EOPNOTSUPP; - ret = ds->ops->get_mac_eee(ds, p->dp->index, e); + ret = ds->ops->get_mac_eee(ds, dp->index, e); if (ret) return ret; - return phy_ethtool_get_eee(p->phy, e); + return phy_ethtool_get_eee(dev->phydev, e); } #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_slave_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { + struct net_device *master = dsa_slave_to_master(dev); struct dsa_slave_priv *p = netdev_priv(dev); - struct net_device *master = dsa_master_netdev(p); struct netpoll *netpoll; int err = 0; @@ -715,9 +666,9 @@ static void dsa_slave_poll_controller(struct net_device *dev) static int dsa_slave_get_phys_port_name(struct net_device *dev, char *name, size_t len) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); - if (snprintf(name, len, "p%d", p->dp->index) >= len) + if (snprintf(name, len, "p%d", dp->index) >= len) return -EINVAL; return 0; @@ -740,14 +691,15 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; __be16 protocol = cls->common.protocol; - struct dsa_switch *ds = p->dp->ds; struct net *net = dev_net(dev); - struct dsa_slave_priv *to_p; + struct dsa_switch *ds = dp->ds; struct net_device *to_dev; const struct tc_action *a; + struct dsa_port *to_dp; int err = -EOPNOTSUPP; LIST_HEAD(actions); int ifindex; @@ -780,13 +732,12 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; - to_p = netdev_priv(to_dev); + to_dp = dsa_slave_to_port(to_dev); - mirror->to_local_port = to_p->dp->index; + mirror->to_local_port = to_dp->index; mirror->ingress = ingress; - err = ds->ops->port_mirror_add(ds, p->dp->index, mirror, - ingress); + err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress); if (err) { kfree(mall_tc_entry); return err; @@ -801,9 +752,9 @@ static int dsa_slave_add_cls_matchall(struct net_device *dev, static void dsa_slave_del_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_mall_tc_entry *mall_tc_entry; - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; if (!ds->ops->port_mirror_del) return; @@ -816,8 +767,7 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev, switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: - ds->ops->port_mirror_del(ds, p->dp->index, - &mall_tc_entry->mirror); + ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); break; default: WARN_ON(1); @@ -894,32 +844,32 @@ static void dsa_slave_get_stats64(struct net_device *dev, static int dsa_slave_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (!ds->ops->get_rxnfc) return -EOPNOTSUPP; - return ds->ops->get_rxnfc(ds, p->dp->index, nfc, rule_locs); + return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs); } static int dsa_slave_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; if (!ds->ops->set_rxnfc) return -EOPNOTSUPP; - return ds->ops->set_rxnfc(ds, p->dp->index, nfc); + return ds->ops->set_rxnfc(ds, dp->index, nfc); } static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, .get_regs = dsa_slave_get_regs, - .nway_reset = dsa_slave_nway_reset, + .nway_reset = phy_ethtool_nway_reset, .get_link = dsa_slave_get_link, .get_eeprom_len = dsa_slave_get_eeprom_len, .get_eeprom = dsa_slave_get_eeprom, @@ -931,8 +881,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, .get_eee = dsa_slave_get_eee, - .get_link_ksettings = dsa_slave_get_link_ksettings, - .set_link_ksettings = dsa_slave_set_link_ksettings, + .get_link_ksettings = phy_ethtool_get_link_ksettings, + .set_link_ksettings = phy_ethtool_set_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, }; @@ -972,43 +922,44 @@ static struct device_type dsa_type = { static void dsa_slave_adjust_link(struct net_device *dev) { + struct dsa_port *dp = dsa_slave_to_port(dev); struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; unsigned int status_changed = 0; - if (p->old_link != p->phy->link) { + if (p->old_link != dev->phydev->link) { status_changed = 1; - p->old_link = p->phy->link; + p->old_link = dev->phydev->link; } - if (p->old_duplex != p->phy->duplex) { + if (p->old_duplex != dev->phydev->duplex) { status_changed = 1; - p->old_duplex = p->phy->duplex; + p->old_duplex = dev->phydev->duplex; } - if (p->old_pause != p->phy->pause) { + if (p->old_pause != dev->phydev->pause) { status_changed = 1; - p->old_pause = p->phy->pause; + p->old_pause = dev->phydev->pause; } if (ds->ops->adjust_link && status_changed) - ds->ops->adjust_link(ds, p->dp->index, p->phy); + ds->ops->adjust_link(ds, dp->index, dev->phydev); if (status_changed) - phy_print_status(p->phy); + phy_print_status(dev->phydev); } static int dsa_slave_fixed_link_update(struct net_device *dev, struct fixed_phy_status *status) { - struct dsa_slave_priv *p; struct dsa_switch *ds; + struct dsa_port *dp; if (dev) { - p = netdev_priv(dev); - ds = p->dp->ds; + dp = dsa_slave_to_port(dev); + ds = dp->ds; if (ds->ops->fixed_link_update) - ds->ops->fixed_link_update(ds, p->dp->index, status); + ds->ops->fixed_link_update(ds, dp->index, status); } return 0; @@ -1017,32 +968,35 @@ static int dsa_slave_fixed_link_update(struct net_device *dev, /* slave device setup *******************************************************/ static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr) { + struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct dsa_switch *ds = p->dp->ds; + struct dsa_switch *ds = dp->ds; - p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr); - if (!p->phy) { + slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr); + if (!slave_dev->phydev) { netdev_err(slave_dev, "no phy at %d\n", addr); return -ENODEV; } /* Use already configured phy mode */ if (p->phy_interface == PHY_INTERFACE_MODE_NA) - p->phy_interface = p->phy->interface; - return phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link, - p->phy_interface); + p->phy_interface = slave_dev->phydev->interface; + + return phy_connect_direct(slave_dev, slave_dev->phydev, + dsa_slave_adjust_link, p->phy_interface); } static int dsa_slave_phy_setup(struct net_device *slave_dev) { + struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct dsa_switch *ds = p->dp->ds; - struct device_node *phy_dn, *port_dn; + struct device_node *port_dn = dp->dn; + struct dsa_switch *ds = dp->ds; + struct device_node *phy_dn; bool phy_is_fixed = false; u32 phy_flags = 0; int mode, ret; - port_dn = p->dp->dn; mode = of_get_phy_mode(port_dn); if (mode < 0) mode = PHY_INTERFACE_MODE_NA; @@ -1063,7 +1017,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) } if (ds->ops->get_phy_flags) - phy_flags = ds->ops->get_phy_flags(ds, p->dp->index); + phy_flags = ds->ops->get_phy_flags(ds, dp->index); if (phy_dn) { int phy_id = of_mdio_parse_addr(&slave_dev->dev, phy_dn); @@ -1082,33 +1036,34 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } } else { - p->phy = of_phy_connect(slave_dev, phy_dn, - dsa_slave_adjust_link, - phy_flags, - p->phy_interface); + slave_dev->phydev = of_phy_connect(slave_dev, phy_dn, + dsa_slave_adjust_link, + phy_flags, + p->phy_interface); } of_node_put(phy_dn); } - if (p->phy && phy_is_fixed) - fixed_phy_set_link_update(p->phy, dsa_slave_fixed_link_update); + if (slave_dev->phydev && phy_is_fixed) + fixed_phy_set_link_update(slave_dev->phydev, + dsa_slave_fixed_link_update); /* We could not connect to a designated PHY, so use the switch internal * MDIO bus instead */ - if (!p->phy) { - ret = dsa_slave_phy_connect(slave_dev, p->dp->index); + if (!slave_dev->phydev) { + ret = dsa_slave_phy_connect(slave_dev, dp->index); if (ret) { netdev_err(slave_dev, "failed to connect to port %d: %d\n", - p->dp->index, ret); + dp->index, ret); if (phy_is_fixed) of_phy_deregister_fixed_link(port_dn); return ret; } } - phy_attached_info(p->phy); + phy_attached_info(slave_dev->phydev); return 0; } @@ -1128,12 +1083,12 @@ int dsa_slave_suspend(struct net_device *slave_dev) netif_device_detach(slave_dev); - if (p->phy) { - phy_stop(p->phy); + if (slave_dev->phydev) { + phy_stop(slave_dev->phydev); p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; - phy_suspend(p->phy); + phy_suspend(slave_dev->phydev); } return 0; @@ -1141,31 +1096,39 @@ int dsa_slave_suspend(struct net_device *slave_dev) int dsa_slave_resume(struct net_device *slave_dev) { - struct dsa_slave_priv *p = netdev_priv(slave_dev); - netif_device_attach(slave_dev); - if (p->phy) { - phy_resume(p->phy); - phy_start(p->phy); + if (slave_dev->phydev) { + phy_resume(slave_dev->phydev); + phy_start(slave_dev->phydev); } return 0; } +static void dsa_slave_notify(struct net_device *dev, unsigned long val) +{ + struct net_device *master = dsa_slave_to_master(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_notifier_register_info rinfo = { + .switch_number = dp->ds->index, + .port_number = dp->index, + .master = master, + .info.dev = dev, + }; + + call_dsa_notifiers(val, dev, &rinfo.info); +} + int dsa_slave_create(struct dsa_port *port, const char *name) { + struct dsa_port *cpu_dp = port->cpu_dp; + struct net_device *master = cpu_dp->master; struct dsa_switch *ds = port->ds; - struct dsa_switch_tree *dst = ds->dst; - struct net_device *master; struct net_device *slave_dev; struct dsa_slave_priv *p; - struct dsa_port *cpu_dp; int ret; - cpu_dp = ds->dst->cpu_dp; - master = cpu_dp->netdev; - if (!ds->num_tx_queues) ds->num_tx_queues = 1; @@ -1201,51 +1164,58 @@ int dsa_slave_create(struct dsa_port *port, const char *name) } p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); - p->xmit = dst->tag_ops->xmit; + p->xmit = cpu_dp->tag_ops->xmit; p->old_pause = -1; p->old_link = -1; p->old_duplex = -1; - port->netdev = slave_dev; - ret = register_netdev(slave_dev); - if (ret) { - netdev_err(master, "error %d registering interface %s\n", - ret, slave_dev->name); - port->netdev = NULL; - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; - } + port->slave = slave_dev; netif_carrier_off(slave_dev); ret = dsa_slave_phy_setup(slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - unregister_netdev(slave_dev); - free_percpu(p->stats64); - free_netdev(slave_dev); - return ret; + goto out_free; + } + + dsa_slave_notify(slave_dev, DSA_PORT_REGISTER); + + ret = register_netdev(slave_dev); + if (ret) { + netdev_err(master, "error %d registering interface %s\n", + ret, slave_dev->name); + goto out_phy; } return 0; + +out_phy: + phy_disconnect(slave_dev->phydev); + if (of_phy_is_fixed_link(port->dn)) + of_phy_deregister_fixed_link(port->dn); +out_free: + free_percpu(p->stats64); + free_netdev(slave_dev); + port->slave = NULL; + return ret; } void dsa_slave_destroy(struct net_device *slave_dev) { + struct dsa_port *dp = dsa_slave_to_port(slave_dev); struct dsa_slave_priv *p = netdev_priv(slave_dev); - struct device_node *port_dn; - - port_dn = p->dp->dn; + struct device_node *port_dn = dp->dn; netif_carrier_off(slave_dev); - if (p->phy) { - phy_disconnect(p->phy); + if (slave_dev->phydev) { + phy_disconnect(slave_dev->phydev); if (of_phy_is_fixed_link(port_dn)) of_phy_deregister_fixed_link(port_dn); } + dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); unregister_netdev(slave_dev); free_percpu(p->stats64); free_netdev(slave_dev); @@ -1259,8 +1229,7 @@ static bool dsa_slave_dev_check(struct net_device *dev) static int dsa_slave_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_port *dp = p->dp; + struct dsa_port *dp = dsa_slave_to_port(dev); int err = NOTIFY_DONE; if (netif_is_bridge_master(info->upper_dev)) { @@ -1303,14 +1272,14 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) container_of(work, struct dsa_switchdev_event_work, work); struct net_device *dev = switchdev_work->dev; struct switchdev_notifier_fdb_info *fdb_info; - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); int err; rtnl_lock(); switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; - err = dsa_port_fdb_add(p->dp, fdb_info->addr, fdb_info->vid); + err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid); if (err) { netdev_dbg(dev, "fdb add failed err=%d\n", err); break; @@ -1321,7 +1290,7 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) case SWITCHDEV_FDB_DEL_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; - err = dsa_port_fdb_del(p->dp, fdb_info->addr, fdb_info->vid); + err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid); if (err) { netdev_dbg(dev, "fdb del failed err=%d\n", err); dev_close(dev); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index dbb016434ace..9e082bae3cb0 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -61,7 +61,7 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u16 queue = skb_get_queue_mapping(skb); u8 *brcm_tag; @@ -82,9 +82,14 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev ((queue & BRCM_IG_TC_MASK) << BRCM_IG_TC_SHIFT); brcm_tag[1] = 0; brcm_tag[2] = 0; - if (p->dp->index == 8) + if (dp->index == 8) brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; - brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK; + brcm_tag[3] = (1 << dp->index) & BRCM_IG_DSTMAP1_MASK; + + /* Now tell the master network device about the desired output queue + * as well + */ + skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(dp->index, queue)); return skb; } @@ -92,9 +97,6 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; int source_port; u8 *brcm_tag; @@ -117,8 +119,8 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Locate which port this is coming from */ source_port = brcm_tag[3] & BRCM_EG_PID_MASK; - /* Validate port against switch setup, either the port is totally */ - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_find_slave(dev, 0, source_port); + if (!skb->dev) return NULL; /* Remove Broadcom tag and update checksum */ @@ -129,8 +131,6 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - BRCM_TAG_LEN, 2 * ETH_ALEN); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index fbf9ca954773..dbbcdafed8c3 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -18,7 +18,7 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u8 *dsa_header; /* @@ -34,8 +34,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct tagged FROM_CPU DSA tag from 802.1q tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x60 | p->dp->ds->index; - dsa_header[1] = p->dp->index << 3; + dsa_header[0] = 0x60 | dp->ds->index; + dsa_header[1] = dp->index << 3; /* * Move CFI field from byte 2 to byte 1. @@ -55,8 +55,8 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) * Construct untagged FROM_CPU DSA tag. */ dsa_header = skb->data + 2 * ETH_ALEN; - dsa_header[0] = 0x40 | p->dp->ds->index; - dsa_header[1] = p->dp->index << 3; + dsa_header[0] = 0x40 | dp->ds->index; + dsa_header[1] = dp->index << 3; dsa_header[2] = 0x00; dsa_header[3] = 0x00; } @@ -67,8 +67,6 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; u8 *dsa_header; int source_device; int source_port; @@ -93,18 +91,8 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = dsa_header[0] & 0x1f; source_port = (dsa_header[1] >> 3) & 0x1f; - /* - * Check that the source device exists and that the source - * port is a registered DSA port. - */ - if (source_device >= DSA_MAX_SWITCHES) - return NULL; - - ds = dst->ds[source_device]; - if (!ds) - return NULL; - - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_find_slave(dev, source_device, source_port); + if (!skb->dev) return NULL; /* @@ -153,8 +141,6 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 76367ba1b2e2..f38a626b3a05 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -19,7 +19,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u8 *edsa_header; /* @@ -43,8 +43,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x60 | p->dp->ds->index; - edsa_header[5] = p->dp->index << 3; + edsa_header[4] = 0x60 | dp->ds->index; + edsa_header[5] = dp->index << 3; /* * Move CFI field from byte 6 to byte 5. @@ -68,8 +68,8 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; edsa_header[3] = 0x00; - edsa_header[4] = 0x40 | p->dp->ds->index; - edsa_header[5] = p->dp->index << 3; + edsa_header[4] = 0x40 | dp->ds->index; + edsa_header[5] = dp->index << 3; edsa_header[6] = 0x00; edsa_header[7] = 0x00; } @@ -80,8 +80,6 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; u8 *edsa_header; int source_device; int source_port; @@ -106,18 +104,8 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, source_device = edsa_header[0] & 0x1f; source_port = (edsa_header[1] >> 3) & 0x1f; - /* - * Check that the source device exists and that the source - * port is a registered DSA port. - */ - if (source_device >= DSA_MAX_SWITCHES) - return NULL; - - ds = dst->ds[source_device]; - if (!ds) - return NULL; - - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + skb->dev = dsa_master_find_slave(dev, source_device, source_port); + if (!skb->dev) return NULL; /* @@ -172,8 +160,6 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, 2 * ETH_ALEN); } - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 010ca0a336c4..0f62effad88f 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -34,7 +34,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); struct sk_buff *nskb; int padlen; u8 *tag; @@ -72,7 +72,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); tag[0] = 0; - tag[1] = 1 << p->dp->index; /* destination port */ + tag[1] = 1 << dp->index; /* destination port */ return nskb; } @@ -80,22 +80,19 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; u8 *tag; int source_port; tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; source_port = tag[0] & 7; - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + + skb->dev = dsa_master_find_slave(dev, 0, source_port); + if (!skb->dev) return NULL; pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 0b9826105e42..57519597c6fc 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -42,7 +42,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u16 *lan9303_tag; /* insert a special VLAN tag between the MAC addresses @@ -62,7 +62,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) lan9303_tag = (u16 *)(skb->data + 2 * ETH_ALEN); lan9303_tag[0] = htons(ETH_P_8021Q); - lan9303_tag[1] = htons(p->dp->index | BIT(4)); + lan9303_tag[1] = htons(dp->index | BIT(4)); return skb; } @@ -71,17 +71,8 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { u16 *lan9303_tag; - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; unsigned int source_port; - ds = dst->ds[0]; - - if (unlikely(!ds)) { - dev_warn_ratelimited(&dev->dev, "Dropping packet, due to missing DSA switch device\n"); - return NULL; - } - if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) { dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull\n"); @@ -103,16 +94,12 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, source_port = ntohs(lan9303_tag[1]) & 0x3; - if (source_port >= ds->num_ports) { + skb->dev = dsa_master_find_slave(dev, 0, source_port); + if (!skb->dev) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n"); return NULL; } - if (!ds->ports[source_port].netdev) { - dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid netdev or device\n"); - return NULL; - } - /* remove the special VLAN tag between the MAC addresses * and the current ethertype field. */ @@ -120,9 +107,6 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), 2 * ETH_ALEN); - /* forward the packet to the dedicated interface */ - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index ec8ee5f43255..8475434af7d5 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -23,7 +23,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u8 *mtk_tag; if (skb_cow_head(skb, MTK_HDR_LEN) < 0) @@ -36,7 +36,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, /* Build the tag after the MAC Source Address */ mtk_tag = skb->data + 2 * ETH_ALEN; mtk_tag[0] = 0; - mtk_tag[1] = (1 << p->dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; + mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; mtk_tag[2] = 0; mtk_tag[3] = 0; @@ -46,8 +46,6 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_switch *ds; int port; __be16 *phdr, hdr; @@ -68,20 +66,12 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - MTK_HDR_LEN, 2 * ETH_ALEN); - /* This protocol doesn't support cascading multiple - * switches so it's safe to assume the switch is first - * in the tree. - */ - ds = dst->ds[0]; - if (!ds) - return NULL; - /* Get source port information */ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); - if (!ds->ports[port].netdev) - return NULL; - skb->dev = ds->ports[port].netdev; + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) + return NULL; return skb; } diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 1d4c70711c0f..613f4ee97771 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -38,7 +38,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); u16 *phdr, hdr; dev->stats.tx_packets++; @@ -54,8 +54,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) /* Set the version field, and set destination port information */ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | - QCA_HDR_XMIT_FROM_CPU | - BIT(p->dp->index); + QCA_HDR_XMIT_FROM_CPU | BIT(dp->index); *phdr = htons(hdr); @@ -65,9 +64,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds; u8 ver; int port; __be16 *phdr, hdr; @@ -92,20 +88,12 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN, ETH_HLEN - QCA_HDR_LEN); - /* This protocol doesn't support cascading multiple switches so it's - * safe to assume the switch is first in the tree - */ - ds = cpu_dp->ds; - if (!ds) - return NULL; - /* Get source port information */ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); - if (!ds->ports[port].netdev) - return NULL; - /* Update skb & forward the frame accordingly */ - skb->dev = ds->ports[port].netdev; + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) + return NULL; return skb; } diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d2fd4923aa3e..7d20e1f3de28 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -16,7 +16,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); struct sk_buff *nskb; int padlen; u8 *trailer; @@ -48,7 +48,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) trailer = skb_put(nskb, 4); trailer[0] = 0x80; - trailer[1] = 1 << p->dp->index; + trailer[1] = 1 << dp->index; trailer[2] = 0x10; trailer[3] = 0x00; @@ -58,9 +58,6 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) { - struct dsa_switch_tree *dst = dev->dsa_ptr; - struct dsa_port *cpu_dp = dsa_get_cpu_port(dst); - struct dsa_switch *ds = cpu_dp->ds; u8 *trailer; int source_port; @@ -73,13 +70,13 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; source_port = trailer[1] & 7; - if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) + + skb->dev = dsa_master_find_slave(dev, 0, source_port); + if (!skb->dev) return NULL; pskb_trim_rcsum(skb, skb->len - 4); - skb->dev = ds->ports[source_port].netdev; - return skb; } diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index f85b08baff16..85bf86ad6b18 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -80,12 +80,13 @@ static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) fq->daddr = *arg->dst; } -static void lowpan_frag_expire(unsigned long data) +static void lowpan_frag_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ieee802154_lowpan.frags); spin_lock(&fq->q.lock); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e31108e5ef79..ce4aa827be05 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -195,7 +195,7 @@ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; - int err; + int err, tcp_fastopen; lock_sock(sk); @@ -217,11 +217,12 @@ int inet_listen(struct socket *sock, int backlog) * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ - if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && - (sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && + (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk, backlog); @@ -826,6 +827,7 @@ int inet_shutdown(struct socket *sock, int how) err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for POLLHUP, even on eg. unconnected UDP sockets -- RR */ + /* fall through */ default: sk->sk_shutdown |= how; if (sk->sk_prot->shutdown) @@ -839,7 +841,7 @@ int inet_shutdown(struct socket *sock, int how) case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; - /* Fall through */ + /* fall through */ case TCP_SYN_SENT: err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7c45b8896709..a8d7c5a9fb05 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1180,6 +1180,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; + /* fall through */ case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7ce22a2c07ce..e1e2ec0525e6 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1522,6 +1522,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ + /* fall through */ case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; @@ -1757,7 +1758,7 @@ static int inet_validate_link_af(const struct net_device *dev, struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; - if (dev && !__in_dev_get_rtnl(dev)) + if (dev && !__in_dev_get_rcu(dev)) return -EAFNOSUPPORT; err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); @@ -1781,7 +1782,7 @@ static int inet_validate_link_af(const struct net_device *dev, static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) { - struct in_device *in_dev = __in_dev_get_rtnl(dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 57a5d48acee8..467b3c15395f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -601,17 +601,9 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); } - -static inline void fib_add_weight(struct fib_info *fi, - const struct fib_nh *nh) -{ - fi->fib_weight += nh->nh_weight; -} - #else /* CONFIG_IP_ROUTE_MULTIPATH */ #define fib_rebalance(fi) do { } while (0) -#define fib_add_weight(fi, nh) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -774,8 +766,8 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) * | * |-> {local prefix} (terminal node) */ -static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, - struct fib_nh *nh, struct netlink_ext_ack *extack) +static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, + struct netlink_ext_ack *extack) { int err = 0; struct net *net; @@ -1258,7 +1250,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int linkdown = 0; change_nexthops(fi) { - err = fib_check_nh(cfg, fi, nexthop_nh, extack); + err = fib_check_nh(cfg, nexthop_nh, extack); if (err != 0) goto failure; if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) @@ -1275,7 +1267,6 @@ struct fib_info *fib_create_info(struct fib_config *cfg, change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); - fib_add_weight(fi, nexthop_nh); } endfor_nexthops(fi) fib_rebalance(fi); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 416bb304a281..1859c473b21a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 681e33998e03..3c1570d3e22f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -782,7 +782,7 @@ static bool icmp_tag_validation(int proto) } /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. */ @@ -810,7 +810,8 @@ static bool icmp_unreach(struct sk_buff *skb) if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; - if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->type) { + case ICMP_DEST_UNREACH: switch (icmph->code & 15) { case ICMP_NET_UNREACH: case ICMP_HOST_UNREACH: @@ -846,8 +847,16 @@ static bool icmp_unreach(struct sk_buff *skb) } if (icmph->code > NR_ICMP_UNREACH) goto out; - } else if (icmph->type == ICMP_PARAMETERPROB) + break; + case ICMP_PARAMETERPROB: info = ntohl(icmph->un.gateway) >> 24; + break; + case ICMP_TIME_EXCEEDED: + __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS); + if (icmph->code == ICMP_EXC_FRAGTIME) + goto out; + break; + } /* * Throw it at our lower layers diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 8a91ebbf0c01..5c965ecc96a0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -494,17 +494,15 @@ EXPORT_SYMBOL(inet_csk_accept); * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)) + void (*retransmit_handler)(struct timer_list *t), + void (*delack_handler)(struct timer_list *t), + void (*keepalive_handler)(struct timer_list *t)) { struct inet_connection_sock *icsk = inet_csk(sk); - setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, - (unsigned long)sk); - setup_timer(&icsk->icsk_delack_timer, delack_handler, - (unsigned long)sk); - setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); + timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); + timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); + timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } EXPORT_SYMBOL(inet_csk_init_xmit_timers); @@ -676,9 +674,9 @@ void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req } EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); -static void reqsk_timer_handler(unsigned long data) +static void reqsk_timer_handler(struct timer_list *t) { - struct request_sock *req = (struct request_sock *)data; + struct request_sock *req = from_timer(req, t, rsk_timer); struct sock *sk_listener = req->rsk_listener; struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); @@ -749,8 +747,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, req->num_timeout = 0; req->sk = NULL; - setup_pinned_timer(&req->rsk_timer, reqsk_timer_handler, - (unsigned long)req); + timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL); diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index af74d0433453..7f3ef5c287a1 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -147,7 +147,7 @@ inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) spin_unlock(&hb->chain_lock); hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire((unsigned long) fq); + f->frag_expire(&fq->timer); return evicted; } @@ -366,7 +366,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, f->constructor(q, arg); add_frag_mem_limit(nf, f->qsize); - setup_timer(&q->timer, f->frag_expire, (unsigned long)q); + timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); refcount_set(&q->refcnt, 1); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b039159e67a..a4bab81f1462 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -142,9 +142,9 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); -static void tw_timer_handler(unsigned long data) +static void tw_timer_handler(struct timer_list *t) { - struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; + struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); if (tw->tw_kill) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); @@ -188,8 +188,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); - setup_pinned_timer(&tw->tw_timer, tw_timer_handler, - (unsigned long)tw); + timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e7eb590c86ce..914d56928578 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -128,9 +128,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, break; } if (cmp == -1) - pp = &(*pp)->rb_left; + pp = &next->rb_left; else - pp = &(*pp)->rb_right; + pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; @@ -284,14 +284,17 @@ EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { - struct inet_peer *p, *n; + struct rb_node *p = rb_first(&base->rb_root); - rbtree_postorder_for_each_entry_safe(p, n, &base->rb_root, rb_node) { - inet_putpeer(p); + while (p) { + struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); + + p = rb_next(p); + rb_erase(&peer->rb_node, &base->rb_root); + inet_putpeer(peer); cond_resched(); } - base->rb_root = RB_ROOT; base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 46408c220d9d..9215654a401f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -190,12 +190,13 @@ static bool frag_expire_skip_icmp(u32 user) /* * Oops, a fragment queue timed out. Kill it and send an ICMP reply. */ -static void ip_expire(unsigned long arg) +static void ip_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct ipq *qp; struct net *net; - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); rcu_read_lock(); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9cee986ac6b8..c105a315b1a3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -259,7 +259,6 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, struct ip_tunnel *tunnel; struct erspanhdr *ershdr; const struct iphdr *iph; - __be32 session_id; __be32 index; int len; @@ -275,8 +274,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, /* The original GRE header does not have key field, * Use ERSPAN 10-bit session ID as key. */ - session_id = cpu_to_be32(ntohs(ershdr->session_id)); - tpi->key = session_id; + tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK); index = ershdr->md.index; tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags | TUNNEL_KEY, @@ -581,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, false)) goto err_free_rt; - if (skb->len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -733,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -1222,6 +1220,7 @@ static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } @@ -1245,13 +1244,16 @@ static int erspan_tunnel_init(struct net_device *dev) tunnel->tun_hlen = 8; tunnel->parms.iph.protocol = IPPROTO_GRE; - t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + + sizeof(struct erspanhdr); + t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); return ip_tunnel_init(dev); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fa2dc8f692c6..57fc13c6ab2b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -311,9 +311,10 @@ drop: static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt; + int (*edemux)(struct sk_buff *skb); struct net_device *dev = skb->dev; - void (*edemux)(struct sk_buff *skb); + struct rtable *rt; + int err; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -331,7 +332,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { - edemux(skb); + err = edemux(skb); + if (unlikely(err)) + goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } @@ -342,13 +345,10 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, dev); - if (unlikely(err)) { - if (err == -EXDEV) - __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); - goto drop; - } + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, dev); + if (unlikely(err)) + goto drop_error; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -399,6 +399,11 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) drop: kfree_skb(skb); return NET_RX_DROP; + +drop_error: + if (err == -EXDEV) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; } /* diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 02d70ca99db1..58465c0a8682 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -168,6 +168,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ + int pkt_len = skb->len; int err; int mtu; @@ -229,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) - err = skb->len; + err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c9b3e6e069ae..40a43ad294cb 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -67,6 +67,7 @@ #include <net/fib_rules.h> #include <linux/netconf.h> #include <net/nexthop.h> +#include <net/switchdev.h> struct ipmr_rule { struct fib_rule common; @@ -264,6 +265,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) fib_rules_unregister(net->ipv4.mr_rules_ops); rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR); +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IPMR); +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT; +} +EXPORT_SYMBOL(ipmr_rule_default); #else #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) @@ -298,6 +315,22 @@ static void __net_exit ipmr_rules_exit(struct net *net) net->ipv4.mrt = NULL; rtnl_unlock(); } + +static int ipmr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return 0; +} + +static unsigned int ipmr_rules_seq_read(struct net *net) +{ + return 0; +} + +bool ipmr_rule_default(const struct fib_rule *rule) +{ + return true; +} +EXPORT_SYMBOL(ipmr_rule_default); #endif static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, @@ -587,6 +620,82 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif +static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + vifi_t vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, + struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static int call_ipmr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc_cache *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = RTNL_FAMILY_IPMR, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + net->ipv4.ipmr_seq++; + return call_fib_notifiers(net, event_type, &info.info); +} + /** * vif_delete - Delete a VIF entry * @notify: Set to 1, if the caller is a notifier_call @@ -594,6 +703,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) static int vif_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { + struct net *net = read_pnet(&mrt->net); struct vif_device *v; struct net_device *dev; struct in_device *in_dev; @@ -603,6 +713,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; + if (VIF_EXISTS(mrt, vifi)) + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); + write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; @@ -652,10 +766,11 @@ static void ipmr_cache_free_rcu(struct rcu_head *head) kmem_cache_free(mrt_cachep, c); } -static inline void ipmr_cache_free(struct mfc_cache *c) +void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->rcu, ipmr_cache_free_rcu); } +EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -754,6 +869,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, struct vifctl *vifc, int mrtsock) { int vifi = vifc->vifc_vifi; + struct switchdev_attr attr = { + .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, + }; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; @@ -828,6 +946,13 @@ static int vif_add(struct net *net, struct mr_table *mrt, /* Fill in the VIF structures */ + attr.orig_dev = dev; + if (!switchdev_port_attr_get(dev, &attr)) { + memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len); + v->dev_parent_id.id_len = attr.u.ppid.id_len; + } else { + v->dev_parent_id.id_len = 0; + } v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; @@ -851,6 +976,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, if (vifi+1 > mrt->maxvif) mrt->maxvif = vifi+1; write_unlock_bh(&mrt_lock); + call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id); return 0; } @@ -949,6 +1075,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) if (c) { c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->mfc_un.res.minvif = MAXVIFS; + refcount_set(&c->mfc_un.res.refcount, 1); } return c; } @@ -1150,6 +1277,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; /* The entries are added/deleted only under RTNL */ @@ -1161,8 +1289,9 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); return 0; } @@ -1189,6 +1318,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1238,6 +1369,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, ipmr_cache_resolve(net, mrt, uc, c); ipmr_cache_free(uc); } + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mroute_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1245,6 +1377,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { + struct net *net = read_pnet(&mrt->net); struct mfc_cache *c, *tmp; LIST_HEAD(list); int i; @@ -1263,8 +1396,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); + ipmr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { @@ -1393,6 +1528,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, case MRT_ADD_MFC: case MRT_DEL_MFC: parent = -1; + /* fall through */ case MRT_ADD_MFC_PROXY: case MRT_DEL_MFC_PROXY: if (optlen != sizeof(mfc)) { @@ -1724,10 +1860,33 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk, return dst_output(net, sk, skb); } +#ifdef CONFIG_NET_SWITCHDEV +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + struct vif_device *out_vif = &mrt->vif_table[out_vifi]; + struct vif_device *in_vif = &mrt->vif_table[in_vifi]; + + if (!skb->offload_mr_fwd_mark) + return false; + if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len) + return false; + return netdev_phys_item_id_same(&out_vif->dev_parent_id, + &in_vif->dev_parent_id); +} +#else +static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt, + int in_vifi, int out_vifi) +{ + return false; +} +#endif + /* Processing handlers for ipmr_forward */ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc_cache *c, int vifi) + int in_vifi, struct sk_buff *skb, + struct mfc_cache *c, int vifi) { const struct iphdr *iph = ip_hdr(skb); struct vif_device *vif = &mrt->vif_table[vifi]; @@ -1748,6 +1907,9 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, goto out_free; } + if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi)) + goto out_free; + if (vif->flags & VIFF_TUNNEL) { rt = ip_route_output_ports(net, &fl4, NULL, vif->remote, vif->local, @@ -1925,8 +2087,8 @@ forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, - psend); + ipmr_queue_xmit(net, mrt, true_vifi, + skb2, cache, psend); } psend = ct; } @@ -1937,9 +2099,10 @@ last_forward: struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ipmr_queue_xmit(net, mrt, skb2, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb2, + cache, psend); } else { - ipmr_queue_xmit(net, mrt, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); return; } } @@ -2156,6 +2319,9 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) return -EMSGSIZE; @@ -3048,14 +3214,87 @@ static const struct net_protocol pim_protocol = { }; #endif +static unsigned int ipmr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net); +} + +static int ipmr_dump(struct net *net, struct notifier_block *nb) +{ + struct mr_table *mrt; + int err; + + err = ipmr_rules_dump(net, nb); + if (err) + return err; + + ipmr_for_each_table(mrt, net) { + struct vif_device *v = &mrt->vif_table[0]; + struct mfc_cache *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(&mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(&mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + call_ipmr_mfc_entry_notifier(nb, net, + FIB_EVENT_ENTRY_ADD, mfc, + mrt->id); + } + + return 0; +} + +static const struct fib_notifier_ops ipmr_notifier_ops_template = { + .family = RTNL_FAMILY_IPMR, + .fib_seq_read = ipmr_seq_read, + .fib_dump = ipmr_dump, + .owner = THIS_MODULE, +}; + +static int __net_init ipmr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv4.ipmr_seq = 0; + + ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + net->ipv4.ipmr_notifier_ops = ops; + + return 0; +} + +static void __net_exit ipmr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops); + net->ipv4.ipmr_notifier_ops = NULL; +} + /* Setup for IP multicast routing */ static int __net_init ipmr_net_init(struct net *net) { int err; + err = ipmr_notifier_init(net); + if (err) + goto ipmr_notifier_fail; + err = ipmr_rules_init(net); if (err < 0) - goto fail; + goto ipmr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -3072,7 +3311,9 @@ proc_cache_fail: proc_vif_fail: ipmr_rules_exit(net); #endif -fail: +ipmr_rules_fail: + ipmr_notifier_exit(net); +ipmr_notifier_fail: return err; } @@ -3082,6 +3323,7 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_cache", net->proc_net); remove_proc_entry("ip_mr_vif", net->proc_net); #endif + ipmr_notifier_exit(net); ipmr_rules_exit(net); } diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 811689e523c3..f75fc6b53115 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (synproxy == NULL) return NF_ACCEPT; - if (nf_is_loopback_packet(skb)) + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) return NF_ACCEPT; thoff = ip_hdrlen(skb); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index a0f37b208268..0443ca4120b0 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -276,7 +276,8 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 94d4cd2d5ea4..4306db827374 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1520,43 +1520,56 @@ struct rtable *rt_dst_alloc(struct net_device *dev, EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - struct rtable *rth; - struct in_device *in_dev = __in_dev_get_rcu(dev); - unsigned int flags = RTCF_MULTICAST; - u32 itag = 0; int err; /* Primary sanity checks. */ - if (!in_dev) return -EINVAL; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || skb->protocol != htons(ETH_P_IP)) - goto e_inval; + return -EINVAL; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - goto e_inval; + return -EINVAL; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr)) - goto e_inval; + return -EINVAL; } else { err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); + in_dev, itag); if (err < 0) - goto e_err; + return err; } + return 0; +} + +/* called in rcu_read_lock() section */ +static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev, int our) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + struct rtable *rth; + u32 itag = 0; + int err; + + err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); + if (err) + return err; + if (our) flags |= RTCF_LOCAL; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); if (!rth) - goto e_nobufs; + return -ENOBUFS; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1572,13 +1585,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_set(skb, &rth->dst); return 0; - -e_nobufs: - return -ENOBUFS; -e_inval: - return -EINVAL; -e_err: - return err; } @@ -2507,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; - rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; @@ -3032,7 +3038,6 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; int __init ip_rt_init(void) { - int rc = 0; int cpu; ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); @@ -3089,7 +3094,7 @@ int __init ip_rt_init(void) #endif register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); - return rc; + return 0; } #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0d3c038d7b04..cac8dd309f39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -251,10 +251,12 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl, return ret; } -static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, +static int proc_tcp_fastopen_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; int ret; @@ -265,7 +267,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return -ENOMEM; rcu_read_lock(); - ctxt = rcu_dereference(tcp_fastopen_ctx); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else @@ -282,12 +284,7 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, ret = -EINVAL; goto bad_key; } - /* Generate a dummy secret but don't publish it. This - * is needed so we don't regenerate a new key on the - * first invocation of tcp_fastopen_cookie_gen - */ - tcp_fastopen_init_key_once(false); - tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH); + tcp_fastopen_reset_cipher(net, user_key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: @@ -358,11 +355,13 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_fastopen_blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - tcp_fastopen_active_timeout_reset(); + atomic_set(&net->ipv4.tfo_active_disable_times, 0); return ret; } @@ -401,27 +400,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_fastopen", - .data = &sysctl_tcp_fastopen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "tcp_fastopen_key", - .mode = 0600, - .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), - .proc_handler = proc_tcp_fastopen_key, - }, - { - .procname = "tcp_fastopen_blackhole_timeout_sec", - .data = &sysctl_tcp_fastopen_blackhole_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_tfo_blackhole_detect_timeout, - .extra1 = &zero, - }, - { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, .maxlen = sizeof(int), @@ -1085,6 +1063,28 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_fastopen", + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_fastopen_key", + .mode = 0600, + .data = &init_net.ipv4.sysctl_tcp_fastopen, + .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), + .proc_handler = proc_tcp_fastopen_key, + }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &init_net.ipv4.sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5091402720ab..3b34850d361f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -413,8 +413,10 @@ void tcp_init_sock(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; + sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); + INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); @@ -456,8 +458,22 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) +void tcp_init_transfer(struct sock *sk, int bpf_op) { + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_mtup_init(sk); + icsk->icsk_af_ops->rebuild_header(sk); + tcp_init_metrics(sk); + tcp_call_bpf(sk, bpf_op); + tcp_init_congestion_control(sk); + tcp_init_buffer_space(sk); +} + +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -686,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!tcp_send_head(sk)) - return; - skb = tcp_write_queue_tail(sk); + if (!skb) + return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); @@ -869,6 +884,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, * available to the caller, no more, no less. */ skb->reserved_tailroom = skb->end - skb->tail - size; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -948,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + if (!skb || (copy = size_goal - skb->len) <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - skb_queue_empty(&sk->sk_write_queue)); + tcp_rtx_and_write_queues_empty(sk)); if (!skb) goto wait_for_memory; @@ -1027,7 +1043,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sk->sk_tsflags); if (!(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } @@ -1126,7 +1142,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, struct sockaddr *uaddr = msg->msg_name; int err, flags; - if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || + if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; @@ -1183,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; @@ -1259,7 +1275,7 @@ restart: int max = size_goal; skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { + if (skb) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; copy = max - skb->len; @@ -1279,7 +1295,7 @@ new_segment: process_backlog = false; goto restart; } - first_skb = skb_queue_empty(&sk->sk_write_queue); + first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, @@ -1404,7 +1420,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: @@ -1505,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } + skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -2304,6 +2327,37 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +static void tcp_rtx_queue_purge(struct sock *sk) +{ + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + /* Since we are deleting whole queue, no need to + * list_del(&skb->tcp_tsorted_anchor) + */ + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); + } +} + +void tcp_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + sk_wmem_free_skb(sk, skb); + } + tcp_rtx_queue_purge(sk); + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); + sk_mem_reclaim(sk); + tcp_clear_all_retrans_hints(tcp_sk(sk)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2362,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags) * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; - tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(sk->sk_rx_dst); @@ -2749,7 +2802,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { - tcp_fastopen_init_key_once(true); + tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { @@ -2759,7 +2812,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; - } else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { + } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 66ac69f7bd19..06fbe102a425 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -389,7 +389,7 @@ static void tcp_cdg_release(struct sock *sk) kfree(ca->gradients); } -struct tcp_congestion_ops tcp_cdg __read_mostly = { +static struct tcp_congestion_ops tcp_cdg __read_mostly = { .cong_avoid = tcp_cdg_cong_avoid, .cwnd_event = tcp_cdg_cwnd_event, .pkts_acked = tcp_cdg_acked, diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index e3c33220c418..7ee4aadcdd71 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -9,15 +9,18 @@ #include <net/inetpeer.h> #include <net/tcp.h> -int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE; - -struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; - -static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock); - -void tcp_fastopen_init_key_once(bool publish) +void tcp_fastopen_init_key_once(struct net *net) { - static u8 key[TCP_FASTOPEN_KEY_LENGTH]; + u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct tcp_fastopen_context *ctxt; + + rcu_read_lock(); + ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); + if (ctxt) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. @@ -25,8 +28,8 @@ void tcp_fastopen_init_key_once(bool publish) * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ - if (net_get_random_once(key, sizeof(key)) && publish) - tcp_fastopen_reset_cipher(key, sizeof(key)); + get_random_bytes(key, sizeof(key)); + tcp_fastopen_reset_cipher(net, key, sizeof(key)); } static void tcp_fastopen_ctx_free(struct rcu_head *head) @@ -37,7 +40,22 @@ static void tcp_fastopen_ctx_free(struct rcu_head *head) kfree(ctx); } -int tcp_fastopen_reset_cipher(void *key, unsigned int len) +void tcp_fastopen_ctx_destroy(struct net *net) +{ + struct tcp_fastopen_context *ctxt; + + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); + + ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + + if (ctxt) + call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); +} + +int tcp_fastopen_reset_cipher(struct net *net, void *key, unsigned int len) { int err; struct tcp_fastopen_context *ctx, *octx; @@ -61,26 +79,27 @@ error: kfree(ctx); } memcpy(ctx->key, key, len); - spin_lock(&tcp_fastopen_ctx_lock); + spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - octx = rcu_dereference_protected(tcp_fastopen_ctx, - lockdep_is_held(&tcp_fastopen_ctx_lock)); - rcu_assign_pointer(tcp_fastopen_ctx, ctx); - spin_unlock(&tcp_fastopen_ctx_lock); + octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, + lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); + rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); return err; } -static bool __tcp_fastopen_cookie_gen(const void *path, +static bool __tcp_fastopen_cookie_gen(struct net *net, + const void *path, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; bool ok = false; rcu_read_lock(); - ctx = rcu_dereference(tcp_fastopen_ctx); + ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctx) { crypto_cipher_encrypt_one(ctx->tfm, foc->val, path); foc->len = TCP_FASTOPEN_COOKIE_SIZE; @@ -96,7 +115,8 @@ static bool __tcp_fastopen_cookie_gen(const void *path, * * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE. */ -static bool tcp_fastopen_cookie_gen(struct request_sock *req, +static bool tcp_fastopen_cookie_gen(struct net *net, + struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { @@ -104,7 +124,7 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct iphdr *iph = ip_hdr(syn); __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 }; - return __tcp_fastopen_cookie_gen(path, foc); + return __tcp_fastopen_cookie_gen(net, path, foc); } #if IS_ENABLED(CONFIG_IPV6) @@ -112,13 +132,13 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, const struct ipv6hdr *ip6h = ipv6_hdr(syn); struct tcp_fastopen_cookie tmp; - if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) { + if (__tcp_fastopen_cookie_gen(net, &ip6h->saddr, &tmp)) { struct in6_addr *buf = &tmp.addr; int i; for (i = 0; i < 4; i++) buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i]; - return __tcp_fastopen_cookie_gen(buf, foc); + return __tcp_fastopen_cookie_gen(net, buf, foc); } } #endif @@ -216,12 +236,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ - inet_csk(child)->icsk_af_ops->rebuild_header(child); - tcp_init_congestion_control(child); - tcp_mtup_init(child); - tcp_init_metrics(child); - tcp_call_bpf(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_buffer_space(child); + tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; @@ -279,25 +294,26 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc) { - struct tcp_fastopen_cookie valid_foc = { .len = -1 }; bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; + int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen; + struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); - if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && + if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } - if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) + if (syn_data && (tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len >= 0 && /* Client presents or requests a cookie */ - tcp_fastopen_cookie_gen(req, skb, &valid_foc) && + tcp_fastopen_cookie_gen(sock_net(sk), req, skb, &valid_foc) && foc->len == TCP_FASTOPEN_COOKIE_SIZE && foc->len == valid_foc.len && !memcmp(foc->val, valid_foc.val, foc->len)) { @@ -347,7 +363,7 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, return false; } - if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { + if (sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { cookie->len = -1; return true; } @@ -401,25 +417,16 @@ EXPORT_SYMBOL(tcp_fastopen_defer_connect); * TFO connection with data exchanges. */ -/* Default to 1hr */ -unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; -static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); -static unsigned long tfo_active_disable_stamp __read_mostly; - /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { - atomic_inc(&tfo_active_disable_times); - tfo_active_disable_stamp = jiffies; - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); -} + struct net *net = sock_net(sk); -/* Reset tfo_active_disable_times to 0 */ -void tcp_fastopen_active_timeout_reset(void) -{ - atomic_set(&tfo_active_disable_times, 0); + atomic_inc(&net->ipv4.tfo_active_disable_times); + net->ipv4.tfo_active_disable_stamp = jiffies; + NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable @@ -428,17 +435,18 @@ void tcp_fastopen_active_timeout_reset(void) */ bool tcp_fastopen_active_should_disable(struct sock *sk) { - int tfo_da_times = atomic_read(&tfo_active_disable_times); - int multiplier; + unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout; + int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); unsigned long timeout; + int multiplier; if (!tfo_da_times) return false; /* Limit timout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); - timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; - if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + timeout = multiplier * tfo_bh_timeout * HZ; + if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout)) return true; /* Mark check bit so we can check for successful active TFO @@ -457,27 +465,25 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node *p; - struct sk_buff *skb; struct dst_entry *dst; + struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { - p = rb_first(&tp->out_of_order_queue); - if (p && !rb_next(p)) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); + if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; } } } else if (tp->syn_fastopen_ch && - atomic_read(&tfo_active_disable_times)) { + atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) - tcp_fastopen_active_timeout_reset(); + atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index db9bb46b5776..b2390bfdc68f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1142,6 +1142,7 @@ struct tcp_sacktag_state { u64 last_sackt; struct rate_sample *rate; int flag; + unsigned int mss_now; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, if (pkt_len >= skb->len && !in_sack) return 0; - err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); + err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1288,13 +1290,13 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ -static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *prev = tcp_write_queue_prev(sk, skb); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ @@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); @@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto fallback; /* Can only happen with delayed DSACK + discard craziness */ - if (unlikely(skb == tcp_write_queue_head(sk))) + prev = skb_rb_prev(skb); + if (!prev) goto fallback; - prev = tcp_write_queue_prev(sk, skb); if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; @@ -1495,18 +1496,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) + if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ - if (prev == tcp_write_queue_tail(sk)) + skb = skb_rb_next(prev); + if (!skb) goto out; - skb = tcp_write_queue_next(sk, prev); if (!skb_can_shift(skb) || - (skb == tcp_send_head(sk)) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; @@ -1514,7 +1514,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), + len, mss, 0); } out: @@ -1538,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; - if (skb == tcp_send_head(sk)) - break; - /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; @@ -1593,6 +1591,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_skb_pcount(skb), skb->skb_mstamp); tcp_rate_skb_delivered(sk, skb, state->rate); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1604,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, return skb; } -/* Avoid all extra work that is being done by sacktag while walking in - * a normal way - */ +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, + struct tcp_sacktag_state *state, + u32 seq) +{ + struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; + struct sk_buff *skb; + int unack_bytes; + + while (*p) { + parent = *p; + skb = rb_to_skb(parent); + if (before(seq, TCP_SKB_CB(skb)->seq)) { + p = &parent->rb_left; + continue; + } + if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { + p = &parent->rb_right; + continue; + } + + state->fack_count = 0; + unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una; + if (state->mss_now && unack_bytes > 0) + state->fack_count = unack_bytes / state->mss_now; + + return skb; + } + return NULL; +} + static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, struct tcp_sacktag_state *state, u32 skip_to_seq) { - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - - if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) - break; + if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) + return skb; - state->fack_count += tcp_skb_pcount(skb); - } - return skb; + return tcp_sacktag_bsearch(sk, state, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, @@ -1742,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } } - skb = tcp_write_queue_head(sk); + state->mss_now = tcp_current_mss(sk); state->fack_count = 0; + skb = NULL; i = 0; if (!tp->sacked_out) { @@ -1967,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); @@ -1976,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk) } tcp_clear_all_retrans_hints(tp); - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - + skb_rbtree_walk_from(skb) { mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || is_reneg); if (mark_lost) @@ -2205,20 +2224,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); - if (tp->lost_skb_hint) { - skb = tp->lost_skb_hint; - cnt = tp->lost_cnt_hint; + skb = tp->lost_skb_hint; + if (skb) { /* Head already handled? */ - if (mark_head && skb != tcp_write_queue_head(sk)) + if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) return; + cnt = tp->lost_cnt_hint; } else { - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); cnt = 0; } - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk_from(skb) { /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -2242,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) /* If needed, chop off the prefix to mark as lost. */ lost = (packets - oldcnt) * mss; if (lost < skb->len && - tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) + tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } @@ -2326,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk) if (tp->retrans_out) return true; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) return true; @@ -2367,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (unmark_loss) { struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tp->lost_out = 0; @@ -2614,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk) unsigned int mss = tcp_current_mss(sk); u32 prior_lost = tp->lost_out; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss && !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { @@ -2710,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, * is updated in tcp_ack()). Otherwise fall back to * the conventional recovery. */ - if (tcp_send_head(sk) && + if (!tcp_write_queue_empty(sk) && after(tcp_wnd_end(tp), tp->snd_nxt)) { *rexmit = REXMIT_NEW; return; @@ -2802,9 +2816,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + if (!tp->sacked_out && tp->fackets_out) tp->fackets_out = 0; /* Now state machine starts. @@ -2871,6 +2885,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ + /* fall through */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -3054,8 +3069,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if (!before(shinfo->tskey, prior_snd_una) && - before(shinfo->tskey, tcp_sk(sk)->snd_una)) - __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + before(shinfo->tskey, tcp_sk(sk)->snd_una)) { + tcp_skb_tsorted_save(skb) { + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); + } tcp_skb_tsorted_restore(skb); + } } /* Remove acknowledged frames from the retransmission queue. If our packet @@ -3071,11 +3089,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; + struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; long seq_rtt_us = -1L; long ca_rtt_us = -1L; - struct sk_buff *skb; u32 pkts_acked = 0; u32 last_in_flight = 0; bool rtt_update; @@ -3083,7 +3101,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = 0; - while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; @@ -3101,8 +3119,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, break; fully_acked = false; } else { - /* Speedup tcp_unlink_write_queue() and next loop */ - prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } @@ -3154,12 +3170,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!fully_acked) break; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_rtx_queue_unlink_and_free(skb, sk); } if (!skb) @@ -3251,12 +3267,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, static void tcp_ack_probe(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *head = tcp_send_head(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Was it a usable window open? */ - - if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + if (!head) + return; + if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). @@ -3376,7 +3394,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); - if (tcp_send_head(sk)) + if (!tcp_write_queue_empty(sk)) tcp_slow_start_after_idle_check(sk); if (nwin > tp->max_window) { @@ -3561,8 +3579,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; - /* We very likely will need to access write queue head. */ - prefetchw(sk->sk_write_queue.next); + /* We very likely will need to access rtx queue. */ + prefetch(sk->tcp_rtx_queue.rb_node); /* If the ack is older than previous acks * then we can probably ignore it. @@ -3676,8 +3694,7 @@ no_queue: * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tcp_send_head(sk)) - tcp_ack_probe(sk); + tcp_ack_probe(sk); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -4330,7 +4347,7 @@ static void tcp_ofo_queue(struct sock *sk) p = rb_first(&tp->out_of_order_queue); while (p) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = rb_to_skb(p); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4394,7 +4411,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node **p, *q, *parent; + struct rb_node **p, *parent; struct sk_buff *skb1; u32 seq, end_seq; bool fragstolen; @@ -4453,7 +4470,7 @@ coalesce_done: parent = NULL; while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb1)->seq)) { p = &parent->rb_left; continue; @@ -4498,9 +4515,7 @@ insert: merge_right: /* Remove other segments covered by skb. */ - while ((q = rb_next(&skb->rbnode)) != NULL) { - skb1 = rb_entry(q, struct sk_buff, rbnode); - + while ((skb1 = skb_rb_next(skb)) != NULL) { if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { @@ -4515,7 +4530,7 @@ merge_right: tcp_drop(sk, skb1); } /* If there is no skb after us, we are the last_skb ! */ - if (!q) + if (!skb1) tp->ooo_last_skb = skb; add_sack: @@ -4701,7 +4716,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li if (list) return !skb_queue_is_last(list, skb) ? skb->next : NULL; - return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); + return skb_rb_next(skb); } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, @@ -4722,7 +4737,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -4730,7 +4745,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left; else @@ -4849,26 +4864,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *head; - struct rb_node *p; u32 start, end; - p = rb_first(&tp->out_of_order_queue); - skb = rb_entry_safe(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); new_range: if (!skb) { - p = rb_last(&tp->out_of_order_queue); - /* Note: This is possible p is NULL here. We do not - * use rb_entry_safe(), as ooo_last_skb is valid only - * if rbtree is not empty. - */ - tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); + tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); return; } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; for (head = skb;;) { - skb = tcp_skb_next(skb, NULL); + skb = skb_rb_next(skb); /* Range is terminated when we see a gap or when * we are at the queue end. @@ -4911,14 +4919,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk) do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); - tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); + tcp_drop(sk, rb_to_skb(node)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; node = prev; } while (node); - tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); + tp->ooo_last_skb = rb_to_skb(prev); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection @@ -5513,20 +5521,13 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) security_inet_conn_established(sk, skb); } - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - tcp_init_metrics(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ tp->lsndtime = tcp_jiffies32; - tcp_init_buffer_space(sk); - if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); @@ -5540,7 +5541,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; + struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; @@ -5575,9 +5576,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ - tcp_for_write_queue_from(data, sk) { - if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data, 1)) + skb_rbtree_walk_from(data) { + if (__tcp_retransmit_skb(sk, data, 1)) break; } tcp_rearm_rto(sk); @@ -5693,7 +5693,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tcp_is_sack(tp) && sysctl_tcp_fack) tcp_enable_fack(tp); - tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); @@ -5919,15 +5918,18 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (req) { inet_csk(sk)->icsk_retransmits = 0; reqsk_fastopen_remove(sk, req, false); + /* Re-arm the timer because data may have been sent out. + * This is similar to the regular data transmission case + * when new data has just been ack'ed. + * + * (TFO) - we could try to be more aggressive and + * retransmitting any data sooner based on when they + * are sent out. + */ + tcp_rearm_rto(sk); } else { - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - tcp_call_bpf(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); - tcp_init_congestion_control(sk); - - tcp_mtup_init(sk); + tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); tp->copied_seq = tp->rcv_nxt; - tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5947,19 +5949,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (req) { - /* Re-arm the timer because data may have been sent out. - * This is similar to the regular data transmission case - * when new data has just been ack'ed. - * - * (TFO) - we could try to be more aggressive and - * retransmitting any data sooner based on when they - * are sent out. - */ - tcp_rearm_rto(sk); - } else - tcp_init_metrics(sk); - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); @@ -6056,6 +6045,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_LAST_ACK: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; + /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: /* RFC 793 says to queue data in these states, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d9416b5162bc..ecee4ddb24c5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); BUG_ON(!skb); tcp_mstamp_refresh(tp); @@ -1503,23 +1503,23 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, @@ -1538,6 +1538,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) @@ -1778,8 +1779,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb); break; @@ -2472,6 +2474,11 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); + net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; + atomic_set(&net->ipv4.tfo_active_disable_times, 0); + return 0; fail: tcp_sk_exit(net); @@ -2481,7 +2488,12 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { + struct net *net; + inet_twsk_purge(&tcp_hashinfo, AF_INET); + + list_for_each_entry(net, net_exit_list, exit_list) + tcp_fastopen_ctx_destroy(net); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 188a6f31356d..2341b9f857b6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -446,6 +446,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; INIT_LIST_HEAD(&newtp->tsq_node); + INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0bc9e46a5369..53dc1267c85e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -42,6 +42,8 @@ #include <linux/gfp.h> #include <linux/module.h> +#include <trace/events/tcp.h> + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse __read_mostly = 1; @@ -66,15 +68,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_write_queue); + tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -971,6 +975,12 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb) HRTIMER_MODE_ABS_PINNED); } +static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb) +{ + skb->skb_mstamp = tp->tcp_mstamp; + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); +} + /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. @@ -1003,10 +1013,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; oskb = skb; - if (unlikely(skb_cloned(skb))) - skb = pskb_copy(skb, gfp_mask); - else - skb = skb_clone(skb, gfp_mask); + + tcp_skb_tsorted_save(oskb) { + if (unlikely(skb_cloned(oskb))) + skb = pskb_copy(oskb, gfp_mask); + else + skb = skb_clone(oskb, gfp_mask); + } tcp_skb_tsorted_restore(oskb); + if (unlikely(!skb)) return -ENOBUFS; } @@ -1127,7 +1141,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = net_xmit_eval(err); } if (!err && oskb) { - oskb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, oskb); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1239,12 +1253,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) TCP_SKB_CB(skb)->eor = 0; } +/* Insert buff after skb on the write or rtx queue of sk. */ +static void tcp_insert_write_queue_after(struct sk_buff *skb, + struct sk_buff *buff, + struct sock *sk, + enum tcp_queue tcp_queue) +{ + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + __skb_queue_after(&sk->sk_write_queue, skb, buff); + else + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); @@ -1327,7 +1354,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); + list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; } @@ -1614,10 +1642,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) - * 3) no more data to send (null tcp_send_head ) + * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ - if (!tcp_send_head(sk) && sk->sk_socket && + if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); @@ -1813,7 +1841,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, +static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { struct sk_buff *buff; @@ -1822,7 +1851,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now, gfp); + return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) @@ -1858,7 +1887,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); return 0; } @@ -1928,8 +1957,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, goto send_now; } - head = tcp_write_queue_head(sk); - + /* TODO : use tsorted_sent_queue ? */ + head = tcp_rtx_queue_head(sk); + if (!head) + goto send_now; age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) @@ -2147,13 +2178,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send the 1st or 2nd skb in write queue. + /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (skb == sk->sk_write_queue.next || - skb->prev == sk->sk_write_queue.next) + if (tcp_rtx_queue_empty(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2204,7 +2234,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ - if (tcp_write_queue_empty(sk)) + if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); @@ -2260,7 +2290,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); goto repair; /* Skip network transmission */ } @@ -2299,7 +2329,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, limit, mss_now, gfp))) break; if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) @@ -2339,7 +2370,7 @@ repair: tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return !tp->packets_out && tcp_send_head(sk); + return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2363,7 +2394,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - tcp_send_head(sk)) + !tcp_write_queue_empty(sk)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account @@ -2416,18 +2447,14 @@ void tcp_send_loss_probe(struct sock *sk) int mss = tcp_current_mss(sk); skb = tcp_send_head(sk); - if (skb) { - if (tcp_snd_wnd_test(tp, skb, mss)) { - pcount = tp->packets_out; - tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - if (tp->packets_out > pcount) - goto probe_sent; - goto rearm_timer; - } - skb = tcp_write_queue_prev(sk, skb); - } else { - skb = tcp_write_queue_tail(sk); + if (skb && tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; } + skb = skb_rb_last(&sk->tcp_rtx_queue); /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) @@ -2445,10 +2472,11 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_next(sk, skb); + skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) @@ -2648,7 +2676,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + struct sk_buff *next_skb = skb_rb_next(skb); int skb_size, next_skb_size; skb_size = skb->len; @@ -2665,8 +2693,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } tcp_highest_sack_combine(sk, next_skb, skb); - tcp_unlink_write_queue(next_skb, sk); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2694,7 +2720,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); - sk_wmem_free_skb(sk, next_skb); + tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } @@ -2705,8 +2731,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) return false; if (skb_cloned(skb)) return false; - if (skb == tcp_send_head(sk)) - return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -2729,7 +2753,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; - tcp_for_write_queue_from_safe(skb, tmp, sk) { + skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; @@ -2804,7 +2828,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) len = cur_mss * segs; if (skb->len > len) { - if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, + cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone(skb, GFP_ATOMIC)) @@ -2838,17 +2863,21 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); - err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : - -ENOBUFS; + tcp_skb_tsorted_save(skb) { + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); + err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : + -ENOBUFS; + } tcp_skb_tsorted_restore(skb); + if (!err) - skb->skb_mstamp = tp->tcp_mstamp; + tcp_update_skb_after_send(tp, skb); } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } if (likely(!err)) { TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } @@ -2892,29 +2921,21 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - struct sk_buff *hole = NULL; u32 max_segs; int mib_idx; if (!tp->packets_out) return; - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - } else { - skb = tcp_write_queue_head(sk); - } - + rtx_head = tcp_rtx_queue_head(sk); + skb = tp->retransmit_skb_hint ?: rtx_head; max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { __u8 sacked; int segs; - if (skb == tcp_send_head(sk)) - break; - if (tcp_pacing_check(sk)) break; @@ -2959,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk) && + if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, @@ -3001,12 +3022,15 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { + if (!tskb && tcp_under_memory_pressure(sk)) + tskb = skb_rb_last(&sk->tcp_rtx_queue); + + if (tskb) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (!tcp_send_head(sk)) { + if (tcp_write_queue_empty(sk)) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have @@ -3023,6 +3047,7 @@ coalesce: goto coalesce; return; } + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ @@ -3071,20 +3096,24 @@ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { - pr_debug("%s: wrong queue state\n", __func__); + pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { if (skb_cloned(skb)) { - struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + struct sk_buff *nskb; + + tcp_skb_tsorted_save(skb) { + nskb = skb_copy(skb, GFP_ATOMIC); + } tcp_skb_tsorted_restore(skb); if (!nskb) return -ENOMEM; - tcp_unlink_write_queue(skb, sk); + INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); - __tcp_add_write_queue_head(sk, nskb); - sk_wmem_free_skb(sk, skb); + tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk->sk_wmem_queued += nskb->truesize; sk_mem_charge(sk, nskb->truesize); skb = nskb; @@ -3307,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); tp->write_seq = tcb->end_seq; @@ -3385,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); + tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - /* data was not sent, this is our new send_head */ - sk->sk_send_head = syn_data; + /* data was not sent, put it in write_queue */ + __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: @@ -3433,6 +3462,7 @@ int tcp_connect(struct sock *sk) tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : @@ -3627,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); @@ -3657,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 449cd914d58e..cda6074a429a 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -45,7 +45,7 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; + struct sk_buff *skb, *n; u32 reo_wnd; *reo_timeout = 0; @@ -58,45 +58,31 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); - tcp_for_write_queue(skb, sk) { + list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue, + tcp_tsorted_anchor) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + s32 remaining; - if (skb == tcp_send_head(sk)) - break; - - /* Skip ones already (s)acked */ - if (!after(scb->end_seq, tp->snd_una) || - scb->sacked & TCPCB_SACKED_ACKED) + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) continue; - if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, - tp->rack.end_seq, scb->end_seq)) { - /* Step 3 in draft-cheng-tcpm-rack-00.txt: - * A packet is lost if its elapsed time is beyond - * the recent RTT plus the reordering window. - */ - u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp, - skb->skb_mstamp); - s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - - if (remaining < 0) { - tcp_rack_mark_skb_lost(sk, skb); - continue; - } - - /* Skip ones marked lost but not yet retransmitted */ - if ((scb->sacked & TCPCB_LOST) && - !(scb->sacked & TCPCB_SACKED_RETRANS)) - continue; + if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) + break; + /* A packet is lost if it has not been s/acked beyond + * the recent RTT plus the reordering window. + */ + remaining = tp->rack.rtt_us + reo_wnd - + tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp); + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); + list_del_init(&skb->tcp_tsorted_anchor); + } else { /* Record maximum wait time (+1 to avoid 0) */ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); - - } else if (!(scb->sacked & TCPCB_RETRANS)) { - /* Original data are sent sequentially so stop early - * b/c the rest are all sent after rack_sent - */ - break; } } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 655dd8d7f064..804a8d34ce86 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) - start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return false; + start_ts = tcp_skb_timestamp(head); + } if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -283,15 +288,17 @@ out: * * Returns: Nothing (void) */ -static void tcp_delack_timer(unsigned long data) +static void tcp_delack_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_delack_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { tcp_delack_timer_handler(sk); } else { - inet_csk(sk)->icsk_ack.blocked = 1; + icsk->icsk_ack.blocked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) @@ -304,11 +311,12 @@ static void tcp_delack_timer(unsigned long data) static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; u32 start_ts; - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; return; } @@ -321,9 +329,9 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + start_ts = tcp_skb_timestamp(skb); if (!start_ts) - tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > jiffies_to_msecs(icsk->icsk_user_timeout)) @@ -408,7 +416,7 @@ void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - WARN_ON(tcp_write_queue_empty(sk)); + WARN_ON(tcp_rtx_queue_empty(sk)); tp->tlp_high_seq = 0; @@ -441,7 +449,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); + tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -473,7 +481,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { + if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -570,9 +578,11 @@ out: sk_mem_reclaim(sk); } -static void tcp_write_timer(unsigned long data) +static void tcp_write_timer(struct timer_list *t) { - struct sock *sk = (struct sock *)data; + struct inet_connection_sock *icsk = + from_timer(icsk, t, icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { @@ -607,9 +617,9 @@ void tcp_set_keepalive(struct sock *sk, int val) EXPORT_SYMBOL_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (unsigned long data) +static void tcp_keepalive_timer (struct timer_list *t) { - struct sock *sk = (struct sock *) data; + struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; @@ -647,7 +657,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || tcp_send_head(sk)) + if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 218cfcc77650..ee113ff15fd0 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { - return min(tp->snd_ssthresh, tp->snd_cwnd-1); + return min(tp->snd_ssthresh, tp->snd_cwnd); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 784ced0b9150..7c9a6e4a7253 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2220,9 +2220,10 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -void udp_v4_early_demux(struct sk_buff *skb) +int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; @@ -2233,25 +2234,21 @@ void udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return; + return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); - if (skb->pkt_type == PACKET_BROADCAST || - skb->pkt_type == PACKET_MULTICAST) { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + if (skb->pkt_type == PACKET_MULTICAST) { + in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return; + return 0; - /* we are supposed to accept bcast packets */ - if (skb->pkt_type == PACKET_MULTICAST) { - ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, - iph->protocol); - if (!ours) - return; - } + ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, + iph->protocol); + if (!ours) + return 0; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, @@ -2262,7 +2259,7 @@ void udp_v4_early_demux(struct sk_buff *skb) } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) - return; + return 0; skb->sk = sk; skb->destructor = sock_efree; @@ -2271,12 +2268,23 @@ void udp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst) { + u32 itag = 0; + /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); + + /* for unconnected multicast sockets we need to validate + * the source on each packet + */ + if (!inet_sk(sk)->inet_daddr && in_dev) + return ip_mc_validate_source(skb, iph->daddr, + iph->saddr, iph->tos, + skb->dev, in_dev, &itag); } + return 0; } int udp_rcv(struct sk_buff *skb) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 97658bfc1b58..e360d55be555 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 96861c702c06..4603aa488f4f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -152,7 +152,7 @@ static void ipv6_regen_rndid(struct inet6_dev *idev); static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); -static int ipv6_count_addresses(struct inet6_dev *idev); +static int ipv6_count_addresses(const struct inet6_dev *idev); static int ipv6_generate_stable_address(struct in6_addr *addr, u8 dad_count, const struct inet6_dev *idev); @@ -303,10 +303,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .disable_policy = 0, }; -/* Check if a valid qdisc is available */ -static inline bool addrconf_qdisc_ok(const struct net_device *dev) +/* Check if link is ready: is it up and is a valid qdisc available */ +static inline bool addrconf_link_ready(const struct net_device *dev) { - return !qdisc_tx_is_noop(dev); + return netif_oper_up(dev) && !qdisc_tx_is_noop(dev); } static void addrconf_del_rs_timer(struct inet6_dev *idev) @@ -451,7 +451,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->token = in6addr_any; - if (netif_running(dev) && addrconf_qdisc_ok(dev)) + if (netif_running(dev) && addrconf_link_ready(dev)) ndev->if_flags |= IF_READY; ipv6_mc_init_dev(ndev); @@ -616,23 +616,23 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; + struct inet6_dev *in6_dev = NULL; + struct net_device *dev = NULL; struct netconfmsg *ncm; struct sk_buff *skb; struct ipv6_devconf *devconf; - struct inet6_dev *in6_dev; - struct net_device *dev; int ifindex; int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, devconf_ipv6_policy, extack); if (err < 0) - goto errout; + return err; - err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) - goto errout; + return -EINVAL; + err = -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: @@ -642,10 +642,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, devconf = net->ipv6.devconf_dflt; break; default: - dev = __dev_get_by_index(net, ifindex); + dev = dev_get_by_index(net, ifindex); if (!dev) - goto errout; - in6_dev = __in6_dev_get(dev); + return -EINVAL; + in6_dev = in6_dev_get(dev); if (!in6_dev) goto errout; devconf = &in6_dev->cnf; @@ -653,7 +653,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; @@ -669,6 +669,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: + if (in6_dev) + in6_dev_put(in6_dev); + if (dev) + dev_put(dev); return err; } @@ -945,7 +949,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) break; } - list_add_tail(&ifp->if_list, p); + list_add_tail_rcu(&ifp->if_list, p); } static u32 inet6_addr_hash(const struct in6_addr *addr) @@ -1204,7 +1208,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) action = check_cleanup_prefix_route(ifp, &expires); - list_del_init(&ifp->if_list); + list_del_rcu(&ifp->if_list); __in6_ifa_put(ifp); write_unlock_bh(&ifp->idev->lock); @@ -1558,8 +1562,7 @@ static int __ipv6_dev_get_saddr(struct net *net, { struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx]; - read_lock_bh(&idev->lock); - list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) { int i; /* @@ -1609,11 +1612,6 @@ static int __ipv6_dev_get_saddr(struct net *net, } break; } else if (minihiscore < miniscore) { - if (hiscore->ifa) - in6_ifa_put(hiscore->ifa); - - in6_ifa_hold(score->ifa); - swap(hiscore, score); hiscore_idx = 1 - hiscore_idx; @@ -1625,7 +1623,6 @@ static int __ipv6_dev_get_saddr(struct net *net, } } out: - read_unlock_bh(&idev->lock); return hiscore_idx; } @@ -1662,6 +1659,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, int dst_type; bool use_oif_addr = false; int hiscore_idx = 0; + int ret = 0; dst_type = __ipv6_addr_type(daddr); dst.addr = daddr; @@ -1737,15 +1735,14 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, } out: - rcu_read_unlock(); - hiscore = &scores[hiscore_idx]; if (!hiscore->ifa) - return -EADDRNOTAVAIL; + ret = -EADDRNOTAVAIL; + else + *saddr = hiscore->ifa->addr; - *saddr = hiscore->ifa->addr; - in6_ifa_put(hiscore->ifa); - return 0; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ipv6_dev_get_saddr); @@ -1785,15 +1782,15 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, return err; } -static int ipv6_count_addresses(struct inet6_dev *idev) +static int ipv6_count_addresses(const struct inet6_dev *idev) { + const struct inet6_ifaddr *ifp; int cnt = 0; - struct inet6_ifaddr *ifp; - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) + rcu_read_lock(); + list_for_each_entry_rcu(ifp, &idev->addr_list, if_list) cnt++; - read_unlock_bh(&idev->lock); + rcu_read_unlock(); return cnt; } @@ -1859,20 +1856,18 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; + const struct inet6_dev *idev; bool ret = false; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - list_for_each_entry(ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len); if (ret) break; } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); @@ -1882,22 +1877,20 @@ EXPORT_SYMBOL(ipv6_chk_custom_prefix); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; + const struct inet6_dev *idev; int onlink; onlink = 0; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - list_for_each_entry(ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { onlink = ipv6_prefix_equal(addr, &ifa->addr, ifa->prefix_len); if (onlink) break; } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); return onlink; @@ -2321,24 +2314,24 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); + rcu_read_lock(); + fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); if (!fn) goto out; - noflags |= RTF_CACHE; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & flags) != flags) continue; if ((rt->rt6i_flags & noflags) != 0) continue; - dst_hold(&rt->dst); + if (!dst_hold_safe(&rt->dst)) + rt = NULL; break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -3297,7 +3290,7 @@ static int fixup_permanent_addr(struct inet6_dev *idev, struct rt6_info *rt, *prev; rt = addrconf_dst_alloc(idev, &ifp->addr, false); - if (unlikely(IS_ERR(rt))) + if (IS_ERR(rt)) return PTR_ERR(rt); /* ifp->rt can be accessed outside of rtnl */ @@ -3403,7 +3396,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, /* restore routes for permanent addresses */ addrconf_permanent_addr(dev); - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is not ready yet. */ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", dev->name); @@ -3418,7 +3411,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } } else if (event == NETDEV_CHANGE) { - if (!addrconf_qdisc_ok(dev)) { + if (!addrconf_link_ready(dev)) { /* device is still not ready. */ break; } @@ -3562,7 +3555,6 @@ static int addrconf_ifdown(struct net_device *dev, int how) struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; - struct list_head del_list; int _keep_addr; bool keep_addr; int state, i; @@ -3654,7 +3646,6 @@ restart: */ keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6); - INIT_LIST_HEAD(&del_list); list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { struct rt6_info *rt = NULL; bool keep; @@ -3663,8 +3654,6 @@ restart: keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) && !addr_is_local(&ifa->addr); - if (!keep) - list_move(&ifa->if_list, &del_list); write_unlock_bh(&idev->lock); spin_lock_bh(&ifa->lock); @@ -3698,19 +3687,14 @@ restart: } write_lock_bh(&idev->lock); + if (!keep) { + list_del_rcu(&ifa->if_list); + in6_ifa_put(ifa); + } } write_unlock_bh(&idev->lock); - /* now clean up addresses to be removed */ - while (!list_empty(&del_list)) { - ifa = list_first_entry(&del_list, - struct inet6_ifaddr, if_list); - list_del(&ifa->if_list); - - in6_ifa_put(ifa); - } - /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); @@ -3820,8 +3804,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || - idev->cnf.accept_dad < 1 || + (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { bump_id = ifp->flags & IFA_F_TENTATIVE; @@ -4906,17 +4890,15 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); if (err < 0) - goto errout; + return err; addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); - if (!addr) { - err = -EINVAL; - goto errout; - } + if (!addr) + return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifa_index) - dev = __dev_get_by_index(net, ifm->ifa_index); + dev = dev_get_by_index(net, ifm->ifa_index); ifa = ipv6_get_ifaddr(net, addr, dev, 1); if (!ifa) { @@ -4942,6 +4924,8 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, errout_ifa: in6_ifa_put(ifa); errout: + if (dev) + dev_put(dev); return err; } @@ -5898,10 +5882,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) spin_lock(&ifa->lock); if (ifa->rt) { struct rt6_info *rt = ifa->rt; - struct fib6_table *table = rt->rt6i_table; int cpu; - read_lock(&table->tb6_lock); + rcu_read_lock(); addrconf_set_nopolicy(ifa->rt, val); if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { @@ -5911,7 +5894,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) addrconf_set_nopolicy(*rtp, val); } } - read_unlock(&table->tb6_lock); + rcu_read_unlock(); } spin_unlock(&ifa->lock); } @@ -6585,13 +6568,13 @@ int __init addrconf_init(void) __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0); __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0); __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, - inet6_dump_ifaddr, 0); + inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED); __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr, 0); __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr, 0); __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, 0); + inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED); ipv6_addr_label_rtnl_register(); @@ -6618,9 +6601,9 @@ void addrconf_cleanup(void) unregister_pernet_subsys(&addrconf_ops); ipv6_addr_label_cleanup(); - rtnl_lock(); + rtnl_af_unregister(&inet6_ops); - __rtnl_af_unregister(&inet6_ops); + rtnl_lock(); /* clean dev list */ for_each_netdev(&init_net, dev) { diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index c6311d7108f6..2606d2fa3ac4 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -18,7 +18,6 @@ #include <linux/if_addrlabel.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> -#include <linux/refcount.h> #if 0 #define ADDRLABEL(x...) printk(x) @@ -36,7 +35,6 @@ struct ip6addrlbl_entry { int addrtype; u32 label; struct hlist_node list; - refcount_t refcnt; struct rcu_head rcu; }; @@ -111,28 +109,6 @@ static const __net_initconst struct ip6addrlbl_init_table } }; -/* Object management */ -static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) -{ - kfree(p); -} - -static void ip6addrlbl_free_rcu(struct rcu_head *h) -{ - ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); -} - -static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p) -{ - return refcount_inc_not_zero(&p->refcnt); -} - -static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) -{ - if (refcount_dec_and_test(&p->refcnt)) - call_rcu(&p->rcu, ip6addrlbl_free_rcu); -} - /* Find label */ static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, @@ -219,7 +195,6 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); - refcount_set(&newp->refcnt, 1); return newp; } @@ -243,7 +218,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, goto out; } hlist_replace_rcu(&p->list, &newp->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); goto out; } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || (p->prefixlen < newp->prefixlen)) { @@ -281,7 +256,7 @@ static int ip6addrlbl_add(struct net *net, ret = __ip6addrlbl_add(net, newp, replace); spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) - ip6addrlbl_free(newp); + kfree(newp); return ret; } @@ -302,7 +277,7 @@ static int __ip6addrlbl_del(struct net *net, p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); ret = 0; break; } @@ -360,7 +335,7 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) spin_lock(&net->ipv6.ip6addrlbl_table.lock); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { hlist_del_rcu(&p->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); } spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } @@ -546,38 +521,28 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -EINVAL; addr = nla_data(tb[IFAL_ADDRESS]); - rcu_read_lock(); - p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - if (p && !ip6addrlbl_hold(p)) - p = NULL; - lseq = net->ipv6.ip6addrlbl_table.seq; - rcu_read_unlock(); - - if (!p) { - err = -ESRCH; - goto out; - } - skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL); - if (!skb) { - ip6addrlbl_put(p); + if (!skb) return -ENOBUFS; - } - err = ip6addrlbl_fill(skb, p, lseq, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWADDRLABEL, 0); + err = -ESRCH; - ip6addrlbl_put(p); + rcu_read_lock(); + p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); + lseq = net->ipv6.ip6addrlbl_table.seq; + if (p) + err = ip6addrlbl_fill(skb, p, lseq, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, + RTM_NEWADDRLABEL, 0); + rcu_read_unlock(); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); - goto out; + } else { + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } - - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); -out: return err; } diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 7802b72196f3..37bb33fbc742 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -271,6 +271,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir) case NEXTHDR_DEST: if (dir == XFRM_POLICY_OUT) ipv6_rearrange_destopt(iph, exthdr.opth); + /* fall through */ case NEXTHDR_HOP: if (!zero_out_mutable_opts(exthdr.opth)) { net_dbg_ratelimited("overrun %sopts\n", diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 95516138e861..7835dea930b4 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -89,6 +89,7 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) */ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; + /* fall through */ case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); return false; diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 115d60919f72..11025f8d124b 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -187,7 +187,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; - unsigned int len; bool found; if (fragoff) @@ -204,7 +203,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } - len = skb->len - start; do { struct ipv6_opt_hdr _hdr, *hp; @@ -273,7 +271,6 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, if (!found) { nexthdr = hp->nexthdr; - len -= hdrlen; start += hdrlen; } } while (!found); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 5acb54405b10..6ae5dd3f4d0d 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -250,16 +250,15 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset) return (*op & 0xC0) == 0x80; } -int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, - struct icmp6hdr *thdr, int len) +void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; - int err = 0; skb = skb_peek(&sk->sk_write_queue); if (!skb) - goto out; + return; icmp6h = icmp6_hdr(skb); memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); @@ -287,8 +286,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, tmp_csum); } ip6_push_pending_frames(sk); -out: - return err; } struct icmpv6_msg { @@ -438,7 +435,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, int iif = 0; int addr_type = 0; int len; - int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); if ((u8 *)hdr < skb->head || @@ -575,17 +571,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, rcu_read_lock(); idev = __in6_dev_get(skb->dev); - err = ip6_append_data(sk, icmpv6_getfrag, &msg, - len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), - &ipc6, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, &sockc_unused); - if (err) { + if (ip6_append_data(sk, icmpv6_getfrag, &msg, + len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), + &ipc6, &fl6, (struct rt6_info *)dst, + MSG_DONTWAIT, &sockc_unused)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, - len + sizeof(struct icmp6hdr)); + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + len + sizeof(struct icmp6hdr)); } rcu_read_unlock(); out_dst_release: @@ -682,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct icmpv6_msg msg; struct dst_entry *dst; struct ipcm6_cookie ipc6; - int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); struct sockcm_cookie sockc_unused = {0}; @@ -719,8 +713,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(net, sk, &dst, &fl6); - if (err) + if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) @@ -737,17 +730,16 @@ static void icmpv6_echo_reply(struct sk_buff *skb) ipc6.dontfrag = np->dontfrag; ipc6.opt = NULL; - err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT, - &sockc_unused); - - if (err) { + if (ip6_append_data(sk, icmpv6_getfrag, &msg, + skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), &ipc6, &fl6, + (struct rt6_info *)dst, MSG_DONTWAIT, + &sockc_unused)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, - skb->len + sizeof(struct icmp6hdr)); + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + skb->len + sizeof(struct icmp6hdr)); } dst_release(dst); out: @@ -872,10 +864,8 @@ static int icmpv6_rcv(struct sk_buff *skb) goto discard_it; hdr = icmp6_hdr(skb); - /* - * Drop through to notify - */ - + /* to notify */ + /* fall through */ case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e5308d7cbd75..1ada9672d198 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -38,14 +38,6 @@ #include <net/ip6_fib.h> #include <net/ip6_route.h> -#define RT6_DEBUG 2 - -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { @@ -62,9 +54,12 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct net *net, struct fib6_node *fn); -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); -static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); @@ -110,6 +105,20 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; +void fib6_update_sernum(struct rt6_info *rt) +{ + struct fib6_table *table = rt->rt6i_table; + struct net *net = dev_net(rt->dst.dev); + struct fib6_node *fn; + + spin_lock_bh(&table->tb6_lock); + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&table->tb6_lock)); + if (fn) + fn->fn_sernum = fib6_new_sernum(net); + spin_unlock_bh(&table->tb6_lock); +} + /* * Auxiliary address test functions for the radix tree. * @@ -140,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static struct fib6_node *node_alloc(void) +static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); + if (fn) + net->ipv6.rt6_stats->fib_nodes++; return fn; } -static void node_free_immediate(struct fib6_node *fn) +static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); + net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) @@ -161,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head) kmem_cache_free(fib6_node_kmem, fn); } -static void node_free(struct fib6_node *fn) +static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); + net->ipv6.rt6_stats->fib_nodes--; } void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) @@ -185,9 +198,6 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) *ppcpu_rt = NULL; } } - - free_percpu(non_pcpu_rt->rt6i_pcpu); - non_pcpu_rt->rt6i_pcpu = NULL; } EXPORT_SYMBOL_GPL(rt6_free_pcpu); @@ -205,8 +215,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ - rwlock_init(&tb->tb6_lock); - + spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* @@ -225,7 +234,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; - table->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(table->tb6_root.leaf, + net->ipv6.ip6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } @@ -322,11 +332,8 @@ unsigned int fib6_tables_seq_read(struct net *net) struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - read_lock_bh(&tb->tb6_lock); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; - read_unlock_bh(&tb->tb6_lock); - } } rcu_read_unlock(); @@ -372,7 +379,7 @@ static int fib6_node_dump(struct fib6_walker *w) { struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) + for_each_fib6_walker_rt(w) fib6_rt_dump(rt, w->args); w->leaf = NULL; return 0; @@ -382,9 +389,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { w->root = &tb->tb6_root; - read_lock_bh(&tb->tb6_lock); + spin_lock_bh(&tb->tb6_lock); fib6_walk(net, w); - read_unlock_bh(&tb->tb6_lock); + spin_unlock_bh(&tb->tb6_lock); } /* Called with rcu_read_lock() */ @@ -421,7 +428,7 @@ static int fib6_dump_node(struct fib6_walker *w) int res; struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ @@ -480,9 +487,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->count = 0; w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = w->root->fn_sernum; @@ -497,9 +504,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, } else w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; @@ -580,11 +587,13 @@ out: * node. */ -static struct fib6_node *fib6_add_1(struct fib6_node *root, - struct in6_addr *addr, int plen, - int offset, int allow_create, - int replace_required, int sernum, - struct netlink_ext_ack *extack) +static struct fib6_node *fib6_add_1(struct net *net, + struct fib6_table *table, + struct fib6_node *root, + struct in6_addr *addr, int plen, + int offset, int allow_create, + int replace_required, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -599,7 +608,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn = root; do { - key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -625,12 +636,10 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { - rt6_release(fn->leaf); - fn->leaf = NULL; + RCU_INIT_POINTER(fn->leaf, NULL); + rt6_release(leaf); } - fn->fn_sernum = sernum; - return fn; } @@ -639,10 +648,13 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, */ /* Try to walk down on tree. */ - fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right : fn->left; + fn = dir ? + rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)) : + rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { @@ -668,19 +680,17 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, * Create new leaf node without children. */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - - ln->parent = pn; - ln->fn_sernum = sernum; + RCU_INIT_POINTER(ln->parent, pn); if (dir) - pn->right = ln; + rcu_assign_pointer(pn->right, ln); else - pn->left = ln; + rcu_assign_pointer(pn->left, ln); return ln; @@ -694,7 +704,8 @@ insert_above: * and the current */ - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. @@ -710,14 +721,14 @@ insert_above: * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { - in = node_alloc(); - ln = node_alloc(); + in = node_alloc(net); + ln = node_alloc(net); if (!in || !ln) { if (in) - node_free_immediate(in); + node_free_immediate(net, in); if (ln) - node_free_immediate(ln); + node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } @@ -731,31 +742,28 @@ insert_above: in->fn_bit = bit; - in->parent = pn; + RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; - atomic_inc(&in->leaf->rt6i_ref); - - in->fn_sernum = sernum; + atomic_inc(&rcu_dereference_protected(in->leaf, + lockdep_is_held(&table->tb6_lock))->rt6i_ref); /* update parent pointer */ if (dir) - pn->right = in; + rcu_assign_pointer(pn->right, in); else - pn->left = in; + rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; - ln->parent = in; - fn->parent = in; - - ln->fn_sernum = sernum; + RCU_INIT_POINTER(ln->parent, in); + rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { - in->right = ln; - in->left = fn; + rcu_assign_pointer(in->right, ln); + rcu_assign_pointer(in->left, fn); } else { - in->left = ln; - in->right = fn; + rcu_assign_pointer(in->left, ln); + rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ @@ -765,28 +773,26 @@ insert_above: * (old node)[fn] NULL */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - ln->parent = pn; - - ln->fn_sernum = sernum; - - if (dir) - pn->right = ln; - else - pn->left = ln; + RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) - ln->right = fn; + RCU_INIT_POINTER(ln->right, fn); else - ln->left = fn; + RCU_INIT_POINTER(ln->left, fn); - fn->parent = ln; + rcu_assign_pointer(fn->parent, ln); + + if (dir) + rcu_assign_pointer(pn->right, ln); + else + rcu_assign_pointer(pn->left, ln); } return ln; } @@ -832,6 +838,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, struct net *net) { + struct fib6_table *table = rt->rt6i_table; + if (atomic_read(&rt->rt6i_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, @@ -840,12 +848,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * to still alive ones. */ while (fn) { - if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { - fn->leaf = fib6_find_prefix(net, fn); - atomic_inc(&fn->leaf->rt6i_ref); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_leaf; + if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { + new_leaf = fib6_find_prefix(net, table, fn); + atomic_inc(&new_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_leaf); rt6_release(rt); } - fn = fn->parent; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); } } } @@ -857,9 +870,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc) { + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct rt6_info *iter = NULL; - struct rt6_info **ins; - struct rt6_info **fallback_ins = NULL; + struct rt6_info __rcu **ins; + struct rt6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -874,7 +889,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; - for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { + for (iter = leaf; iter; + iter = rcu_dereference_protected(iter->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates */ @@ -936,7 +953,8 @@ next_iter: if (fallback_ins && !found) { /* No ECMP-able route found, replace first non-ECMP one */ ins = fallback_ins; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); found++; } @@ -950,7 +968,7 @@ next_iter: struct rt6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ - sibling = fn->leaf; + sibling = leaf; while (sibling) { if (sibling->rt6i_metric == rt->rt6i_metric && rt6_qualify_for_ecmp(sibling)) { @@ -958,7 +976,8 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = sibling->dst.rt6_next; + sibling = rcu_dereference_protected(sibling->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings @@ -987,10 +1006,10 @@ add: if (err) return err; - rt->dst.rt6_next = iter; - *ins = rt; - rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(rt->dst.rt6_next, iter); atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, rt); if (!info->skip_notify) @@ -1016,10 +1035,10 @@ add: if (err) return err; - *ins = rt; + atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; - atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt); if (!info->skip_notify) @@ -1031,14 +1050,15 @@ add: nsiblings = iter->rt6i_nsiblings; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->dst.rt6_next; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; @@ -1046,14 +1066,16 @@ add: *ins = iter->dst.rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->dst.rt6_next; } - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } WARN_ON(nsiblings != 0); } @@ -1077,16 +1099,33 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } +static void fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) +{ + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + + /* paired with smp_rmb() in rt6_get_cookie_safe() */ + smp_wmb(); + while (fn) { + fn->fn_sernum = sernum; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + } +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees + * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc, struct netlink_ext_ack *extack) { + struct fib6_table *table = rt->rt6i_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; @@ -1095,6 +1134,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; + if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE)) + return -EINVAL; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1105,9 +1146,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, + fn = fib6_add_1(info->nl_net, table, root, + &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required, sernum, extack); + replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -1120,7 +1162,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (rt->rt6i_src.plen) { struct fib6_node *sn; - if (!fn->subtree) { + if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* @@ -1134,42 +1176,40 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, */ /* Create subtree root node */ - sfn = node_alloc(); + sfn = node_alloc(info->nl_net); if (!sfn) goto failure; - sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + rcu_assign_pointer(sfn->leaf, + info->nl_net->ipv6.ip6_null_entry); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = sernum; /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(sfn, &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, sfn, + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ - node_free_immediate(sfn); + node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ - sfn->parent = fn; - fn->subtree = sfn; + rcu_assign_pointer(sfn->parent, fn); + rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); @@ -1177,9 +1217,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } } - if (!fn->leaf) { - fn->leaf = rt; + if (!rcu_access_pointer(fn->leaf)) { atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); } fn = sn; } @@ -1187,9 +1227,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, err = fib6_add_rt2node(fn, rt, info, mxc); if (!err) { + fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); - if (!(rt->rt6i_flags & RTF_CACHE)) - fib6_prune_clones(info->nl_net, pn); } out: @@ -1199,19 +1238,23 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - if (pn != fn && pn->leaf == rt) { - pn->leaf = NULL; + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn != fn && pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); atomic_dec(&rt->rt6i_ref); } - if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn->leaf = fib6_find_prefix(info->nl_net, pn); + if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, pn); #if RT6_DEBUG >= 2 - if (!pn->leaf) { - WARN_ON(pn->leaf == NULL); - pn->leaf = info->nl_net->ipv6.ip6_null_entry; + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = info->nl_net->ipv6.ip6_null_entry; } #endif - atomic_inc(&pn->leaf->rt6i_ref); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); } #endif goto failure; @@ -1226,7 +1269,7 @@ failure: * fn->leaf. */ if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) - fib6_repair_tree(info->nl_net, fn); + fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function */ @@ -1264,7 +1307,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, dir = addr_bit_set(args->addr, fn->fn_bit); - next = dir ? fn->right : fn->left; + next = dir ? rcu_dereference(fn->right) : + rcu_dereference(fn->left); if (next) { fn = next; @@ -1274,18 +1318,22 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, } while (fn) { - if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree || fn->fn_flags & RTN_RTINFO) { + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; - key = (struct rt6key *) ((u8 *) fn->leaf + - args->offset); + if (!leaf) + goto backtrack; + + key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) { + if (subtree) { struct fib6_node *sfn; - sfn = fib6_lookup_1(fn->subtree, - args + 1); + sfn = fib6_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; @@ -1295,18 +1343,18 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, return fn; } } -#ifdef CONFIG_IPV6_SUBTREES backtrack: -#endif if (fn->fn_flags & RTN_ROOT) break; - fn = fn->parent; + fn = rcu_dereference(fn->parent); } return NULL; } +/* called with rcu_read_lock() held + */ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -1337,54 +1385,87 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) + * exact_match == true means we try to find fn with exact match of + * the passed in prefix addr + * exact_match == false means we try to find fn with longest prefix + * match of the passed in prefix addr. This is useful for finding fn + * for cached route as it will be stored in the exception table under + * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, - int plen, int offset) + int plen, int offset, + bool exact_match) { - struct fib6_node *fn; + struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = rcu_dereference(fn->leaf); + struct rt6key *key; + + /* This node is being deleted */ + if (!leaf) { + if (plen <= fn->fn_bit) + goto out; + else + goto next; + } + + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) - return NULL; + goto out; if (plen == fn->fn_bit) return fn; + prev = fn; + +next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) - fn = fn->right; + fn = rcu_dereference(fn->right); else - fn = fn->left; + fn = rcu_dereference(fn->left); } - return NULL; +out: + if (exact_match) + return NULL; + else + return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len) + const struct in6_addr *saddr, int src_len, + bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst)); + offsetof(struct rt6_info, rt6i_dst), + exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); - if (fn && fn->subtree) - fn = fib6_locate_1(fn->subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src)); + if (fn) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree) { + fn = fib6_locate_1(subtree, saddr, src_len, + offsetof(struct rt6_info, rt6i_src), + exact_match); + } + } } #endif @@ -1400,16 +1481,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn) { + struct fib6_node *child_left, *child_right; + if (fn->fn_flags & RTN_ROOT) return net->ipv6.ip6_null_entry; while (fn) { - if (fn->left) - return fn->left->leaf; - if (fn->right) - return fn->right->leaf; + child_left = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + child_right = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + if (child_left) + return rcu_dereference_protected(child_left->leaf, + lockdep_is_held(&table->tb6_lock)); + if (child_right) + return rcu_dereference_protected(child_right->leaf, + lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } @@ -1419,31 +1510,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. + * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, - struct fib6_node *fn) + struct fib6_table *table, + struct fib6_node *fn) { int children; int nstate; - struct fib6_node *child, *pn; + struct fib6_node *child; struct fib6_walker *w; int iter = 0; for (;;) { + struct fib6_node *fn_r = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *fn_l = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_r = rcu_dereference_protected(pn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_l = rcu_dereference_protected(pn->left, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_fn_leaf; + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); - WARN_ON(fn->leaf); + WARN_ON(fn_leaf); children = 0; child = NULL; - if (fn->right) - child = fn->right, children |= 1; - if (fn->left) - child = fn->left, children |= 2; + if (fn_r) + child = fn_r, children |= 1; + if (fn_l) + child = fn_l, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES @@ -1451,36 +1560,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net, || (children && fn->fn_flags & RTN_ROOT) #endif ) { - fn->leaf = fib6_find_prefix(net, fn); + new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 - if (!fn->leaf) { - WARN_ON(!fn->leaf); - fn->leaf = net->ipv6.ip6_null_entry; + if (!new_fn_leaf) { + WARN_ON(!new_fn_leaf); + new_fn_leaf = net->ipv6.ip6_null_entry; } #endif - atomic_inc(&fn->leaf->rt6i_ref); - return fn->parent; + atomic_inc(&new_fn_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_fn_leaf); + return pn; } - pn = fn->parent; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); - FIB6_SUBTREE(pn) = NULL; + RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) - pn->right = child; - else if (pn->left == fn) - pn->left = child; + if (pn_r == fn) + rcu_assign_pointer(pn->right, child); + else if (pn_l == fn) + rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) - child->parent = pn; + rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } @@ -1489,19 +1598,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { - if (w->root == fn) { - w->root = w->node = NULL; - RT6_TRACE("W %p adjusted by delroot 1\n", w); - } else if (w->node == fn) { + if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { - if (w->root == fn) { - w->root = child; - RT6_TRACE("W %p adjusted by delroot 2\n", w); - } if (w->node == fn) { w->node = child; if (children&2) { @@ -1516,33 +1618,39 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } read_unlock(&net->ipv6.fib6_walker_lock); - node_free(fn); + node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; - rt6_release(pn->leaf); - pn->leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + rt6_release(pn_leaf); fn = pn; } } -static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, - struct nl_info *info) +static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, + struct rt6_info __rcu **rtp, struct nl_info *info) { struct fib6_walker *w; - struct rt6_info *rt = *rtp; + struct rt6_info *rt = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; RT6_TRACE("fib6_del_route\n"); + WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); + /* Unlink it */ *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); + /* Reset round-robin state, if necessary */ - if (fn->rr_ptr == rt) + if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ @@ -1561,20 +1669,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->dst.rt6_next; + w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); - rt->dst.rt6_next = NULL; - /* If it was last route, expunge its radix tree node */ - if (!fn->leaf) { + if (!rcu_access_pointer(fn->leaf)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; - fn = fib6_repair_tree(net, fn); + fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); @@ -1585,12 +1692,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_release(rt); } +/* Need to own table->tb6_lock */ int fib6_del(struct rt6_info *rt, struct nl_info *info) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); + struct fib6_table *table = rt->rt6i_table; struct net *net = info->nl_net; - struct rt6_info **rtp; + struct rt6_info __rcu **rtp; + struct rt6_info __rcu **rtp_next; #if RT6_DEBUG >= 2 if (rt->dst.obsolete > 0) { @@ -1603,28 +1713,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags & RTF_CACHE)) { - struct fib6_node *pn = fn; -#ifdef CONFIG_IPV6_SUBTREES - /* clones of this route might be in another subtree */ - if (rt->rt6i_src.plen) { - while (!(pn->fn_flags & RTN_ROOT)) - pn = pn->parent; - pn = pn->parent; - } -#endif - fib6_prune_clones(info->nl_net, pn); - } + /* remove cached dst from exception table */ + if (rt->rt6i_flags & RTF_CACHE) + return rt6_remove_exception_rt(rt); /* * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { - if (*rtp == rt) { - fib6_del_route(fn, rtp, info); + for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { + struct rt6_info *cur = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); + if (rt == cur) { + fib6_del_route(table, fn, rtp, info); return 0; } + rtp_next = &cur->dst.rt6_next; } return -ENOENT; } @@ -1651,22 +1755,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. + * + * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { - struct fib6_node *fn, *pn; + struct fib6_node *fn, *pn, *left, *right; + + /* w->root should always be table->tb6_root */ + WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; - if (w->prune && fn != w->root && - fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { - w->state = FWS_C; - w->leaf = fn->leaf; - } switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: @@ -1676,21 +1780,26 @@ static int fib6_walk_continue(struct fib6_walker *w) } w->state = FWS_L; #endif + /* fall through */ case FWS_L: - if (fn->left) { - w->node = fn->left; + left = rcu_dereference_protected(fn->left, 1); + if (left) { + w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; + /* fall through */ case FWS_R: - if (fn->right) { - w->node = fn->right; + right = rcu_dereference_protected(fn->right, 1); + if (right) { + w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; - w->leaf = fn->leaf; + w->leaf = rcu_dereference_protected(fn->leaf, 1); + /* fall through */ case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; @@ -1709,10 +1818,13 @@ static int fib6_walk_continue(struct fib6_walker *w) } skip: w->state = FWS_U; + /* fall through */ case FWS_U: if (fn == w->root) return 0; - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, 1); + left = rcu_dereference_protected(pn->left, 1); + right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { @@ -1721,13 +1833,13 @@ skip: continue; } #endif - if (pn->left == fn) { + if (left == fn) { w->state = FWS_R; continue; } - if (pn->right == fn) { + if (right == fn) { w->state = FWS_C; - w->leaf = w->node->leaf; + w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 @@ -1770,7 +1882,7 @@ static int fib6_clean_node(struct fib6_walker *w) return 0; } - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; @@ -1798,20 +1910,16 @@ static int fib6_clean_node(struct fib6_walker *w) * func is called on each route. * It may return -1 -> delete this route. * 0 -> continue walking - * - * prune==1 -> only immediate children of node (certainly, - * ignoring pure split nodes) will be scanned. */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - bool prune, int sernum, void *arg) + int sernum, void *arg) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; - c.w.prune = prune; c.w.count = 0; c.w.skip = 0; c.func = func; @@ -1834,10 +1942,10 @@ static void __fib6_clean_all(struct net *net, for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, false, sernum, arg); - write_unlock_bh(&table->tb6_lock); + func, sernum, arg); + spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); @@ -1849,22 +1957,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); } -static int fib6_prune_clone(struct rt6_info *rt, void *arg) -{ - if (rt->rt6i_flags & RTF_CACHE) { - RT6_TRACE("pruning clone %p\n", rt); - return -1; - } - - return 0; -} - -static void fib6_prune_clones(struct net *net, struct fib6_node *fn) -{ - fib6_clean_tree(net, fn, fib6_prune_clone, true, - FIB6_NO_SERNUM_CHANGE, NULL); -} - static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); @@ -1876,12 +1968,6 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -struct fib6_gc_args -{ - int timeout; - int more; -}; - static int fib6_age(struct rt6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; @@ -1890,9 +1976,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) /* * check addrconf expiration here. * Routes are expired even if they are in use. - * - * Also age clones. Note, that clones are aged out - * only if they are not in use now. */ if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { @@ -1901,31 +1984,14 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } gc_args->more++; - } else if (rt->rt6i_flags & RTF_CACHE) { - if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) - rt->dst.obsolete = DST_OBSOLETE_KILL; - if (atomic_read(&rt->dst.__refcnt) == 1 && - rt->dst.obsolete == DST_OBSOLETE_KILL) { - RT6_TRACE("aging clone %p\n", rt); - return -1; - } else if (rt->rt6i_flags & RTF_GATEWAY) { - struct neighbour *neigh; - __u8 neigh_flags = 0; - - neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); - if (neigh) { - neigh_flags = neigh->flags; - neigh_release(neigh); - } - if (!(neigh_flags & NTF_ROUTER)) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); - return -1; - } - } - gc_args->more++; } + /* Also age clones in the exception table. + * Note, that clones are aged out + * only if they are not in use now. + */ + rt6_age_exceptions(rt, gc_args, now); + return 0; } @@ -1993,7 +2059,8 @@ static int __net_init fib6_net_init(struct net *net) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; - net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); @@ -2004,7 +2071,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; - net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); @@ -2134,7 +2202,9 @@ static int ipv6_route_yield(struct fib6_walker *w) return 1; do { - iter->w.leaf = iter->w.leaf->dst.rt6_next; + iter->w.leaf = rcu_dereference_protected( + iter->w.leaf->dst.rt6_next, + lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; @@ -2199,7 +2269,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = ((struct rt6_info *)v)->dst.rt6_next; + n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); if (n) { ++*pos; return n; @@ -2207,9 +2277,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) iter_table: ipv6_route_check_sernum(iter); - read_lock(&iter->tbl->tb6_lock); + spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); - read_unlock(&iter->tbl->tb6_lock); + spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { if (v) ++*pos; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4b58f964da20..241841fb479c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1313,6 +1313,7 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netif_keep_dst(dev); } static bool ip6gre_netlink_encap_parms(struct nlattr *data[], diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index cdb3728faca7..4a87f9428ca5 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (gso_partial) + if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index e0f9bc0421e0..4212879ff35e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -593,6 +593,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case NDISC_REDIRECT: rel_type = ICMP_REDIRECT; rel_code = ICMP_REDIR_HOST; + /* fall through */ default: return 0; } @@ -1043,6 +1044,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; + unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; bool use_cache = false; @@ -1124,7 +1126,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - psh_hlen - t->tun_hlen; + mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1133,7 +1135,7 @@ route_lookup: mtu = IPV6_MIN_MTU; if (skb_dst(skb) && !t->parms.collect_md) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len - t->tun_hlen > mtu && !skb_is_gso(skb)) { + if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 714914d1bb98..dbb74f3c57a7 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -445,6 +445,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; + int pkt_len = skb->len; int err = -1; int mtu; @@ -502,7 +503,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += skb->len; + tstats->tx_bytes += pkt_len; tstats->tx_packets++; u64_stats_update_end(&tstats->syncp); } else { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index f5500f5444e9..59fad81e5f7a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1722,6 +1722,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; + /* fall through */ case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a5e466d4e093..b9404feabd78 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -377,6 +377,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; + case IPV6_FREEBIND: + if (optlen < sizeof(int)) + goto e_inval; + /* we also don't have a separate freebind bit for IPV6 */ + inet_sk(sk)->freebind = valbool; + retv = 0; + break; + case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; @@ -1214,6 +1222,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = inet_sk(sk)->transparent; break; + case IPV6_FREEBIND: + val = inet_sk(sk)->freebind; + break; + case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index a5cd43d75393..437af8c95277 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -353,7 +353,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, nexthdr = ipv6_hdr(skb)->nexthdr; thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (thoff < 0) + if (thoff < 0 || nexthdr != IPPROTO_TCP) return NF_ACCEPT; th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b263bf3a19f7..977d8900cfd1 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -169,12 +169,13 @@ static unsigned int nf_hashfn(const struct inet_frag_queue *q) return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); } -static void nf_ct_frag6_expire(unsigned long data) +static void nf_ct_frag6_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); ip6_expire_frag_queue(net, fq, &nf_frags); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index 46d6dba50698..1d2fb9267d6f 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -290,7 +290,8 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, else return NF_ACCEPT; } - /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + /* Only ICMPs can be IP_CT_IS_REPLY: */ + /* fall through */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index ac826dd338ff..d12c55dad7d1 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -154,9 +154,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, - (struct icmp6hdr *) &pfh.icmph, - len); + icmpv6_push_pending_frames(sk, &fl6, + (struct icmp6hdr *)&pfh.icmph, len); } release_sock(sk); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index e4462b0ff801..761a473a07c5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1055,6 +1055,7 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return ipv6_setsockopt(sk, level, optname, optval, optlen); } @@ -1077,6 +1078,7 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return compat_ipv6_setsockopt(sk, level, optname, optval, optlen); @@ -1138,6 +1140,7 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return ipv6_getsockopt(sk, level, optname, optval, optlen); } @@ -1160,6 +1163,7 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname, if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; + /* fall through */ default: return compat_ipv6_getsockopt(sk, level, optname, optval, optlen); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 846012eae526..afbc000ad4f2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -170,12 +170,13 @@ out: } EXPORT_SYMBOL(ip6_expire_frag_queue); -static void ip6_frag_expire(unsigned long data) +static void ip6_frag_expire(struct timer_list *t) { + struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; struct net *net; - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); + fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); ip6_expire_frag_queue(net, fq, &ip6_frags); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 26cc9f483b6d..2e8842fa6450 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -44,6 +44,7 @@ #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <linux/slab.h> +#include <linux/jhash.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> @@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net, struct in6_addr *dst, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -139,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->rt6i_uncached)) { struct uncached_list *ul = rt->rt6i_uncached_list; + struct net *net = dev_net(rt->dst.dev); spin_lock_bh(&ul->lock); list_del(&rt->rt6i_uncached); + atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); spin_unlock_bh(&ul->lock); } } @@ -355,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); - if (rt) + if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); + } return rt; } @@ -369,17 +377,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, if (rt) { rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); - if (rt->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **p; - - p = per_cpu_ptr(rt->rt6i_pcpu, cpu); - /* no one shares rt */ - *p = NULL; - } - } else { + if (!rt->rt6i_pcpu) { dst_release_immediate(&rt->dst); return NULL; } @@ -392,6 +390,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_exception_bucket *bucket; struct dst_entry *from = dst->from; struct inet6_dev *idev; @@ -404,6 +403,11 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); + if (bucket) { + rt->rt6i_exception_bucket = NULL; + kfree(bucket); + } dst->from = NULL; dst_release(from); @@ -478,7 +482,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, } /* - * Route lookup. Any table->tb6_lock is implied. + * Route lookup. rcu_read_lock() should be held. */ static inline struct rt6_info *rt6_device_match(struct net *net, @@ -493,7 +497,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { struct net_device *dev = sprt->dst.dev; if (oif) { @@ -702,6 +706,7 @@ out: } static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *leaf, struct rt6_info *rr_head, u32 metric, int oif, int strict, bool *do_rr) @@ -711,7 +716,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -720,7 +725,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = find_match(rt, oif, strict, &mpri, match, do_rr); } - for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + for (rt = leaf; rt && rt != rr_head; + rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -732,37 +738,59 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rt->dst.rt6_next) + for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; } -static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) +static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, + int oif, int strict) { + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6_info *match, *rt0; - struct net *net; bool do_rr = false; + int key_plen; - rt0 = fn->rr_ptr; + if (!leaf) + return net->ipv6.ip6_null_entry; + + rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) - fn->rr_ptr = rt0 = fn->leaf; + rt0 = leaf; - match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, + /* Double check to make sure fn is not an intermediate node + * and fn->leaf does not points to its child's leaf + * (This might happen if all routes under fn are deleted from + * the tree and fib6_repair_tree() is called on the node.) + */ + key_plen = rt0->rt6i_dst.plen; +#ifdef CONFIG_IPV6_SUBTREES + if (rt0->rt6i_src.plen) + key_plen = rt0->rt6i_src.plen; +#endif + if (fn->fn_bit != key_plen) + return net->ipv6.ip6_null_entry; + + match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); if (do_rr) { - struct rt6_info *next = rt0->dst.rt6_next; + struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) - next = fn->leaf; - - if (next != rt0) - fn->rr_ptr = next; + next = leaf; + + if (next != rt0) { + spin_lock_bh(&leaf->rt6i_table->tb6_lock); + /* make sure next is not being deleted from the tree */ + if (next->rt6i_node) + rcu_assign_pointer(fn->rr_ptr, next); + spin_unlock_bh(&leaf->rt6i_table->tb6_lock); + } } - net = dev_net(rt0->dst.dev); return match ? match : net->ipv6.ip6_null_entry; } @@ -850,13 +878,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { - struct fib6_node *pn; + struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; - pn = fn->parent; - if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) - fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); + pn = rcu_dereference(fn->parent); + sn = FIB6_SUBTREE(pn); + if (sn && sn != fn) + fn = fib6_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) @@ -864,27 +893,57 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, } } +static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, + bool null_fallback) +{ + struct rt6_info *rt = *prt; + + if (dst_hold_safe(&rt->dst)) + return true; + if (null_fallback) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } else { + rt = NULL; + } + *prt = rt; + return false; +} + static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) { + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; - struct rt6_info *rt; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - rt = fn->leaf; - rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); - if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); + rt = rcu_dereference(fn->leaf); + if (!rt) { + rt = net->ipv6.ip6_null_entry; + } else { + rt = rt6_device_match(net, rt, &fl6->saddr, + fl6->flowi6_oif, flags); + if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) + rt = rt6_multipath_select(rt, fl6, + fl6->flowi6_oif, flags); + } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } - dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + /* Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; + + if (ip6_hold_safe(net, &rt, true)) + dst_use_noref(&rt->dst, jiffies); + + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -938,9 +997,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, struct fib6_table *table; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, mxc, extack); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); return err; } @@ -1038,7 +1097,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) return pcpu_rt; } -/* It should be called with read_lock_bh(&tb6_lock) acquired */ +/* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) { struct rt6_info *pcpu_rt, **p; @@ -1046,16 +1105,14 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt) { - dst_hold(&pcpu_rt->dst); + if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) rt6_dst_from_metrics_check(pcpu_rt); - } + return pcpu_rt; } static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); @@ -1066,36 +1123,514 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return net->ipv6.ip6_null_entry; } - read_lock_bh(&table->tb6_lock); - if (rt->rt6i_pcpu) { - p = this_cpu_ptr(rt->rt6i_pcpu); - prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ - dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = prev; - } - } else { - /* rt has been removed from the fib6 tree - * before we have a chance to acquire the read_lock. - * In this case, don't brother to create a pcpu rt - * since rt is going away anyway. The next - * dst_check() will trigger a re-lookup. - */ - dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = rt; - } dst_hold(&pcpu_rt->dst); + p = this_cpu_ptr(rt->rt6i_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + BUG_ON(prev); + rt6_dst_from_metrics_check(pcpu_rt); - read_unlock_bh(&table->tb6_lock); return pcpu_rt; } +/* exception hash table implementation + */ +static DEFINE_SPINLOCK(rt6_exception_lock); + +/* Remove rt6_ex from hash table and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_remove_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex) +{ + struct net *net; + + if (!bucket || !rt6_ex) + return; + + net = dev_net(rt6_ex->rt6i->dst.dev); + rt6_ex->rt6i->rt6i_node = NULL; + hlist_del_rcu(&rt6_ex->hlist); + rt6_release(rt6_ex->rt6i); + kfree_rcu(rt6_ex, rcu); + WARN_ON_ONCE(!bucket->depth); + bucket->depth--; + net->ipv6.rt6_stats->fib_rt_cache--; +} + +/* Remove oldest rt6_ex in bucket and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) +{ + struct rt6_exception *rt6_ex, *oldest = NULL; + + if (!bucket) + return; + + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) + oldest = rt6_ex; + } + rt6_remove_exception(bucket, oldest); +} + +static u32 rt6_exception_hash(const struct in6_addr *dst, + const struct in6_addr *src) +{ + static u32 seed __read_mostly; + u32 val; + + net_get_random_once(&seed, sizeof(seed)); + val = jhash(dst, sizeof(*dst), seed); + +#ifdef CONFIG_IPV6_SUBTREES + if (src) + val = jhash(src, sizeof(*src), val); +#endif + return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rt6_exception_lock + */ +static struct rt6_exception * +__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rcu_read_lock() + */ +static struct rt6_exception * +__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +static int rt6_insert_exception(struct rt6_info *nrt, + struct rt6_info *ort) +{ + struct net *net = dev_net(ort->dst.dev); + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err = 0; + + /* ort can't be a cache or pcpu route */ + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; + WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); + + spin_lock_bh(&rt6_exception_lock); + + if (ort->exception_bucket_flushed) { + err = -EINVAL; + goto out; + } + + bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) { + bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), + GFP_ATOMIC); + if (!bucket) { + err = -ENOMEM; + goto out; + } + rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); + } + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates ort is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (ort->rt6i_src.plen) + src_key = &nrt->rt6i_src.addr; +#endif + + /* Update rt6i_prefsrc as it could be changed + * in rt6_remove_prefsrc() + */ + nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + /* rt6_mtu_change() might lower mtu on ort. + * Only insert this exception route if its mtu + * is less than ort's mtu value. + */ + if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { + err = -EINVAL; + goto out; + } + + rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_remove_exception(bucket, rt6_ex); + + rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); + if (!rt6_ex) { + err = -ENOMEM; + goto out; + } + rt6_ex->rt6i = nrt; + rt6_ex->stamp = jiffies; + atomic_inc(&nrt->rt6i_ref); + nrt->rt6i_node = ort->rt6i_node; + hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); + bucket->depth++; + net->ipv6.rt6_stats->fib_rt_cache++; + + if (bucket->depth > FIB6_MAX_DEPTH) + rt6_exception_remove_oldest(bucket); + +out: + spin_unlock_bh(&rt6_exception_lock); + + /* Update fn->fn_sernum to invalidate all cached dst */ + if (!err) + fib6_update_sernum(ort); + + return err; +} + +void rt6_flush_exceptions(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + spin_lock_bh(&rt6_exception_lock); + /* Prevent rt6_insert_exception() to recreate the bucket list */ + rt->exception_bucket_flushed = 1; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) + goto out; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) + rt6_remove_exception(bucket, rt6_ex); + WARN_ON_ONCE(bucket->depth); + bucket++; + } + +out: + spin_unlock_bh(&rt6_exception_lock); +} + +/* Find cached rt in the hash table inside passed in rt + * Caller has to hold rcu_read_lock() + */ +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + struct rt6_info *res = NULL; + + bucket = rcu_dereference(rt->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates rt is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (rt->rt6i_src.plen) + src_key = saddr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); + + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) + res = rt6_ex->rt6i; + + return res; +} + +/* Remove the passed in cached rt from the hash table that contains it */ +int rt6_remove_exception_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err; + + if (!from || + !(rt->rt6i_flags & RTF_CACHE)) + return -EINVAL; + + if (!rcu_access_pointer(from->rt6i_exception_bucket)) + return -ENOENT; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(from->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_spinlock(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) { + rt6_remove_exception(bucket, rt6_ex); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&rt6_exception_lock); + return err; +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + + if (!from || + !(rt->rt6i_flags & RTF_CACHE)) + return; + + rcu_read_lock(); + bucket = rcu_dereference(from->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_ex->stamp = jiffies; + + rcu_read_unlock(); +} + +static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + rt6_ex->rt6i->rt6i_prefsrc.plen = 0; + } + bucket++; + } + } +} + +static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) + entry->rt6i_pmtu = mtu; + } + bucket++; + } + } +} + +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +static void rt6_exceptions_clean_tohost(struct rt6_info *rt, + struct in6_addr *gateway) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + + if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == + RTF_CACHE_GATEWAY && + ipv6_addr_equal(gateway, + &entry->rt6i_gateway)) { + rt6_remove_exception(bucket, rt6_ex); + } + } + bucket++; + } + } + + spin_unlock_bh(&rt6_exception_lock); +} + +static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_info *rt = rt6_ex->rt6i; + + if (atomic_read(&rt->dst.__refcnt) == 1 && + time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + RT6_TRACE("aging clone %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } else if (rt->rt6i_flags & RTF_GATEWAY) { + struct neighbour *neigh; + __u8 neigh_flags = 0; + + neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); + if (neigh) { + neigh_flags = neigh->flags; + neigh_release(neigh); + } + if (!(neigh_flags & NTF_ROUTER)) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } + } + gc_args->more++; +} + +void rt6_age_exceptions(struct rt6_info *rt, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + rt6_age_examine_exception(bucket, rt6_ex, + gc_args, now); + } + bucket++; + } + } + spin_unlock_bh(&rt6_exception_lock); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -1103,7 +1638,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1112,7 +1647,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - rt = rt6_select(fn, oif, strict); + rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) rt = rt6_multipath_select(rt, fl6, oif, strict); if (rt == net->ipv6.ip6_null_entry) { @@ -1127,13 +1662,22 @@ redo_rt6_select: } } + /*Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; - if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { - dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); - - rt6_dst_from_metrics_check(rt); - + if (rt == net->ipv6.ip6_null_entry) { + rcu_read_unlock(); + dst_hold(&rt->dst); + trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + return rt; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + rt6_dst_from_metrics_check(rt); + } + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && @@ -1146,8 +1690,14 @@ redo_rt6_select: struct rt6_info *uncached_rt; - dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + } else { + rcu_read_unlock(); + uncached_rt = rt; + goto uncached_rt_out; + } + rcu_read_unlock(); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); @@ -1157,11 +1707,13 @@ redo_rt6_select: * No need for another dst_hold() */ rt6_uncached_list_add(uncached_rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); } else { uncached_rt = net->ipv6.ip6_null_entry; dst_hold(&uncached_rt->dst); } +uncached_rt_out: trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); return uncached_rt; @@ -1170,26 +1722,28 @@ redo_rt6_select: struct rt6_info *pcpu_rt; - rt->dst.lastuse = jiffies; - rt->dst.__use++; + dst_use_noref(&rt->dst, jiffies); + local_bh_disable(); pcpu_rt = rt6_get_pcpu_route(rt); - if (pcpu_rt) { - read_unlock_bh(&table->tb6_lock); - } else { - /* We have to do the read_unlock first - * because rt6_make_pcpu_route() may trigger - * ip6_dst_gc() which will take the write_lock. - */ - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - pcpu_rt = rt6_make_pcpu_route(rt); - dst_release(&rt->dst); + if (!pcpu_rt) { + /* atomic_inc_not_zero() is needed when using rcu */ + if (atomic_inc_not_zero(&rt->rt6i_ref)) { + /* No dst_hold() on rt is needed because grabbing + * rt->rt6i_ref makes sure rt can't be released. + */ + pcpu_rt = rt6_make_pcpu_route(rt); + rt6_release(rt); + } else { + /* rt is already removed from tree */ + pcpu_rt = net->ipv6.ip6_null_entry; + dst_hold(&pcpu_rt->dst); + } } - + local_bh_enable(); + rcu_read_unlock(); trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; - } } EXPORT_SYMBOL_GPL(ip6_pol_route); @@ -1325,9 +1879,10 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, - DST_OBSOLETE_NONE, 0); + DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; @@ -1491,23 +2046,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); + /* update rt6_ex->stamp for cache */ + if (rt6->rt6i_flags & RTF_CACHE) + rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct rt6_info *nrt6; nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - - /* ip6_ins_rt(nrt6) will bump the - * rt6->rt6i_node->fn_sernum - * which will fail the next rt6_check() and - * invalidate the sk->sk_dst_cache. - */ - ip6_ins_rt(nrt6); - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt6->dst); + if (rt6_insert_exception(nrt6, rt6)) + dst_release_immediate(&nrt6->dst); } } } @@ -1571,7 +2120,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -1584,10 +2133,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, * routes. */ - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt6_check_expired(rt)) continue; if (rt->dst.error) @@ -1596,8 +2145,23 @@ restart: continue; if (fl6->flowi6_oif != rt->dst.dev->ifindex) continue; - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { + rt_cache = rt6_find_cached_rt(rt, + &fl6->daddr, + &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(&rdfl->gateway, + &rt_cache->rt6i_gateway)) { + rt = rt_cache; + break; + } continue; + } break; } @@ -1615,9 +2179,9 @@ restart: } out: - dst_hold(&rt->dst); + ip6_hold_safe(net, &rt, true); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -1766,6 +2330,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, * do proper release of the net_device */ rt6_uncached_list_add(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); @@ -2216,9 +2781,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) } table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out: ip6_rt_put(rt); @@ -2244,7 +2809,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (rt == net->ipv6.ip6_null_entry) goto out_put; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { struct rt6_info *sibling, *next_sibling; @@ -2274,7 +2839,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) err = fib6_del(rt, info); out_unlock: - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out_put: ip6_rt_put(rt); @@ -2288,9 +2853,9 @@ out_put: static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { + struct rt6_info *rt, *rt_cache; struct fib6_table *table; struct fib6_node *fn; - struct rt6_info *rt; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); @@ -2299,17 +2864,22 @@ static int ip6_route_del(struct fib6_config *cfg, return err; } - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, - &cfg->fc_src, cfg->fc_src_len); + &cfg->fc_src, cfg->fc_src_len, + !(cfg->fc_flags & RTF_CACHE)); if (fn) { - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { - if ((rt->rt6i_flags & RTF_CACHE) && - !(cfg->fc_flags & RTF_CACHE)) - continue; + for_each_fib6_node_rt_rcu(fn) { + if (cfg->fc_flags & RTF_CACHE) { + rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, + &cfg->fc_src); + if (!rt_cache) + continue; + rt = rt_cache; + } if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -2321,8 +2891,9 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) continue; - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + if (!dst_hold_safe(&rt->dst)) + break; + rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) @@ -2331,7 +2902,7 @@ static int ip6_route_del(struct fib6_config *cfg, return __ip6_del_rt_siblings(rt, cfg); } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return err; } @@ -2435,8 +3006,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - if (ip6_ins_rt(nrt)) - goto out_release; + /* No need to remove rt from the exception table if rt is + * a cached route because rt6_insert_exception() will + * takes care of it + */ + if (rt6_insert_exception(nrt, rt)) { + dst_release_immediate(&nrt->dst); + goto out; + } netevent.old = &rt->dst; netevent.new = &nrt->dst; @@ -2444,17 +3021,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - if (rt->rt6i_flags & RTF_CACHE) { - rt = (struct rt6_info *) dst_clone(&rt->dst); - ip6_del_rt(rt); - } - -out_release: - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt->dst); - out: neigh_release(neigh); } @@ -2511,23 +3077,23 @@ static struct rt6_info *rt6_get_route_info(struct net *net, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); + rcu_read_lock(); + fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) continue; - dst_hold(&rt->dst); + ip6_hold_safe(NULL, &rt, false); break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -2573,16 +3139,16 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) break; } if (rt) - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + ip6_hold_safe(NULL, &rt, false); + rcu_read_unlock(); return rt; } @@ -2620,17 +3186,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table) struct rt6_info *rt; restart: - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - ip6_del_rt(rt); + if (dst_hold_safe(&rt->dst)) { + rcu_read_unlock(); + ip6_del_rt(rt); + } else { + rcu_read_unlock(); + } goto restart; } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } @@ -2818,8 +3387,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) if (((void *)rt->dst.dev == dev || !dev) && rt != net->ipv6.ip6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->rt6i_prefsrc.plen = 0; + /* need to update cache as well */ + rt6_exceptions_remove_prefsrc(rt); + spin_unlock_bh(&rt6_exception_lock); } return 0; } @@ -2836,18 +3409,23 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) -#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || - ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && - ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } + + /* Further clean up cached routes in exception table. + * This is needed because cached route may have a different + * gateway than its 'parent' in the case of an ip redirect. + */ + rt6_exceptions_clean_tohost(rt, gateway); + return 0; } @@ -2926,19 +3504,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { - if (rt->rt6i_flags & RTF_CACHE) { - /* For RTF_CACHE with rt6i_pmtu == 0 - * (i.e. a redirected route), - * the metrics of its rt->dst.from has already - * been updated. - */ - if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) - rt->rt6i_pmtu = arg->mtu; - } else if (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + spin_lock_bh(&rt6_exception_lock); + if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } + rt6_exceptions_update_pmtu(rt, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); } return 0; } @@ -3839,7 +4412,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, - net->ipv6.rt6_stats->fib_rt_alloc, + atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 64d94afa427f..ae83615b7f6d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1577,8 +1577,9 @@ do_time_wait: refcounted = false; goto process; } - /* Fall through to ACK */ } + /* to ACK */ + /* fall through */ case TCP_TW_ACK: tcp_v6_timewait_ack(sk, skb); break; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 11d1314ab6c5..4ed9f8cc3b6a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -152,6 +152,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) switch (nexthdr) { case NEXTHDR_FRAGMENT: onlyproto = 1; + /* fall through */ case NEXTHDR_ROUTING: case NEXTHDR_HOP: case NEXTHDR_DEST: diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index ac598ec90589..d21a9d128d3e 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1867,6 +1867,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; + /* fall through */ case SIOCGIFADDR: rc = ipxitf_ioctl(cmd, argp); break; diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index af4e76ac88ff..0b750a22c4b9 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1650,7 +1650,7 @@ static int kcm_clone(struct socket *osock, struct kcm_clone *info, } newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); - if (unlikely(IS_ERR(newfile))) { + if (IS_ERR(newfile)) { err = PTR_ERR(newfile); goto out_sock_alloc_fail; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index ee485df73ccd..02d61101b108 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1314,6 +1314,9 @@ again: hlist_del_init(&session->hlist); + if (test_and_set_bit(0, &session->dead)) + goto again; + if (session->ref != NULL) (*session->ref)(session); @@ -1685,14 +1688,12 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create); /* This function is used by the netlink TUNNEL_DELETE command. */ -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel) { - l2tp_tunnel_inc_refcount(tunnel); - if (false == queue_work(l2tp_wq, &tunnel->del_work)) { - l2tp_tunnel_dec_refcount(tunnel); - return 1; + if (!test_and_set_bit(0, &tunnel->dead)) { + l2tp_tunnel_inc_refcount(tunnel); + queue_work(l2tp_wq, &tunnel->del_work); } - return 0; } EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); @@ -1750,6 +1751,9 @@ EXPORT_SYMBOL_GPL(__l2tp_session_unhash); */ int l2tp_session_delete(struct l2tp_session *session) { + if (test_and_set_bit(0, &session->dead)) + return 0; + if (session->ref) (*session->ref)(session); __l2tp_session_unhash(session); diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a305e0c5925a..67c79d9b5c6c 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -76,6 +76,7 @@ struct l2tp_session_cfg { struct l2tp_session { int magic; /* should be * L2TP_SESSION_MAGIC */ + long dead; struct l2tp_tunnel *tunnel; /* back pointer to tunnel * context */ @@ -160,6 +161,9 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + + unsigned long dead; + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ bool acpt_newsess; /* Indicates whether this @@ -254,7 +258,7 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp); void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel); -int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); +void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel); struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 87da9ef61860..014a7bc2a872 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -44,7 +44,6 @@ struct l2tp_eth { struct net_device *dev; struct sock *tunnel_sock; struct l2tp_session *session; - struct list_head list; atomic_long_t tx_bytes; atomic_long_t tx_packets; atomic_long_t tx_dropped; @@ -58,17 +57,6 @@ struct l2tp_eth_sess { struct net_device *dev; }; -/* per-net private data for this module */ -static unsigned int l2tp_eth_net_id; -struct l2tp_eth_net { - struct list_head l2tp_eth_dev_list; - spinlock_t l2tp_eth_lock; -}; - -static inline struct l2tp_eth_net *l2tp_eth_pernet(struct net *net) -{ - return net_generic(net, l2tp_eth_net_id); -} static int l2tp_eth_dev_init(struct net_device *dev) { @@ -84,12 +72,6 @@ static int l2tp_eth_dev_init(struct net_device *dev) static void l2tp_eth_dev_uninit(struct net_device *dev) { - struct l2tp_eth *priv = netdev_priv(dev); - struct l2tp_eth_net *pn = l2tp_eth_pernet(dev_net(dev)); - - spin_lock(&pn->l2tp_eth_lock); - list_del_init(&priv->list); - spin_unlock(&pn->l2tp_eth_lock); dev_put(dev); } @@ -273,7 +255,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; - struct l2tp_eth_net *pn; if (cfg->ifname) { strlcpy(name, cfg->ifname, IFNAMSIZ); @@ -305,7 +286,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, priv = netdev_priv(dev); priv->dev = dev; priv->session = session; - INIT_LIST_HEAD(&priv->list); priv->tunnel_sock = tunnel->sock; session->recv_skb = l2tp_eth_dev_recv; @@ -326,10 +306,6 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, strlcpy(session->ifname, dev->name, IFNAMSIZ); dev_hold(dev); - pn = l2tp_eth_pernet(dev_net(dev)); - spin_lock(&pn->l2tp_eth_lock); - list_add(&priv->list, &pn->l2tp_eth_dev_list); - spin_unlock(&pn->l2tp_eth_lock); return 0; @@ -342,22 +318,6 @@ out: return rc; } -static __net_init int l2tp_eth_init_net(struct net *net) -{ - struct l2tp_eth_net *pn = net_generic(net, l2tp_eth_net_id); - - INIT_LIST_HEAD(&pn->l2tp_eth_dev_list); - spin_lock_init(&pn->l2tp_eth_lock); - - return 0; -} - -static struct pernet_operations l2tp_eth_net_ops = { - .init = l2tp_eth_init_net, - .id = &l2tp_eth_net_id, - .size = sizeof(struct l2tp_eth_net), -}; - static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, @@ -371,25 +331,18 @@ static int __init l2tp_eth_init(void) err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) - goto out; - - err = register_pernet_device(&l2tp_eth_net_ops); - if (err) - goto out_unreg; + goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; -out_unreg: - l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); -out: +err: return err; } static void __exit l2tp_eth_exit(void) { - unregister_pernet_device(&l2tp_eth_net_ops); l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 50e3ee9a9d61..bc6e8bfc5be4 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -437,11 +437,11 @@ static void pppol2tp_session_close(struct l2tp_session *session) BUG_ON(session->magic != L2TP_SESSION_MAGIC); - if (sock) { + if (sock) inet_shutdown(sock, SEND_SHUTDOWN); - /* Don't let the session go away before our socket does */ - l2tp_session_inc_refcount(session); - } + + /* Don't let the session go away before our socket does */ + l2tp_session_inc_refcount(session); } /* Really kill the session socket. (Called from sock_put() if diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index e15314e3b464..db6e0afe3a20 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -127,8 +127,8 @@ static struct lapb_cb *lapb_create_cb(void) skb_queue_head_init(&lapb->write_queue); skb_queue_head_init(&lapb->ack_queue); - init_timer(&lapb->t1timer); - init_timer(&lapb->t2timer); + timer_setup(&lapb->t1timer, NULL, 0); + timer_setup(&lapb->t2timer, NULL, 0); lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 1a5535bc3b8d..8bb469cb3abe 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -35,15 +35,14 @@ #include <linux/interrupt.h> #include <net/lapb.h> -static void lapb_t1timer_expiry(unsigned long); -static void lapb_t2timer_expiry(unsigned long); +static void lapb_t1timer_expiry(struct timer_list *); +static void lapb_t2timer_expiry(struct timer_list *); void lapb_start_t1timer(struct lapb_cb *lapb) { del_timer(&lapb->t1timer); - lapb->t1timer.data = (unsigned long)lapb; - lapb->t1timer.function = &lapb_t1timer_expiry; + lapb->t1timer.function = (TIMER_FUNC_TYPE)lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; add_timer(&lapb->t1timer); @@ -53,8 +52,7 @@ void lapb_start_t2timer(struct lapb_cb *lapb) { del_timer(&lapb->t2timer); - lapb->t2timer.data = (unsigned long)lapb; - lapb->t2timer.function = &lapb_t2timer_expiry; + lapb->t2timer.function = (TIMER_FUNC_TYPE)lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; add_timer(&lapb->t2timer); @@ -75,9 +73,9 @@ int lapb_t1timer_running(struct lapb_cb *lapb) return timer_pending(&lapb->t1timer); } -static void lapb_t2timer_expiry(unsigned long param) +static void lapb_t2timer_expiry(struct timer_list *t) { - struct lapb_cb *lapb = (struct lapb_cb *)param; + struct lapb_cb *lapb = from_timer(lapb, t, t2timer); if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; @@ -85,9 +83,9 @@ static void lapb_t2timer_expiry(unsigned long param) } } -static void lapb_t1timer_expiry(unsigned long param) +static void lapb_t1timer_expiry(struct timer_list *t) { - struct lapb_cb *lapb = (struct lapb_cb *)param; + struct lapb_cb *lapb = from_timer(lapb, t, t1timer); switch (lapb->state) { diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 282912245938..80f25ff2f24b 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -6,6 +6,7 @@ mac80211-y := \ driver-ops.o \ sta_info.o \ wep.o \ + aead_api.o \ wpa.o \ scan.o offchannel.o \ ht.o agg-tx.o agg-rx.o \ @@ -15,8 +16,6 @@ mac80211-y := \ rate.o \ michael.o \ tkip.o \ - aes_ccm.o \ - aes_gcm.o \ aes_cmac.o \ aes_gmac.o \ fils_aead.o \ diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aead_api.c index a4e0d59a40dd..160f9df30402 100644 --- a/net/mac80211/aes_ccm.c +++ b/net/mac80211/aead_api.c @@ -1,6 +1,7 @@ /* * Copyright 2003-2004, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. + * Copyright 2014-2015, Qualcomm Atheros, Inc. * * Rewrite: Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> * @@ -12,30 +13,29 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/err.h> +#include <linux/scatterlist.h> #include <crypto/aead.h> -#include <net/mac80211.h> -#include "key.h" -#include "aes_ccm.h" +#include "aead_api.h" -int ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic, - size_t mic_len) +int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, + u8 *data, size_t data_len, u8 *mic) { + size_t mic_len = crypto_aead_authsize(tfm); struct scatterlist sg[3]; struct aead_request *aead_req; int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); u8 *__aad; - aead_req = kzalloc(reqsize + CCM_AAD_LEN, GFP_ATOMIC); + aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC); if (!aead_req) return -ENOMEM; __aad = (u8 *)aead_req + reqsize; - memcpy(__aad, aad, CCM_AAD_LEN); + memcpy(__aad, aad, aad_len); sg_init_table(sg, 3); - sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad)); + sg_set_buf(&sg[0], __aad, aad_len); sg_set_buf(&sg[1], data, data_len); sg_set_buf(&sg[2], mic, mic_len); @@ -49,10 +49,10 @@ int ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, return 0; } -int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic, - size_t mic_len) +int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len, + u8 *data, size_t data_len, u8 *mic) { + size_t mic_len = crypto_aead_authsize(tfm); struct scatterlist sg[3]; struct aead_request *aead_req; int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); @@ -62,15 +62,15 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, if (data_len == 0) return -EINVAL; - aead_req = kzalloc(reqsize + CCM_AAD_LEN, GFP_ATOMIC); + aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC); if (!aead_req) return -ENOMEM; __aad = (u8 *)aead_req + reqsize; - memcpy(__aad, aad, CCM_AAD_LEN); + memcpy(__aad, aad, aad_len); sg_init_table(sg, 3); - sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad)); + sg_set_buf(&sg[0], __aad, aad_len); sg_set_buf(&sg[1], data, data_len); sg_set_buf(&sg[2], mic, mic_len); @@ -84,14 +84,14 @@ int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, return err; } -struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[], - size_t key_len, - size_t mic_len) +struct crypto_aead * +aead_key_setup_encrypt(const char *alg, const u8 key[], + size_t key_len, size_t mic_len) { struct crypto_aead *tfm; int err; - tfm = crypto_alloc_aead("ccm(aes)", 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_aead(alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) return tfm; @@ -109,7 +109,7 @@ free_aead: return ERR_PTR(err); } -void ieee80211_aes_key_free(struct crypto_aead *tfm) +void aead_key_free(struct crypto_aead *tfm) { crypto_free_aead(tfm); } diff --git a/net/mac80211/aead_api.h b/net/mac80211/aead_api.h new file mode 100644 index 000000000000..5e39ea843bbf --- /dev/null +++ b/net/mac80211/aead_api.h @@ -0,0 +1,27 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _AEAD_API_H +#define _AEAD_API_H + +#include <crypto/aead.h> +#include <linux/crypto.h> + +struct crypto_aead * +aead_key_setup_encrypt(const char *alg, const u8 key[], + size_t key_len, size_t mic_len); + +int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, + size_t aad_len, u8 *data, + size_t data_len, u8 *mic); + +int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, + size_t aad_len, u8 *data, + size_t data_len, u8 *mic); + +void aead_key_free(struct crypto_aead *tfm); + +#endif /* _AEAD_API_H */ diff --git a/net/mac80211/aes_ccm.h b/net/mac80211/aes_ccm.h index fcd3254c5cf0..e9b7ca0bde5b 100644 --- a/net/mac80211/aes_ccm.h +++ b/net/mac80211/aes_ccm.h @@ -10,19 +10,39 @@ #ifndef AES_CCM_H #define AES_CCM_H -#include <linux/crypto.h> +#include "aead_api.h" #define CCM_AAD_LEN 32 -struct crypto_aead *ieee80211_aes_key_setup_encrypt(const u8 key[], - size_t key_len, - size_t mic_len); -int ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic, - size_t mic_len); -int ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic, - size_t mic_len); -void ieee80211_aes_key_free(struct crypto_aead *tfm); +static inline struct crypto_aead * +ieee80211_aes_key_setup_encrypt(const u8 key[], size_t key_len, size_t mic_len) +{ + return aead_key_setup_encrypt("ccm(aes)", key, key_len, mic_len); +} + +static inline int +ieee80211_aes_ccm_encrypt(struct crypto_aead *tfm, + u8 *b_0, u8 *aad, u8 *data, + size_t data_len, u8 *mic) +{ + return aead_encrypt(tfm, b_0, aad + 2, + be16_to_cpup((__be16 *)aad), + data, data_len, mic); +} + +static inline int +ieee80211_aes_ccm_decrypt(struct crypto_aead *tfm, + u8 *b_0, u8 *aad, u8 *data, + size_t data_len, u8 *mic) +{ + return aead_decrypt(tfm, b_0, aad + 2, + be16_to_cpup((__be16 *)aad), + data, data_len, mic); +} + +static inline void ieee80211_aes_key_free(struct crypto_aead *tfm) +{ + return aead_key_free(tfm); +} #endif /* AES_CCM_H */ diff --git a/net/mac80211/aes_gcm.c b/net/mac80211/aes_gcm.c deleted file mode 100644 index 8a4397cc1b08..000000000000 --- a/net/mac80211/aes_gcm.c +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright 2014-2015, Qualcomm Atheros, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/err.h> -#include <crypto/aead.h> - -#include <net/mac80211.h> -#include "key.h" -#include "aes_gcm.h" - -int ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic) -{ - struct scatterlist sg[3]; - struct aead_request *aead_req; - int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); - u8 *__aad; - - aead_req = kzalloc(reqsize + GCM_AAD_LEN, GFP_ATOMIC); - if (!aead_req) - return -ENOMEM; - - __aad = (u8 *)aead_req + reqsize; - memcpy(__aad, aad, GCM_AAD_LEN); - - sg_init_table(sg, 3); - sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad)); - sg_set_buf(&sg[1], data, data_len); - sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN); - - aead_request_set_tfm(aead_req, tfm); - aead_request_set_crypt(aead_req, sg, sg, data_len, j_0); - aead_request_set_ad(aead_req, sg[0].length); - - crypto_aead_encrypt(aead_req); - kzfree(aead_req); - return 0; -} - -int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic) -{ - struct scatterlist sg[3]; - struct aead_request *aead_req; - int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm); - u8 *__aad; - int err; - - if (data_len == 0) - return -EINVAL; - - aead_req = kzalloc(reqsize + GCM_AAD_LEN, GFP_ATOMIC); - if (!aead_req) - return -ENOMEM; - - __aad = (u8 *)aead_req + reqsize; - memcpy(__aad, aad, GCM_AAD_LEN); - - sg_init_table(sg, 3); - sg_set_buf(&sg[0], &__aad[2], be16_to_cpup((__be16 *)__aad)); - sg_set_buf(&sg[1], data, data_len); - sg_set_buf(&sg[2], mic, IEEE80211_GCMP_MIC_LEN); - - aead_request_set_tfm(aead_req, tfm); - aead_request_set_crypt(aead_req, sg, sg, - data_len + IEEE80211_GCMP_MIC_LEN, j_0); - aead_request_set_ad(aead_req, sg[0].length); - - err = crypto_aead_decrypt(aead_req); - kzfree(aead_req); - - return err; -} - -struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[], - size_t key_len) -{ - struct crypto_aead *tfm; - int err; - - tfm = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return tfm; - - err = crypto_aead_setkey(tfm, key, key_len); - if (err) - goto free_aead; - err = crypto_aead_setauthsize(tfm, IEEE80211_GCMP_MIC_LEN); - if (err) - goto free_aead; - - return tfm; - -free_aead: - crypto_free_aead(tfm); - return ERR_PTR(err); -} - -void ieee80211_aes_gcm_key_free(struct crypto_aead *tfm) -{ - crypto_free_aead(tfm); -} diff --git a/net/mac80211/aes_gcm.h b/net/mac80211/aes_gcm.h index 55aed5352494..d2b096033009 100644 --- a/net/mac80211/aes_gcm.h +++ b/net/mac80211/aes_gcm.h @@ -9,16 +9,38 @@ #ifndef AES_GCM_H #define AES_GCM_H -#include <linux/crypto.h> +#include "aead_api.h" #define GCM_AAD_LEN 32 -int ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic); -int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, u8 *j_0, u8 *aad, - u8 *data, size_t data_len, u8 *mic); -struct crypto_aead *ieee80211_aes_gcm_key_setup_encrypt(const u8 key[], - size_t key_len); -void ieee80211_aes_gcm_key_free(struct crypto_aead *tfm); +static inline int ieee80211_aes_gcm_encrypt(struct crypto_aead *tfm, + u8 *j_0, u8 *aad, u8 *data, + size_t data_len, u8 *mic) +{ + return aead_encrypt(tfm, j_0, aad + 2, + be16_to_cpup((__be16 *)aad), + data, data_len, mic); +} + +static inline int ieee80211_aes_gcm_decrypt(struct crypto_aead *tfm, + u8 *j_0, u8 *aad, u8 *data, + size_t data_len, u8 *mic) +{ + return aead_decrypt(tfm, j_0, aad + 2, + be16_to_cpup((__be16 *)aad), + data, data_len, mic); +} + +static inline struct crypto_aead * +ieee80211_aes_gcm_key_setup_encrypt(const u8 key[], size_t key_len) +{ + return aead_key_setup_encrypt("gcm(aes)", key, + key_len, IEEE80211_GCMP_MIC_LEN); +} + +static inline void ieee80211_aes_gcm_key_free(struct crypto_aead *tfm) +{ + return aead_key_free(tfm); +} #endif /* AES_GCM_H */ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 2849a1fc41c5..88cc1ae935ea 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -459,7 +459,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, } void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, - const u8 *addr, unsigned int bit) + const u8 *addr, unsigned int tid) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; @@ -470,7 +470,7 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, if (!sta) goto unlock; - set_bit(bit, sta->ampdu_mlme.tid_rx_manage_offl); + set_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl); ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); unlock: rcu_read_unlock(); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index d6d0b4201e40..41f5e48f8021 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -290,13 +290,15 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, { int i; + mutex_lock(&sta->ampdu_mlme.mtx); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { - __ieee80211_stop_tx_ba_session(sta, i, reason); - __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_LEAVE_QBSS, - reason != AGG_STOP_DESTROY_STA && - reason != AGG_STOP_PEER_REQUEST); + ___ieee80211_stop_tx_ba_session(sta, i, reason); + ___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, + WLAN_REASON_QSTA_LEAVE_QBSS, + reason != AGG_STOP_DESTROY_STA && + reason != AGG_STOP_PEER_REQUEST); } + mutex_unlock(&sta->ampdu_mlme.mtx); /* stopping might queue the work again - so cancel only afterwards */ cancel_work_sync(&sta->ampdu_mlme.work); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9675814f64db..68f874e73561 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2009,6 +2009,8 @@ void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct txq_info *txq, int tid); void ieee80211_txq_purge(struct ieee80211_local *local, struct txq_info *txqi); +void ieee80211_txq_remove_vlan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata); void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, u16 transaction, u16 auth_alg, u16 status, const u8 *extra, size_t extra_len, const u8 *bssid, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index f75029abf728..13b16f90e1cf 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -793,9 +793,7 @@ static int ieee80211_open(struct net_device *dev) static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) { - struct ieee80211_sub_if_data *txq_sdata = sdata; struct ieee80211_local *local = sdata->local; - struct fq *fq = &local->fq; unsigned long flags; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; @@ -939,9 +937,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: - txq_sdata = container_of(sdata->bss, - struct ieee80211_sub_if_data, u.ap); - mutex_lock(&local->mtx); list_del(&sdata->u.vlan.list); mutex_unlock(&local->mtx); @@ -998,8 +993,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, skb_queue_purge(&sdata->skb_queue); } - sdata->bss = NULL; - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { skb_queue_walk_safe(&local->pending[i], skb, tmp) { @@ -1012,22 +1005,10 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - if (txq_sdata->vif.txq) { - struct txq_info *txqi = to_txq_info(txq_sdata->vif.txq); - - /* - * FIXME FIXME - * - * We really shouldn't purge the *entire* txqi since that - * contains frames for the other AP_VLANs (and possibly - * the AP itself) as well, but there's no API in FQ now - * to be able to filter. - */ + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + ieee80211_txq_remove_vlan(local, sdata); - spin_lock_bh(&fq->lock); - ieee80211_txq_purge(local, txqi); - spin_unlock_bh(&fq->lock); - } + sdata->bss = NULL; if (local->open_count == 0) ieee80211_clear_tx_pending(local); @@ -1772,7 +1753,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, sizeof(void *)); int txq_size = 0; - if (local->ops->wake_tx_queue) + if (local->ops->wake_tx_queue && + type != NL80211_IFTYPE_AP_VLAN && + type != NL80211_IFTYPE_MONITOR) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index a550c707cd8a..7a76c4a6df30 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -675,8 +675,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) enum nl80211_band band; u8 *pos; struct ieee80211_sub_if_data *sdata; - int hdr_len = offsetof(struct ieee80211_mgmt, u.beacon) + - sizeof(mgmt->u.beacon); + int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon); sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); rcu_read_lock(); diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 7e5f271e3c30..465b7853edc0 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -275,6 +275,7 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, u8 *hw_addr, struct ieee802_11_elems *ie); bool mesh_peer_accepts_plinks(struct ieee802_11_elems *ie); u32 mesh_accept_plinks_update(struct ieee80211_sub_if_data *sdata); +void mesh_plink_timer(struct timer_list *t); void mesh_plink_broken(struct sta_info *sta); u32 mesh_plink_deactivate(struct sta_info *sta); u32 mesh_plink_open(struct sta_info *sta); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index d8bbd0d2225a..146ec6c0f12f 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -111,8 +111,8 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *pos, ie_len; - int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.mesh_action) + - sizeof(mgmt->u.action.u.mesh_action); + int hdr_len = offsetofend(struct ieee80211_mgmt, + u.action.u.mesh_action); skb = dev_alloc_skb(local->tx_headroom + hdr_len + @@ -242,8 +242,8 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_mgmt *mgmt; u8 *pos, ie_len; - int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.mesh_action) + - sizeof(mgmt->u.action.u.mesh_action); + int hdr_len = offsetofend(struct ieee80211_mgmt, + u.action.u.mesh_action); if (time_before(jiffies, ifmsh->next_perr)) return -EAGAIN; diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index f69c6c38ca43..e2d00cce3c17 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -220,8 +220,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, bool include_plid = false; u16 peering_proto = 0; u8 *pos, ie_len = 4; - int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.self_prot) + - sizeof(mgmt->u.action.u.self_prot); + int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.self_prot); int err = -ENOMEM; skb = dev_alloc_skb(local->tx_headroom + @@ -604,8 +603,9 @@ out: ieee80211_mbss_info_change_notify(sdata, changed); } -static void mesh_plink_timer(unsigned long data) +void mesh_plink_timer(struct timer_list *t) { + struct mesh_sta *mesh = from_timer(mesh, t, plink_timer); struct sta_info *sta; u16 reason = 0; struct ieee80211_sub_if_data *sdata; @@ -617,7 +617,7 @@ static void mesh_plink_timer(unsigned long data) * del_timer_sync() this timer after having made sure * it cannot be readded (by deleting the plink.) */ - sta = (struct sta_info *) data; + sta = mesh->plink_sta; if (sta->sdata->local->quiescing) return; @@ -697,11 +697,8 @@ static void mesh_plink_timer(unsigned long data) static inline void mesh_plink_timer_set(struct sta_info *sta, u32 timeout) { - sta->mesh->plink_timer.expires = jiffies + msecs_to_jiffies(timeout); - sta->mesh->plink_timer.data = (unsigned long) sta; - sta->mesh->plink_timer.function = mesh_plink_timer; sta->mesh->plink_timeout = timeout; - add_timer(&sta->mesh->plink_timer); + mod_timer(&sta->mesh->plink_timer, jiffies + msecs_to_jiffies(timeout)); } static bool llid_in_use(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 3b8e2709d8de..e4ededa1909d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -145,7 +145,6 @@ static u32 ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, - const struct ieee80211_ht_cap *ht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, struct cfg80211_chan_def *chandef, bool tracking) @@ -163,20 +162,13 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, chandef->center_freq1 = channel->center_freq; chandef->center_freq2 = 0; - if (!ht_cap || !ht_oper || !sta_ht_cap.ht_supported) { + if (!ht_oper || !sta_ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT; goto out; } chandef->width = NL80211_CHAN_WIDTH_20; - if (!(ht_cap->cap_info & - cpu_to_le16(IEEE80211_HT_CAP_SUP_WIDTH_20_40))) { - ret = IEEE80211_STA_DISABLE_40MHZ; - vht_chandef = *chandef; - goto out; - } - ht_cfreq = ieee80211_channel_to_frequency(ht_oper->primary_chan, channel->band); /* check that channel matches the right operating channel */ @@ -344,7 +336,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, /* calculate new channel (type) based on HT/VHT operation IEs */ flags = ieee80211_determine_chantype(sdata, sband, chan, - ht_cap, ht_oper, vht_oper, + ht_oper, vht_oper, &chandef, true); /* @@ -780,11 +772,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_HT_CAPABILITY, WLAN_EID_BSS_COEX_2040, + /* luckily this is almost always there */ WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, - /* 60GHz doesn't happen right now */ + /* 60 GHz (Multi-band, DMG, MMS) can't happen */ WLAN_EID_VHT_CAPABILITY, WLAN_EID_OPMODE_NOTIF, }; @@ -811,22 +804,16 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) /* if present, add any custom IEs that go before VHT */ if (assoc_data->ie_len) { static const u8 before_vht[] = { - WLAN_EID_SSID, - WLAN_EID_SUPP_RATES, - WLAN_EID_EXT_SUPP_RATES, - WLAN_EID_PWR_CAPABILITY, - WLAN_EID_SUPPORTED_CHANNELS, - WLAN_EID_RSN, - WLAN_EID_QOS_CAPA, - WLAN_EID_RRM_ENABLED_CAPABILITIES, - WLAN_EID_MOBILITY_DOMAIN, - WLAN_EID_SUPPORTED_REGULATORY_CLASSES, - WLAN_EID_HT_CAPABILITY, + /* + * no need to list the ones split off before HT + * or generated here + */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_QOS_TRAFFIC_CAPA, WLAN_EID_TIM_BCAST_REQ, WLAN_EID_INTERWORKING, + /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; /* RIC already taken above, so no need to handle here anymore */ @@ -4317,7 +4304,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, - ht_cap, ht_oper, vht_oper, + ht_oper, vht_oper, &chandef, false); sdata->needed_rx_chains = min(ieee80211_ht_vht_rx_chains(sdata, cbss), diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 47d2ed570470..ef2becaade50 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -7,7 +7,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright 2016 Intel Deutschland GmbH + * Copyright 2016-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -183,6 +183,20 @@ ieee80211_bss_info_update(struct ieee80211_local *local, return bss; } +static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, + u32 scan_flags, const u8 *da) +{ + if (!sdata) + return false; + /* accept broadcast for OCE */ + if (scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP && + is_broadcast_ether_addr(da)) + return true; + if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) + return true; + return ether_addr_equal(da, sdata->vif.addr); +} + void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); @@ -208,19 +222,24 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; + u32 scan_req_flags = 0, sched_scan_req_flags = 0; scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); - /* ignore ProbeResp to foreign address unless scanning - * with randomised address + if (scan_req) + scan_req_flags = scan_req->flags; + + if (sched_scan_req) + sched_scan_req_flags = sched_scan_req->flags; + + /* ignore ProbeResp to foreign address or non-bcast (OCE) + * unless scanning with randomised address */ - if (!(sdata1 && - (ether_addr_equal(mgmt->da, sdata1->vif.addr) || - scan_req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR)) && - !(sdata2 && - (ether_addr_equal(mgmt->da, sdata2->vif.addr) || - sched_scan_req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR))) + if (!ieee80211_scan_accept_presp(sdata1, scan_req_flags, + mgmt->da) && + !ieee80211_scan_accept_presp(sdata2, sched_scan_req_flags, + mgmt->da)) return; elements = mgmt->u.probe_resp.variable; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 69615016d5bf..9673e157bf8f 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -329,10 +329,12 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; + sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.user_mpm) - init_timer(&sta->mesh->plink_timer); + timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, + 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif @@ -515,6 +517,31 @@ static int sta_info_insert_drv_state(struct ieee80211_local *local, return err; } +static void +ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + bool allow_p2p_go_ps = sdata->vif.p2p; + struct sta_info *sta; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata || + !test_sta_flag(sta, WLAN_STA_ASSOC)) + continue; + if (!sta->sta.support_p2p_ps) { + allow_p2p_go_ps = false; + break; + } + } + rcu_read_unlock(); + + if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { + sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_P2P_PS); + } +} + /* * should be called with sta_mtx locked * this function replaces the mutex lock @@ -561,6 +588,13 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); + + if (sta->sta_state >= IEEE80211_STA_ASSOC) { + ieee80211_recalc_min_chandef(sta->sdata); + if (!sta->sta.support_p2p_ps) + ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); + } + /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); @@ -1788,31 +1822,6 @@ void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, } EXPORT_SYMBOL(ieee80211_sta_set_buffered); -static void -ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) -{ - struct ieee80211_local *local = sdata->local; - bool allow_p2p_go_ps = sdata->vif.p2p; - struct sta_info *sta; - - rcu_read_lock(); - list_for_each_entry_rcu(sta, &local->sta_list, list) { - if (sdata != sta->sdata || - !test_sta_flag(sta, WLAN_STA_ASSOC)) - continue; - if (!sta->sta.support_p2p_ps) { - allow_p2p_go_ps = false; - break; - } - } - rcu_read_unlock(); - - if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { - sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; - ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_P2P_PS); - } -} - int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 3acbdfa9f649..5c54acd10562 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -344,6 +344,7 @@ DECLARE_EWMA(mesh_fail_avg, 20, 8) * @plink_state: peer link state * @plink_timeout: timeout of peer link * @plink_timer: peer link watch timer + * @plink_sta: peer link watch timer's sta_info * @t_offset: timing offset relative to this host * @t_offset_setpoint: reference timing offset of this sta to be used when * calculating clockdrift @@ -356,6 +357,7 @@ DECLARE_EWMA(mesh_fail_avg, 20, 8) */ struct mesh_sta { struct timer_list plink_timer; + struct sta_info *plink_sta; s64 t_offset; s64 t_offset_setpoint; @@ -398,7 +400,7 @@ struct ieee80211_sta_rx_stats { u64 msdu[IEEE80211_NUM_TIDS + 1]; }; -/** +/* * The bandwidth threshold below which the per-station CoDel parameters will be * scaled to be more lenient (to prevent starvation of slow stations). This * value will be scaled by the number of active stations when it is being diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 94826680cf2b..7b8154474b9e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1396,6 +1396,40 @@ static void ieee80211_txq_enqueue(struct ieee80211_local *local, fq_flow_get_default_func); } +static bool fq_vlan_filter_func(struct fq *fq, struct fq_tin *tin, + struct fq_flow *flow, struct sk_buff *skb, + void *data) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + + return info->control.vif == data; +} + +void ieee80211_txq_remove_vlan(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata) +{ + struct fq *fq = &local->fq; + struct txq_info *txqi; + struct fq_tin *tin; + struct ieee80211_sub_if_data *ap; + + if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN)) + return; + + ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); + + if (!ap->vif.txq) + return; + + txqi = to_txq_info(ap->vif.txq); + tin = &txqi->tin; + + spin_lock_bh(&fq->lock); + fq_tin_filter(fq, tin, fq_vlan_filter_func, &sdata->vif, + fq_skb_free_func); + spin_unlock_bh(&fq->lock); +} + void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct txq_info *txqi, int tid) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 6aef6793d052..d57e5f6bd8b6 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1392,10 +1392,10 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { - WLAN_EID_SSID, - WLAN_EID_SUPP_RATES, - WLAN_EID_REQUEST, - WLAN_EID_EXT_SUPP_RATES, + /* + * no need to list the ones split off already + * (or generated here) + */ WLAN_EID_DS_PARAMS, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, }; @@ -1424,20 +1424,17 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, /* insert custom IEs that go before VHT */ if (ie && ie_len) { static const u8 before_vht[] = { - WLAN_EID_SSID, - WLAN_EID_SUPP_RATES, - WLAN_EID_REQUEST, - WLAN_EID_EXT_SUPP_RATES, - WLAN_EID_DS_PARAMS, - WLAN_EID_SUPPORTED_REGULATORY_CLASSES, - WLAN_EID_HT_CAPABILITY, + /* + * no need to list the ones split off already + * (or generated here) + */ WLAN_EID_BSS_COEX_2040, WLAN_EID_EXT_CAPABILITY, WLAN_EID_SSID_LIST, WLAN_EID_CHANNEL_USAGE, WLAN_EID_INTERWORKING, WLAN_EID_MESH_ID, - /* 60 GHz can't happen here right now */ + /* 60 GHz (Multi-band, DMG, MMS) can't happen */ }; noffset = ieee80211_ie_split(ie, ie_len, before_vht, ARRAY_SIZE(before_vht), @@ -2980,8 +2977,8 @@ int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; - int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.chan_switch) + - sizeof(mgmt->u.action.u.chan_switch); + int hdr_len = offsetofend(struct ieee80211_mgmt, + u.action.u.chan_switch); u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 19ec2189d3ac..b9276ac849fa 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -386,6 +386,16 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) bw = ieee80211_sta_cap_rx_bw(sta); bw = min(bw, sta->cur_max_bandwidth); + + /* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of + * IEEE80211-2016 specification makes higher bandwidth operation + * possible on the TDLS link if the peers have wider bandwidth + * capability. + */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) + return bw; + bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 0d722ea98a1b..b58722d9de37 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -464,7 +464,7 @@ static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, pos += IEEE80211_CCMP_HDR_LEN; ccmp_special_blocks(skb, pn, b_0, aad); return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len, - skb_put(skb, mic_len), mic_len); + skb_put(skb, mic_len)); } @@ -543,7 +543,7 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, key->u.ccmp.tfm, b_0, aad, skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN, data_len, - skb->data + skb->len - mic_len, mic_len)) + skb->data + skb->len - mic_len)) return RX_DROP_UNUSABLE; } diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 5c467ef97311..801ea9098387 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -24,6 +24,7 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" + depends on NET_IP_TUNNEL || NET_IP_TUNNEL=n ---help--- Add support for forwarding of mpls packets. diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index c5b9ce41d66f..8ca9915befc8 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -16,6 +16,7 @@ #include <net/arp.h> #include <net/ip_fib.h> #include <net/netevent.h> +#include <net/ip_tunnels.h> #include <net/netns/generic.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -39,6 +40,36 @@ static int one = 1; static int label_limit = (1 << 20) - 1; static int ttl_max = 255; +#if IS_ENABLED(CONFIG_NET_IP_TUNNEL) +static size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e) +{ + return sizeof(struct mpls_shim_hdr); +} + +static const struct ip_tunnel_encap_ops mpls_iptun_ops = { + .encap_hlen = ipgre_mpls_encap_hlen, +}; + +static int ipgre_tunnel_encap_add_mpls_ops(void) +{ + return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); +} + +static void ipgre_tunnel_encap_del_mpls_ops(void) +{ + ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); +} +#else +static int ipgre_tunnel_encap_add_mpls_ops(void) +{ + return 0; +} + +static void ipgre_tunnel_encap_del_mpls_ops(void) +{ +} +#endif + static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags); @@ -2485,6 +2516,10 @@ static int __init mpls_init(void) 0); rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, mpls_netconf_dump_devconf, 0); + err = ipgre_tunnel_encap_add_mpls_ops(); + if (err) + pr_err("Can't add mpls over gre tunnel ops\n"); + err = 0; out: return err; @@ -2502,6 +2537,7 @@ static void __exit mpls_exit(void) dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); + ipgre_tunnel_encap_del_mpls_ops(); } module_exit(mpls_exit); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 8ad2b52a0b32..5ca18f07683b 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -37,11 +37,11 @@ #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void -mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct mtype *map = set->data; - setup_timer(&map->gc, gc, (unsigned long)set); + timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } @@ -272,10 +272,10 @@ out: } static void -mtype_gc(unsigned long ul_set) +mtype_gc(struct timer_list *t) { - struct ip_set *set = (struct ip_set *)ul_set; - struct mtype *map = set->data; + struct mtype *map = from_timer(map, t, gc); + struct ip_set *set = map->set; void *x; u32 id; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 4783efff0bde..d8975a0b4282 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -48,6 +48,7 @@ struct bitmap_ip { size_t memsize; /* members size */ u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* data extensions */ __aligned(__alignof__(u64)); }; @@ -232,6 +233,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->netmask = netmask; set->timeout = IPSET_NO_TIMEOUT; + map->set = set; set->data = map; set->family = NFPROTO_IPV4; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index 9a065f672d3a..4c279fbd2d5d 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -52,6 +52,7 @@ struct bitmap_ipmac { u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* MAC + data extensions */ __aligned(__alignof__(u64)); }; @@ -307,6 +308,7 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, map->elements = elements; set->timeout = IPSET_NO_TIMEOUT; + map->set = set; set->data = map; set->family = NFPROTO_IPV4; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index 7f0c733358a4..7f9bbd7c98b5 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -40,6 +40,7 @@ struct bitmap_port { u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* data extensions */ __aligned(__alignof__(u64)); }; @@ -214,6 +215,7 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->last_port = last_port; set->timeout = IPSET_NO_TIMEOUT; + map->set = set; set->data = map; set->family = NFPROTO_UNSPEC; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e495b5e484b1..cf84f7b37cd9 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1191,14 +1191,17 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; - if (from->ref_netlink || to->ref_netlink) + write_lock_bh(&ip_set_ref_lock); + + if (from->ref_netlink || to->ref_netlink) { + write_unlock_bh(&ip_set_ref_lock); return -EBUSY; + } strncpy(from_name, from->name, IPSET_MAXNAMELEN); strncpy(from->name, to->name, IPSET_MAXNAMELEN); strncpy(to->name, from_name, IPSET_MAXNAMELEN); - write_lock_bh(&ip_set_ref_lock); swap(from->ref, to->ref); ip_set(inst, from_id) = to; ip_set(inst, to_id) = from; @@ -2072,25 +2075,28 @@ static struct pernet_operations ip_set_net_ops = { static int __init ip_set_init(void) { - int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + int ret = register_pernet_subsys(&ip_set_net_ops); + + if (ret) { + pr_err("ip_set: cannot register pernet_subsys.\n"); + return ret; + } + ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); + unregister_pernet_subsys(&ip_set_net_ops); return ret; } + ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + unregister_pernet_subsys(&ip_set_net_ops); return ret; } - ret = register_pernet_subsys(&ip_set_net_ops); - if (ret) { - pr_err("ip_set: cannot register pernet_subsys.\n"); - nf_unregister_sockopt(&so_set); - nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - return ret; - } + pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } @@ -2098,9 +2104,10 @@ ip_set_init(void) static void __exit ip_set_fini(void) { - unregister_pernet_subsys(&ip_set_net_ops); nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + + unregister_pernet_subsys(&ip_set_net_ops); pr_debug("these are the famous last words\n"); } diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 51063d9ed0f7..efffc8eabafe 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -280,6 +280,7 @@ htable_bits(u32 hashsize) struct htype { struct htable __rcu *table; /* the hash table */ struct timer_list gc; /* garbage collection when timeout enabled */ + struct ip_set *set; /* attached to this ip_set */ u32 maxelem; /* max elements in the hash */ u32 initval; /* random jhash init value */ #ifdef IP_SET_HASH_WITH_MARKMASK @@ -429,11 +430,11 @@ mtype_destroy(struct ip_set *set) } static void -mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct htype *h = set->data; - setup_timer(&h->gc, gc, (unsigned long)set); + timer_setup(&h->gc, gc, 0); mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); pr_debug("gc initialized, run in every %u\n", IPSET_GC_PERIOD(set->timeout)); @@ -526,10 +527,10 @@ mtype_expire(struct ip_set *set, struct htype *h) } static void -mtype_gc(unsigned long ul_set) +mtype_gc(struct timer_list *t) { - struct ip_set *set = (struct ip_set *)ul_set; - struct htype *h = set->data; + struct htype *h = from_timer(h, t, gc); + struct ip_set *set = h->set; pr_debug("called\n"); spin_lock_bh(&set->lock); @@ -1314,6 +1315,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, t->htable_bits = hbits; RCU_INIT_POINTER(h->table, t); + h->set = set; set->data = h; #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 20bfbd315f61..613eb212cb48 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -123,13 +123,12 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; ip &= ip_set_hostmask(h->netmask); + e.ip = htonl(ip); + if (e.ip == 0) + return -IPSET_ERR_HASH_ELEM; - if (adt == IPSET_TEST) { - e.ip = htonl(ip); - if (e.ip == 0) - return -IPSET_ERR_HASH_ELEM; + if (adt == IPSET_TEST) return adtfn(set, &e, &ext, &ext, flags); - } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { @@ -148,17 +147,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - if (retried) + if (retried) { ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip += hosts) { e.ip = htonl(ip); - if (e.ip == 0) - return -IPSET_ERR_HASH_ELEM; + } + for (; ip <= ip_to;) { ret = adtfn(set, &e, &ext, &ext, flags); - if (ret && !ip_set_eexist(ret, flags)) return ret; + ip += hosts; + e.ip = htonl(ip); + if (e.ip == 0) + return 0; + ret = 0; } return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index b64cf14e8352..f3ba8348cf9d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -149,7 +149,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { e.ip = htonl(ip); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index f438740e6c6a..ddb8039ec1d2 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -178,7 +178,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 6215fb898c50..a7f4d7a85420 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -185,7 +185,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5ab1b99a53c2..a2f19b9906e9 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -271,7 +271,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { e.ip = htonl(ip); p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; @@ -281,7 +281,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip == ntohl(h->next.ip) && p == ntohs(h->next.port) ? ntohl(h->next.ip2) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip2 = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &cidr); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 5d9e895452e7..1c67a1761e45 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -193,7 +193,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], } if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 44cf11939c91..d417074f1c1a 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -255,7 +255,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index db614e13b193..7f9ae2e9645b 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -250,13 +250,13 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip[0]); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip[0] = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); ip2 = (retried && ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip[1] = htonl(ip2); last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 54b64b6cd0cd..e6ef382febe4 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -241,7 +241,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index aff846960ac4..8602f2595a1a 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -291,7 +291,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip[0]); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip[0] = htonl(ip); ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) @@ -301,7 +301,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip2 = (retried && ip == ntohl(h->next.ip[0]) && p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip[1] = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index 178d4eba013b..c9b4e05ad940 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -44,6 +44,7 @@ struct set_adt_elem { struct list_set { u32 size; /* size of set list array */ struct timer_list gc; /* garbage collection */ + struct ip_set *set; /* attached to this ip_set */ struct net *net; /* namespace */ struct list_head members; /* the set members */ }; @@ -571,10 +572,10 @@ static const struct ip_set_type_variant set_variant = { }; static void -list_set_gc(unsigned long ul_set) +list_set_gc(struct timer_list *t) { - struct ip_set *set = (struct ip_set *)ul_set; - struct list_set *map = set->data; + struct list_set *map = from_timer(map, t, gc); + struct ip_set *set = map->set; spin_lock_bh(&set->lock); set_cleanup_entries(set); @@ -585,11 +586,11 @@ list_set_gc(unsigned long ul_set) } static void -list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +list_set_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct list_set *map = set->data; - setup_timer(&map->gc, gc, (unsigned long)set); + timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } @@ -606,6 +607,7 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) map->size = size; map->net = net; + map->set = set; INIT_LIST_HEAD(&map->members); set->data = map; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 90d396814798..4527921b1c3a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -921,6 +921,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, { struct sk_buff *new_skb = NULL; struct iphdr *old_iph = NULL; + __u8 old_dsfield; #ifdef CONFIG_IP_VS_IPV6 struct ipv6hdr *old_ipv6h = NULL; #endif @@ -945,7 +946,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, *payload_len = ntohs(old_ipv6h->payload_len) + sizeof(*old_ipv6h); - *dsfield = ipv6_get_dsfield(old_ipv6h); + old_dsfield = ipv6_get_dsfield(old_ipv6h); *ttl = old_ipv6h->hop_limit; if (df) *df = 0; @@ -960,12 +961,15 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, /* fix old IP header checksum */ ip_send_check(old_iph); - *dsfield = ipv4_get_dsfield(old_iph); + old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) *payload_len = ntohs(old_iph->tot_len); } + /* Implement full-functionality option for ECN encapsulation */ + *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); + return skb; error: kfree_skb(skb); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 929927171426..64e1ee091225 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1048,7 +1048,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure; - if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) + if (basechain->stats && nft_dump_stats(skb, basechain->stats)) goto nla_put_failure; } @@ -1487,8 +1487,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); - if (IS_ERR(chain2)) - return PTR_ERR(chain2); + if (!IS_ERR(chain2)) + return -EEXIST; } if (nla[NFTA_CHAIN_COUNTERS]) { @@ -2741,8 +2741,10 @@ cont: list_for_each_entry(i, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, i)) continue; - if (!strcmp(set->name, i->name)) + if (!strcmp(set->name, i->name)) { + kfree(set->name); return -ENFILE; + } } return 0; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c83a3b5e1c6c..d8571f414208 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -892,7 +892,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); - strlcpy(info->name, compat_tmp.name, sizeof(info->name)); + memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; user += sizeof(compat_tmp); } else @@ -905,9 +905,9 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(info, user, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); - info->name[sizeof(info->name) - 1] = '\0'; user += sizeof(*info); } + info->name[sizeof(info->name) - 1] = '\0'; size = sizeof(struct xt_counters); size *= info->num_counters; diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 38986a95216c..29123934887b 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -8,6 +8,7 @@ */ #include <linux/module.h> +#include <linux/syscalls.h> #include <linux/skbuff.h> #include <linux/filter.h> #include <linux/bpf.h> @@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) return 0; } +static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) +{ + mm_segment_t oldfs = get_fs(); + int retval, fd; + + set_fs(KERNEL_DS); + fd = bpf_obj_get_user(path); + set_fs(oldfs); + if (fd < 0) + return fd; + + retval = __bpf_mt_check_fd(fd, ret); + sys_close(fd); + return retval; +} + static int bpf_mt_check(const struct xt_mtchk_param *par) { struct xt_bpf_info *info = par->matchinfo; @@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par) return __bpf_mt_check_bytecode(info->bpf_program, info->bpf_program_num_elem, &info->filter); - else if (info->mode == XT_BPF_MODE_FD_PINNED || - info->mode == XT_BPF_MODE_FD_ELF) + else if (info->mode == XT_BPF_MODE_FD_ELF) return __bpf_mt_check_fd(info->fd, &info->filter); + else if (info->mode == XT_BPF_MODE_PATH_PINNED) + return __bpf_mt_check_path(info->path, &info->filter); else return -EINVAL; } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index e75ef39669c5..575d2153e3b8 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -76,7 +76,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, transparent = nf_sk_is_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && - transparent) + transparent && sk_fullsock(sk)) pskb->mark = sk->sk_mark; if (sk != skb->sk) @@ -133,7 +133,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) transparent = nf_sk_is_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && - transparent) + transparent && sk_fullsock(sk)) pskb->mark = sk->sk_mark; if (sk != skb->sk) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 327807731b44..336d9c6dcad9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2266,14 +2266,18 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; + if (cb->start) { + ret = cb->start(cb); + if (ret) + goto error_unlock; + } + nlk->cb_running = true; mutex_unlock(nlk->cb_mutex); - if (cb->start) - cb->start(cb); - ret = netlink_dump(sk); + sock_put(sk); if (ret) @@ -2332,16 +2336,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) { - struct sock *sk; - - sk = netlink_lookup(sock_net(in_skb->sk), - in_skb->sk->sk_protocol, - NETLINK_CB(in_skb).portid); - if (sk) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - sock_put(sk); - } + NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; + NETLINK_CB(in_skb).sk->sk_error_report(NETLINK_CB(in_skb).sk); return; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index ebf16f7f9089..2dec3583c97d 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -241,9 +241,9 @@ void nr_destroy_socket(struct sock *); /* * Handler for deferred kills. */ -static void nr_destroy_timer(unsigned long data) +static void nr_destroy_timer(struct timer_list *t) { - struct sock *sk=(struct sock *)data; + struct sock *sk = from_timer(sk, t, sk_timer); bh_lock_sock(sk); sock_hold(sk); nr_destroy_socket(sk); @@ -284,7 +284,7 @@ void nr_destroy_socket(struct sock *sk) if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ - sk->sk_timer.function = nr_destroy_timer; + sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_destroy_timer; sk->sk_timer.expires = jiffies + 2 * HZ; add_timer(&sk->sk_timer); } else diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 94d05806a9a2..43569aea0f5e 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -29,24 +29,23 @@ #include <linux/interrupt.h> #include <net/netrom.h> -static void nr_heartbeat_expiry(unsigned long); -static void nr_t1timer_expiry(unsigned long); -static void nr_t2timer_expiry(unsigned long); -static void nr_t4timer_expiry(unsigned long); -static void nr_idletimer_expiry(unsigned long); +static void nr_heartbeat_expiry(struct timer_list *); +static void nr_t1timer_expiry(struct timer_list *); +static void nr_t2timer_expiry(struct timer_list *); +static void nr_t4timer_expiry(struct timer_list *); +static void nr_idletimer_expiry(struct timer_list *); void nr_init_timers(struct sock *sk) { struct nr_sock *nr = nr_sk(sk); - setup_timer(&nr->t1timer, nr_t1timer_expiry, (unsigned long)sk); - setup_timer(&nr->t2timer, nr_t2timer_expiry, (unsigned long)sk); - setup_timer(&nr->t4timer, nr_t4timer_expiry, (unsigned long)sk); - setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk); + timer_setup(&nr->t1timer, nr_t1timer_expiry, 0); + timer_setup(&nr->t2timer, nr_t2timer_expiry, 0); + timer_setup(&nr->t4timer, nr_t4timer_expiry, 0); + timer_setup(&nr->idletimer, nr_idletimer_expiry, 0); /* initialized by sock_init_data */ - sk->sk_timer.data = (unsigned long)sk; - sk->sk_timer.function = &nr_heartbeat_expiry; + sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_heartbeat_expiry; } void nr_start_t1timer(struct sock *sk) @@ -113,9 +112,9 @@ int nr_t1timer_running(struct sock *sk) return timer_pending(&nr_sk(sk)->t1timer); } -static void nr_heartbeat_expiry(unsigned long param) +static void nr_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct sock *sk = from_timer(sk, t, sk_timer); struct nr_sock *nr = nr_sk(sk); bh_lock_sock(sk); @@ -152,10 +151,10 @@ static void nr_heartbeat_expiry(unsigned long param) bh_unlock_sock(sk); } -static void nr_t2timer_expiry(unsigned long param) +static void nr_t2timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; - struct nr_sock *nr = nr_sk(sk); + struct nr_sock *nr = from_timer(nr, t, t2timer); + struct sock *sk = &nr->sock; bh_lock_sock(sk); if (nr->condition & NR_COND_ACK_PENDING) { @@ -165,19 +164,20 @@ static void nr_t2timer_expiry(unsigned long param) bh_unlock_sock(sk); } -static void nr_t4timer_expiry(unsigned long param) +static void nr_t4timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct nr_sock *nr = from_timer(nr, t, t4timer); + struct sock *sk = &nr->sock; bh_lock_sock(sk); nr_sk(sk)->condition &= ~NR_COND_PEER_RX_BUSY; bh_unlock_sock(sk); } -static void nr_idletimer_expiry(unsigned long param) +static void nr_idletimer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; - struct nr_sock *nr = nr_sk(sk); + struct nr_sock *nr = from_timer(nr, t, idletimer); + struct sock *sk = &nr->sock; bh_lock_sock(sk); @@ -202,10 +202,10 @@ static void nr_idletimer_expiry(unsigned long param) bh_unlock_sock(sk); } -static void nr_t1timer_expiry(unsigned long param) +static void nr_t1timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; - struct nr_sock *nr = nr_sk(sk); + struct nr_sock *nr = from_timer(nr, t, t1timer); + struct sock *sk = &nr->sock; bh_lock_sock(sk); switch (nr->state) { diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index a54a556fcdb5..a551232daf61 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return err == -EINPROGRESS ? 0 : err; break; + case OVS_ACTION_ATTR_CT_CLEAR: + err = ovs_ct_clear(skb, key); + break; + case OVS_ACTION_ATTR_PUSH_ETH: err = push_eth(skb, key, nla_data(a)); break; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index d558e882ca0c..fe861e2f0deb 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1129,6 +1129,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, return err; } +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) +{ + if (skb_nfct(skb)) { + nf_conntrack_put(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + ovs_ct_fill_key(skb, key); + } + + return 0; +} + static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, const struct sw_flow_key *key, bool log) { diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index bc7efd1867ab..399dfdd2c4f9 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, const struct ovs_conntrack_info *); +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key); void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); int ovs_ct_put_key(const struct sw_flow_key *swkey, @@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, return -ENOTSUPP; } +static inline int ovs_ct_clear(struct sk_buff *skb, + struct sw_flow_key *key) +{ + return -ENOTSUPP; +} + static inline void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e8eb427ce6d1..dc0d79092e74 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -48,6 +48,7 @@ #include <net/ndisc.h> #include <net/mpls.h> #include <net/vxlan.h> +#include <net/erspan.h> #include "flow_netlink.h" @@ -75,6 +76,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) break; case OVS_ACTION_ATTR_CT: + case OVS_ACTION_ATTR_CT_CLEAR: case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_POP_ETH: case OVS_ACTION_ATTR_POP_MPLS: @@ -319,7 +321,8 @@ size_t ovs_tun_key_attr_size(void) * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ - + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_DST */ + + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ } size_t ovs_key_attr_size(void) @@ -371,6 +374,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -593,6 +597,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } +static int erspan_tun_opt_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + struct erspan_metadata opts; + + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); + + memset(&opts, 0, sizeof(opts)); + opts.index = nla_get_be32(attr); + + /* Index has only 20-bit */ + if (ntohl(opts.index) & ~INDEX_MASK) { + OVS_NLERR(log, "ERSPAN index number %x too large.", + ntohl(opts.index)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask); + opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), + is_mask); + + return 0; +} + static int ip_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -700,6 +731,19 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_PAD: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = erspan_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_ERSPAN_OPT; + opts_type = type; + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -824,6 +868,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, else if (output->tun_flags & TUNNEL_VXLAN_OPT && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + ((struct erspan_metadata *)tun_opts)->index)) + return -EMSGSIZE; } return 0; @@ -2195,6 +2243,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + break; } }; @@ -2479,6 +2529,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, + [OVS_ACTION_ATTR_CT_CLEAR] = 0, [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), [OVS_ACTION_ATTR_POP_ETH] = 0, @@ -2620,6 +2671,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_CT_CLEAR: + break; + case OVS_ACTION_ATTR_PUSH_ETH: /* Disallow pushing an Ethernet header if one * is already present */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 0389398fa4ab..2e5e7a41d8ef 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -108,7 +108,8 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp), NULL, NULL); + get_dpdev(vport->dp), + NULL, NULL, NULL); if (err) goto error_unlock; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b0f221885dfd..3f5caa3fbd06 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2838,6 +2838,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); + bool has_vnet_hdr = false; int hlen, tlen, linear; int extra_len = 0; @@ -2881,6 +2882,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); if (err) goto out_unlock; + has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -2939,7 +2941,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - if (po->has_vnet_hdr) { + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; @@ -3067,13 +3069,15 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, int ret = 0; bool unlisted = false; - if (po->fanout) - return -EINVAL; - lock_sock(sk); spin_lock(&po->bind_lock); rcu_read_lock(); + if (po->fanout) { + ret = -EINVAL; + goto out_unlock; + } + if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index f925753668a7..3b0ef691f5b1 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -35,11 +35,11 @@ #include <net/phonet/pn_dev.h> /* Transport protocol registration */ -static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; +static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; -static struct phonet_protocol *phonet_proto_get(unsigned int protocol) +static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { - struct phonet_protocol *pp; + const struct phonet_protocol *pp; if (protocol >= PHONET_NPROTO) return NULL; @@ -53,7 +53,7 @@ static struct phonet_protocol *phonet_proto_get(unsigned int protocol) return pp; } -static inline void phonet_proto_put(struct phonet_protocol *pp) +static inline void phonet_proto_put(const struct phonet_protocol *pp) { module_put(pp->prot->owner); } @@ -65,7 +65,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; struct pn_sock *pn; - struct phonet_protocol *pnp; + const struct phonet_protocol *pnp; int err; if (!capable(CAP_SYS_ADMIN)) @@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr) return 1; } -struct header_ops phonet_header_ops = { +const struct header_ops phonet_header_ops = { .create = pn_header_create, .parse = pn_header_parse, }; @@ -470,7 +470,7 @@ static struct packet_type phonet_packet_type __read_mostly = { static DEFINE_MUTEX(proto_tab_lock); int __init_or_module phonet_proto_register(unsigned int protocol, - struct phonet_protocol *pp) + const struct phonet_protocol *pp) { int err = 0; @@ -492,7 +492,8 @@ int __init_or_module phonet_proto_register(unsigned int protocol, } EXPORT_SYMBOL(phonet_proto_register); -void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp) +void phonet_proto_unregister(unsigned int protocol, + const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); BUG_ON(proto_tab[protocol] != pp); diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 5e710435ffa9..b44fb9018fb8 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -195,7 +195,7 @@ static struct proto pn_proto = { .name = "PHONET", }; -static struct phonet_protocol pn_dgram_proto = { +static const struct phonet_protocol pn_dgram_proto = { .ops = &phonet_dgram_ops, .prot = &pn_proto, .sock_type = SOCK_DGRAM, diff --git a/net/phonet/pep.c b/net/phonet/pep.c index e81537991ddf..9fc76b19cd3c 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1351,7 +1351,7 @@ static struct proto pep_proto = { .name = "PNPIPE", }; -static struct phonet_protocol pep_pn_proto = { +static const struct phonet_protocol pep_pn_proto = { .ops = &phonet_stream_ops, .prot = &pep_proto, .sock_type = SOCK_SEQPACKET, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index c2f5c13550c0..e458ece96d3d 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -20,26 +20,15 @@ #include "qrtr.h" -#define QRTR_PROTO_VER 1 +#define QRTR_PROTO_VER_1 1 +#define QRTR_PROTO_VER_2 3 /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff -enum qrtr_pkt_type { - QRTR_TYPE_DATA = 1, - QRTR_TYPE_HELLO = 2, - QRTR_TYPE_BYE = 3, - QRTR_TYPE_NEW_SERVER = 4, - QRTR_TYPE_DEL_SERVER = 5, - QRTR_TYPE_DEL_CLIENT = 6, - QRTR_TYPE_RESUME_TX = 7, - QRTR_TYPE_EXIT = 8, - QRTR_TYPE_PING = 9, -}; - /** - * struct qrtr_hdr - (I|R)PCrouter packet header + * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @src_node_id: source node @@ -49,7 +38,7 @@ enum qrtr_pkt_type { * @dst_node_id: destination node * @dst_port_id: destination port */ -struct qrtr_hdr { +struct qrtr_hdr_v1 { __le32 version; __le32 type; __le32 src_node_id; @@ -60,9 +49,44 @@ struct qrtr_hdr { __le32 dst_port_id; } __packed; -#define QRTR_HDR_SIZE sizeof(struct qrtr_hdr) -#define QRTR_NODE_BCAST ((unsigned int)-1) -#define QRTR_PORT_CTRL ((unsigned int)-2) +/** + * struct qrtr_hdr_v2 - (I|R)PCrouter packet header later versions + * @version: protocol version + * @type: packet type; one of QRTR_TYPE_* + * @flags: bitmask of QRTR_FLAGS_* + * @optlen: length of optional header data + * @size: length of packet, excluding this header and optlen + * @src_node_id: source node + * @src_port_id: source port + * @dst_node_id: destination node + * @dst_port_id: destination port + */ +struct qrtr_hdr_v2 { + u8 version; + u8 type; + u8 flags; + u8 optlen; + __le32 size; + __le16 src_node_id; + __le16 src_port_id; + __le16 dst_node_id; + __le16 dst_port_id; +}; + +#define QRTR_FLAGS_CONFIRM_RX BIT(0) + +struct qrtr_cb { + u32 src_node; + u32 src_port; + u32 dst_node; + u32 dst_port; + + u8 type; + u8 confirm_rx; +}; + +#define QRTR_HDR_MAX_SIZE max_t(size_t, sizeof(struct qrtr_hdr_v1), \ + sizeof(struct qrtr_hdr_v2)) struct qrtr_sock { /* WARNING: sk must be the first member */ @@ -111,8 +135,12 @@ struct qrtr_node { struct list_head item; }; -static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb); -static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb); +static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to); +static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to); /* Release node resources and free the node. * @@ -150,10 +178,27 @@ static void qrtr_node_release(struct qrtr_node *node) } /* Pass an outgoing packet socket buffer to the endpoint driver. */ -static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { + struct qrtr_hdr_v1 *hdr; + size_t len = skb->len; int rc = -ENODEV; + hdr = skb_push(skb, sizeof(*hdr)); + hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); + hdr->type = cpu_to_le32(type); + hdr->src_node_id = cpu_to_le32(from->sq_node); + hdr->src_port_id = cpu_to_le32(from->sq_port); + hdr->dst_node_id = cpu_to_le32(to->sq_node); + hdr->dst_port_id = cpu_to_le32(to->sq_port); + + hdr->size = cpu_to_le32(len); + hdr->confirm_rx = 0; + + skb_put_padto(skb, ALIGN(len, 4)); + mutex_lock(&node->ep_lock); if (node->ep) rc = node->ep->xmit(node->ep, skb); @@ -207,125 +252,103 @@ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) { struct qrtr_node *node = ep->node; - const struct qrtr_hdr *phdr = data; + const struct qrtr_hdr_v1 *v1; + const struct qrtr_hdr_v2 *v2; struct sk_buff *skb; - unsigned int psize; + struct qrtr_cb *cb; unsigned int size; - unsigned int type; unsigned int ver; - unsigned int dst; - - if (len < QRTR_HDR_SIZE || len & 3) - return -EINVAL; - - ver = le32_to_cpu(phdr->version); - size = le32_to_cpu(phdr->size); - type = le32_to_cpu(phdr->type); - dst = le32_to_cpu(phdr->dst_port_id); - - psize = (size + 3) & ~3; + size_t hdrlen; - if (ver != QRTR_PROTO_VER) - return -EINVAL; - - if (len != psize + QRTR_HDR_SIZE) - return -EINVAL; - - if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA) + if (len & 3) return -EINVAL; skb = netdev_alloc_skb(NULL, len); if (!skb) return -ENOMEM; - skb_reset_transport_header(skb); - skb_put_data(skb, data, len); - - skb_queue_tail(&node->rx_queue, skb); - schedule_work(&node->work); - - return 0; -} -EXPORT_SYMBOL_GPL(qrtr_endpoint_post); + cb = (struct qrtr_cb *)skb->cb; -static struct sk_buff *qrtr_alloc_ctrl_packet(u32 type, size_t pkt_len, - u32 src_node, u32 dst_node) -{ - struct qrtr_hdr *hdr; - struct sk_buff *skb; - - skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); - if (!skb) - return NULL; - skb_reset_transport_header(skb); + /* Version field in v1 is little endian, so this works for both cases */ + ver = *(u8*)data; - hdr = skb_put(skb, QRTR_HDR_SIZE); - hdr->version = cpu_to_le32(QRTR_PROTO_VER); - hdr->type = cpu_to_le32(type); - hdr->src_node_id = cpu_to_le32(src_node); - hdr->src_port_id = cpu_to_le32(QRTR_PORT_CTRL); - hdr->confirm_rx = cpu_to_le32(0); - hdr->size = cpu_to_le32(pkt_len); - hdr->dst_node_id = cpu_to_le32(dst_node); - hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); + switch (ver) { + case QRTR_PROTO_VER_1: + v1 = data; + hdrlen = sizeof(*v1); - return skb; -} + cb->type = le32_to_cpu(v1->type); + cb->src_node = le32_to_cpu(v1->src_node_id); + cb->src_port = le32_to_cpu(v1->src_port_id); + cb->confirm_rx = !!v1->confirm_rx; + cb->dst_node = le32_to_cpu(v1->dst_node_id); + cb->dst_port = le32_to_cpu(v1->dst_port_id); -/* Allocate and construct a resume-tx packet. */ -static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, - u32 dst_node, u32 port) -{ - const int pkt_len = 20; - struct sk_buff *skb; - __le32 *buf; + size = le32_to_cpu(v1->size); + break; + case QRTR_PROTO_VER_2: + v2 = data; + hdrlen = sizeof(*v2) + v2->optlen; + + cb->type = v2->type; + cb->confirm_rx = !!(v2->flags & QRTR_FLAGS_CONFIRM_RX); + cb->src_node = le16_to_cpu(v2->src_node_id); + cb->src_port = le16_to_cpu(v2->src_port_id); + cb->dst_node = le16_to_cpu(v2->dst_node_id); + cb->dst_port = le16_to_cpu(v2->dst_port_id); + + if (cb->src_port == (u16)QRTR_PORT_CTRL) + cb->src_port = QRTR_PORT_CTRL; + if (cb->dst_port == (u16)QRTR_PORT_CTRL) + cb->dst_port = QRTR_PORT_CTRL; + + size = le32_to_cpu(v2->size); + break; + default: + pr_err("qrtr: Invalid version %d\n", ver); + goto err; + } - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_RESUME_TX, pkt_len, - src_node, dst_node); - if (!skb) - return NULL; + if (len != ALIGN(size, 4) + hdrlen) + goto err; - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); - buf[1] = cpu_to_le32(src_node); - buf[2] = cpu_to_le32(port); + if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA) + goto err; - return skb; -} + skb_put_data(skb, data + hdrlen, size); -/* Allocate and construct a BYE message to signal remote termination */ -static struct sk_buff *qrtr_alloc_local_bye(u32 src_node) -{ - const int pkt_len = 20; - struct sk_buff *skb; - __le32 *buf; + skb_queue_tail(&node->rx_queue, skb); + schedule_work(&node->work); - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_BYE, pkt_len, - src_node, qrtr_local_nid); - if (!skb) - return NULL; + return 0; - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_BYE); +err: + kfree_skb(skb); + return -EINVAL; - return skb; } +EXPORT_SYMBOL_GPL(qrtr_endpoint_post); -static struct sk_buff *qrtr_alloc_del_client(struct sockaddr_qrtr *sq) +/** + * qrtr_alloc_ctrl_packet() - allocate control packet skb + * @pkt: reference to qrtr_ctrl_pkt pointer + * + * Returns newly allocated sk_buff, or NULL on failure + * + * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and + * on success returns a reference to the control packet in @pkt. + */ +static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) { - const int pkt_len = 20; + const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; - __le32 *buf; - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_DEL_CLIENT, pkt_len, - sq->sq_node, QRTR_NODE_BCAST); + skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, GFP_KERNEL); if (!skb) return NULL; - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); - buf[1] = cpu_to_le32(sq->sq_node); - buf[2] = cpu_to_le32(sq->sq_port); + skb_reserve(skb, QRTR_HDR_MAX_SIZE); + *pkt = skb_put_zero(skb, pkt_len); return skb; } @@ -340,24 +363,26 @@ static void qrtr_port_put(struct qrtr_sock *ipc); static void qrtr_node_rx_work(struct work_struct *work) { struct qrtr_node *node = container_of(work, struct qrtr_node, work); + struct qrtr_ctrl_pkt *pkt; + struct sockaddr_qrtr dst; + struct sockaddr_qrtr src; struct sk_buff *skb; while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { - const struct qrtr_hdr *phdr; - u32 dst_node, dst_port; struct qrtr_sock *ipc; - u32 src_node; + struct qrtr_cb *cb; int confirm; - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - src_node = le32_to_cpu(phdr->src_node_id); - dst_node = le32_to_cpu(phdr->dst_node_id); - dst_port = le32_to_cpu(phdr->dst_port_id); - confirm = !!phdr->confirm_rx; + cb = (struct qrtr_cb *)skb->cb; + src.sq_node = cb->src_node; + src.sq_port = cb->src_port; + dst.sq_node = cb->dst_node; + dst.sq_port = cb->dst_port; + confirm = !!cb->confirm_rx; - qrtr_node_assign(node, src_node); + qrtr_node_assign(node, cb->src_node); - ipc = qrtr_port_lookup(dst_port); + ipc = qrtr_port_lookup(cb->dst_port); if (!ipc) { kfree_skb(skb); } else { @@ -368,10 +393,16 @@ static void qrtr_node_rx_work(struct work_struct *work) } if (confirm) { - skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port); + skb = qrtr_alloc_ctrl_packet(&pkt); if (!skb) break; - if (qrtr_node_enqueue(node, skb)) + + pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); + pkt->client.node = cpu_to_le32(dst.sq_node); + pkt->client.port = cpu_to_le32(dst.sq_port); + + if (qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, + &dst, &src)) break; } } @@ -421,6 +452,9 @@ EXPORT_SYMBOL_GPL(qrtr_endpoint_register); void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) { struct qrtr_node *node = ep->node; + struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; + struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; + struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; mutex_lock(&node->ep_lock); @@ -428,9 +462,11 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ - skb = qrtr_alloc_local_bye(node->nid); - if (skb) - qrtr_local_enqueue(NULL, skb); + skb = qrtr_alloc_ctrl_packet(&pkt); + if (skb) { + pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); + qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); + } qrtr_node_release(node); ep->node = NULL; @@ -466,13 +502,24 @@ static void qrtr_port_put(struct qrtr_sock *ipc) /* Remove port assignment. */ static void qrtr_port_remove(struct qrtr_sock *ipc) { + struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; int port = ipc->us.sq_port; + struct sockaddr_qrtr to; - skb = qrtr_alloc_del_client(&ipc->us); + to.sq_family = AF_QIPCRTR; + to.sq_node = QRTR_NODE_BCAST; + to.sq_port = QRTR_PORT_CTRL; + + skb = qrtr_alloc_ctrl_packet(&pkt); if (skb) { + pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); + pkt->client.node = cpu_to_le32(ipc->us.sq_node); + pkt->client.port = cpu_to_le32(ipc->us.sq_port); + skb_set_owner_w(skb, &ipc->sk); - qrtr_bcast_enqueue(NULL, skb); + qrtr_bcast_enqueue(NULL, skb, QRTR_TYPE_DEL_CLIENT, &ipc->us, + &to); } if (port == QRTR_PORT_CTRL) @@ -541,7 +588,7 @@ static void qrtr_reset_ports(void) sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; - wake_up_interruptible(sk_sleep(&ipc->sk)); + ipc->sk.sk_error_report(&ipc->sk); sock_put(&ipc->sk); } mutex_unlock(&qrtr_port_lock); @@ -620,19 +667,23 @@ static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) } /* Queue packet to local peer socket. */ -static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { - const struct qrtr_hdr *phdr; struct qrtr_sock *ipc; + struct qrtr_cb *cb; - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - - ipc = qrtr_port_lookup(le32_to_cpu(phdr->dst_port_id)); + ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ kfree_skb(skb); return -ENODEV; } + cb = (struct qrtr_cb *)skb->cb; + cb->src_node = from->sq_node; + cb->src_port = from->sq_port; + if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); kfree_skb(skb); @@ -645,7 +696,9 @@ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb) } /* Queue packet for broadcast. */ -static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { struct sk_buff *skbn; @@ -655,11 +708,11 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) if (!skbn) break; skb_set_owner_w(skbn, skb->sk); - qrtr_node_enqueue(node, skbn); + qrtr_node_enqueue(node, skbn, type, from, to); } mutex_unlock(&qrtr_node_lock); - qrtr_local_enqueue(node, skb); + qrtr_local_enqueue(node, skb, type, from, to); return 0; } @@ -667,13 +720,14 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); - int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *); + int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *, int, + struct sockaddr_qrtr *, struct sockaddr_qrtr *); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct qrtr_node *node; - struct qrtr_hdr *hdr; struct sk_buff *skb; size_t plen; + u32 type = QRTR_TYPE_DATA; int rc; if (msg->msg_flags & ~(MSG_DONTWAIT)) @@ -722,37 +776,19 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } plen = (len + 3) & ~3; - skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_SIZE, + skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_MAX_SIZE, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) goto out_node; - skb_reset_transport_header(skb); - skb_put(skb, len + QRTR_HDR_SIZE); - - hdr = (struct qrtr_hdr *)skb_transport_header(skb); - hdr->version = cpu_to_le32(QRTR_PROTO_VER); - hdr->src_node_id = cpu_to_le32(ipc->us.sq_node); - hdr->src_port_id = cpu_to_le32(ipc->us.sq_port); - hdr->confirm_rx = cpu_to_le32(0); - hdr->size = cpu_to_le32(len); - hdr->dst_node_id = cpu_to_le32(addr->sq_node); - hdr->dst_port_id = cpu_to_le32(addr->sq_port); + skb_reserve(skb, QRTR_HDR_MAX_SIZE); - rc = skb_copy_datagram_from_iter(skb, QRTR_HDR_SIZE, - &msg->msg_iter, len); + rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc) { kfree_skb(skb); goto out_node; } - if (plen != len) { - rc = skb_pad(skb, plen - len); - if (rc) - goto out_node; - skb_put(skb, plen - len); - } - if (ipc->us.sq_port == QRTR_PORT_CTRL) { if (len < 4) { rc = -EINVAL; @@ -761,12 +797,11 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } /* control messages already require the type as 'command' */ - skb_copy_bits(skb, QRTR_HDR_SIZE, &hdr->type, 4); - } else { - hdr->type = cpu_to_le32(QRTR_TYPE_DATA); + skb_copy_bits(skb, 0, &type, 4); + type = le32_to_cpu(type); } - rc = enqueue_fn(node, skb); + rc = enqueue_fn(node, skb, type, &ipc->us, addr); if (rc >= 0) rc = len; @@ -781,9 +816,9 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); - const struct qrtr_hdr *phdr; struct sock *sk = sock->sk; struct sk_buff *skb; + struct qrtr_cb *cb; int copied, rc; lock_sock(sk); @@ -800,22 +835,22 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, return rc; } - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - copied = le32_to_cpu(phdr->size); + copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } - rc = skb_copy_datagram_msg(skb, QRTR_HDR_SIZE, msg, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc < 0) goto out; rc = copied; if (addr) { + cb = (struct qrtr_cb *)skb->cb; addr->sq_family = AF_QIPCRTR; - addr->sq_node = le32_to_cpu(phdr->src_node_id); - addr->sq_port = le32_to_cpu(phdr->src_port_id); + addr->sq_node = cb->src_node; + addr->sq_port = cb->src_port; msg->msg_namelen = sizeof(*addr); } @@ -908,7 +943,7 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: skb = skb_peek(&sk->sk_receive_queue); if (skb) - len = skb->len - QRTR_HDR_SIZE; + len = skb->len; rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: diff --git a/net/rds/ib.c b/net/rds/ib.c index a0954ace3774..36dd2099048a 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -126,6 +126,7 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; + bool has_fr, has_fmr; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) @@ -143,11 +144,11 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); - rds_ibdev->has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && - device->map_phys_fmr && device->unmap_fmr); - rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr); + has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + rds_ibdev->use_fastreg = (has_fr && !has_fmr); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? diff --git a/net/rds/ib.h b/net/rds/ib.h index bf4822407567..6ea6a27891b0 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -215,8 +215,6 @@ struct rds_ib_device { struct list_head conn_list; struct ib_device *dev; struct ib_pd *pd; - bool has_fmr; - bool has_fr; bool use_fastreg; unsigned int max_mrs; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 9a3c54e659e9..e678699268a2 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -601,11 +601,11 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; - pool->max_items = RDS_MR_1M_POOL_SIZE; + pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; - pool->max_items = RDS_MR_8K_POOL_SIZE; + pool->max_items = rds_ibdev->max_8k_mrs; } pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 4a9729257023..6a5c4992cf61 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -318,9 +318,11 @@ void rose_destroy_socket(struct sock *); /* * Handler for deferred kills. */ -static void rose_destroy_timer(unsigned long data) +static void rose_destroy_timer(struct timer_list *t) { - rose_destroy_socket((struct sock *)data); + struct sock *sk = from_timer(sk, t, sk_timer); + + rose_destroy_socket(sk); } /* @@ -353,8 +355,7 @@ void rose_destroy_socket(struct sock *sk) if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ - setup_timer(&sk->sk_timer, rose_destroy_timer, - (unsigned long)sk); + timer_setup(&sk->sk_timer, rose_destroy_timer, 0); sk->sk_timer.expires = jiffies + 10 * HZ; add_timer(&sk->sk_timer); } else @@ -538,8 +539,8 @@ static int rose_create(struct net *net, struct socket *sock, int protocol, sock->ops = &rose_proto_ops; sk->sk_protocol = protocol; - init_timer(&rose->timer); - init_timer(&rose->idletimer); + timer_setup(&rose->timer, NULL, 0); + timer_setup(&rose->idletimer, NULL, 0); rose->t1 = msecs_to_jiffies(sysctl_rose_call_request_timeout); rose->t2 = msecs_to_jiffies(sysctl_rose_reset_request_timeout); @@ -582,8 +583,8 @@ static struct sock *rose_make_new(struct sock *osk) sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); - init_timer(&rose->timer); - init_timer(&rose->idletimer); + timer_setup(&rose->timer, NULL, 0); + timer_setup(&rose->idletimer, NULL, 0); orose = rose_sk(osk); rose->t1 = orose->t1; diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index c76638cc2cd5..cda4c6678ef1 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -27,8 +27,8 @@ #include <linux/interrupt.h> #include <net/rose.h> -static void rose_ftimer_expiry(unsigned long); -static void rose_t0timer_expiry(unsigned long); +static void rose_ftimer_expiry(struct timer_list *); +static void rose_t0timer_expiry(struct timer_list *); static void rose_transmit_restart_confirmation(struct rose_neigh *neigh); static void rose_transmit_restart_request(struct rose_neigh *neigh); @@ -37,8 +37,7 @@ void rose_start_ftimer(struct rose_neigh *neigh) { del_timer(&neigh->ftimer); - neigh->ftimer.data = (unsigned long)neigh; - neigh->ftimer.function = &rose_ftimer_expiry; + neigh->ftimer.function = (TIMER_FUNC_TYPE)rose_ftimer_expiry; neigh->ftimer.expires = jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout); @@ -49,8 +48,7 @@ static void rose_start_t0timer(struct rose_neigh *neigh) { del_timer(&neigh->t0timer); - neigh->t0timer.data = (unsigned long)neigh; - neigh->t0timer.function = &rose_t0timer_expiry; + neigh->t0timer.function = (TIMER_FUNC_TYPE)rose_t0timer_expiry; neigh->t0timer.expires = jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout); @@ -77,13 +75,13 @@ static int rose_t0timer_running(struct rose_neigh *neigh) return timer_pending(&neigh->t0timer); } -static void rose_ftimer_expiry(unsigned long param) +static void rose_ftimer_expiry(struct timer_list *t) { } -static void rose_t0timer_expiry(unsigned long param) +static void rose_t0timer_expiry(struct timer_list *t) { - struct rose_neigh *neigh = (struct rose_neigh *)param; + struct rose_neigh *neigh = from_timer(neigh, t, t0timer); rose_transmit_restart_request(neigh); diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 344456206b70..7af4f99c4a93 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -19,12 +19,13 @@ static struct sk_buff_head loopback_queue; static struct timer_list loopback_timer; static void rose_set_loopback_timer(void); +static void rose_loopback_timer(struct timer_list *unused); void rose_loopback_init(void) { skb_queue_head_init(&loopback_queue); - init_timer(&loopback_timer); + timer_setup(&loopback_timer, rose_loopback_timer, 0); } static int rose_loopback_running(void) @@ -50,20 +51,16 @@ int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh) return 1; } -static void rose_loopback_timer(unsigned long); static void rose_set_loopback_timer(void) { del_timer(&loopback_timer); - loopback_timer.data = 0; - loopback_timer.function = &rose_loopback_timer; loopback_timer.expires = jiffies + 10; - add_timer(&loopback_timer); } -static void rose_loopback_timer(unsigned long param) +static void rose_loopback_timer(struct timer_list *unused) { struct sk_buff *skb; struct net_device *dev; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 452bbb38d943..65921cd10323 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -104,8 +104,8 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, skb_queue_head_init(&rose_neigh->queue); - init_timer(&rose_neigh->ftimer); - init_timer(&rose_neigh->t0timer); + timer_setup(&rose_neigh->ftimer, NULL, 0); + timer_setup(&rose_neigh->t0timer, NULL, 0); if (rose_route->ndigis != 0) { rose_neigh->digipeat = @@ -390,8 +390,8 @@ void rose_add_loopback_neigh(void) skb_queue_head_init(&sn->queue); - init_timer(&sn->ftimer); - init_timer(&sn->t0timer); + timer_setup(&sn->ftimer, NULL, 0); + timer_setup(&sn->t0timer, NULL, 0); spin_lock_bh(&rose_neigh_list_lock); sn->next = rose_neigh_list; diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index bc5469d6d9cb..ea613b2a9735 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -28,16 +28,15 @@ #include <linux/interrupt.h> #include <net/rose.h> -static void rose_heartbeat_expiry(unsigned long); -static void rose_timer_expiry(unsigned long); -static void rose_idletimer_expiry(unsigned long); +static void rose_heartbeat_expiry(struct timer_list *t); +static void rose_timer_expiry(struct timer_list *); +static void rose_idletimer_expiry(struct timer_list *); void rose_start_heartbeat(struct sock *sk) { del_timer(&sk->sk_timer); - sk->sk_timer.data = (unsigned long)sk; - sk->sk_timer.function = &rose_heartbeat_expiry; + sk->sk_timer.function = (TIMER_FUNC_TYPE)rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; add_timer(&sk->sk_timer); @@ -49,8 +48,7 @@ void rose_start_t1timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.data = (unsigned long)sk; - rose->timer.function = &rose_timer_expiry; + rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; add_timer(&rose->timer); @@ -62,8 +60,7 @@ void rose_start_t2timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.data = (unsigned long)sk; - rose->timer.function = &rose_timer_expiry; + rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; add_timer(&rose->timer); @@ -75,8 +72,7 @@ void rose_start_t3timer(struct sock *sk) del_timer(&rose->timer); - rose->timer.data = (unsigned long)sk; - rose->timer.function = &rose_timer_expiry; + rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; add_timer(&rose->timer); @@ -88,8 +84,7 @@ void rose_start_hbtimer(struct sock *sk) del_timer(&rose->timer); - rose->timer.data = (unsigned long)sk; - rose->timer.function = &rose_timer_expiry; + rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; add_timer(&rose->timer); @@ -102,8 +97,7 @@ void rose_start_idletimer(struct sock *sk) del_timer(&rose->idletimer); if (rose->idle > 0) { - rose->idletimer.data = (unsigned long)sk; - rose->idletimer.function = &rose_idletimer_expiry; + rose->idletimer.function = (TIMER_FUNC_TYPE)rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; add_timer(&rose->idletimer); @@ -125,9 +119,9 @@ void rose_stop_idletimer(struct sock *sk) del_timer(&rose_sk(sk)->idletimer); } -static void rose_heartbeat_expiry(unsigned long param) +static void rose_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct sock *sk = from_timer(sk, t, sk_timer); struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); @@ -163,10 +157,10 @@ static void rose_heartbeat_expiry(unsigned long param) bh_unlock_sock(sk); } -static void rose_timer_expiry(unsigned long param) +static void rose_timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; - struct rose_sock *rose = rose_sk(sk); + struct rose_sock *rose = from_timer(rose, t, timer); + struct sock *sk = &rose->sock; bh_lock_sock(sk); switch (rose->state) { @@ -192,9 +186,10 @@ static void rose_timer_expiry(unsigned long param) bh_unlock_sock(sk); } -static void rose_idletimer_expiry(unsigned long param) +static void rose_idletimer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct rose_sock *rose = from_timer(rose, t, idletimer); + struct sock *sk = &rose->sock; bh_lock_sock(sk); rose_clear_queues(sk); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index da6fa82c98a8..ac97db92ab68 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -21,6 +21,8 @@ #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/rhashtable.h> +#include <linux/list.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> @@ -1249,8 +1251,226 @@ out_module_put: return skb->len; } +struct tcf_action_net { + struct rhashtable egdev_ht; +}; + +static unsigned int tcf_action_net_id; + +struct tcf_action_egdev_cb { + struct list_head list; + tc_setup_cb_t *cb; + void *cb_priv; +}; + +struct tcf_action_egdev { + struct rhash_head ht_node; + const struct net_device *dev; + unsigned int refcnt; + struct list_head cb_list; +}; + +static const struct rhashtable_params tcf_action_egdev_ht_params = { + .key_offset = offsetof(struct tcf_action_egdev, dev), + .head_offset = offsetof(struct tcf_action_egdev, ht_node), + .key_len = sizeof(const struct net_device *), +}; + +static struct tcf_action_egdev * +tcf_action_egdev_lookup(const struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + return rhashtable_lookup_fast(&tan->egdev_ht, &dev, + tcf_action_egdev_ht_params); +} + +static struct tcf_action_egdev * +tcf_action_egdev_get(const struct net_device *dev) +{ + struct tcf_action_egdev *egdev; + struct tcf_action_net *tan; + + egdev = tcf_action_egdev_lookup(dev); + if (egdev) + goto inc_ref; + + egdev = kzalloc(sizeof(*egdev), GFP_KERNEL); + if (!egdev) + return NULL; + INIT_LIST_HEAD(&egdev->cb_list); + tan = net_generic(dev_net(dev), tcf_action_net_id); + rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node, + tcf_action_egdev_ht_params); + +inc_ref: + egdev->refcnt++; + return egdev; +} + +static void tcf_action_egdev_put(struct tcf_action_egdev *egdev) +{ + struct tcf_action_net *tan; + + if (--egdev->refcnt) + return; + tan = net_generic(dev_net(egdev->dev), tcf_action_net_id); + rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node, + tcf_action_egdev_ht_params); + kfree(egdev); +} + +static struct tcf_action_egdev_cb * +tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + list_for_each_entry(egdev_cb, &egdev->cb_list, list) + if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv) + return egdev_cb; + return NULL; +} + +static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev, + enum tc_setup_type type, + void *type_data, bool err_stop) +{ + struct tcf_action_egdev_cb *egdev_cb; + int ok_count = 0; + int err; + + list_for_each_entry(egdev_cb, &egdev->cb_list, list) { + err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv); + if (err) { + if (err_stop) + return err; + } else { + ok_count++; + } + } + return ok_count; +} + +static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); + if (WARN_ON(egdev_cb)) + return -EEXIST; + egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL); + if (!egdev_cb) + return -ENOMEM; + egdev_cb->cb = cb; + egdev_cb->cb_priv = cb_priv; + list_add(&egdev_cb->list, &egdev->cb_list); + return 0; +} + +static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); + if (WARN_ON(!egdev_cb)) + return; + list_del(&egdev_cb->list); + kfree(egdev_cb); +} + +static int __tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev); + int err; + + if (!egdev) + return -ENOMEM; + err = tcf_action_egdev_cb_add(egdev, cb, cb_priv); + if (err) + goto err_cb_add; + return 0; + +err_cb_add: + tcf_action_egdev_put(egdev); + return err; +} +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + int err; + + rtnl_lock(); + err = __tc_setup_cb_egdev_register(dev, cb, cb_priv); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register); + +static void __tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); + + if (WARN_ON(!egdev)) + return; + tcf_action_egdev_cb_del(egdev, cb, cb_priv); + tcf_action_egdev_put(egdev); +} +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + rtnl_lock(); + __tc_setup_cb_egdev_unregister(dev, cb, cb_priv); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister); + +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); + + if (!egdev) + return 0; + return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop); +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call); + +static __net_init int tcf_action_net_init(struct net *net) +{ + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params); +} + +static void __net_exit tcf_action_net_exit(struct net *net) +{ + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + rhashtable_destroy(&tan->egdev_ht); +} + +static struct pernet_operations tcf_action_net_ops = { + .init = tcf_action_net_init, + .exit = tcf_action_net_exit, + .id = &tcf_action_net_id, + .size = sizeof(struct tcf_action_net), +}; + static int __init tc_action_init(void) { + int err; + + err = register_pernet_subsys(&tcf_action_net_ops); + if (err) + return err; + rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c0c707eb2c96..5ef8ce8c83d4 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 8ccd35825b6b..3007cb1310ea 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -248,6 +248,22 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) return ret; } +#ifdef CONFIG_MODULES +static const char *ife_meta_id2name(u32 metaid) +{ + switch (metaid) { + case IFE_META_SKBMARK: + return "skbmark"; + case IFE_META_PRIO: + return "skbprio"; + case IFE_META_TCINDEX: + return "tcindex"; + default: + return "unknown"; + } +} +#endif + /* called when adding new meta information * under ife->tcf_lock for existing action */ @@ -263,7 +279,7 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, if (exists) spin_unlock_bh(&ife->tcf_lock); rtnl_unlock(); - request_module("ifemeta%u", metaid); + request_module("ife-meta-%s", ife_meta_id2name(metaid)); rtnl_lock(); if (exists) spin_lock_bh(&ife->tcf_lock); @@ -392,10 +408,14 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) static void tcf_ife_cleanup(struct tc_action *a, int bind) { struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a, bind); spin_unlock_bh(&ife->tcf_lock); + + p = rcu_dereference_protected(ife->params, 1); + kfree_rcu(p, rcu); } /* under ife->tcf_lock for existing action */ @@ -432,6 +452,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; + struct tcf_ife_params *p, *p_old; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; @@ -450,24 +471,41 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_IFE_PARMS]); + /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because + * they cannot run as the same time. Check on all other values which + * are not supported right now. + */ + if (parm->flags & ~IFE_ENCODE) + return -EINVAL; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + exists = tcf_idr_check(tn, parm->index, a, bind); - if (exists && bind) + if (exists && bind) { + kfree(p); return 0; + } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, - bind, false); - if (ret) + bind, true); + if (ret) { + kfree(p); return ret; + } ret = ACT_P_CREATED; } else { tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + kfree(p); return -EEXIST; + } } ife = to_ife(*a); - ife->flags = parm->flags; + p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { if (tb[TCA_IFE_TYPE]) @@ -478,24 +516,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, saddr = nla_data(tb[TCA_IFE_SMAC]); } - if (exists) - spin_lock_bh(&ife->tcf_lock); ife->tcf_action = parm->action; if (parm->flags & IFE_ENCODE) { if (daddr) - ether_addr_copy(ife->eth_dst, daddr); + ether_addr_copy(p->eth_dst, daddr); else - eth_zero_addr(ife->eth_dst); + eth_zero_addr(p->eth_dst); if (saddr) - ether_addr_copy(ife->eth_src, saddr); + ether_addr_copy(p->eth_src, saddr); else - eth_zero_addr(ife->eth_src); + eth_zero_addr(p->eth_src); - ife->eth_type = ife_type; + p->eth_type = ife_type; } + if (exists) + spin_lock_bh(&ife->tcf_lock); + if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&ife->metalist); @@ -511,6 +550,7 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + kfree(p); return err; } @@ -531,6 +571,7 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + kfree(p); return err; } } @@ -538,6 +579,11 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + p_old = rtnl_dereference(ife->params); + rcu_assign_pointer(ife->params, p); + if (p_old) + kfree_rcu(p_old, rcu); + if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -549,12 +595,13 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, .refcnt = ife->tcf_refcnt - ref, .bindcnt = ife->tcf_bindcnt - bind, .action = ife->tcf_action, - .flags = ife->flags, + .flags = p->flags, }; struct tcf_t t; @@ -565,17 +612,17 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; - if (!is_zero_ether_addr(ife->eth_dst)) { - if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst)) + if (!is_zero_ether_addr(p->eth_dst)) { + if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst)) goto nla_put_failure; } - if (!is_zero_ether_addr(ife->eth_src)) { - if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src)) + if (!is_zero_ether_addr(p->eth_src)) { + if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src)) goto nla_put_failure; } - if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type)) + if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; if (dump_metalist(skb, ife)) { @@ -617,19 +664,15 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, u8 *tlv_data; u16 metalen; - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); - spin_unlock(&ife->tcf_lock); if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); tlv_data = ife_decode(skb, &metalen); if (unlikely(!tlv_data)) { - spin_lock(&ife->tcf_lock); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -647,14 +690,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, */ pr_info_ratelimited("Unknown metaid %d dlen %d\n", mtype, dlen); - ife->tcf_qstats.overlimits++; + qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); } } if (WARN_ON(tlv_data != ifehdr_end)) { - spin_lock(&ife->tcf_lock); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -683,7 +724,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) } static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) + struct tcf_result *res, struct tcf_ife_params *p) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; @@ -706,23 +747,20 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, exceed_mtu = true; } - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet * with no metadata */ - ife->tcf_qstats.overlimits++; - spin_unlock(&ife->tcf_lock); + qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); return action; } /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -731,6 +769,8 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, ife_meta = ife_encode(skb, metalen); + spin_lock(&ife->tcf_lock); + /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... @@ -742,25 +782,24 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, } if (err < 0) { /* too corrupt to keep around if overwritten */ - ife->tcf_qstats.drops++; spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skboff += err; } + spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; - if (!is_zero_ether_addr(ife->eth_src)) - ether_addr_copy(oethh->h_source, ife->eth_src); - if (!is_zero_ether_addr(ife->eth_dst)) - ether_addr_copy(oethh->h_dest, ife->eth_dst); - oethh->h_proto = htons(ife->eth_type); + if (!is_zero_ether_addr(p->eth_src)) + ether_addr_copy(oethh->h_source, p->eth_src); + if (!is_zero_ether_addr(p->eth_dst)) + ether_addr_copy(oethh->h_dest, p->eth_dst); + oethh->h_proto = htons(p->eth_type); if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); - spin_unlock(&ife->tcf_lock); - return action; } @@ -768,21 +807,19 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p; + int ret; + + rcu_read_lock(); + p = rcu_dereference(ife->params); + if (p->flags & IFE_ENCODE) { + ret = tcf_ife_encode(skb, a, res, p); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); - if (ife->flags & IFE_ENCODE) - return tcf_ife_encode(skb, a, res); - - if (!(ife->flags & IFE_ENCODE)) - return tcf_ife_decode(skb, a, res); - - pr_info_ratelimited("unknown failure(policy neither de/encode\n"); - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); - tcf_lastuse_update(&ife->tcf_tm); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); - - return TC_ACT_SHOT; + return tcf_ife_decode(skb, a, res); } static int tcf_ife_walker(struct net *net, struct sk_buff *skb, diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c index 82892170ce4f..1e3f10e5da99 100644 --- a/net/sched/act_meta_mark.c +++ b/net/sched/act_meta_mark.c @@ -76,4 +76,4 @@ module_exit(ifemark_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE skb mark metadata module"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_SKBMARK); +MODULE_ALIAS_IFE_META("skbmark"); diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c index 26bf4d86030b..4033f9fc4d4a 100644 --- a/net/sched/act_meta_skbprio.c +++ b/net/sched/act_meta_skbprio.c @@ -73,4 +73,4 @@ module_exit(ifeprio_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE skb prio metadata action"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_PRIO); +MODULE_ALIAS_IFE_META("skbprio"); diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c index 3b35774ce890..2ea1f26c9e96 100644 --- a/net/sched/act_meta_skbtcindex.c +++ b/net/sched/act_meta_skbtcindex.c @@ -76,4 +76,4 @@ module_exit(ifetc_index_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2016)"); MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX); +MODULE_ALIAS_IFE_META("tcindex"); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 416627c66f08..8b3e59388480 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -140,6 +140,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, m->tcfm_eaction = parm->eaction; if (dev != NULL) { m->tcfm_ifindex = parm->ifindex; + m->net = net; if (ret != ACT_P_CREATED) dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); @@ -313,15 +314,11 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; -static int tcf_mirred_device(const struct tc_action *a, struct net *net, - struct net_device **mirred_dev) +static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { - int ifindex = tcf_mirred_ifindex(a); + struct tcf_mirred *m = to_mirred(a); - *mirred_dev = __dev_get_by_index(net, ifindex); - if (!*mirred_dev) - return -EINVAL; - return 0; + return __dev_get_by_index(m->net, m->tcfm_ifindex); } static struct tc_action_ops act_mirred_ops = { @@ -336,7 +333,7 @@ static struct tc_action_ops act_mirred_ops = { .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), - .get_dev = tcf_mirred_device, + .get_dev = tcf_mirred_get_dev, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0b2219adf520..2e8e87fd9d97 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -241,7 +241,7 @@ tcf_chain_filter_chain_ptr_set(struct tcf_chain *chain, } int tcf_block_get(struct tcf_block **p_block, - struct tcf_proto __rcu **p_filter_chain) + struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q) { struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL); struct tcf_chain *chain; @@ -257,6 +257,8 @@ int tcf_block_get(struct tcf_block **p_block, goto err_chain_create; } tcf_chain_filter_chain_ptr_set(chain, p_filter_chain); + block->net = qdisc_net(q); + block->q = q; *p_block = block; return 0; @@ -418,8 +420,8 @@ static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, } static int tcf_fill_node(struct net *net, struct sk_buff *skb, - struct tcf_proto *tp, void *fh, u32 portid, - u32 seq, u16 flags, int event) + struct tcf_proto *tp, struct Qdisc *q, u32 parent, + void *fh, u32 portid, u32 seq, u16 flags, int event) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -432,8 +434,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; - tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex; - tcm->tcm_parent = tp->classid; + tcm->tcm_ifindex = qdisc_dev(q)->ifindex; + tcm->tcm_parent = parent; tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); if (nla_put_string(skb, TCA_KIND, tp->ops->kind)) goto nla_put_failure; @@ -456,6 +458,7 @@ nla_put_failure: static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, + struct Qdisc *q, u32 parent, void *fh, int event, bool unicast) { struct sk_buff *skb; @@ -465,7 +468,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, + if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event) <= 0) { kfree_skb(skb); return -EINVAL; @@ -480,6 +483,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, + struct Qdisc *q, u32 parent, void *fh, bool unicast, bool *last) { struct sk_buff *skb; @@ -490,7 +494,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, + if (tcf_fill_node(net, skb, tp, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER) <= 0) { kfree_skb(skb); return -EINVAL; @@ -510,6 +514,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, } static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, + struct Qdisc *q, u32 parent, struct nlmsghdr *n, struct tcf_chain *chain, int event) { @@ -517,7 +522,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, for (tp = rtnl_dereference(chain->filter_chain); tp; tp = rtnl_dereference(tp->next)) - tfilter_notify(net, oskb, n, tp, 0, event, false); + tfilter_notify(net, oskb, n, tp, q, parent, 0, event, false); } /* Add/change/delete/get a filter node */ @@ -636,7 +641,8 @@ replay: } if (n->nlmsg_type == RTM_DELTFILTER && prio == 0) { - tfilter_notify_chain(net, skb, n, chain, RTM_DELTFILTER); + tfilter_notify_chain(net, skb, q, parent, n, + chain, RTM_DELTFILTER); tcf_chain_flush(chain); err = 0; goto errout; @@ -683,7 +689,7 @@ replay: if (!fh) { if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { tcf_chain_tp_remove(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, fh, + tfilter_notify(net, skb, n, tp, q, parent, fh, RTM_DELTFILTER, false); tcf_proto_destroy(tp); err = 0; @@ -708,8 +714,8 @@ replay: } break; case RTM_DELTFILTER: - err = tfilter_del_notify(net, skb, n, tp, fh, false, - &last); + err = tfilter_del_notify(net, skb, n, tp, q, parent, + fh, false, &last); if (err) goto errout; if (last) { @@ -718,7 +724,7 @@ replay: } goto errout; case RTM_GETTFILTER: - err = tfilter_notify(net, skb, n, tp, fh, + err = tfilter_notify(net, skb, n, tp, q, parent, fh, RTM_NEWTFILTER, true); goto errout; default: @@ -732,7 +738,8 @@ replay: if (err == 0) { if (tp_created) tcf_chain_tp_insert(chain, &chain_info, tp); - tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); + tfilter_notify(net, skb, n, tp, q, parent, fh, + RTM_NEWTFILTER, false); } else { if (tp_created) tcf_proto_destroy(tp); @@ -751,6 +758,8 @@ struct tcf_dump_args { struct tcf_walker w; struct sk_buff *skb; struct netlink_callback *cb; + struct Qdisc *q; + u32 parent; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -758,13 +767,14 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct tcf_dump_args *a = (void *)arg; struct net *net = sock_net(a->skb->sk); - return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid, + return tcf_fill_node(net, a->skb, tp, a->q, a->parent, + n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); } -static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, - struct netlink_callback *cb, +static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, + struct sk_buff *skb, struct netlink_callback *cb, long index_start, long *p_index) { struct net *net = sock_net(skb->sk); @@ -786,7 +796,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, 0, + if (tcf_fill_node(net, skb, tp, q, parent, 0, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) @@ -799,6 +809,8 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct sk_buff *skb, arg.w.fn = tcf_node_dump; arg.skb = skb; arg.cb = cb; + arg.q = q; + arg.parent = parent; arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; @@ -824,6 +836,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) const struct Qdisc_class_ops *cops; long index_start; long index; + u32 parent; int err; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) @@ -837,10 +850,13 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (!dev) return skb->len; - if (!tcm->tcm_parent) + parent = tcm->tcm_parent; + if (!parent) { q = dev->qdisc; - else + parent = q->handle; + } else { q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } if (!q) goto out; cops = q->ops->cl_ops; @@ -864,7 +880,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (tca[TCA_CHAIN] && nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; - if (!tcf_chain_dump(chain, skb, cb, index_start, &index)) + if (!tcf_chain_dump(chain, q, parent, skb, cb, + index_start, &index)) break; } @@ -1004,29 +1021,42 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); -int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, - struct net_device **hw_dev) +static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, + enum tc_setup_type type, + void *type_data, bool err_stop) { + int ok_count = 0; #ifdef CONFIG_NET_CLS_ACT const struct tc_action *a; + struct net_device *dev; LIST_HEAD(actions); + int ret; if (!tcf_exts_has_actions(exts)) - return -EINVAL; + return 0; tcf_exts_to_list(exts, &actions); list_for_each_entry(a, &actions, list) { - if (a->ops->get_dev) { - a->ops->get_dev(a, dev_net(dev), hw_dev); - break; - } + if (!a->ops->get_dev) + continue; + dev = a->ops->get_dev(a); + if (!dev || !tc_can_offload(dev)) + continue; + ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count += ret; } - if (*hw_dev) - return 0; #endif - return -EOPNOTSUPP; + return ok_count; +} + +int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop) +{ + return tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); } -EXPORT_SYMBOL(tcf_exts_get_dev); +EXPORT_SYMBOL(tc_setup_cb_call); static int __init tc_filter_init(void) { diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d89ebafd2239..700b345b07f9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -17,13 +17,14 @@ #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/idr.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> struct basic_head { - u32 hgenerator; struct list_head flist; + struct idr handle_idr; struct rcu_head rcu; }; @@ -78,6 +79,7 @@ static int basic_init(struct tcf_proto *tp) if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; } @@ -99,8 +101,10 @@ static void basic_destroy(struct tcf_proto *tp) list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); call_rcu(&f->rcu, basic_delete_filter); } + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -111,6 +115,7 @@ static int basic_delete(struct tcf_proto *tp, void *arg, bool *last) list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); + idr_remove_ext(&head->handle_idr, f->handle); call_rcu(&f->rcu, basic_delete_filter); *last = list_empty(&head->flist); return 0; @@ -154,6 +159,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *fold = (struct basic_filter *) *arg; struct basic_filter *fnew; + unsigned long idr_index; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; @@ -176,33 +182,33 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = -EINVAL; if (handle) { fnew->handle = handle; - } else if (fold) { - fnew->handle = fold->handle; + if (!fold) { + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (err) + goto errout; + } } else { - unsigned int i = 0x80000000; - do { - if (++head->hgenerator == 0x7FFFFFFF) - head->hgenerator = 1; - } while (--i > 0 && basic_get(tp, head->hgenerator)); - - if (i <= 0) { - pr_err("Insufficient number of handles\n"); + err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (err) goto errout; - } - - fnew->handle = head->hgenerator; + fnew->handle = idr_index; } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr); - if (err < 0) + if (err < 0) { + if (!fold) + idr_remove_ext(&head->handle_idr, fnew->handle); goto errout; + } *arg = fnew; if (fold) { + idr_replace_ext(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); call_rcu(&fold->rcu, basic_delete_filter); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 520c5027646a..6c6b21f6ba62 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -17,6 +17,7 @@ #include <linux/skbuff.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/idr.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> @@ -32,7 +33,7 @@ MODULE_DESCRIPTION("TC BPF based classifier"); struct cls_bpf_head { struct list_head plist; - u32 hgen; + struct idr handle_idr; struct rcu_head rcu; }; @@ -99,11 +100,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_end(skb); + bpf_compute_data_pointers(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } @@ -238,6 +239,7 @@ static int cls_bpf_init(struct tcf_proto *tp) return -ENOBUFS; INIT_LIST_HEAD_RCU(&head->plist); + idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; @@ -264,6 +266,9 @@ static void cls_bpf_delete_prog_rcu(struct rcu_head *rcu) static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) { + struct cls_bpf_head *head = rtnl_dereference(tp->root); + + idr_remove_ext(&head->handle_idr, prog->handle); cls_bpf_stop_offload(tp, prog); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); @@ -287,6 +292,7 @@ static void cls_bpf_destroy(struct tcf_proto *tp) list_for_each_entry_safe(prog, tmp, &head->plist, link) __cls_bpf_delete(tp, prog); + idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } @@ -421,27 +427,6 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, return 0; } -static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, - struct cls_bpf_head *head) -{ - unsigned int i = 0x80000000; - u32 handle; - - do { - if (++head->hgen == 0x7FFFFFFF) - head->hgen = 1; - } while (--i > 0 && cls_bpf_get(tp, head->hgen)); - - if (unlikely(i == 0)) { - pr_err("Insufficient number of handles\n"); - handle = 0; - } else { - handle = head->hgen; - } - - return handle; -} - static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -451,6 +436,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; struct cls_bpf_prog *prog; + unsigned long idr_index; int ret; if (tca[TCA_OPTIONS] == NULL) @@ -476,21 +462,30 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, } } - if (handle == 0) - prog->handle = cls_bpf_grab_new_handle(tp, head); - else + if (handle == 0) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + 1, 0x7FFFFFFF, GFP_KERNEL); + if (ret) + goto errout; + prog->handle = idr_index; + } else { + if (!oldprog) { + ret = idr_alloc_ext(&head->handle_idr, prog, &idr_index, + handle, handle + 1, GFP_KERNEL); + if (ret) + goto errout; + } prog->handle = handle; - if (prog->handle == 0) { - ret = -EINVAL; - goto errout; } ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr); if (ret < 0) - goto errout; + goto errout_idr; ret = cls_bpf_offload(tp, prog, oldprog); if (ret) { + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); __cls_bpf_delete_prog(prog); return ret; } @@ -499,6 +494,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; if (oldprog) { + idr_replace_ext(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); call_rcu(&oldprog->rcu, cls_bpf_delete_prog_rcu); @@ -509,6 +505,9 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, *arg = prog; return 0; +errout_idr: + if (!oldprog) + idr_remove_ext(&head->handle_idr, prog->handle); errout: tcf_exts_destroy(&prog->exts); kfree(prog); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 2a3a60ec5b86..6b29cef90bec 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -345,9 +345,9 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; } -static void flow_perturbation(unsigned long arg) +static void flow_perturbation(struct timer_list *t) { - struct flow_filter *f = (struct flow_filter *)arg; + struct flow_filter *f = from_timer(f, t, perturb_timer); get_random_bytes(&f->hashrnd, 4); if (f->perturb_period) @@ -491,8 +491,11 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ; } - if (TC_H_MAJ(baseclass) == 0) - baseclass = TC_H_MAKE(tp->q->handle, baseclass); + if (TC_H_MAJ(baseclass) == 0) { + struct Qdisc *q = tcf_block_q(tp->chain->block); + + baseclass = TC_H_MAKE(q->handle, baseclass); + } if (TC_H_MIN(baseclass) == 0) baseclass = TC_H_MAKE(baseclass, 1); @@ -502,8 +505,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, get_random_bytes(&fnew->hashrnd, 4); } - setup_deferrable_timer(&fnew->perturb_timer, flow_perturbation, - (unsigned long)fnew); + timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE); netif_keep_dst(qdisc_dev(tp->q)); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d230cb4c8094..5b7bb968d1d4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -88,7 +88,6 @@ struct cls_fl_filter { u32 handle; u32 flags; struct rcu_head rcu; - struct net_device *hw_dev; }; static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) @@ -152,37 +151,12 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct cls_fl_filter *f; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; - struct ip_tunnel_info *info; if (!atomic_read(&head->ht.nelems)) return -1; fl_clear_masked_range(&skb_key, &head->mask); - info = skb_tunnel_info(skb); - if (info) { - struct ip_tunnel_key *key = &info->key; - - switch (ip_tunnel_info_af(info)) { - case AF_INET: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV4_ADDRS; - skb_key.enc_ipv4.src = key->u.ipv4.src; - skb_key.enc_ipv4.dst = key->u.ipv4.dst; - break; - case AF_INET6: - skb_key.enc_control.addr_type = - FLOW_DISSECTOR_KEY_IPV6_ADDRS; - skb_key.enc_ipv6.src = key->u.ipv6.src; - skb_key.enc_ipv6.dst = key->u.ipv6.dst; - break; - } - - skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); - skb_key.enc_tp.src = key->tp_src; - skb_key.enc_tp.dst = key->tp_dst; - } - skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. @@ -226,16 +200,17 @@ static void fl_destroy_filter(struct rcu_head *head) static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = f->hw_dev; - - if (!tc_can_offload(dev)) - return; + struct net_device *dev = tp->q->dev_queue->dev; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, false); } static int fl_hw_replace_filter(struct tcf_proto *tp, @@ -245,20 +220,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload cls_flower = {}; + bool skip_sw = tc_skip_sw(f->flags); int err; - if (!tc_can_offload(dev)) { - if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) || - (f->hw_dev && !tc_can_offload(f->hw_dev))) { - f->hw_dev = dev; - return tc_skip_sw(f->flags) ? -EINVAL : 0; - } - dev = f->hw_dev; - cls_flower.egress_dev = true; - } else { - f->hw_dev = dev; - } - tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_REPLACE; cls_flower.cookie = (unsigned long) f; @@ -267,31 +231,47 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); - if (!err) - f->flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_can_offload(dev)) { + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + if (err) { + if (skip_sw) + return err; + } else { + f->flags |= TCA_CLS_FLAGS_IN_HW; + } + } - if (tc_skip_sw(f->flags)) + err = tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, skip_sw); + if (err < 0) { + fl_hw_destroy_filter(tp, f); return err; + } else if (err > 0) { + f->flags |= TCA_CLS_FLAGS_IN_HW; + } + + if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; + return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = f->hw_dev; - - if (!tc_can_offload(dev)) - return; + struct net_device *dev = tp->q->dev_queue->dev; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, false); } static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 941245ad07fd..aa1e1f3bd20c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -28,6 +28,7 @@ #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <net/sch_generic.h> #define HTSIZE 256 @@ -83,9 +84,11 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, } } } else { + struct Qdisc *q = tcf_block_q(tp->chain->block); + /* Old method: classify the packet using its skb mark. */ if (id && (TC_H_MAJ(id) == 0 || - !(TC_H_MAJ(id ^ tp->q->handle)))) { + !(TC_H_MAJ(id ^ q->handle)))) { res->classid = id; res->class = 0; return 0; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 14a7e08b2fa9..d732b5474a4d 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -13,6 +13,7 @@ #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> +#include <net/sch_generic.h> /* * Passing parameters to the root seems to be done more awkwardly than really @@ -90,9 +91,11 @@ static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, f = tcindex_lookup(p, key); if (!f) { + struct Qdisc *q = tcf_block_q(tp->chain->block); + if (!p->fall_through) return -1; - res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); + res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key); res->class = 0; pr_debug("alg 0x%x\n", res->classid); return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 10b8d851fc6b..b6d46065f661 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -46,6 +46,7 @@ #include <net/act_api.h> #include <net/pkt_cls.h> #include <linux/netdevice.h> +#include <linux/idr.h> struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -82,6 +83,7 @@ struct tc_u_hnode { struct tc_u_common *tp_c; int refcnt; unsigned int divisor; + struct idr handle_idr; struct rcu_head rcu; /* The 'ht' field MUST be the last field in structure to allow for * more entries allocated at end of structure. @@ -91,9 +93,9 @@ struct tc_u_hnode { struct tc_u_common { struct tc_u_hnode __rcu *hlist; - struct Qdisc *q; + struct tcf_block *block; int refcnt; - u32 hgenerator; + struct idr handle_idr; struct hlist_node hnode; struct rcu_head rcu; }; @@ -311,19 +313,19 @@ static void *u32_get(struct tcf_proto *tp, u32 handle) return u32_lookup_key(ht, handle); } -static u32 gen_new_htid(struct tc_u_common *tp_c) +static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) { - int i = 0x800; + unsigned long idr_index; + int err; - /* hgenerator only used inside rtnl lock it is safe to increment + /* This is only used inside rtnl lock it is safe to increment * without read _copy_ update semantics */ - do { - if (++tp_c->hgenerator == 0x7FF) - tp_c->hgenerator = 1; - } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); - - return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; + err = idr_alloc_ext(&tp_c->handle_idr, ptr, &idr_index, + 1, 0x7FF, GFP_KERNEL); + if (err) + return 0; + return (u32)(idr_index | 0x800) << 20; } static struct hlist_head *tc_u_common_hash; @@ -333,11 +335,7 @@ static struct hlist_head *tc_u_common_hash; static unsigned int tc_u_hash(const struct tcf_proto *tp) { - struct net_device *dev = tp->q->dev_queue->dev; - u32 qhandle = tp->q->handle; - int ifindex = dev->ifindex; - - return hash_64((u64)ifindex << 32 | qhandle, U32_HASH_SHIFT); + return hash_64((u64) tp->chain->block, U32_HASH_SHIFT); } static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) @@ -347,7 +345,7 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp) h = tc_u_hash(tp); hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) { - if (tc->q == tp->q) + if (tc->block == tp->chain->block) return tc; } return NULL; @@ -366,8 +364,9 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; root_ht->refcnt++; - root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; + idr_init(&root_ht->handle_idr); if (tp_c == NULL) { tp_c = kzalloc(sizeof(*tp_c), GFP_KERNEL); @@ -375,8 +374,9 @@ static int u32_init(struct tcf_proto *tp) kfree(root_ht); return -ENOBUFS; } - tp_c->q = tp->q; + tp_c->block = tp->chain->block; INIT_HLIST_NODE(&tp_c->hnode); + idr_init(&tp_c->handle_idr); h = tc_u_hash(tp); hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]); @@ -565,6 +565,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) rtnl_dereference(n->next)); tcf_unbind_filter(tp, &n->res); u32_remove_hw_knode(tp, n->handle); + idr_remove_ext(&ht->handle_idr, n->handle); call_rcu(&n->rcu, u32_delete_key_freepf_rcu); } } @@ -586,6 +587,8 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) hn = &phn->next, phn = rtnl_dereference(*hn)) { if (phn == ht) { u32_clear_hw_hnode(tp, ht); + idr_destroy(&ht->handle_idr); + idr_remove_ext(&tp_c->handle_idr, ht->handle); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -633,6 +636,7 @@ static void u32_destroy(struct tcf_proto *tp) kfree_rcu(ht, rcu); } + idr_destroy(&tp_c->handle_idr); kfree(tp_c); } @@ -701,27 +705,21 @@ ret: return ret; } -#define NR_U32_NODE (1<<12) -static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 htid) { - struct tc_u_knode *n; - unsigned long i; - unsigned long *bitmap = kzalloc(BITS_TO_LONGS(NR_U32_NODE) * sizeof(unsigned long), - GFP_KERNEL); - if (!bitmap) - return handle | 0xFFF; - - for (n = rtnl_dereference(ht->ht[TC_U32_HASH(handle)]); - n; - n = rtnl_dereference(n->next)) - set_bit(TC_U32_NODE(n->handle), bitmap); - - i = find_next_zero_bit(bitmap, NR_U32_NODE, 0x800); - if (i >= NR_U32_NODE) - i = find_next_zero_bit(bitmap, NR_U32_NODE, 1); + unsigned long idr_index; + u32 start = htid | 0x800; + u32 max = htid | 0xFFF; + u32 min = htid; + + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + start, max + 1, GFP_KERNEL)) { + if (idr_alloc_ext(&ht->handle_idr, NULL, &idr_index, + min + 1, max + 1, GFP_KERNEL)) + return max; + } - kfree(bitmap); - return handle | (i >= NR_U32_NODE ? 0xFFF : i); + return (u32)idr_index; } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -806,6 +804,7 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c, if (pins->handle == n->handle) break; + idr_replace_ext(&ht->handle_idr, n, n->handle); RCU_INIT_POINTER(n->next, pins->next); rcu_assign_pointer(*ins, n); } @@ -937,22 +936,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; if (TC_U32_KEY(handle)) return -EINVAL; - if (handle == 0) { - handle = gen_new_htid(tp->data); - if (handle == 0) - return -ENOMEM; - } ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; + if (handle == 0) { + handle = gen_new_htid(tp->data, ht); + if (handle == 0) { + kfree(ht); + return -ENOMEM; + } + } else { + err = idr_alloc_ext(&tp_c->handle_idr, ht, NULL, + handle, handle + 1, GFP_KERNEL); + if (err) { + kfree(ht); + return err; + } + } ht->tp_c = tp_c; ht->refcnt = 1; ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; + idr_init(&ht->handle_idr); err = u32_replace_hw_hnode(tp, ht, flags); if (err) { + idr_remove_ext(&tp_c->handle_idr, handle); kfree(ht); return err; } @@ -986,24 +996,33 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) return -EINVAL; handle = htid | TC_U32_NODE(handle); + err = idr_alloc_ext(&ht->handle_idr, NULL, NULL, + handle, handle + 1, + GFP_KERNEL); + if (err) + return err; } else handle = gen_new_kid(ht, htid); - if (tb[TCA_U32_SEL] == NULL) - return -EINVAL; + if (tb[TCA_U32_SEL] == NULL) { + err = -EINVAL; + goto erridr; + } s = nla_data(tb[TCA_U32_SEL]); n = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); - if (n == NULL) - return -ENOBUFS; + if (n == NULL) { + err = -ENOBUFS; + goto erridr; + } #ifdef CONFIG_CLS_U32_PERF size = sizeof(struct tc_u32_pcnt) + s->nkeys * sizeof(u64); n->pf = __alloc_percpu(size, __alignof__(struct tc_u32_pcnt)); if (!n->pf) { - kfree(n); - return -ENOBUFS; + err = -ENOBUFS; + goto errfree; } #endif @@ -1066,9 +1085,12 @@ errhw: errout: tcf_exts_destroy(&n->exts); #ifdef CONFIG_CLS_U32_PERF +errfree: free_percpu(n->pf); #endif kfree(n); +erridr: + idr_remove_ext(&ht->handle_idr, handle); return err; } diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 03b677bc0700..1331a4c2d8ff 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -178,7 +178,7 @@ static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); - struct net *net = dev_net(qdisc_dev(tp->q)); + struct net *net = tp->chain->block->net; if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index aa82116ed10c..a9ac912f1d67 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1661,9 +1661,11 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) struct tcf_bind_args *a = (void *)arg; if (tp->ops->bind_class) { - tcf_tree_lock(tp); + struct Qdisc *q = tcf_block_q(tp->chain->block); + + sch_tree_lock(q); tp->ops->bind_class(n, a->classid, a->cl); - tcf_tree_unlock(tp); + sch_tree_unlock(q); } return 0; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index c5fcdf1a58a0..2dbd249c0b2f 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -281,7 +281,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, goto err_out; } - error = tcf_block_get(&flow->block, &flow->filter_list); + error = tcf_block_get(&flow->block, &flow->filter_list, sch); if (error) { kfree(flow); goto err_out; @@ -546,7 +546,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) p->link.q = &noop_qdisc; pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - err = tcf_block_get(&p->link.block, &p->link.filter_list); + err = tcf_block_get(&p->link.block, &p->link.filter_list, sch); if (err) return err; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index dcef97fa8047..c3b92d62190e 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1566,7 +1566,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); return err; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 2d0e8d4bdc29..753dc7a77b60 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -412,7 +412,7 @@ static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct drr_sched *q = qdisc_priv(sch); int err; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; err = qdisc_class_hash_init(&q->clhash); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 2836c80c7aa5..fb4fb71c68cf 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -344,7 +344,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) goto errout; - err = tcf_block_get(&p->block, &p->filter_list); + err = tcf_block_get(&p->block, &p->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index de3b57ceca7b..3c40edeff1e8 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -481,7 +481,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) return err; } - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a0a198768aad..6ced7c89c141 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -288,9 +288,9 @@ unsigned long dev_trans_start(struct net_device *dev) } EXPORT_SYMBOL(dev_trans_start); -static void dev_watchdog(unsigned long arg) +static void dev_watchdog(struct timer_list *t) { - struct net_device *dev = (struct net_device *)arg; + struct net_device *dev = from_timer(dev, t, watchdog_timer); netif_tx_lock(dev); if (!qdisc_tx_is_noop(dev)) { @@ -954,7 +954,7 @@ void dev_init_scheduler(struct net_device *dev) if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); + timer_setup(&dev->watchdog_timer, dev_watchdog, 0); } static void shutdown_scheduler_queue(struct net_device *dev, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3f88b75488b0..a692184bd333 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1033,7 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); return err; @@ -1405,7 +1405,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) return err; q->eligible = RB_ROOT; - err = tcf_block_get(&q->root.block, &q->root.filter_list); + err = tcf_block_get(&q->root.block, &q->root.filter_list, sch); if (err) return err; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c6d7ae81b41f..57be73c0e1d2 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1030,7 +1030,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; @@ -1393,7 +1393,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; - err = tcf_block_get(&cl->block, &cl->filter_list); + err = tcf_block_get(&cl->block, &cl->filter_list, sch); if (err) { kfree(cl); goto failure; diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 44de4ee51ce9..9ccc1b89b0d9 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -59,7 +59,7 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; - err = tcf_block_get(&q->block, &dev->ingress_cl_list); + err = tcf_block_get(&q->block, &dev->ingress_cl_list, sch); if (err) return err; @@ -153,11 +153,11 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt) struct net_device *dev = qdisc_dev(sch); int err; - err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list); + err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list, sch); if (err) return err; - err = tcf_block_get(&q->egress_block, &dev->egress_cl_list); + err = tcf_block_get(&q->egress_block, &dev->egress_cl_list, sch); if (err) return err; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 6bcdfe6e7b63..cae91b4b08a6 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -18,10 +18,16 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> +#include <net/pkt_cls.h> struct mqprio_sched { struct Qdisc **qdiscs; + u16 mode; + u16 shaper; int hw_offload; + u32 flags; + u64 min_rate[TC_QOPT_MAX_QUEUE]; + u64 max_rate[TC_QOPT_MAX_QUEUE]; }; static void mqprio_destroy(struct Qdisc *sch) @@ -39,9 +45,17 @@ static void mqprio_destroy(struct Qdisc *sch) } if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) { - struct tc_mqprio_qopt mqprio = {}; + struct tc_mqprio_qopt_offload mqprio = { { 0 } }; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, &mqprio); + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + case TC_MQPRIO_MODE_CHANNEL: + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, + &mqprio); + break; + default: + return; + } } else { netdev_set_num_tc(dev, 0); } @@ -97,6 +111,26 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) return 0; } +static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = { + [TCA_MQPRIO_MODE] = { .len = sizeof(u16) }, + [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) }, + [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED }, + [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED }, +}; + +static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy, int len) +{ + int nested_len = nla_len(nla) - NLA_ALIGN(len); + + if (nested_len >= nla_attr_size(0)) + return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), + nested_len, policy, NULL); + + memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); + return 0; +} + static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) { struct net_device *dev = qdisc_dev(sch); @@ -105,6 +139,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; + struct nlattr *tb[TCA_MQPRIO_MAX + 1]; + struct nlattr *attr; + int rem; + int len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); @@ -115,6 +153,10 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; + /* make certain can allocate enough classids to handle queues */ + if (dev->num_tx_queues >= TC_H_MIN_PRIORITY) + return -ENOMEM; + if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; @@ -122,6 +164,58 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (mqprio_parse_opt(dev, qopt)) return -EINVAL; + if (len > 0) { + err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, + sizeof(*qopt)); + if (err < 0) + return err; + + if (!qopt->hw) + return -EINVAL; + + if (tb[TCA_MQPRIO_MODE]) { + priv->flags |= TC_MQPRIO_F_MODE; + priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); + } + + if (tb[TCA_MQPRIO_SHAPER]) { + priv->flags |= TC_MQPRIO_F_SHAPER; + priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); + } + + if (tb[TCA_MQPRIO_MIN_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->min_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MIN_RATE; + } + + if (tb[TCA_MQPRIO_MAX_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->max_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MAX_RATE; + } + } + /* pre-allocate qdisc, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); @@ -146,14 +240,36 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) * supplied and verified mapping */ if (qopt->hw) { - struct tc_mqprio_qopt mqprio = *qopt; + struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_MQPRIO, + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + if (priv->shaper != TC_MQPRIO_SHAPER_DCB) + return -EINVAL; + break; + case TC_MQPRIO_MODE_CHANNEL: + mqprio.flags = priv->flags; + if (priv->flags & TC_MQPRIO_F_MODE) + mqprio.mode = priv->mode; + if (priv->flags & TC_MQPRIO_F_SHAPER) + mqprio.shaper = priv->shaper; + if (priv->flags & TC_MQPRIO_F_MIN_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.min_rate[i] = priv->min_rate[i]; + if (priv->flags & TC_MQPRIO_F_MAX_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.max_rate[i] = priv->max_rate[i]; + break; + default: + return -EINVAL; + } + err = dev->netdev_ops->ndo_setup_tc(dev, + TC_SETUP_MQPRIO, &mqprio); if (err) return err; - priv->hw_offload = mqprio.hw; + priv->hw_offload = mqprio.qopt.hw; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -193,7 +309,7 @@ static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); - unsigned long ntx = cl - 1 - netdev_get_num_tc(dev); + unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; @@ -223,11 +339,51 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, return 0; } +static int dump_rates(struct mqprio_sched *priv, + struct tc_mqprio_qopt *opt, struct sk_buff *skb) +{ + struct nlattr *nest; + int i; + + if (priv->flags & TC_MQPRIO_F_MIN_RATE) { + nest = nla_nest_start(skb, TCA_MQPRIO_MIN_RATE64); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < opt->num_tc; i++) { + if (nla_put(skb, TCA_MQPRIO_MIN_RATE64, + sizeof(priv->min_rate[i]), + &priv->min_rate[i])) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + + if (priv->flags & TC_MQPRIO_F_MAX_RATE) { + nest = nla_nest_start(skb, TCA_MQPRIO_MAX_RATE64); + if (!nest) + goto nla_put_failure; + + for (i = 0; i < opt->num_tc; i++) { + if (nla_put(skb, TCA_MQPRIO_MAX_RATE64, + sizeof(priv->max_rate[i]), + &priv->max_rate[i])) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); - unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; unsigned int i; @@ -258,12 +414,25 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.offset[i] = dev->tc_to_txq[i].offset; } - if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) goto nla_put_failure; - return skb->len; + if ((priv->flags & TC_MQPRIO_F_MODE) && + nla_put_u16(skb, TCA_MQPRIO_MODE, priv->mode)) + goto nla_put_failure; + + if ((priv->flags & TC_MQPRIO_F_SHAPER) && + nla_put_u16(skb, TCA_MQPRIO_SHAPER, priv->shaper)) + goto nla_put_failure; + + if ((priv->flags & TC_MQPRIO_F_MIN_RATE || + priv->flags & TC_MQPRIO_F_MAX_RATE) && + (dump_rates(priv, &opt, skb) != 0)) + goto nla_put_failure; + + return nla_nest_end(skb, nla); nla_put_failure: - nlmsg_trim(skb, b); + nlmsg_trim(skb, nla); return -1; } @@ -282,38 +451,35 @@ static unsigned long mqprio_find(struct Qdisc *sch, u32 classid) struct net_device *dev = qdisc_dev(sch); unsigned int ntx = TC_H_MIN(classid); - if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev)) - return 0; - return ntx; + /* There are essentially two regions here that have valid classid + * values. The first region will have a classid value of 1 through + * num_tx_queues. All of these are backed by actual Qdiscs. + */ + if (ntx < TC_H_MIN_PRIORITY) + return (ntx <= dev->num_tx_queues) ? ntx : 0; + + /* The second region represents the hardware traffic classes. These + * are represented by classid values of TC_H_MIN_PRIORITY through + * TC_H_MIN_PRIORITY + netdev_get_num_tc - 1 + */ + return ((ntx - TC_H_MIN_PRIORITY) < netdev_get_num_tc(dev)) ? ntx : 0; } static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { - struct net_device *dev = qdisc_dev(sch); + if (cl < TC_H_MIN_PRIORITY) { + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + struct net_device *dev = qdisc_dev(sch); + int tc = netdev_txq_to_tc(dev, cl - 1); - if (cl <= netdev_get_num_tc(dev)) { + tcm->tcm_parent = (tc < 0) ? 0 : + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(tc + TC_H_MIN_PRIORITY)); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + } else { tcm->tcm_parent = TC_H_ROOT; tcm->tcm_info = 0; - } else { - int i; - struct netdev_queue *dev_queue; - - dev_queue = mqprio_queue_get(sch, cl); - tcm->tcm_parent = 0; - for (i = 0; i < netdev_get_num_tc(dev); i++) { - struct netdev_tc_txq tc = dev->tc_to_txq[i]; - int q_idx = cl - netdev_get_num_tc(dev); - - if (q_idx > tc.offset && - q_idx <= tc.offset + tc.count) { - tcm->tcm_parent = - TC_H_MAKE(TC_H_MAJ(sch->handle), - TC_H_MIN(i + 1)); - break; - } - } - tcm->tcm_info = dev_queue->qdisc_sleeping->handle; } tcm->tcm_handle |= TC_H_MIN(cl); return 0; @@ -324,15 +490,14 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, __releases(d->lock) __acquires(d->lock) { - struct net_device *dev = qdisc_dev(sch); - - if (cl <= netdev_get_num_tc(dev)) { + if (cl >= TC_H_MIN_PRIORITY) { int i; __u32 qlen = 0; struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; - struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; + struct net_device *dev = qdisc_dev(sch); + struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we @@ -385,12 +550,25 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) /* Walk hierarchy with a virtual class per tc */ arg->count = arg->skip; - for (ntx = arg->skip; - ntx < dev->num_tx_queues + netdev_get_num_tc(dev); - ntx++) { + for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) { + if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + + /* Pad the values and skip over unused traffic classes */ + if (ntx < TC_MAX_QUEUE) { + arg->count = TC_MAX_QUEUE; + ntx = TC_MAX_QUEUE; + } + + /* Reset offset, sort out remaining per-queue qdiscs */ + for (ntx -= TC_MAX_QUEUE; ntx < dev->num_tx_queues; ntx++) { if (arg->fn(sch, ntx + 1, arg) < 0) { arg->stop = 1; - break; + return; } arg->count++; } diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index ff4fc3e0facd..31e0a284eeff 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -245,7 +245,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5a4f10080290..db0228a65e8c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -148,12 +148,6 @@ struct netem_skb_cb { psched_time_t time_to_send; }; - -static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) -{ - return rb_entry(rb, struct sk_buff, rbnode); -} - static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { /* we assume we can use skb next/prev/tstamp as storage for rb_node */ @@ -364,7 +358,7 @@ static void tfifo_reset(struct Qdisc *sch) struct rb_node *p = rb_first(&q->t_root); while (p) { - struct sk_buff *skb = netem_rb_to_skb(p); + struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); @@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) struct sk_buff *skb; parent = *p; - skb = netem_rb_to_skb(parent); + skb = rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) p = &parent->rb_right; else @@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *t_skb; struct netem_skb_cb *t_last; - t_skb = netem_rb_to_skb(rb_last(&q->t_root)); + t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || t_last->time_to_send > last->time_to_send) { @@ -617,7 +611,7 @@ deliver: if (p) { psched_time_t time_to_send; - skb = netem_rb_to_skb(p); + skb = rb_to_skb(p); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 6c2791d6102d..776c694c77c7 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -74,6 +74,7 @@ struct pie_sched_data { struct pie_vars vars; struct pie_stats stats; struct timer_list adapt_timer; + struct Qdisc *sch; }; static void pie_params_init(struct pie_params *params) @@ -422,10 +423,10 @@ static void calculate_probability(struct Qdisc *sch) pie_vars_init(&q->vars); } -static void pie_timer(unsigned long arg) +static void pie_timer(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct pie_sched_data *q = qdisc_priv(sch); + struct pie_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -446,7 +447,8 @@ static int pie_init(struct Qdisc *sch, struct nlattr *opt) pie_vars_init(&q->vars); sch->limit = q->params.limit; - setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch); + q->sch = sch; + timer_setup(&q->adapt_timer, pie_timer, 0); if (opt) { int err = pie_change(sch, opt); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 2dd6c68ae91e..95fad348c8d7 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -212,7 +212,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6ddfd4991108..8694c7b6d2b1 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1419,7 +1419,7 @@ static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) int i, j, err; u32 max_cl_shift, maxbudg_shift, max_classes; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 93b9d70a9b28..fdfdb56aaae2 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -40,6 +40,7 @@ struct red_sched_data { u32 limit; /* HARD maximal queue length */ unsigned char flags; struct timer_list adapt_timer; + struct Qdisc *sch; struct red_parms parms; struct red_vars vars; struct red_stats stats; @@ -221,10 +222,10 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static inline void red_adaptative_timer(unsigned long arg) +static inline void red_adaptative_timer(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct red_sched_data *q = qdisc_priv(sch); + struct red_sched_data *q = from_timer(q, t, adapt_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -238,7 +239,8 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt) struct red_sched_data *q = qdisc_priv(sch); q->qdisc = &noop_qdisc; - setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch); + q->sch = sch; + timer_setup(&q->adapt_timer, red_adaptative_timer, 0); return red_change(sch, opt); } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index cc39e170b4aa..487d375f5a06 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -553,7 +553,7 @@ static int sfb_init(struct Qdisc *sch, struct nlattr *opt) struct sfb_sched_data *q = qdisc_priv(sch); int err; - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 74ea863b8240..5a9c95149c5c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -145,6 +145,7 @@ struct sfq_sched_data { int perturb_period; unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ struct timer_list perturb_timer; + struct Qdisc *sch; }; /* @@ -604,10 +605,10 @@ drop: qdisc_tree_reduce_backlog(sch, dropped, drop_len); } -static void sfq_perturbation(unsigned long arg) +static void sfq_perturbation(struct timer_list *t) { - struct Qdisc *sch = (struct Qdisc *)arg; - struct sfq_sched_data *q = qdisc_priv(sch); + struct sfq_sched_data *q = from_timer(q, t, perturb_timer); + struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); @@ -722,10 +723,9 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) int i; int err; - setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, - (unsigned long)sch); + timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); - err = tcf_block_get(&q->block, &q->filter_list); + err = tcf_block_get(&q->block, &q->filter_list, sch); if (err) return err; diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 70f1b570bab9..bf90c5397719 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -12,7 +12,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ output.o input.o debug.o stream.o auth.o \ - offload.o + offload.o stream_sched.o stream_sched_prio.o \ + stream_sched_rr.o sctp_probe-y := probe.o diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 3afac275ee82..7b261afc47b9 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -311,10 +311,10 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; - streamout->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && @@ -323,7 +323,7 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) &chunk->asoc->stream.out[chunk->sinfo.sinfo_stream]; chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 2966ff400755..4db012aa25f7 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -50,6 +50,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> /* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); @@ -72,32 +73,38 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp); /* Add data to the front of the queue. */ static inline void sctp_outq_head_data(struct sctp_outq *q, - struct sctp_chunk *ch) + struct sctp_chunk *ch) { + struct sctp_stream_out_ext *oute; + __u16 stream; + list_add(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; + + stream = sctp_chunk_stream_no(ch); + oute = q->asoc->stream.out[stream].ext; + list_add(&ch->stream_list, &oute->outq); } /* Take data from the front of the queue. */ static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) { - struct sctp_chunk *ch = NULL; - - if (!list_empty(&q->out_chunk_list)) { - struct list_head *entry = q->out_chunk_list.next; - - ch = list_entry(entry, struct sctp_chunk, list); - list_del_init(entry); - q->out_qlen -= ch->skb->len; - } - return ch; + return q->sched->dequeue(q); } + /* Add data chunk to the end of the queue. */ static inline void sctp_outq_tail_data(struct sctp_outq *q, struct sctp_chunk *ch) { + struct sctp_stream_out_ext *oute; + __u16 stream; + list_add_tail(&ch->list, &q->out_chunk_list); q->out_qlen += ch->skb->len; + + stream = sctp_chunk_stream_no(ch); + oute = q->asoc->stream.out[stream].ext; + list_add_tail(&ch->stream_list, &oute->outq); } /* @@ -207,6 +214,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->retransmit); INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); + sctp_sched_set_sched(asoc, SCTP_SS_FCFS); } /* Free the outqueue structure and any related pending chunks. @@ -258,6 +266,7 @@ static void __sctp_outq_teardown(struct sctp_outq *q) /* Throw away any leftover data chunks. */ while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + sctp_sched_dequeue_done(q, chunk); /* Mark as send failure. */ sctp_chunk_fail(chunk, q->error); @@ -366,7 +375,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; - streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + streamout->ext->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (!chk->tsn_gap_acked) { if (chk->transport) @@ -391,20 +400,21 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, struct sctp_outq *q = &asoc->outqueue; struct sctp_chunk *chk, *temp; + q->sched->unsched_all(&asoc->stream); + list_for_each_entry_safe(chk, temp, &q->out_chunk_list, list) { if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; - list_del_init(&chk->list); - q->out_qlen -= chk->skb->len; + sctp_sched_dequeue_common(q, chk); asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; if (chk->sinfo.sinfo_stream < asoc->stream.outcnt) { struct sctp_stream_out *streamout = &asoc->stream.out[chk->sinfo.sinfo_stream]; - streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; } msg_len -= SCTP_DATA_SNDSIZE(chk) + @@ -415,6 +425,8 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, break; } + q->sched->sched_all(&asoc->stream); + return msg_len; } @@ -1033,22 +1045,9 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { __u32 sid = ntohs(chunk->subh.data_hdr->stream); - /* RFC 2960 6.5 Every DATA chunk MUST carry a valid - * stream identifier. - */ - if (chunk->sinfo.sinfo_stream >= asoc->stream.outcnt) { - - /* Mark as failed send. */ - sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); - if (asoc->peer.prsctp_capable && - SCTP_PR_PRIO_ENABLED(chunk->sinfo.sinfo_flags)) - asoc->sent_cnt_removable--; - sctp_chunk_free(chunk); - continue; - } - /* Has this chunk expired? */ if (sctp_chunk_abandoned(chunk)) { + sctp_sched_dequeue_done(q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1070,6 +1069,7 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) new_transport = asoc->peer.active_path; if (new_transport->state == SCTP_UNCONFIRMED) { WARN_ONCE(1, "Attempt to send packet on unconfirmed path."); + sctp_sched_dequeue_done(q, chunk); sctp_chunk_fail(chunk, 0); sctp_chunk_free(chunk); continue; @@ -1133,6 +1133,11 @@ static void sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp) else asoc->stats.oodchunks++; + /* Only now it's safe to consider this + * chunk as sent, sched-wise. + */ + sctp_sched_dequeue_done(q, chunk); + break; default: diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 22ed01a76b19..a72a7d925d46 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -463,6 +463,7 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, .r = r, .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), }; + int pos = cb->args[2]; /* eps hashtable dumps * args: @@ -493,7 +494,8 @@ skip: goto done; sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, - net, (int *)&cb->args[2], &commp); + net, &pos, &commp); + cb->args[2] = pos; done: cb->args[1] = cb->args[4]; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e6a2974e020e..402bfbb888cd 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -50,6 +50,7 @@ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> static int sctp_cmd_interpreter(enum sctp_event event_type, union sctp_subtype subtype, @@ -1089,6 +1090,8 @@ static void sctp_cmd_send_msg(struct sctp_association *asoc, list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_outq_tail(&asoc->outqueue, chunk, gfp); + + asoc->outqueue.sched->enqueue(&asoc->outqueue, msg); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d4730ada7f32..88c28421ec15 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -79,6 +79,7 @@ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> /* Forward declarations for internal helper functions. */ static int sctp_writeable(struct sock *sk); @@ -1927,6 +1928,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_free; } + /* Allocate sctp_stream_out_ext if not already done */ + if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { + err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); + if (err) + goto out_free; + } + if (sctp_wspace(asoc) < msg_len) sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); @@ -3907,6 +3915,64 @@ out: return retval; } +static int sctp_setsockopt_scheduler(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_assoc_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + if (params.assoc_value > SCTP_SS_MAX) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + goto out; + + retval = sctp_sched_set_sched(asoc, params.assoc_value); + +out: + return retval; +} + +static int sctp_setsockopt_scheduler_value(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_association *asoc; + struct sctp_stream_value params; + int retval = -EINVAL; + + if (optlen < sizeof(params)) + goto out; + + optlen = sizeof(params); + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) + goto out; + + retval = sctp_sched_set_value(asoc, params.stream_id, + params.stream_value, GFP_KERNEL); + +out: + return retval; +} + /* API 6.2 setsockopt(), getsockopt() * * Applications use setsockopt() and getsockopt() to set or retrieve @@ -4088,6 +4154,12 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_ADD_STREAMS: retval = sctp_setsockopt_add_streams(sk, optval, optlen); break; + case SCTP_STREAM_SCHEDULER: + retval = sctp_setsockopt_scheduler(sk, optval, optlen); + break; + case SCTP_STREAM_SCHEDULER_VALUE: + retval = sctp_setsockopt_scheduler_value(sk, optval, optlen); + break; default: retval = -ENOPROTOOPT; break; @@ -6645,7 +6717,7 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, char __user *optval, int __user *optlen) { - struct sctp_stream_out *streamout; + struct sctp_stream_out_ext *streamoute; struct sctp_association *asoc; struct sctp_prstatus params; int retval = -EINVAL; @@ -6668,21 +6740,29 @@ static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, if (!asoc || params.sprstat_sid >= asoc->stream.outcnt) goto out; - streamout = &asoc->stream.out[params.sprstat_sid]; + streamoute = asoc->stream.out[params.sprstat_sid].ext; + if (!streamoute) { + /* Not allocated yet, means all stats are 0 */ + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + retval = 0; + goto out; + } + if (policy == SCTP_PR_SCTP_NONE) { params.sprstat_abandoned_unsent = 0; params.sprstat_abandoned_sent = 0; for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { params.sprstat_abandoned_unsent += - streamout->abandoned_unsent[policy]; + streamoute->abandoned_unsent[policy]; params.sprstat_abandoned_sent += - streamout->abandoned_sent[policy]; + streamoute->abandoned_sent[policy]; } } else { params.sprstat_abandoned_unsent = - streamout->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + streamoute->abandoned_unsent[__SCTP_PR_INDEX(policy)]; params.sprstat_abandoned_sent = - streamout->abandoned_sent[__SCTP_PR_INDEX(policy)]; + streamoute->abandoned_sent[__SCTP_PR_INDEX(policy)]; } if (put_user(len, optlen) || copy_to_user(optval, ¶ms, len)) { @@ -6778,6 +6858,85 @@ out: return retval; } +static int sctp_getsockopt_scheduler(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + params.assoc_value = sctp_sched_get_sched(asoc); + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + +static int sctp_getsockopt_scheduler_value(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_stream_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + retval = sctp_sched_get_value(asoc, params.stream_id, + ¶ms.stream_value); + if (retval) + goto out; + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + +out: + return retval; +} + static int sctp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -6960,6 +7119,14 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_enable_strreset(sk, len, optval, optlen); break; + case SCTP_STREAM_SCHEDULER: + retval = sctp_getsockopt_scheduler(sk, len, optval, + optlen); + break; + case SCTP_STREAM_SCHEDULER_VALUE: + retval = sctp_getsockopt_scheduler_value(sk, len, optval, + optlen); + break; default: retval = -ENOPROTOOPT; break; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 63ea15503714..5ea33a2c453b 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -32,44 +32,181 @@ * Xin Long <lucien.xin@gmail.com> */ +#include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> + +/* Migrates chunks from stream queues to new stream queues if needed, + * but not across associations. Also, removes those chunks to streams + * higher than the new max. + */ +static void sctp_stream_outq_migrate(struct sctp_stream *stream, + struct sctp_stream *new, __u16 outcnt) +{ + struct sctp_association *asoc; + struct sctp_chunk *ch, *temp; + struct sctp_outq *outq; + int i; + + asoc = container_of(stream, struct sctp_association, stream); + outq = &asoc->outqueue; + + list_for_each_entry_safe(ch, temp, &outq->out_chunk_list, list) { + __u16 sid = sctp_chunk_stream_no(ch); + + if (sid < outcnt) + continue; + + sctp_sched_dequeue_common(outq, ch); + /* No need to call dequeue_done here because + * the chunks are not scheduled by now. + */ + + /* Mark as failed send. */ + sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM); + if (asoc->peer.prsctp_capable && + SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags)) + asoc->sent_cnt_removable--; + + sctp_chunk_free(ch); + } + + if (new) { + /* Here we actually move the old ext stuff into the new + * buffer, because we want to keep it. Then + * sctp_stream_update will swap ->out pointers. + */ + for (i = 0; i < outcnt; i++) { + kfree(new->out[i].ext); + new->out[i].ext = stream->out[i].ext; + stream->out[i].ext = NULL; + } + } + + for (i = outcnt; i < stream->outcnt; i++) + kfree(stream->out[i].ext); +} + +static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt, + gfp_t gfp) +{ + struct sctp_stream_out *out; + + out = kmalloc_array(outcnt, sizeof(*out), gfp); + if (!out) + return -ENOMEM; + + if (stream->out) { + memcpy(out, stream->out, min(outcnt, stream->outcnt) * + sizeof(*out)); + kfree(stream->out); + } + + if (outcnt > stream->outcnt) + memset(out + stream->outcnt, 0, + (outcnt - stream->outcnt) * sizeof(*out)); + + stream->out = out; + + return 0; +} + +static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt, + gfp_t gfp) +{ + struct sctp_stream_in *in; + + in = kmalloc_array(incnt, sizeof(*stream->in), gfp); + + if (!in) + return -ENOMEM; + + if (stream->in) { + memcpy(in, stream->in, min(incnt, stream->incnt) * + sizeof(*in)); + kfree(stream->in); + } + + if (incnt > stream->incnt) + memset(in + stream->incnt, 0, + (incnt - stream->incnt) * sizeof(*in)); + + stream->in = in; + + return 0; +} int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { - int i; + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + int i, ret = 0; + + gfp |= __GFP_NOWARN; /* Initial stream->out size may be very big, so free it and alloc - * a new one with new outcnt to save memory. + * a new one with new outcnt to save memory if needed. */ - kfree(stream->out); + if (outcnt == stream->outcnt) + goto in; - stream->out = kcalloc(outcnt, sizeof(*stream->out), gfp); - if (!stream->out) - return -ENOMEM; + /* Filter out chunks queued on streams that won't exist anymore */ + sched->unsched_all(stream); + sctp_stream_outq_migrate(stream, NULL, outcnt); + sched->sched_all(stream); + + i = sctp_stream_alloc_out(stream, outcnt, gfp); + if (i) + return i; stream->outcnt = outcnt; for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; + sched->init(stream); + +in: if (!incnt) - return 0; + goto out; - stream->in = kcalloc(incnt, sizeof(*stream->in), gfp); - if (!stream->in) { - kfree(stream->out); - stream->out = NULL; - return -ENOMEM; + i = sctp_stream_alloc_in(stream, incnt, gfp); + if (i) { + ret = -ENOMEM; + goto free; } stream->incnt = incnt; + goto out; - return 0; +free: + sched->free(stream); + kfree(stream->out); + stream->out = NULL; +out: + return ret; +} + +int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) +{ + struct sctp_stream_out_ext *soute; + + soute = kzalloc(sizeof(*soute), GFP_KERNEL); + if (!soute) + return -ENOMEM; + stream->out[sid].ext = soute; + + return sctp_sched_init_sid(stream, sid, GFP_KERNEL); } void sctp_stream_free(struct sctp_stream *stream) { + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + int i; + + sched->free(stream); + for (i = 0; i < stream->outcnt; i++) + kfree(stream->out[i].ext); kfree(stream->out); kfree(stream->in); } @@ -87,6 +224,10 @@ void sctp_stream_clear(struct sctp_stream *stream) void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + + sched->unsched_all(stream); + sctp_stream_outq_migrate(stream, new, new->outcnt); sctp_stream_free(stream); stream->out = new->out; @@ -94,6 +235,8 @@ void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) stream->outcnt = new->outcnt; stream->incnt = new->incnt; + sched->sched_all(stream); + new->out = NULL; new->in = NULL; } @@ -270,15 +413,9 @@ int sctp_send_add_streams(struct sctp_association *asoc, } if (out) { - struct sctp_stream_out *streamout; - - streamout = krealloc(stream->out, outcnt * sizeof(*streamout), - GFP_KERNEL); - if (!streamout) + retval = sctp_stream_alloc_out(stream, outcnt, GFP_KERNEL); + if (retval) goto out; - - memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); - stream->out = streamout; } chunk = sctp_make_strreset_addstrm(asoc, out, in); @@ -601,7 +738,6 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - struct sctp_stream_in *streamin; __u32 request_seq, incnt; __u16 in, i; @@ -648,13 +784,9 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( if (!in || incnt > SCTP_MAX_STREAM) goto out; - streamin = krealloc(stream->in, incnt * sizeof(*streamin), - GFP_ATOMIC); - if (!streamin) + if (sctp_stream_alloc_in(stream, incnt, GFP_ATOMIC)) goto out; - memset(streamin + stream->incnt, 0, in * sizeof(*streamin)); - stream->in = streamin; stream->incnt = incnt; result = SCTP_STRRESET_PERFORMED; @@ -676,10 +808,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( struct sctp_strreset_addstrm *addstrm = param.v; struct sctp_stream *stream = &asoc->stream; __u32 result = SCTP_STRRESET_DENIED; - struct sctp_stream_out *streamout; struct sctp_chunk *chunk = NULL; __u32 request_seq, outcnt; __u16 out, i; + int ret; request_seq = ntohl(addstrm->request_seq); if (TSN_lt(asoc->strreset_inseq, request_seq) || @@ -708,14 +840,10 @@ struct sctp_chunk *sctp_process_strreset_addstrm_in( if (!out || outcnt > SCTP_MAX_STREAM) goto out; - streamout = krealloc(stream->out, outcnt * sizeof(*streamout), - GFP_ATOMIC); - if (!streamout) + ret = sctp_stream_alloc_out(stream, outcnt, GFP_ATOMIC); + if (ret) goto out; - memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); - stream->out = streamout; - chunk = sctp_make_strreset_addstrm(asoc, out, 0); if (!chunk) goto out; diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c new file mode 100644 index 000000000000..0b83ec51e43b --- /dev/null +++ b/net/sctp/stream_sched.c @@ -0,0 +1,275 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> + */ + +#include <linux/list.h> +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> + +/* First Come First Serve (a.k.a. FIFO) + * RFC DRAFT ndata Section 3.1 + */ +static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, + __u16 value, gfp_t gfp) +{ + return 0; +} + +static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + *value = 0; + return 0; +} + +static int sctp_sched_fcfs_init(struct sctp_stream *stream) +{ + return 0; +} + +static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + return 0; +} + +static void sctp_sched_fcfs_free(struct sctp_stream *stream) +{ +} + +static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ +} + +static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_chunk *ch = NULL; + struct list_head *entry; + + if (list_empty(&q->out_chunk_list)) + goto out; + + if (stream->out_curr) { + ch = list_entry(stream->out_curr->ext->outq.next, + struct sctp_chunk, stream_list); + } else { + entry = q->out_chunk_list.next; + ch = list_entry(entry, struct sctp_chunk, list); + } + + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *chunk) +{ +} + +static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) +{ +} + +static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) +{ +} + +static struct sctp_sched_ops sctp_sched_fcfs = { + .set = sctp_sched_fcfs_set, + .get = sctp_sched_fcfs_get, + .init = sctp_sched_fcfs_init, + .init_sid = sctp_sched_fcfs_init_sid, + .free = sctp_sched_fcfs_free, + .enqueue = sctp_sched_fcfs_enqueue, + .dequeue = sctp_sched_fcfs_dequeue, + .dequeue_done = sctp_sched_fcfs_dequeue_done, + .sched_all = sctp_sched_fcfs_sched_all, + .unsched_all = sctp_sched_fcfs_unsched_all, +}; + +/* API to other parts of the stack */ + +extern struct sctp_sched_ops sctp_sched_prio; +extern struct sctp_sched_ops sctp_sched_rr; + +static struct sctp_sched_ops *sctp_sched_ops[] = { + &sctp_sched_fcfs, + &sctp_sched_prio, + &sctp_sched_rr, +}; + +int sctp_sched_set_sched(struct sctp_association *asoc, + enum sctp_sched_type sched) +{ + struct sctp_sched_ops *n = sctp_sched_ops[sched]; + struct sctp_sched_ops *old = asoc->outqueue.sched; + struct sctp_datamsg *msg = NULL; + struct sctp_chunk *ch; + int i, ret = 0; + + if (old == n) + return ret; + + if (sched > SCTP_SS_MAX) + return -EINVAL; + + if (old) { + old->free(&asoc->stream); + + /* Give the next scheduler a clean slate. */ + for (i = 0; i < asoc->stream.outcnt; i++) { + void *p = asoc->stream.out[i].ext; + + if (!p) + continue; + + p += offsetofend(struct sctp_stream_out_ext, outq); + memset(p, 0, sizeof(struct sctp_stream_out_ext) - + offsetofend(struct sctp_stream_out_ext, outq)); + } + } + + asoc->outqueue.sched = n; + n->init(&asoc->stream); + for (i = 0; i < asoc->stream.outcnt; i++) { + if (!asoc->stream.out[i].ext) + continue; + + ret = n->init_sid(&asoc->stream, i, GFP_KERNEL); + if (ret) + goto err; + } + + /* We have to requeue all chunks already queued. */ + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + if (ch->msg == msg) + continue; + msg = ch->msg; + n->enqueue(&asoc->outqueue, msg); + } + + return ret; + +err: + n->free(&asoc->stream); + asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ + + return ret; +} + +int sctp_sched_get_sched(struct sctp_association *asoc) +{ + int i; + + for (i = 0; i <= SCTP_SS_MAX; i++) + if (asoc->outqueue.sched == sctp_sched_ops[i]) + return i; + + return 0; +} + +int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, + __u16 value, gfp_t gfp) +{ + if (sid >= asoc->stream.outcnt) + return -EINVAL; + + if (!asoc->stream.out[sid].ext) { + int ret; + + ret = sctp_stream_init_ext(&asoc->stream, sid); + if (ret) + return ret; + } + + return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); +} + +int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, + __u16 *value) +{ + if (sid >= asoc->stream.outcnt) + return -EINVAL; + + if (!asoc->stream.out[sid].ext) + return 0; + + return asoc->outqueue.sched->get(&asoc->stream, sid, value); +} + +void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) +{ + if (!list_is_last(&ch->frag_list, &ch->msg->chunks)) { + struct sctp_stream_out *sout; + __u16 sid; + + /* datamsg is not finish, so save it as current one, + * in case application switch scheduler or a higher + * priority stream comes in. + */ + sid = sctp_chunk_stream_no(ch); + sout = &q->asoc->stream.out[sid]; + q->asoc->stream.out_curr = sout; + return; + } + + q->asoc->stream.out_curr = NULL; + q->sched->dequeue_done(q, ch); +} + +/* Auxiliary functions for the schedulers */ +void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) +{ + list_del_init(&ch->list); + list_del_init(&ch->stream_list); + q->out_qlen -= ch->skb->len; +} + +int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) +{ + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + + INIT_LIST_HEAD(&stream->out[sid].ext->outq); + return sched->init_sid(stream, sid, gfp); +} + +struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + + asoc = container_of(stream, struct sctp_association, stream); + + return asoc->outqueue.sched; +} diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c new file mode 100644 index 000000000000..384dbf3c8760 --- /dev/null +++ b/net/sctp/stream_sched_prio.c @@ -0,0 +1,347 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> + */ + +#include <linux/list.h> +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> + +/* Priority handling + * RFC DRAFT ndata section 3.4 + */ + +static void sctp_sched_prio_unsched_all(struct sctp_stream *stream); + +static struct sctp_stream_priorities *sctp_sched_prio_new_head( + struct sctp_stream *stream, int prio, gfp_t gfp) +{ + struct sctp_stream_priorities *p; + + p = kmalloc(sizeof(*p), gfp); + if (!p) + return NULL; + + INIT_LIST_HEAD(&p->prio_sched); + INIT_LIST_HEAD(&p->active); + p->next = NULL; + p->prio = prio; + + return p; +} + +static struct sctp_stream_priorities *sctp_sched_prio_get_head( + struct sctp_stream *stream, int prio, gfp_t gfp) +{ + struct sctp_stream_priorities *p; + int i; + + /* Look into scheduled priorities first, as they are sorted and + * we can find it fast IF it's scheduled. + */ + list_for_each_entry(p, &stream->prio_list, prio_sched) { + if (p->prio == prio) + return p; + if (p->prio > prio) + break; + } + + /* No luck. So we search on all streams now. */ + for (i = 0; i < stream->outcnt; i++) { + if (!stream->out[i].ext) + continue; + + p = stream->out[i].ext->prio_head; + if (!p) + /* Means all other streams won't be initialized + * as well. + */ + break; + if (p->prio == prio) + return p; + } + + /* If not even there, allocate a new one. */ + return sctp_sched_prio_new_head(stream, prio, gfp); +} + +static void sctp_sched_prio_next_stream(struct sctp_stream_priorities *p) +{ + struct list_head *pos; + + pos = p->next->prio_list.next; + if (pos == &p->active) + pos = pos->next; + p->next = list_entry(pos, struct sctp_stream_out_ext, prio_list); +} + +static bool sctp_sched_prio_unsched(struct sctp_stream_out_ext *soute) +{ + bool scheduled = false; + + if (!list_empty(&soute->prio_list)) { + struct sctp_stream_priorities *prio_head = soute->prio_head; + + /* Scheduled */ + scheduled = true; + + if (prio_head->next == soute) + /* Try to move to the next stream */ + sctp_sched_prio_next_stream(prio_head); + + list_del_init(&soute->prio_list); + + /* Also unsched the priority if this was the last stream */ + if (list_empty(&prio_head->active)) { + list_del_init(&prio_head->prio_sched); + /* If there is no stream left, clear next */ + prio_head->next = NULL; + } + } + + return scheduled; +} + +static void sctp_sched_prio_sched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + struct sctp_stream_priorities *prio, *prio_head; + + prio_head = soute->prio_head; + + /* Nothing to do if already scheduled */ + if (!list_empty(&soute->prio_list)) + return; + + /* Schedule the stream. If there is a next, we schedule the new + * one before it, so it's the last in round robin order. + * If there isn't, we also have to schedule the priority. + */ + if (prio_head->next) { + list_add(&soute->prio_list, prio_head->next->prio_list.prev); + return; + } + + list_add(&soute->prio_list, &prio_head->active); + prio_head->next = soute; + + list_for_each_entry(prio, &stream->prio_list, prio_sched) { + if (prio->prio > prio_head->prio) { + list_add(&prio_head->prio_sched, prio->prio_sched.prev); + return; + } + } + + list_add_tail(&prio_head->prio_sched, &stream->prio_list); +} + +static int sctp_sched_prio_set(struct sctp_stream *stream, __u16 sid, + __u16 prio, gfp_t gfp) +{ + struct sctp_stream_out *sout = &stream->out[sid]; + struct sctp_stream_out_ext *soute = sout->ext; + struct sctp_stream_priorities *prio_head, *old; + bool reschedule = false; + int i; + + prio_head = sctp_sched_prio_get_head(stream, prio, gfp); + if (!prio_head) + return -ENOMEM; + + reschedule = sctp_sched_prio_unsched(soute); + old = soute->prio_head; + soute->prio_head = prio_head; + if (reschedule) + sctp_sched_prio_sched(stream, soute); + + if (!old) + /* Happens when we set the priority for the first time */ + return 0; + + for (i = 0; i < stream->outcnt; i++) { + soute = stream->out[i].ext; + if (soute && soute->prio_head == old) + /* It's still in use, nothing else to do here. */ + return 0; + } + + /* No hits, we are good to free it. */ + kfree(old); + + return 0; +} + +static int sctp_sched_prio_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + *value = stream->out[sid].ext->prio_head->prio; + return 0; +} + +static int sctp_sched_prio_init(struct sctp_stream *stream) +{ + INIT_LIST_HEAD(&stream->prio_list); + + return 0; +} + +static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + INIT_LIST_HEAD(&stream->out[sid].ext->prio_list); + return sctp_sched_prio_set(stream, sid, 0, gfp); +} + +static void sctp_sched_prio_free(struct sctp_stream *stream) +{ + struct sctp_stream_priorities *prio, *n; + LIST_HEAD(list); + int i; + + /* As we don't keep a list of priorities, to avoid multiple + * frees we have to do it in 3 steps: + * 1. unsched everyone, so the lists are free to use in 2. + * 2. build the list of the priorities + * 3. free the list + */ + sctp_sched_prio_unsched_all(stream); + for (i = 0; i < stream->outcnt; i++) { + if (!stream->out[i].ext) + continue; + prio = stream->out[i].ext->prio_head; + if (prio && list_empty(&prio->prio_sched)) + list_add(&prio->prio_sched, &list); + } + list_for_each_entry_safe(prio, n, &list, prio_sched) { + list_del_init(&prio->prio_sched); + kfree(prio); + } +} + +static void sctp_sched_prio_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ + struct sctp_stream *stream; + struct sctp_chunk *ch; + __u16 sid; + + ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); + sid = sctp_chunk_stream_no(ch); + stream = &q->asoc->stream; + sctp_sched_prio_sched(stream, stream->out[sid].ext); +} + +static struct sctp_chunk *sctp_sched_prio_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_stream_priorities *prio; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch = NULL; + + /* Bail out quickly if queue is empty */ + if (list_empty(&q->out_chunk_list)) + goto out; + + /* Find which chunk is next. It's easy, it's either the current + * one or the first chunk on the next active stream. + */ + if (stream->out_curr) { + soute = stream->out_curr->ext; + } else { + prio = list_entry(stream->prio_list.next, + struct sctp_stream_priorities, prio_sched); + soute = prio->next; + } + ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_prio_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + struct sctp_stream_priorities *prio; + struct sctp_stream_out_ext *soute; + __u16 sid; + + /* Last chunk on that msg, move to the next stream on + * this priority. + */ + sid = sctp_chunk_stream_no(ch); + soute = q->asoc->stream.out[sid].ext; + prio = soute->prio_head; + + sctp_sched_prio_next_stream(prio); + + if (list_empty(&soute->outq)) + sctp_sched_prio_unsched(soute); +} + +static void sctp_sched_prio_sched_all(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + struct sctp_stream_out *sout; + struct sctp_chunk *ch; + + asoc = container_of(stream, struct sctp_association, stream); + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + __u16 sid; + + sid = sctp_chunk_stream_no(ch); + sout = &stream->out[sid]; + if (sout->ext) + sctp_sched_prio_sched(stream, sout->ext); + } +} + +static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) +{ + struct sctp_stream_priorities *p, *tmp; + struct sctp_stream_out_ext *soute, *souttmp; + + list_for_each_entry_safe(p, tmp, &stream->prio_list, prio_sched) + list_for_each_entry_safe(soute, souttmp, &p->active, prio_list) + sctp_sched_prio_unsched(soute); +} + +struct sctp_sched_ops sctp_sched_prio = { + .set = sctp_sched_prio_set, + .get = sctp_sched_prio_get, + .init = sctp_sched_prio_init, + .init_sid = sctp_sched_prio_init_sid, + .free = sctp_sched_prio_free, + .enqueue = sctp_sched_prio_enqueue, + .dequeue = sctp_sched_prio_dequeue, + .dequeue_done = sctp_sched_prio_dequeue_done, + .sched_all = sctp_sched_prio_sched_all, + .unsched_all = sctp_sched_prio_unsched_all, +}; diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c new file mode 100644 index 000000000000..7612a438c5b9 --- /dev/null +++ b/net/sctp/stream_sched_rr.c @@ -0,0 +1,201 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp stream queue/scheduling. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> + */ + +#include <linux/list.h> +#include <net/sctp/sctp.h> +#include <net/sctp/sm.h> +#include <net/sctp/stream_sched.h> + +/* Priority handling + * RFC DRAFT ndata section 3.2 + */ +static void sctp_sched_rr_unsched_all(struct sctp_stream *stream); + +static void sctp_sched_rr_next_stream(struct sctp_stream *stream) +{ + struct list_head *pos; + + pos = stream->rr_next->rr_list.next; + if (pos == &stream->rr_list) + pos = pos->next; + stream->rr_next = list_entry(pos, struct sctp_stream_out_ext, rr_list); +} + +static void sctp_sched_rr_unsched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + if (stream->rr_next == soute) + /* Try to move to the next stream */ + sctp_sched_rr_next_stream(stream); + + list_del_init(&soute->rr_list); + + /* If we have no other stream queued, clear next */ + if (list_empty(&stream->rr_list)) + stream->rr_next = NULL; +} + +static void sctp_sched_rr_sched(struct sctp_stream *stream, + struct sctp_stream_out_ext *soute) +{ + if (!list_empty(&soute->rr_list)) + /* Already scheduled. */ + return; + + /* Schedule the stream */ + list_add_tail(&soute->rr_list, &stream->rr_list); + + if (!stream->rr_next) + stream->rr_next = soute; +} + +static int sctp_sched_rr_set(struct sctp_stream *stream, __u16 sid, + __u16 prio, gfp_t gfp) +{ + return 0; +} + +static int sctp_sched_rr_get(struct sctp_stream *stream, __u16 sid, + __u16 *value) +{ + return 0; +} + +static int sctp_sched_rr_init(struct sctp_stream *stream) +{ + INIT_LIST_HEAD(&stream->rr_list); + stream->rr_next = NULL; + + return 0; +} + +static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid, + gfp_t gfp) +{ + INIT_LIST_HEAD(&stream->out[sid].ext->rr_list); + + return 0; +} + +static void sctp_sched_rr_free(struct sctp_stream *stream) +{ + sctp_sched_rr_unsched_all(stream); +} + +static void sctp_sched_rr_enqueue(struct sctp_outq *q, + struct sctp_datamsg *msg) +{ + struct sctp_stream *stream; + struct sctp_chunk *ch; + __u16 sid; + + ch = list_first_entry(&msg->chunks, struct sctp_chunk, frag_list); + sid = sctp_chunk_stream_no(ch); + stream = &q->asoc->stream; + sctp_sched_rr_sched(stream, stream->out[sid].ext); +} + +static struct sctp_chunk *sctp_sched_rr_dequeue(struct sctp_outq *q) +{ + struct sctp_stream *stream = &q->asoc->stream; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch = NULL; + + /* Bail out quickly if queue is empty */ + if (list_empty(&q->out_chunk_list)) + goto out; + + /* Find which chunk is next */ + if (stream->out_curr) + soute = stream->out_curr->ext; + else + soute = stream->rr_next; + ch = list_entry(soute->outq.next, struct sctp_chunk, stream_list); + + sctp_sched_dequeue_common(q, ch); + +out: + return ch; +} + +static void sctp_sched_rr_dequeue_done(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + struct sctp_stream_out_ext *soute; + __u16 sid; + + /* Last chunk on that msg, move to the next stream */ + sid = sctp_chunk_stream_no(ch); + soute = q->asoc->stream.out[sid].ext; + + sctp_sched_rr_next_stream(&q->asoc->stream); + + if (list_empty(&soute->outq)) + sctp_sched_rr_unsched(&q->asoc->stream, soute); +} + +static void sctp_sched_rr_sched_all(struct sctp_stream *stream) +{ + struct sctp_association *asoc; + struct sctp_stream_out_ext *soute; + struct sctp_chunk *ch; + + asoc = container_of(stream, struct sctp_association, stream); + list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { + __u16 sid; + + sid = sctp_chunk_stream_no(ch); + soute = stream->out[sid].ext; + if (soute) + sctp_sched_rr_sched(stream, soute); + } +} + +static void sctp_sched_rr_unsched_all(struct sctp_stream *stream) +{ + struct sctp_stream_out_ext *soute, *tmp; + + list_for_each_entry_safe(soute, tmp, &stream->rr_list, rr_list) + sctp_sched_rr_unsched(stream, soute); +} + +struct sctp_sched_ops sctp_sched_rr = { + .set = sctp_sched_rr_set, + .get = sctp_sched_rr_get, + .init = sctp_sched_rr_init, + .init_sid = sctp_sched_rr_init_sid, + .free = sctp_sched_rr_free, + .enqueue = sctp_sched_rr_enqueue, + .dequeue = sctp_sched_rr_dequeue, + .dequeue_done = sctp_sched_rr_dequeue_done, + .sched_all = sctp_sched_rr_sched_all, + .unsched_all = sctp_sched_rr_unsched_all, +}; diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 20b66e79c5d6..5f6a20084157 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -380,10 +380,14 @@ static int smc_link_determine_gid(struct smc_link_group *lgr) if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, &gattr)) continue; - if (gattr.ndev && - (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) { - lnk->gid = gid; - return 0; + if (gattr.ndev) { + if (is_vlan_dev(gattr.ndev) && + vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) { + lnk->gid = gid; + dev_put(gattr.ndev); + return 0; + } + dev_put(gattr.ndev); } } return -ENODEV; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 0b5852299158..468e1d725d97 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -369,26 +369,17 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) { - struct net_device *ndev; + struct ib_gid_attr gattr; int rc; rc = ib_query_gid(smcibdev->ibdev, ibport, 0, - &smcibdev->gid[ibport - 1], NULL); - /* the SMC protocol requires specification of the roce MAC address; - * if net_device cannot be determined, it can be derived from gid 0 - */ - ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); - if (ndev) { - memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); - dev_put(ndev); - } else if (!rc) { - memcpy(&smcibdev->mac[ibport - 1][0], - &smcibdev->gid[ibport - 1].raw[8], 3); - memcpy(&smcibdev->mac[ibport - 1][3], - &smcibdev->gid[ibport - 1].raw[13], 3); - smcibdev->mac[ibport - 1][0] &= ~0x02; - } - return rc; + &smcibdev->gid[ibport - 1], &gattr); + if (rc || !gattr.ndev) + return -ENODEV; + + memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); + dev_put(gattr.ndev); + return 0; } /* Create an identifier unique for this instance of SMC-R. @@ -419,6 +410,7 @@ int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) &smcibdev->pattr[ibport - 1]); if (rc) goto out; + /* the SMC protocol requires specification of the RoCE MAC address */ rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); if (rc) goto out; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 0cc83839c13c..5dea47eb31bb 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -44,7 +44,7 @@ static mempool_t *rpc_buffer_mempool __read_mostly; static void rpc_async_schedule(struct work_struct *); static void rpc_release_task(struct rpc_task *task); -static void __rpc_queue_timer_fn(unsigned long ptr); +static void __rpc_queue_timer_fn(struct timer_list *t); /* * RPC tasks sit here while waiting for conditions to improve. @@ -228,7 +228,7 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c queue->maxpriority = nr_queues - 1; rpc_reset_waitqueue_priority(queue); queue->qlen = 0; - setup_timer(&queue->timer_list.timer, __rpc_queue_timer_fn, (unsigned long)queue); + timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0); INIT_LIST_HEAD(&queue->timer_list.list); rpc_assign_waitqueue_name(queue, qname); } @@ -635,9 +635,9 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) } EXPORT_SYMBOL_GPL(rpc_wake_up_status); -static void __rpc_queue_timer_fn(unsigned long ptr) +static void __rpc_queue_timer_fn(struct timer_list *t) { - struct rpc_wait_queue *queue = (struct rpc_wait_queue *)ptr; + struct rpc_wait_queue *queue = from_timer(queue, t, timer_list.timer); struct rpc_task *task, *n; unsigned long expires, now, timeo; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index aa04666f929d..33f4ae68426d 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -455,7 +455,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, serv->sv_xdrsize = xdrsize; INIT_LIST_HEAD(&serv->sv_tempsocks); INIT_LIST_HEAD(&serv->sv_permsocks); - init_timer(&serv->sv_temptimer); + timer_setup(&serv->sv_temptimer, NULL, 0); spin_lock_init(&serv->sv_lock); __svc_init_bc(serv); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d16a8b423c20..71de77bd4423 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -28,7 +28,7 @@ module_param(svc_rpc_per_connection_limit, uint, 0644); static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); -static void svc_age_temp_xprts(unsigned long closure); +static void svc_age_temp_xprts(struct timer_list *t); static void svc_delete_xprt(struct svc_xprt *xprt); /* apparently the "standard" is that clients close @@ -785,8 +785,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt serv->sv_tmpcnt++; if (serv->sv_temptimer.function == NULL) { /* setup timer to age temp transports */ - setup_timer(&serv->sv_temptimer, svc_age_temp_xprts, - (unsigned long)serv); + serv->sv_temptimer.function = (TIMER_FUNC_TYPE)svc_age_temp_xprts; mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } @@ -960,9 +959,9 @@ out: * Timer function to close old temporary transports, using * a mark-and-sweep algorithm. */ -static void svc_age_temp_xprts(unsigned long closure) +static void svc_age_temp_xprts(struct timer_list *t) { - struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_serv *serv = from_timer(serv, t, sv_temptimer); struct svc_xprt *xprt; struct list_head *le, *next; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e741ec2b4d8e..4b00302e1867 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -696,9 +696,9 @@ xprt_schedule_autodisconnect(struct rpc_xprt *xprt) } static void -xprt_init_autodisconnect(unsigned long data) +xprt_init_autodisconnect(struct timer_list *t) { - struct rpc_xprt *xprt = (struct rpc_xprt *)data; + struct rpc_xprt *xprt = from_timer(xprt, t, timer); spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv)) @@ -1422,10 +1422,9 @@ found: xprt->idle_timeout = 0; INIT_WORK(&xprt->task_cleanup, xprt_autoclose); if (xprt_has_timer(xprt)) - setup_timer(&xprt->timer, xprt_init_autodisconnect, - (unsigned long)xprt); + timer_setup(&xprt->timer, xprt_init_autodisconnect, 0); else - init_timer(&xprt->timer); + timer_setup(&xprt->timer, NULL, 0); if (strlen(args->servername) > RPC_MAXNETNAMELEN) { xprt_destroy(xprt); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 5a936a6a31a3..df062e086bdb 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -401,7 +401,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (unlikely(n != mw->mw_nents)) goto out_mapmr_err; - dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", + dprintk("RPC: %s: Using frmr %p to map %u segments (%llu bytes)\n", __func__, frmr, mw->mw_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9b5de31aa429..c1841f234a71 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2203,7 +2203,7 @@ static void xs_udp_setup_socket(struct work_struct *work) struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; - struct socket *sock = transport->sock; + struct socket *sock; int status = -EIO; sock = xs_create_sock(xprt, transport, diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 31b9f9c52974..a3af73ec0b78 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -8,7 +8,7 @@ tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ - server.o socket.o + server.o socket.o group.o tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 7d99029df342..329325bd553e 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -233,7 +233,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, struct sk_buff_head xmitq; int rc = 0; - __skb_queue_head_init(&xmitq); + skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) rc = tipc_link_xmit(l, pkts, &xmitq); @@ -258,20 +258,20 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_nlist *dests, u16 *cong_link_cnt) { + struct tipc_dest *dst, *tmp; struct sk_buff_head _pkts; - struct u32_item *n, *tmp; - u32 dst, selector; + u32 dnode, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); - __skb_queue_head_init(&_pkts); + skb_queue_head_init(&_pkts); - list_for_each_entry_safe(n, tmp, &dests->list, list) { - dst = n->value; - if (!tipc_msg_pskb_copy(dst, pkts, &_pkts)) + list_for_each_entry_safe(dst, tmp, &dests->list, list) { + dnode = dst->node; + if (!tipc_msg_pskb_copy(dnode, pkts, &_pkts)) return -ENOMEM; /* Any other return value than -ELINKCONG is ignored */ - if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG) + if (tipc_node_xmit(net, &_pkts, dnode, selector) == -ELINKCONG) (*cong_link_cnt)++; } return 0; @@ -554,7 +554,7 @@ void tipc_nlist_add(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = true; - else if (u32_push(&nl->list, node)) + else if (tipc_dest_push(&nl->list, node, 0)) nl->remote++; } @@ -562,13 +562,13 @@ void tipc_nlist_del(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = false; - else if (u32_del(&nl->list, node)) + else if (tipc_dest_del(&nl->list, node, 0)) nl->remote--; } void tipc_nlist_purge(struct tipc_nlist *nl) { - u32_list_purge(&nl->list); + tipc_dest_list_purge(&nl->list); nl->remote = 0; nl->local = 0; } diff --git a/net/tipc/core.h b/net/tipc/core.h index 5cc5398be722..964342689f2c 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -132,6 +132,11 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } +static inline struct tipc_server *tipc_topsrv(struct net *net) +{ + return tipc_net(net)->topsrv; +} + static inline unsigned int tipc_hashfn(u32 addr) { return addr & (NODE_HTABLE_SIZE - 1); diff --git a/net/tipc/group.c b/net/tipc/group.c new file mode 100644 index 000000000000..7821085a7dd8 --- /dev/null +++ b/net/tipc/group.c @@ -0,0 +1,871 @@ +/* + * net/tipc/group.c: TIPC group messaging code + * + * Copyright (c) 2017, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "addr.h" +#include "group.h" +#include "bcast.h" +#include "server.h" +#include "msg.h" +#include "socket.h" +#include "node.h" +#include "name_table.h" +#include "subscr.h" + +#define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) +#define ADV_IDLE ADV_UNIT +#define ADV_ACTIVE (ADV_UNIT * 12) + +enum mbr_state { + MBR_QUARANTINED, + MBR_DISCOVERED, + MBR_JOINING, + MBR_PUBLISHED, + MBR_JOINED, + MBR_PENDING, + MBR_ACTIVE, + MBR_RECLAIMING, + MBR_REMITTED, + MBR_LEAVING +}; + +struct tipc_member { + struct rb_node tree_node; + struct list_head list; + struct list_head congested; + struct sk_buff *event_msg; + struct sk_buff_head deferredq; + struct tipc_group *group; + u32 node; + u32 port; + u32 instance; + enum mbr_state state; + u16 advertised; + u16 window; + u16 bc_rcv_nxt; + u16 bc_syncpt; + u16 bc_acked; + bool usr_pending; +}; + +struct tipc_group { + struct rb_root members; + struct list_head congested; + struct list_head pending; + struct list_head active; + struct list_head reclaiming; + struct tipc_nlist dests; + struct net *net; + int subid; + u32 type; + u32 instance; + u32 domain; + u32 scope; + u32 portid; + u16 member_cnt; + u16 active_cnt; + u16 max_active; + u16 bc_snd_nxt; + u16 bc_ackers; + bool loopback; + bool events; +}; + +static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, + int mtyp, struct sk_buff_head *xmitq); + +static void tipc_group_decr_active(struct tipc_group *grp, + struct tipc_member *m) +{ + if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING) + grp->active_cnt--; +} + +static int tipc_group_rcvbuf_limit(struct tipc_group *grp) +{ + int max_active, active_pool, idle_pool; + int mcnt = grp->member_cnt + 1; + + /* Limit simultaneous reception from other members */ + max_active = min(mcnt / 8, 64); + max_active = max(max_active, 16); + grp->max_active = max_active; + + /* Reserve blocks for active and idle members */ + active_pool = max_active * ADV_ACTIVE; + idle_pool = (mcnt - max_active) * ADV_IDLE; + + /* Scale to bytes, considering worst-case truesize/msgsize ratio */ + return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; +} + +u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) +{ + return grp->bc_snd_nxt; +} + +static bool tipc_group_is_enabled(struct tipc_member *m) +{ + return m->state != MBR_QUARANTINED && m->state != MBR_LEAVING; +} + +static bool tipc_group_is_receiver(struct tipc_member *m) +{ + return m && m->state >= MBR_JOINED; +} + +u32 tipc_group_exclude(struct tipc_group *grp) +{ + if (!grp->loopback) + return grp->portid; + return 0; +} + +int tipc_group_size(struct tipc_group *grp) +{ + return grp->member_cnt; +} + +struct tipc_group *tipc_group_create(struct net *net, u32 portid, + struct tipc_group_req *mreq) +{ + struct tipc_group *grp; + u32 type = mreq->type; + + grp = kzalloc(sizeof(*grp), GFP_ATOMIC); + if (!grp) + return NULL; + tipc_nlist_init(&grp->dests, tipc_own_addr(net)); + INIT_LIST_HEAD(&grp->congested); + INIT_LIST_HEAD(&grp->active); + INIT_LIST_HEAD(&grp->pending); + INIT_LIST_HEAD(&grp->reclaiming); + grp->members = RB_ROOT; + grp->net = net; + grp->portid = portid; + grp->domain = addr_domain(net, mreq->scope); + grp->type = type; + grp->instance = mreq->instance; + grp->scope = mreq->scope; + grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; + grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; + if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid)) + return grp; + kfree(grp); + return NULL; +} + +void tipc_group_delete(struct net *net, struct tipc_group *grp) +{ + struct rb_root *tree = &grp->members; + struct tipc_member *m, *tmp; + struct sk_buff_head xmitq; + + __skb_queue_head_init(&xmitq); + + rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { + tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); + list_del(&m->list); + kfree(m); + } + tipc_node_distr_xmit(net, &xmitq); + tipc_nlist_purge(&grp->dests); + tipc_topsrv_kern_unsubscr(net, grp->subid); + kfree(grp); +} + +struct tipc_member *tipc_group_find_member(struct tipc_group *grp, + u32 node, u32 port) +{ + struct rb_node *n = grp->members.rb_node; + u64 nkey, key = (u64)node << 32 | port; + struct tipc_member *m; + + while (n) { + m = container_of(n, struct tipc_member, tree_node); + nkey = (u64)m->node << 32 | m->port; + if (key < nkey) + n = n->rb_left; + else if (key > nkey) + n = n->rb_right; + else + return m; + } + return NULL; +} + +static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, + u32 node, u32 port) +{ + struct tipc_member *m; + + m = tipc_group_find_member(grp, node, port); + if (m && tipc_group_is_enabled(m)) + return m; + return NULL; +} + +static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, + u32 node) +{ + struct tipc_member *m; + struct rb_node *n; + + for (n = rb_first(&grp->members); n; n = rb_next(n)) { + m = container_of(n, struct tipc_member, tree_node); + if (m->node == node) + return m; + } + return NULL; +} + +static void tipc_group_add_to_tree(struct tipc_group *grp, + struct tipc_member *m) +{ + u64 nkey, key = (u64)m->node << 32 | m->port; + struct rb_node **n, *parent = NULL; + struct tipc_member *tmp; + + n = &grp->members.rb_node; + while (*n) { + tmp = container_of(*n, struct tipc_member, tree_node); + parent = *n; + tmp = container_of(parent, struct tipc_member, tree_node); + nkey = (u64)tmp->node << 32 | tmp->port; + if (key < nkey) + n = &(*n)->rb_left; + else if (key > nkey) + n = &(*n)->rb_right; + else + return; + } + rb_link_node(&m->tree_node, parent, n); + rb_insert_color(&m->tree_node, &grp->members); +} + +static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, + u32 node, u32 port, + int state) +{ + struct tipc_member *m; + + m = kzalloc(sizeof(*m), GFP_ATOMIC); + if (!m) + return NULL; + INIT_LIST_HEAD(&m->list); + INIT_LIST_HEAD(&m->congested); + __skb_queue_head_init(&m->deferredq); + m->group = grp; + m->node = node; + m->port = port; + m->bc_acked = grp->bc_snd_nxt - 1; + grp->member_cnt++; + tipc_group_add_to_tree(grp, m); + tipc_nlist_add(&grp->dests, m->node); + m->state = state; + return m; +} + +void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port) +{ + tipc_group_create_member(grp, node, port, MBR_DISCOVERED); +} + +static void tipc_group_delete_member(struct tipc_group *grp, + struct tipc_member *m) +{ + rb_erase(&m->tree_node, &grp->members); + grp->member_cnt--; + + /* Check if we were waiting for replicast ack from this member */ + if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) + grp->bc_ackers--; + + list_del_init(&m->list); + list_del_init(&m->congested); + tipc_group_decr_active(grp, m); + + /* If last member on a node, remove node from dest list */ + if (!tipc_group_find_node(grp, m->node)) + tipc_nlist_del(&grp->dests, m->node); + + kfree(m); +} + +struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) +{ + return &grp->dests; +} + +void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, + int *scope) +{ + seq->type = grp->type; + seq->lower = grp->instance; + seq->upper = grp->instance; + *scope = grp->scope; +} + +void tipc_group_update_member(struct tipc_member *m, int len) +{ + struct tipc_group *grp = m->group; + struct tipc_member *_m, *tmp; + + if (!tipc_group_is_enabled(m)) + return; + + m->window -= len; + + if (m->window >= ADV_IDLE) + return; + + if (!list_empty(&m->congested)) + return; + + /* Sort member into congested members' list */ + list_for_each_entry_safe(_m, tmp, &grp->congested, congested) { + if (m->window > _m->window) + continue; + list_add_tail(&m->congested, &_m->congested); + return; + } + list_add_tail(&m->congested, &grp->congested); +} + +void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) +{ + u16 prev = grp->bc_snd_nxt - 1; + struct tipc_member *m; + struct rb_node *n; + + for (n = rb_first(&grp->members); n; n = rb_next(n)) { + m = container_of(n, struct tipc_member, tree_node); + if (tipc_group_is_enabled(m)) { + tipc_group_update_member(m, len); + m->bc_acked = prev; + } + } + + /* Mark number of acknowledges to expect, if any */ + if (ack) + grp->bc_ackers = grp->member_cnt; + grp->bc_snd_nxt++; +} + +bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, + int len, struct tipc_member **mbr) +{ + struct sk_buff_head xmitq; + struct tipc_member *m; + int adv, state; + + m = tipc_group_find_dest(grp, dnode, dport); + *mbr = m; + if (!m) + return false; + if (m->usr_pending) + return true; + if (m->window >= len) + return false; + m->usr_pending = true; + + /* If not fully advertised, do it now to prevent mutual blocking */ + adv = m->advertised; + state = m->state; + if (state < MBR_JOINED) + return true; + if (state == MBR_JOINED && adv == ADV_IDLE) + return true; + if (state == MBR_ACTIVE && adv == ADV_ACTIVE) + return true; + if (state == MBR_PENDING && adv == ADV_IDLE) + return true; + skb_queue_head_init(&xmitq); + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); + tipc_node_distr_xmit(grp->net, &xmitq); + return true; +} + +bool tipc_group_bc_cong(struct tipc_group *grp, int len) +{ + struct tipc_member *m = NULL; + + /* If prev bcast was replicast, reject until all receivers have acked */ + if (grp->bc_ackers) + return true; + + if (list_empty(&grp->congested)) + return false; + + m = list_first_entry(&grp->congested, struct tipc_member, congested); + if (m->window >= len) + return false; + + return tipc_group_cong(grp, m->node, m->port, len, &m); +} + +/* tipc_group_sort_msg() - sort msg into queue by bcast sequence number + */ +static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) +{ + struct tipc_msg *_hdr, *hdr = buf_msg(skb); + u16 bc_seqno = msg_grp_bc_seqno(hdr); + struct sk_buff *_skb, *tmp; + int mtyp = msg_type(hdr); + + /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ + if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { + skb_queue_walk_safe(defq, _skb, tmp) { + _hdr = buf_msg(_skb); + if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) + continue; + __skb_queue_before(defq, _skb, skb); + return; + } + /* Bcast was not bypassed, - add to tail */ + } + /* Unicasts are never bypassed, - always add to tail */ + __skb_queue_tail(defq, skb); +} + +/* tipc_group_filter_msg() - determine if we should accept arriving message + */ +void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = __skb_dequeue(inputq); + bool ack, deliver, update, leave = false; + struct sk_buff_head *defq; + struct tipc_member *m; + struct tipc_msg *hdr; + u32 node, port; + int mtyp, blks; + + if (!skb) + return; + + hdr = buf_msg(skb); + node = msg_orignode(hdr); + port = msg_origport(hdr); + + if (!msg_in_group(hdr)) + goto drop; + + m = tipc_group_find_member(grp, node, port); + if (!tipc_group_is_receiver(m)) + goto drop; + + if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) + goto drop; + + TIPC_SKB_CB(skb)->orig_member = m->instance; + defq = &m->deferredq; + tipc_group_sort_msg(skb, defq); + + while ((skb = skb_peek(defq))) { + hdr = buf_msg(skb); + mtyp = msg_type(hdr); + deliver = true; + ack = false; + update = false; + + if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) + break; + + /* Decide what to do with message */ + switch (mtyp) { + case TIPC_GRP_MCAST_MSG: + if (msg_nameinst(hdr) != grp->instance) { + update = true; + deliver = false; + } + /* Fall thru */ + case TIPC_GRP_BCAST_MSG: + m->bc_rcv_nxt++; + ack = msg_grp_bc_ack_req(hdr); + break; + case TIPC_GRP_UCAST_MSG: + break; + case TIPC_GRP_MEMBER_EVT: + if (m->state == MBR_LEAVING) + leave = true; + if (!grp->events) + deliver = false; + break; + default: + break; + } + + /* Execute decisions */ + __skb_dequeue(defq); + if (deliver) + __skb_queue_tail(inputq, skb); + else + kfree_skb(skb); + + if (ack) + tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); + + if (leave) { + tipc_group_delete_member(grp, m); + __skb_queue_purge(defq); + break; + } + if (!update) + continue; + + blks = msg_blocks(hdr); + tipc_group_update_rcv_win(grp, blks, node, port, xmitq); + } + return; +drop: + kfree_skb(skb); +} + +void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, + u32 port, struct sk_buff_head *xmitq) +{ + struct list_head *active = &grp->active; + int max_active = grp->max_active; + int reclaim_limit = max_active * 3 / 4; + int active_cnt = grp->active_cnt; + struct tipc_member *m, *rm; + + m = tipc_group_find_member(grp, node, port); + if (!m) + return; + + m->advertised -= blks; + + switch (m->state) { + case MBR_JOINED: + /* Reclaim advertised space from least active member */ + if (!list_empty(active) && active_cnt >= reclaim_limit) { + rm = list_first_entry(active, struct tipc_member, list); + rm->state = MBR_RECLAIMING; + list_move_tail(&rm->list, &grp->reclaiming); + tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); + } + /* If max active, become pending and wait for reclaimed space */ + if (active_cnt >= max_active) { + m->state = MBR_PENDING; + list_add_tail(&m->list, &grp->pending); + break; + } + /* Otherwise become active */ + m->state = MBR_ACTIVE; + list_add_tail(&m->list, &grp->active); + grp->active_cnt++; + /* Fall through */ + case MBR_ACTIVE: + if (!list_is_last(&m->list, &grp->active)) + list_move_tail(&m->list, &grp->active); + if (m->advertised > (ADV_ACTIVE * 3 / 4)) + break; + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + break; + case MBR_REMITTED: + if (m->advertised > ADV_IDLE) + break; + m->state = MBR_JOINED; + if (m->advertised < ADV_IDLE) { + pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + } + break; + case MBR_RECLAIMING: + case MBR_DISCOVERED: + case MBR_JOINING: + case MBR_LEAVING: + default: + break; + } +} + +static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, + int mtyp, struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr; + struct sk_buff *skb; + int adv = 0; + + skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, + m->node, tipc_own_addr(grp->net), + m->port, grp->portid, 0); + if (!skb) + return; + + if (m->state == MBR_ACTIVE) + adv = ADV_ACTIVE - m->advertised; + else if (m->state == MBR_JOINED || m->state == MBR_PENDING) + adv = ADV_IDLE - m->advertised; + + hdr = buf_msg(skb); + + if (mtyp == GRP_JOIN_MSG) { + msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); + msg_set_adv_win(hdr, adv); + m->advertised += adv; + } else if (mtyp == GRP_LEAVE_MSG) { + msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); + } else if (mtyp == GRP_ADV_MSG) { + msg_set_adv_win(hdr, adv); + m->advertised += adv; + } else if (mtyp == GRP_ACK_MSG) { + msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); + } else if (mtyp == GRP_REMIT_MSG) { + msg_set_grp_remitted(hdr, m->window); + } + __skb_queue_tail(xmitq, skb); +} + +void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, + struct tipc_msg *hdr, struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + u32 node = msg_orignode(hdr); + u32 port = msg_origport(hdr); + struct tipc_member *m, *pm; + struct tipc_msg *ehdr; + u16 remitted, in_flight; + + if (!grp) + return; + + m = tipc_group_find_member(grp, node, port); + + switch (msg_type(hdr)) { + case GRP_JOIN_MSG: + if (!m) + m = tipc_group_create_member(grp, node, port, + MBR_QUARANTINED); + if (!m) + return; + m->bc_syncpt = msg_grp_bc_syncpt(hdr); + m->bc_rcv_nxt = m->bc_syncpt; + m->window += msg_adv_win(hdr); + + /* Wait until PUBLISH event is received */ + if (m->state == MBR_DISCOVERED) { + m->state = MBR_JOINING; + } else if (m->state == MBR_PUBLISHED) { + m->state = MBR_JOINED; + *usr_wakeup = true; + m->usr_pending = false; + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + ehdr = buf_msg(m->event_msg); + msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); + __skb_queue_tail(inputq, m->event_msg); + } + if (m->window < ADV_IDLE) + tipc_group_update_member(m, 0); + else + list_del_init(&m->congested); + return; + case GRP_LEAVE_MSG: + if (!m) + return; + m->bc_syncpt = msg_grp_bc_syncpt(hdr); + + /* Wait until WITHDRAW event is received */ + if (m->state != MBR_LEAVING) { + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; + return; + } + /* Otherwise deliver already received WITHDRAW event */ + ehdr = buf_msg(m->event_msg); + msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); + __skb_queue_tail(inputq, m->event_msg); + *usr_wakeup = true; + list_del_init(&m->congested); + return; + case GRP_ADV_MSG: + if (!m) + return; + m->window += msg_adv_win(hdr); + *usr_wakeup = m->usr_pending; + m->usr_pending = false; + list_del_init(&m->congested); + return; + case GRP_ACK_MSG: + if (!m) + return; + m->bc_acked = msg_grp_bc_acked(hdr); + if (--grp->bc_ackers) + break; + *usr_wakeup = true; + m->usr_pending = false; + return; + case GRP_RECLAIM_MSG: + if (!m) + return; + *usr_wakeup = m->usr_pending; + m->usr_pending = false; + tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); + m->window = ADV_IDLE; + return; + case GRP_REMIT_MSG: + if (!m || m->state != MBR_RECLAIMING) + return; + + list_del_init(&m->list); + grp->active_cnt--; + remitted = msg_grp_remitted(hdr); + + /* Messages preceding the REMIT still in receive queue */ + if (m->advertised > remitted) { + m->state = MBR_REMITTED; + in_flight = m->advertised - remitted; + } + /* All messages preceding the REMIT have been read */ + if (m->advertised <= remitted) { + m->state = MBR_JOINED; + in_flight = 0; + } + /* ..and the REMIT overtaken by more messages => re-advertise */ + if (m->advertised < remitted) + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + + m->advertised = ADV_IDLE + in_flight; + + /* Set oldest pending member to active and advertise */ + if (list_empty(&grp->pending)) + return; + pm = list_first_entry(&grp->pending, struct tipc_member, list); + pm->state = MBR_ACTIVE; + list_move_tail(&pm->list, &grp->active); + grp->active_cnt++; + if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) + tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); + return; + default: + pr_warn("Received unknown GROUP_PROTO message\n"); + } +} + +/* tipc_group_member_evt() - receive and handle a member up/down event + */ +void tipc_group_member_evt(struct tipc_group *grp, + bool *usr_wakeup, + int *sk_rcvbuf, + struct sk_buff *skb, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_event *evt = (void *)msg_data(hdr); + u32 instance = evt->found_lower; + u32 node = evt->port.node; + u32 port = evt->port.ref; + int event = evt->event; + struct tipc_member *m; + struct net *net; + bool node_up; + u32 self; + + if (!grp) + goto drop; + + net = grp->net; + self = tipc_own_addr(net); + if (!grp->loopback && node == self && port == grp->portid) + goto drop; + + /* Convert message before delivery to user */ + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE); + msg_set_type(hdr, TIPC_GRP_MEMBER_EVT); + msg_set_origport(hdr, port); + msg_set_orignode(hdr, node); + msg_set_nametype(hdr, grp->type); + msg_set_grp_evt(hdr, event); + + m = tipc_group_find_member(grp, node, port); + + if (event == TIPC_PUBLISHED) { + if (!m) + m = tipc_group_create_member(grp, node, port, + MBR_DISCOVERED); + if (!m) + goto drop; + + /* Hold back event if JOIN message not yet received */ + if (m->state == MBR_DISCOVERED) { + m->event_msg = skb; + m->state = MBR_PUBLISHED; + } else { + msg_set_grp_bc_seqno(hdr, m->bc_syncpt); + __skb_queue_tail(inputq, skb); + m->state = MBR_JOINED; + *usr_wakeup = true; + m->usr_pending = false; + } + m->instance = instance; + TIPC_SKB_CB(skb)->orig_member = m->instance; + tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); + if (m->window < ADV_IDLE) + tipc_group_update_member(m, 0); + else + list_del_init(&m->congested); + } else if (event == TIPC_WITHDRAWN) { + if (!m) + goto drop; + + TIPC_SKB_CB(skb)->orig_member = m->instance; + + *usr_wakeup = true; + m->usr_pending = false; + node_up = tipc_node_is_up(net, node); + + /* Hold back event if more messages might be expected */ + if (m->state != MBR_LEAVING && node_up) { + m->event_msg = skb; + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; + } else { + if (node_up) + msg_set_grp_bc_seqno(hdr, m->bc_syncpt); + else + msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); + __skb_queue_tail(inputq, skb); + } + list_del_init(&m->congested); + } + *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); + return; +drop: + kfree_skb(skb); +} diff --git a/net/tipc/group.h b/net/tipc/group.h new file mode 100644 index 000000000000..d525e1cd7de5 --- /dev/null +++ b/net/tipc/group.h @@ -0,0 +1,73 @@ +/* + * net/tipc/group.h: Include file for TIPC group unicast/multicast functions + * + * Copyright (c) 2017, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_GROUP_H +#define _TIPC_GROUP_H + +#include "core.h" + +struct tipc_group; +struct tipc_member; +struct tipc_msg; + +struct tipc_group *tipc_group_create(struct net *net, u32 portid, + struct tipc_group_req *mreq); +void tipc_group_delete(struct net *net, struct tipc_group *grp); +void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port); +struct tipc_nlist *tipc_group_dests(struct tipc_group *grp); +void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, + int *scope); +u32 tipc_group_exclude(struct tipc_group *grp); +void tipc_group_filter_msg(struct tipc_group *grp, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq); +void tipc_group_member_evt(struct tipc_group *grp, bool *wakeup, + int *sk_rcvbuf, struct sk_buff *skb, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq); +void tipc_group_proto_rcv(struct tipc_group *grp, bool *wakeup, + struct tipc_msg *hdr, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq); +void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack); +bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, + int len, struct tipc_member **m); +bool tipc_group_bc_cong(struct tipc_group *grp, int len); +void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, + u32 port, struct sk_buff_head *xmitq); +u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); +void tipc_group_update_member(struct tipc_member *m, int len); +int tipc_group_size(struct tipc_group *grp); +#endif diff --git a/net/tipc/link.c b/net/tipc/link.c index ac0144f532aa..870b9b8f877a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1039,6 +1039,7 @@ int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker, static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *inputq) { + struct sk_buff_head *mc_inputq = l->bc_rcvlink->inputq; struct tipc_msg *hdr = buf_msg(skb); switch (msg_user(hdr)) { @@ -1046,13 +1047,16 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: - if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) { - skb_queue_tail(l->bc_rcvlink->inputq, skb); + if (unlikely(msg_in_group(hdr) || msg_mcast(hdr))) { + skb_queue_tail(mc_inputq, skb); return true; } case CONN_MANAGER: skb_queue_tail(inputq, skb); return true; + case GROUP_PROTOCOL: + skb_queue_tail(mc_inputq, skb); + return true; case NAME_DISTRIBUTOR: l->bc_rcvlink->state = LINK_ESTABLISHED; skb_queue_tail(l->namedq, skb); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 6ef379f004ac..1649d456e22d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -551,7 +551,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) return false; if (msg_errcode(msg)) return false; - *err = -TIPC_ERR_NO_NAME; + *err = TIPC_ERR_NO_NAME; if (skb_linearize(skb)) return false; msg = buf_msg(skb); @@ -568,6 +568,14 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg_set_destnode(msg, dnode); msg_set_destport(msg, dport); *err = TIPC_OK; + + if (!skb_cloned(skb)) + return true; + + /* Unclone buffer in case it was bundled */ + if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) + return false; + return true; } @@ -658,3 +666,10 @@ void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, } kfree_skb(skb); } + +void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, + struct sk_buff_head *xmitq) +{ + if (tipc_msg_reverse(tipc_own_addr(net), &skb, err)) + __skb_queue_tail(xmitq, skb); +} diff --git a/net/tipc/msg.h b/net/tipc/msg.h index c843fd2bc48d..cedf811317fb 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1,7 +1,7 @@ /* * net/tipc/msg.h: Include file for TIPC message header routines * - * Copyright (c) 2000-2007, 2014-2015 Ericsson AB + * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * @@ -61,10 +61,14 @@ struct plist; /* * Payload message types */ -#define TIPC_CONN_MSG 0 -#define TIPC_MCAST_MSG 1 -#define TIPC_NAMED_MSG 2 -#define TIPC_DIRECT_MSG 3 +#define TIPC_CONN_MSG 0 +#define TIPC_MCAST_MSG 1 +#define TIPC_NAMED_MSG 2 +#define TIPC_DIRECT_MSG 3 +#define TIPC_GRP_MEMBER_EVT 4 +#define TIPC_GRP_BCAST_MSG 5 +#define TIPC_GRP_MCAST_MSG 6 +#define TIPC_GRP_UCAST_MSG 7 /* * Internal message users @@ -73,11 +77,13 @@ struct plist; #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 +#define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define SOCK_WAKEUP 14 /* pseudo user */ +#define TOP_SRV 15 /* pseudo user */ /* * Message header sizes @@ -86,6 +92,7 @@ struct plist; #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ +#define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ @@ -96,6 +103,7 @@ struct plist; struct tipc_skb_cb { u32 bytes_read; + u32 orig_member; struct sk_buff *tail; bool validated; u16 chain_imp; @@ -188,6 +196,11 @@ static inline u32 msg_size(struct tipc_msg *m) return msg_bits(m, 0, 0, 0x1ffff); } +static inline u32 msg_blocks(struct tipc_msg *m) +{ + return (msg_size(m) / 1024) + 1; +} + static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); @@ -251,6 +264,18 @@ static inline void msg_set_type(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 29, 0x7, n); } +static inline int msg_in_group(struct tipc_msg *m) +{ + int mtyp = msg_type(m); + + return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; +} + +static inline bool msg_is_grp_evt(struct tipc_msg *m) +{ + return msg_type(m) == TIPC_GRP_MEMBER_EVT; +} + static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; @@ -258,7 +283,10 @@ static inline u32 msg_named(struct tipc_msg *m) static inline u32 msg_mcast(struct tipc_msg *m) { - return msg_type(m) == TIPC_MCAST_MSG; + int mtyp = msg_type(m); + + return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || + (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) @@ -514,6 +542,16 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) #define DSC_RESP_MSG 1 /* + * Group protocol message types + */ +#define GRP_JOIN_MSG 0 +#define GRP_LEAVE_MSG 1 +#define GRP_ADV_MSG 2 +#define GRP_ACK_MSG 3 +#define GRP_RECLAIM_MSG 4 +#define GRP_REMIT_MSG 5 + +/* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) @@ -764,12 +802,12 @@ static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 16, 0xffff, n); } -static inline u32 msg_adv_win(struct tipc_msg *m) +static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } -static inline void msg_set_adv_win(struct tipc_msg *m, u32 n) +static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } @@ -794,6 +832,68 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } +static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +static inline u16 msg_grp_bc_acked(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +static inline u16 msg_grp_remitted(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +/* Word 10 + */ +static inline u16 msg_grp_evt(struct tipc_msg *m) +{ + return msg_bits(m, 10, 0, 0x3); +} + +static inline void msg_set_grp_evt(struct tipc_msg *m, int n) +{ + msg_set_bits(m, 10, 0, 0x3, n); +} + +static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) +{ + return msg_bits(m, 10, 0, 0x1); +} + +static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) +{ + msg_set_bits(m, 10, 0, 0x1, n); +} + +static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) +{ + return msg_bits(m, 10, 16, 0xffff); +} + +static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 10, 16, 0xffff, n); +} + static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) @@ -818,6 +918,8 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); +void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, + struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index bd0aac87b41a..2856e19e036e 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -43,6 +43,7 @@ #include "bcast.h" #include "addr.h" #include "node.h" +#include "group.h" #include <net/genetlink.h> #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ @@ -596,18 +597,47 @@ not_found: return ref; } -/** - * tipc_nametbl_mc_translate - find multicast destinations - * - * Creates list of all local ports that overlap the given multicast address; - * also determines if any off-node ports overlap. - * - * Note: Publications with a scope narrower than 'limit' are ignored. - * (i.e. local node-scope publications mustn't receive messages arriving - * from another node, even if the multcast link brought it here) - * - * Returns non-zero if any off-node ports overlap - */ +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, + struct list_head *dsts, int *dstcnt, u32 exclude, + bool all) +{ + u32 self = tipc_own_addr(net); + struct publication *publ; + struct name_info *info; + struct name_seq *seq; + struct sub_seq *sseq; + + if (!tipc_in_scope(domain, self)) + return false; + + *dstcnt = 0; + rcu_read_lock(); + seq = nametbl_find_seq(net, type); + if (unlikely(!seq)) + goto exit; + spin_lock_bh(&seq->lock); + sseq = nameseq_find_subseq(seq, instance); + if (likely(sseq)) { + info = sseq->info; + list_for_each_entry(publ, &info->zone_list, zone_list) { + if (!tipc_in_scope(domain, publ->node)) + continue; + if (publ->ref == exclude && publ->node == self) + continue; + tipc_dest_push(dsts, publ->node, publ->ref); + (*dstcnt)++; + if (all) + continue; + list_move_tail(&publ->zone_list, &info->zone_list); + break; + } + } + spin_unlock_bh(&seq->lock); +exit: + rcu_read_unlock(); + return !list_empty(dsts); +} + int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, u32 limit, struct list_head *dports) { @@ -634,7 +664,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, info = sseq->info; list_for_each_entry(publ, &info->node_list, node_list) { if (publ->scope <= limit) - u32_push(dports, publ->ref); + tipc_dest_push(dports, 0, publ->ref); } if (info->cluster_list_size != info->node_list_size) @@ -679,6 +709,37 @@ exit: rcu_read_unlock(); } +/* tipc_nametbl_build_group - build list of communication group members + */ +void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, + u32 type, u32 domain) +{ + struct sub_seq *sseq, *stop; + struct name_info *info; + struct publication *p; + struct name_seq *seq; + + rcu_read_lock(); + seq = nametbl_find_seq(net, type); + if (!seq) + goto exit; + + spin_lock_bh(&seq->lock); + sseq = seq->sseqs; + stop = seq->sseqs + seq->first_free; + for (; sseq != stop; sseq++) { + info = sseq->info; + list_for_each_entry(p, &info->zone_list, zone_list) { + if (!tipc_in_scope(domain, p->node)) + continue; + tipc_group_add_member(grp, p->node, p->ref); + } + } + spin_unlock_bh(&seq->lock); +exit: + rcu_read_unlock(); +} + /* * tipc_nametbl_publish - add name publication to network name tables */ @@ -1057,78 +1118,79 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -bool u32_find(struct list_head *l, u32 value) +struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { - struct u32_item *item; + u64 value = (u64)node << 32 | port; + struct tipc_dest *dst; - list_for_each_entry(item, l, list) { - if (item->value == value) - return true; + list_for_each_entry(dst, l, list) { + if (dst->value != value) + continue; + return dst; } - return false; + return NULL; } -bool u32_push(struct list_head *l, u32 value) +bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { - struct u32_item *item; + u64 value = (u64)node << 32 | port; + struct tipc_dest *dst; - list_for_each_entry(item, l, list) { - if (item->value == value) - return false; - } - item = kmalloc(sizeof(*item), GFP_ATOMIC); - if (unlikely(!item)) + if (tipc_dest_find(l, node, port)) return false; - item->value = value; - list_add(&item->list, l); + dst = kmalloc(sizeof(*dst), GFP_ATOMIC); + if (unlikely(!dst)) + return false; + dst->value = value; + list_add(&dst->list, l); return true; } -u32 u32_pop(struct list_head *l) +bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { - struct u32_item *item; - u32 value = 0; + struct tipc_dest *dst; if (list_empty(l)) - return 0; - item = list_first_entry(l, typeof(*item), list); - value = item->value; - list_del(&item->list); - kfree(item); - return value; + return false; + dst = list_first_entry(l, typeof(*dst), list); + if (port) + *port = dst->port; + if (node) + *node = dst->node; + list_del(&dst->list); + kfree(dst); + return true; } -bool u32_del(struct list_head *l, u32 value) +bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { - struct u32_item *item, *tmp; + struct tipc_dest *dst; - list_for_each_entry_safe(item, tmp, l, list) { - if (item->value != value) - continue; - list_del(&item->list); - kfree(item); - return true; - } - return false; + dst = tipc_dest_find(l, node, port); + if (!dst) + return false; + list_del(&dst->list); + kfree(dst); + return true; } -void u32_list_purge(struct list_head *l) +void tipc_dest_list_purge(struct list_head *l) { - struct u32_item *item, *tmp; + struct tipc_dest *dst, *tmp; - list_for_each_entry_safe(item, tmp, l, list) { - list_del(&item->list); - kfree(item); + list_for_each_entry_safe(dst, tmp, l, list) { + list_del(&dst->list); + kfree(dst); } } -int u32_list_len(struct list_head *l) +int tipc_dest_list_len(struct list_head *l) { - struct u32_item *item; + struct tipc_dest *dst; int i = 0; - list_for_each_entry(item, l, list) { + list_for_each_entry(dst, l, list) { i++; } return i; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 6ebdeb1d84a5..71926e429446 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -40,6 +40,7 @@ struct tipc_subscription; struct tipc_plist; struct tipc_nlist; +struct tipc_group; /* * TIPC name types reserved for internal TIPC use (both current and planned) @@ -101,9 +102,14 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, u32 limit, struct list_head *dports); +void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, + u32 type, u32 domain); void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, u32 upper, u32 domain, struct tipc_nlist *nodes); +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, + struct list_head *dsts, int *dstcnt, u32 exclude, + bool all); struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, u32 upper, u32 scope, u32 port_ref, u32 key); @@ -120,16 +126,22 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); -struct u32_item { +struct tipc_dest { struct list_head list; - u32 value; + union { + struct { + u32 port; + u32 node; + }; + u64 value; + }; }; -bool u32_push(struct list_head *l, u32 value); -u32 u32_pop(struct list_head *l); -bool u32_find(struct list_head *l, u32 value); -bool u32_del(struct list_head *l, u32 value); -void u32_list_purge(struct list_head *l); -int u32_list_len(struct list_head *l); +struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port); +bool tipc_dest_push(struct list_head *l, u32 node, u32 port); +bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port); +bool tipc_dest_del(struct list_head *l, u32 node, u32 port); +void tipc_dest_list_purge(struct list_head *l); +int tipc_dest_list_len(struct list_head *l); #endif diff --git a/net/tipc/node.c b/net/tipc/node.c index 198dbc7adbe1..89f8ac73bf65 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -157,7 +157,7 @@ static void tipc_node_timeout(unsigned long data); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static void tipc_node_put(struct tipc_node *node); -static bool tipc_node_is_up(struct tipc_node *n); +static bool node_is_up(struct tipc_node *n); struct tipc_sock_conn { u32 port; @@ -657,7 +657,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, *slot1 = i; } - if (!tipc_node_is_up(n)) { + if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); @@ -717,11 +717,27 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_sk_rcv(n->net, &le->inputq); } -static bool tipc_node_is_up(struct tipc_node *n) +static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } +bool tipc_node_is_up(struct net *net, u32 addr) +{ + struct tipc_node *n; + bool retval = false; + + if (in_own_node(net, addr)) + return true; + + n = tipc_node_find(net, addr); + if (!n) + return false; + retval = node_is_up(n); + tipc_node_put(n); + return retval; +} + void tipc_node_check_dest(struct net *net, u32 onode, struct tipc_bearer *b, u16 capabilities, u32 signature, @@ -1149,7 +1165,7 @@ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; - if (tipc_node_is_up(node)) + if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; @@ -1238,6 +1254,22 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, return 0; } +/* tipc_node_distr_xmit(): send single buffer msgs to individual destinations + * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected + */ +int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) +{ + struct sk_buff *skb; + u32 selector, dnode; + + while ((skb = __skb_dequeue(xmitq))) { + selector = msg_origport(buf_msg(skb)); + dnode = msg_destnode(buf_msg(skb)); + tipc_node_xmit_skb(net, skb, dnode, selector); + } + return 0; +} + void tipc_node_broadcast(struct net *net, struct sk_buff *skb) { struct sk_buff *txskb; @@ -1249,7 +1281,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb) dst = n->addr; if (in_own_node(net, dst)) continue; - if (!tipc_node_is_up(n)) + if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) diff --git a/net/tipc/node.h b/net/tipc/node.h index 898c22916984..acd58d23a70e 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -48,7 +48,8 @@ enum { TIPC_BCAST_SYNCH = (1 << 1), TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), - TIPC_BCAST_RCAST = (1 << 4) + TIPC_BCAST_RCAST = (1 << 4), + TIPC_MCAST_GROUPS = (1 << 5) }; #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ @@ -68,6 +69,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector); +int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *list); int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr); @@ -76,6 +78,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/tipc/server.c b/net/tipc/server.c index 3cd6402e812c..713077536d0c 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -36,6 +36,8 @@ #include "server.h" #include "core.h" #include "socket.h" +#include "addr.h" +#include "msg.h" #include <net/sock.h> #include <linux/module.h> @@ -105,13 +107,11 @@ static void tipc_conn_kref_release(struct kref *kref) kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); sock_release(sock); con->sock = NULL; - - spin_lock_bh(&s->idr_lock); - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - spin_unlock_bh(&s->idr_lock); } - + spin_lock_bh(&s->idr_lock); + idr_remove(&s->conn_idr, con->conid); + s->idr_in_use--; + spin_unlock_bh(&s->idr_lock); tipc_clean_outqueues(con); kfree(con); } @@ -197,7 +197,8 @@ static void tipc_close_conn(struct tipc_conn *con) struct tipc_server *s = con->server; if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { - tipc_unregister_callbacks(con); + if (con->sock) + tipc_unregister_callbacks(con); if (con->conid) s->tipc_conn_release(con->conid, con->usr_data); @@ -207,8 +208,8 @@ static void tipc_close_conn(struct tipc_conn *con) * are harmless for us here as we have already deleted this * connection from server connection list. */ - kernel_sock_shutdown(con->sock, SHUT_RDWR); - + if (con->sock) + kernel_sock_shutdown(con->sock, SHUT_RDWR); conn_put(con); } } @@ -487,38 +488,104 @@ void tipc_conn_terminate(struct tipc_server *s, int conid) } } +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, + u32 lower, u32 upper, int *conid) +{ + struct tipc_subscriber *scbr; + struct tipc_subscr sub; + struct tipc_server *s; + struct tipc_conn *con; + + sub.seq.type = type; + sub.seq.lower = lower; + sub.seq.upper = upper; + sub.timeout = TIPC_WAIT_FOREVER; + sub.filter = TIPC_SUB_PORTS; + *(u32 *)&sub.usr_handle = port; + + con = tipc_alloc_conn(tipc_topsrv(net)); + if (!con) + return false; + + *conid = con->conid; + s = con->server; + scbr = s->tipc_conn_new(*conid); + if (!scbr) { + tipc_close_conn(con); + return false; + } + + con->usr_data = scbr; + con->sock = NULL; + s->tipc_conn_recvmsg(net, *conid, NULL, scbr, &sub, sizeof(sub)); + return true; +} + +void tipc_topsrv_kern_unsubscr(struct net *net, int conid) +{ + struct tipc_conn *con; + + con = tipc_conn_lookup(tipc_topsrv(net), conid); + if (!con) + return; + tipc_close_conn(con); + conn_put(con); +} + +static void tipc_send_kern_top_evt(struct net *net, struct tipc_event *evt) +{ + u32 port = *(u32 *)&evt->s.usr_handle; + u32 self = tipc_own_addr(net); + struct sk_buff_head evtq; + struct sk_buff *skb; + + skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt), + self, self, port, port, 0); + if (!skb) + return; + msg_set_dest_droppable(buf_msg(skb), true); + memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); + skb_queue_head_init(&evtq); + __skb_queue_tail(&evtq, skb); + tipc_sk_rcv(net, &evtq); +} + static void tipc_send_to_sock(struct tipc_conn *con) { - int count = 0; struct tipc_server *s = con->server; struct outqueue_entry *e; + struct tipc_event *evt; struct msghdr msg; + int count = 0; int ret; spin_lock_bh(&con->outqueue_lock); while (test_bit(CF_CONNECTED, &con->flags)) { - e = list_entry(con->outqueue.next, struct outqueue_entry, - list); + e = list_entry(con->outqueue.next, struct outqueue_entry, list); if ((struct list_head *) e == &con->outqueue) break; - spin_unlock_bh(&con->outqueue_lock); - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + spin_unlock_bh(&con->outqueue_lock); - if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) { - msg.msg_name = &e->dest; - msg.msg_namelen = sizeof(struct sockaddr_tipc); - } - ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, - e->iov.iov_len); - if (ret == -EWOULDBLOCK || ret == 0) { - cond_resched(); - goto out; - } else if (ret < 0) { - goto send_err; + if (con->sock) { + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; + if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) { + msg.msg_name = &e->dest; + msg.msg_namelen = sizeof(struct sockaddr_tipc); + } + ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, + e->iov.iov_len); + if (ret == -EWOULDBLOCK || ret == 0) { + cond_resched(); + goto out; + } else if (ret < 0) { + goto send_err; + } + } else { + evt = e->iov.iov_base; + tipc_send_kern_top_evt(s->net, evt); } - /* Don't starve users filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { cond_resched(); diff --git a/net/tipc/server.h b/net/tipc/server.h index 34f8055afa3b..2113c9192633 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -83,13 +83,16 @@ struct tipc_server { int tipc_conn_sendmsg(struct tipc_server *s, int conid, struct sockaddr_tipc *addr, void *data, size_t len); +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, + u32 lower, u32 upper, int *conid); +void tipc_topsrv_kern_unsubscr(struct net *net, int conid); + /** * tipc_conn_terminate - terminate connection with server * * Note: Must call it in process context since it might sleep */ void tipc_conn_terminate(struct tipc_server *s, int conid); - int tipc_server_start(struct tipc_server *s); void tipc_server_stop(struct tipc_server *s); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d50edd6e0019..2bbab4fe2f53 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1,7 +1,7 @@ /* * net/tipc/socket.c: TIPC socket API * - * Copyright (c) 2001-2007, 2012-2016, Ericsson AB + * Copyright (c) 2001-2007, 2012-2017, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems * All rights reserved. * @@ -45,6 +45,7 @@ #include "socket.h" #include "bcast.h" #include "netlink.h" +#include "group.h" #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */ @@ -61,6 +62,11 @@ enum { TIPC_CONNECTING = TCP_SYN_SENT, }; +struct sockaddr_pair { + struct sockaddr_tipc sock; + struct sockaddr_tipc member; +}; + /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API @@ -78,7 +84,7 @@ enum { * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links - * @sent_unacked: # messages sent by socket, and not yet acked by peer + * @snt_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer * @peer: 'connected' peer for dgram/rdm * @node: hash table node @@ -109,9 +115,10 @@ struct tipc_sock { struct rhash_head node; struct tipc_mc_method mc_method; struct rcu_head rcu; + struct tipc_group *group; }; -static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); +static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); @@ -123,6 +130,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); +static int tipc_sk_leave(struct tipc_sock *tsk); static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); @@ -193,6 +201,11 @@ static bool tsk_conn_cong(struct tipc_sock *tsk) return tsk->snt_unacked > tsk->snd_win; } +static u16 tsk_blocks(int len) +{ + return ((len / FLOWCTL_BLK_SZ) + 1); +} + /* tsk_blocks(): translate a buffer size in bytes to number of * advertisable blocks, taking into account the ratio truesize(len)/len * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ @@ -453,7 +466,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; - sk->sk_backlog_rcv = tipc_backlog_rcv; + sk->sk_backlog_rcv = tipc_sk_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; @@ -559,13 +572,14 @@ static int tipc_release(struct socket *sock) __tipc_shutdown(sock, TIPC_ERR_NO_PORT); sk->sk_shutdown = SHUTDOWN_MASK; + tipc_sk_leave(tsk); tipc_sk_withdraw(tsk, 0, NULL); sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); /* Reject any messages that accumulated in backlog queue */ release_sock(sk); - u32_list_purge(&tsk->cong_links); + tipc_dest_list_purge(&tsk->cong_links); tsk->cong_link_cnt = 0; call_rcu(&tsk->rcu, tipc_sk_callback); sock->sk = NULL; @@ -601,7 +615,10 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, res = tipc_sk_withdraw(tsk, 0, NULL); goto exit; } - + if (tsk->group) { + res = -EACCES; + goto exit; + } if (uaddr_len < sizeof(struct sockaddr_tipc)) { res = -EINVAL; goto exit; @@ -697,39 +714,43 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tipc_sock *tsk = tipc_sk(sk); - u32 mask = 0; + struct tipc_group *grp = tsk->group; + u32 revents = 0; sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP | POLLIN | POLLRDNORM; + revents |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) - mask |= POLLHUP; + revents |= POLLHUP; switch (sk->sk_state) { case TIPC_ESTABLISHED: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) - mask |= POLLOUT; + revents |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (!skb_queue_empty(&sk->sk_receive_queue)) - mask |= (POLLIN | POLLRDNORM); + if (skb) + revents |= POLLIN | POLLRDNORM; break; case TIPC_OPEN: - if (!tsk->cong_link_cnt) - mask |= POLLOUT; - if (tipc_sk_type_connectionless(sk) && - (!skb_queue_empty(&sk->sk_receive_queue))) - mask |= (POLLIN | POLLRDNORM); + if (!grp || tipc_group_size(grp)) + if (!tsk->cong_link_cnt) + revents |= POLLOUT; + if (!tipc_sk_type_connectionless(sk)) + break; + if (!skb) + break; + revents |= POLLIN | POLLRDNORM; break; case TIPC_DISCONNECTING: - mask = (POLLIN | POLLRDNORM | POLLHUP); + revents = POLLIN | POLLRDNORM | POLLHUP; break; } - - return mask; + return revents; } /** @@ -757,6 +778,9 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct tipc_nlist dsts; int rc; + if (tsk->group) + return -EACCES; + /* Block or return if any destination link is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); if (unlikely(rc)) @@ -794,6 +818,296 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, } /** + * tipc_send_group_msg - send a message to a member in the group + * @net: network namespace + * @m: message to send + * @mb: group member + * @dnode: destination node + * @dport: destination port + * @dlen: total length of message data + */ +static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, + struct msghdr *m, struct tipc_member *mb, + u32 dnode, u32 dport, int dlen) +{ + u16 bc_snd_nxt = tipc_group_bc_snd_nxt(tsk->group); + struct tipc_mc_method *method = &tsk->mc_method; + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_msg *hdr = &tsk->phdr; + struct sk_buff_head pkts; + int mtu, rc; + + /* Complete message header */ + msg_set_type(hdr, TIPC_GRP_UCAST_MSG); + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_destport(hdr, dport); + msg_set_destnode(hdr, dnode); + msg_set_grp_bc_seqno(hdr, bc_snd_nxt); + + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; + + /* Send message */ + rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + if (unlikely(rc == -ELINKCONG)) { + tipc_dest_push(&tsk->cong_links, dnode, 0); + tsk->cong_link_cnt++; + } + + /* Update send window */ + tipc_group_update_member(mb, blks); + + /* A broadcast sent within next EXPIRE period must follow same path */ + method->rcast = true; + method->mandatory = true; + return dlen; +} + +/** + * tipc_send_group_unicast - send message to a member in the group + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct net *net = sock_net(sk); + struct tipc_member *mb = NULL; + u32 node, port; + int rc; + + node = dest->addr.id.node; + port = dest->addr.id.ref; + if (!port && !node) + return -EHOSTUNREACH; + + /* Block or return if destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(&tsk->cong_links, node, 0) && + !tipc_group_cong(grp, node, port, blks, &mb)); + if (unlikely(rc)) + return rc; + + if (unlikely(!mb)) + return -EHOSTUNREACH; + + rc = tipc_send_group_msg(net, tsk, m, mb, node, port, dlen); + + return rc ? rc : dlen; +} + +/** + * tipc_send_group_anycast - send message to any member with given identity + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); + struct list_head *cong_links = &tsk->cong_links; + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_group *grp = tsk->group; + struct tipc_member *first = NULL; + struct tipc_member *mbr = NULL; + struct net *net = sock_net(sk); + u32 node, port, exclude; + u32 type, inst, domain; + struct list_head dsts; + int lookups = 0; + int dstcnt, rc; + bool cong; + + INIT_LIST_HEAD(&dsts); + + type = dest->addr.name.name.type; + inst = dest->addr.name.name.instance; + domain = addr_domain(net, dest->scope); + exclude = tipc_group_exclude(grp); + + while (++lookups < 4) { + first = NULL; + + /* Look for a non-congested destination member, if any */ + while (1) { + if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts, + &dstcnt, exclude, false)) + return -EHOSTUNREACH; + tipc_dest_pop(&dsts, &node, &port); + cong = tipc_group_cong(grp, node, port, blks, &mbr); + if (!cong) + break; + if (mbr == first) + break; + if (!first) + first = mbr; + } + + /* Start over if destination was not in member list */ + if (unlikely(!mbr)) + continue; + + if (likely(!cong && !tipc_dest_find(cong_links, node, 0))) + break; + + /* Block or return if destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(cong_links, node, 0) && + !tipc_group_cong(grp, node, port, + blks, &mbr)); + if (unlikely(rc)) + return rc; + + /* Send, unless destination disappeared while waiting */ + if (likely(mbr)) + break; + } + + if (unlikely(lookups >= 4)) + return -EHOSTUNREACH; + + rc = tipc_send_group_msg(net, tsk, m, mbr, node, port, dlen); + + return rc ? rc : dlen; +} + +/** + * tipc_send_group_bcast - send message to all members in communication group + * @sk: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct tipc_nlist *dsts = tipc_group_dests(grp); + struct tipc_mc_method *method = &tsk->mc_method; + bool ack = method->mandatory && method->rcast; + int blks = tsk_blocks(MCAST_H_SIZE + dlen); + struct tipc_msg *hdr = &tsk->phdr; + int mtu = tipc_bcast_get_mtu(net); + struct sk_buff_head pkts; + int rc = -EHOSTUNREACH; + + if (!dsts->local && !dsts->remote) + return -EHOSTUNREACH; + + /* Block or return if any destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt && + !tipc_group_bc_cong(grp, blks)); + if (unlikely(rc)) + return rc; + + /* Complete message header */ + if (dest) { + msg_set_type(hdr, TIPC_GRP_MCAST_MSG); + msg_set_nameinst(hdr, dest->addr.name.name.instance); + } else { + msg_set_type(hdr, TIPC_GRP_BCAST_MSG); + msg_set_nameinst(hdr, 0); + } + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_destport(hdr, 0); + msg_set_destnode(hdr, 0); + msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); + + /* Avoid getting stuck with repeated forced replicasts */ + msg_set_grp_bc_ack_req(hdr, ack); + + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; + + /* Send message */ + rc = tipc_mcast_xmit(net, &pkts, method, dsts, &tsk->cong_link_cnt); + if (unlikely(rc)) + return rc; + + /* Update broadcast sequence number and send windows */ + tipc_group_update_bc_members(tsk->group, blks, ack); + + /* Broadcast link is now free to choose method for next broadcast */ + method->mandatory = false; + method->expires = jiffies; + + return dlen; +} + +/** + * tipc_send_group_mcast - send message to all members with given identity + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct tipc_name_seq *seq = &dest->addr.nameseq; + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct net *net = sock_net(sk); + u32 domain, exclude, dstcnt; + struct list_head dsts; + + INIT_LIST_HEAD(&dsts); + + if (seq->lower != seq->upper) + return -ENOTSUPP; + + domain = addr_domain(net, dest->scope); + exclude = tipc_group_exclude(grp); + if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain, + &dsts, &dstcnt, exclude, true)) + return -EHOSTUNREACH; + + if (dstcnt == 1) { + tipc_dest_pop(&dsts, &dest->addr.id.node, &dest->addr.id.ref); + return tipc_send_group_unicast(sock, m, dlen, timeout); + } + + tipc_dest_list_purge(&dsts); + return tipc_send_group_bcast(sock, m, dlen, timeout); +} + +/** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets * @arrvq: queue with arriving messages, to be cloned after destination lookup * @inputq: queue with cloned messages, delivered to socket after dest lookup @@ -803,13 +1117,15 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { - struct tipc_msg *msg; - struct list_head dports; - u32 portid; u32 scope = TIPC_CLUSTER_SCOPE; - struct sk_buff_head tmpq; - uint hsz; + u32 self = tipc_own_addr(net); struct sk_buff *skb, *_skb; + u32 lower = 0, upper = ~0; + struct sk_buff_head tmpq; + u32 portid, oport, onode; + struct list_head dports; + struct tipc_msg *msg; + int user, mtyp, hsz; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); @@ -817,17 +1133,32 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { msg = buf_msg(skb); + user = msg_user(msg); + mtyp = msg_type(msg); + if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { + spin_lock_bh(&inputq->lock); + if (skb_peek(arrvq) == skb) { + __skb_dequeue(arrvq); + __skb_queue_tail(inputq, skb); + } + refcount_dec(&skb->users); + spin_unlock_bh(&inputq->lock); + continue; + } hsz = skb_headroom(skb) + msg_hdr_sz(msg); - - if (in_own_node(net, msg_orignode(msg))) + oport = msg_origport(msg); + onode = msg_orignode(msg); + if (onode == self) scope = TIPC_NODE_SCOPE; /* Create destination port list and message clones: */ - tipc_nametbl_mc_translate(net, - msg_nametype(msg), msg_namelower(msg), - msg_nameupper(msg), scope, &dports); - portid = u32_pop(&dports); - for (; portid; portid = u32_pop(&dports)) { + if (!msg_in_group(msg)) { + lower = msg_namelower(msg); + upper = msg_nameupper(msg); + } + tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper, + scope, &dports); + while (tipc_dest_pop(&dports, NULL, &portid)) { _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); @@ -850,16 +1181,16 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, } /** - * tipc_sk_proto_rcv - receive a connection mng protocol message + * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket * @skb: pointer to message buffer. */ -static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, - struct sk_buff_head *xmitq) +static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { - struct sock *sk = &tsk->sk; - u32 onode = tsk_own_node(tsk); struct tipc_msg *hdr = buf_msg(skb); + u32 onode = tsk_own_node(tsk); + struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); bool conn_cong; @@ -931,6 +1262,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); struct list_head *clinks = &tsk->cong_links; bool syn = !tipc_sk_type_connectionless(sk); + struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; @@ -941,18 +1273,31 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; + if (likely(dest)) { + if (unlikely(m->msg_namelen < sizeof(*dest))) + return -EINVAL; + if (unlikely(dest->family != AF_TIPC)) + return -EINVAL; + } + + if (grp) { + if (!dest) + return tipc_send_group_bcast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_NAME) + return tipc_send_group_anycast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_ID) + return tipc_send_group_unicast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_MCAST) + return tipc_send_group_mcast(sock, m, dlen, timeout); + return -EINVAL; + } + if (unlikely(!dest)) { dest = &tsk->peer; if (!syn || dest->family != AF_TIPC) return -EDESTADDRREQ; } - if (unlikely(m->msg_namelen < sizeof(*dest))) - return -EINVAL; - - if (unlikely(dest->family != AF_TIPC)) - return -EINVAL; - if (unlikely(syn)) { if (sk->sk_state == TIPC_LISTEN) return -EPIPE; @@ -985,7 +1330,6 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; - } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; msg_set_type(hdr, TIPC_DIRECT_MSG); @@ -996,7 +1340,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) } /* Block or return if destination link is congested */ - rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode)); + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(clinks, dnode, 0)); if (unlikely(rc)) return rc; @@ -1008,7 +1353,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { - u32_push(clinks, dnode); + tipc_dest_push(clinks, dnode, 0); tsk->cong_link_cnt++; rc = 0; } @@ -1142,26 +1487,38 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, } /** - * set_orig_addr - capture sender's address for received message + * tipc_sk_set_orig_addr - capture sender's address for received message * @m: descriptor for message info - * @msg: received message header + * @hdr: received message header * * Note: Address is not captured if not requested by receiver. */ -static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) +static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) { - DECLARE_SOCKADDR(struct sockaddr_tipc *, addr, m->msg_name); + DECLARE_SOCKADDR(struct sockaddr_pair *, srcaddr, m->msg_name); + struct tipc_msg *hdr = buf_msg(skb); - if (addr) { - addr->family = AF_TIPC; - addr->addrtype = TIPC_ADDR_ID; - memset(&addr->addr, 0, sizeof(addr->addr)); - addr->addr.id.ref = msg_origport(msg); - addr->addr.id.node = msg_orignode(msg); - addr->addr.name.domain = 0; /* could leave uninitialized */ - addr->scope = 0; /* could leave uninitialized */ - m->msg_namelen = sizeof(struct sockaddr_tipc); - } + if (!srcaddr) + return; + + srcaddr->sock.family = AF_TIPC; + srcaddr->sock.addrtype = TIPC_ADDR_ID; + srcaddr->sock.addr.id.ref = msg_origport(hdr); + srcaddr->sock.addr.id.node = msg_orignode(hdr); + srcaddr->sock.addr.name.domain = 0; + srcaddr->sock.scope = 0; + m->msg_namelen = sizeof(struct sockaddr_tipc); + + if (!msg_in_group(hdr)) + return; + + /* Group message users may also want to know sending member's id */ + srcaddr->member.family = AF_TIPC; + srcaddr->member.addrtype = TIPC_ADDR_NAME; + srcaddr->member.addr.name.name.type = msg_nametype(hdr); + srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member; + srcaddr->member.addr.name.domain = 0; + m->msg_namelen = sizeof(*srcaddr); } /** @@ -1318,11 +1675,13 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buflen, int flags) { struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - struct sk_buff *skb; - struct tipc_msg *hdr; bool connected = !tipc_sk_type_connectionless(sk); + struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct sk_buff_head xmitq; + struct tipc_msg *hdr; + struct sk_buff *skb; + bool grp_evt; long timeout; /* Catch invalid receive requests */ @@ -1336,8 +1695,8 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + /* Step rcv queue to first msg with data or error; wait if necessary */ do { - /* Look at first msg in receive queue; wait if necessary */ rc = tipc_wait_for_rcvmsg(sock, &timeout); if (unlikely(rc)) goto exit; @@ -1346,13 +1705,14 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); err = msg_errcode(hdr); + grp_evt = msg_is_grp_evt(hdr); if (likely(dlen || err)) break; tsk_advance_rx_queue(sk); } while (1); /* Collect msg meta data, including error code and rejected data */ - set_orig_addr(m, hdr); + tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, hdr, tsk); if (unlikely(rc)) goto exit; @@ -1372,15 +1732,33 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(rc)) goto exit; + /* Mark message as group event if applicable */ + if (unlikely(grp_evt)) { + if (msg_grp_evt(hdr) == TIPC_WITHDRAWN) + m->msg_flags |= MSG_EOR; + m->msg_flags |= MSG_OOB; + copy = 0; + } + /* Caption of data or error code/rejected data was successful */ if (unlikely(flags & MSG_PEEK)) goto exit; + /* Send group flow control advertisement when applicable */ + if (tsk->group && msg_in_group(hdr) && !grp_evt) { + skb_queue_head_init(&xmitq); + tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen), + msg_orignode(hdr), msg_origport(hdr), + &xmitq); + tipc_node_distr_xmit(sock_net(sk), &xmitq); + } + tsk_advance_rx_queue(sk); + if (likely(!connected)) goto exit; - /* Send connection flow control ack when applicable */ + /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); @@ -1446,7 +1824,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Collect msg meta data, incl. error code and rejected data */ if (!copied) { - set_orig_addr(m, hdr); + tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, hdr, tsk); if (rc) break; @@ -1532,14 +1910,51 @@ static void tipc_sock_destruct(struct sock *sk) __skb_queue_purge(&sk->sk_receive_queue); } +static void tipc_sk_proto_rcv(struct sock *sk, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = __skb_dequeue(inputq); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_group *grp = tsk->group; + bool wakeup = false; + + switch (msg_user(hdr)) { + case CONN_MANAGER: + tipc_sk_conn_proto_rcv(tsk, skb, xmitq); + return; + case SOCK_WAKEUP: + tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); + tsk->cong_link_cnt--; + wakeup = true; + break; + case GROUP_PROTOCOL: + tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); + break; + case TOP_SRV: + tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf, + skb, inputq, xmitq); + skb = NULL; + break; + default: + break; + } + + if (wakeup) + sk->sk_write_space(sk); + + kfree_skb(skb); +} + /** - * filter_connect - Handle all incoming messages for a connection-based socket + * tipc_filter_connect - Handle incoming message for a connection-based socket * @tsk: TIPC socket * @skb: pointer to message buffer. Set to NULL if buffer is consumed * * Returns true if everything ok, false otherwise */ -static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) +static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); @@ -1643,6 +2058,9 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); + if (unlikely(msg_in_group(hdr))) + return sk->sk_rcvbuf; + if (unlikely(!msg_connected(hdr))) return sk->sk_rcvbuf << msg_importance(hdr); @@ -1653,7 +2071,7 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) } /** - * filter_rcv - validate incoming message + * tipc_sk_filter_rcv - validate incoming message * @sk: socket * @skb: pointer to message. * @@ -1662,99 +2080,71 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) * * Called with socket lock already taken * - * Returns true if message was added to socket receive queue, otherwise false */ -static bool filter_rcv(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *xmitq) +static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { + bool sk_conn = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = buf_msg(skb); - unsigned int limit = rcvbuf_limit(sk, skb); - int err = TIPC_OK; - int usr = msg_user(hdr); - u32 onode; + struct net *net = sock_net(sk); + struct sk_buff_head inputq; + int limit, err = TIPC_OK; - if (unlikely(msg_user(hdr) == CONN_MANAGER)) { - tipc_sk_proto_rcv(tsk, skb, xmitq); - return false; - } + TIPC_SKB_CB(skb)->bytes_read = 0; + __skb_queue_head_init(&inputq); + __skb_queue_tail(&inputq, skb); - if (unlikely(usr == SOCK_WAKEUP)) { - onode = msg_orignode(hdr); - kfree_skb(skb); - u32_del(&tsk->cong_links, onode); - tsk->cong_link_cnt--; - sk->sk_write_space(sk); - return false; - } + if (unlikely(!msg_isdata(hdr))) + tipc_sk_proto_rcv(sk, &inputq, xmitq); - /* Drop if illegal message type */ - if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) { - kfree_skb(skb); - return false; - } + if (unlikely(grp)) + tipc_group_filter_msg(grp, &inputq, xmitq); - /* Reject if wrong message type for current socket state */ - if (tipc_sk_type_connectionless(sk)) { - if (msg_connected(hdr)) { + /* Validate and add to receive buffer if there is space */ + while ((skb = __skb_dequeue(&inputq))) { + hdr = buf_msg(skb); + limit = rcvbuf_limit(sk, skb); + if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || + (!sk_conn && msg_connected(hdr)) || + (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; - goto reject; - } - } else if (unlikely(!filter_connect(tsk, skb))) { - err = TIPC_ERR_NO_PORT; - goto reject; - } + else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) + err = TIPC_ERR_OVERLOAD; - /* Reject message if there isn't room to queue it */ - if (unlikely(sk_rmem_alloc_get(sk) + skb->truesize >= limit)) { - err = TIPC_ERR_OVERLOAD; - goto reject; + if (unlikely(err)) { + tipc_skb_reject(net, err, skb, xmitq); + err = TIPC_OK; + continue; + } + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk); } - - /* Enqueue message */ - TIPC_SKB_CB(skb)->bytes_read = 0; - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - - sk->sk_data_ready(sk); - return true; - -reject: - if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err)) - __skb_queue_tail(xmitq, skb); - return false; } /** - * tipc_backlog_rcv - handle incoming message from backlog queue + * tipc_sk_backlog_rcv - handle incoming message from backlog queue * @sk: socket * @skb: message * * Caller must hold socket lock - * - * Returns 0 */ -static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) +static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - unsigned int truesize = skb->truesize; + unsigned int before = sk_rmem_alloc_get(sk); struct sk_buff_head xmitq; - u32 dnode, selector; + unsigned int added; __skb_queue_head_init(&xmitq); - if (likely(filter_rcv(sk, skb, &xmitq))) { - atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt); - return 0; - } - - if (skb_queue_empty(&xmitq)) - return 0; + tipc_sk_filter_rcv(sk, skb, &xmitq); + added = sk_rmem_alloc_get(sk) - before; + atomic_add(added, &tipc_sk(sk)->dupl_rcvcnt); - /* Send response/rejected message */ - skb = __skb_dequeue(&xmitq); - dnode = msg_destnode(buf_msg(skb)); - selector = msg_origport(buf_msg(skb)); - tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); + /* Send pending response/rejected messages, if any */ + tipc_node_distr_xmit(sock_net(sk), &xmitq); return 0; } @@ -1786,7 +2176,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { - filter_rcv(sk, skb, xmitq); + tipc_sk_filter_rcv(sk, skb, xmitq); continue; } @@ -1833,14 +2223,10 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) spin_unlock_bh(&sk->sk_lock.slock); } /* Send pending response/rejected messages, if any */ - while ((skb = __skb_dequeue(&xmitq))) { - dnode = msg_destnode(buf_msg(skb)); - tipc_node_xmit_skb(net, skb, dnode, dport); - } + tipc_node_distr_xmit(sock_net(sk), &xmitq); sock_put(sk); continue; } - /* No destination socket => dequeue skb if still there */ skb = tipc_skb_dequeue(inputq, dport); if (!skb) @@ -1903,28 +2289,32 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int previous; int res = 0; + if (destlen != sizeof(struct sockaddr_tipc)) + return -EINVAL; + lock_sock(sk); - /* DGRAM/RDM connect(), just save the destaddr */ - if (tipc_sk_type_connectionless(sk)) { - if (dst->family == AF_UNSPEC) { - memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); - } else if (destlen != sizeof(struct sockaddr_tipc)) { - res = -EINVAL; - } else { - memcpy(&tsk->peer, dest, destlen); - } + if (tsk->group) { + res = -EINVAL; goto exit; } - /* - * Reject connection attempt using multicast address - * - * Note: send_msg() validates the rest of the address fields, - * so there's no need to do it here - */ - if (dst->addrtype == TIPC_ADDR_MCAST) { + if (dst->family == AF_UNSPEC) { + memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); + if (!tipc_sk_type_connectionless(sk)) + res = -EINVAL; + goto exit; + } else if (dst->family != AF_TIPC) { res = -EINVAL; + } + if (dst->addrtype != TIPC_ADDR_ID && dst->addrtype != TIPC_ADDR_NAME) + res = -EINVAL; + if (res) + goto exit; + + /* DGRAM/RDM connect(), just save the destaddr */ + if (tipc_sk_type_connectionless(sk)) { + memcpy(&tsk->peer, dest, destlen); goto exit; } @@ -2345,6 +2735,56 @@ void tipc_sk_rht_destroy(struct net *net) rhashtable_destroy(&tn->sk_rht); } +static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) +{ + struct net *net = sock_net(&tsk->sk); + u32 domain = addr_domain(net, mreq->scope); + struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; + struct tipc_name_seq seq; + int rc; + + if (mreq->type < TIPC_RESERVED_TYPES) + return -EACCES; + if (grp) + return -EACCES; + grp = tipc_group_create(net, tsk->portid, mreq); + if (!grp) + return -ENOMEM; + tsk->group = grp; + msg_set_lookup_scope(hdr, mreq->scope); + msg_set_nametype(hdr, mreq->type); + msg_set_dest_droppable(hdr, true); + seq.type = mreq->type; + seq.lower = mreq->instance; + seq.upper = seq.lower; + tipc_nametbl_build_group(net, grp, mreq->type, domain); + rc = tipc_sk_publish(tsk, mreq->scope, &seq); + if (rc) + tipc_group_delete(net, grp); + + /* Eliminate any risk that a broadcast overtakes the sent JOIN */ + tsk->mc_method.rcast = true; + tsk->mc_method.mandatory = true; + return rc; +} + +static int tipc_sk_leave(struct tipc_sock *tsk) +{ + struct net *net = sock_net(&tsk->sk); + struct tipc_group *grp = tsk->group; + struct tipc_name_seq seq; + int scope; + + if (!grp) + return -EINVAL; + tipc_group_self(grp, &seq, &scope); + tipc_group_delete(net, grp); + tsk->group = NULL; + tipc_sk_withdraw(tsk, scope, &seq); + return 0; +} + /** * tipc_setsockopt - set socket option * @sock: socket structure @@ -2363,6 +2803,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group_req mreq; u32 value = 0; int res = 0; @@ -2378,9 +2819,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_CONN_TIMEOUT: if (ol < sizeof(value)) return -EINVAL; - res = get_user(value, (u32 __user *)ov); - if (res) - return res; + if (get_user(value, (u32 __user *)ov)) + return -EFAULT; + break; + case TIPC_GROUP_JOIN: + if (ol < sizeof(mreq)) + return -EINVAL; + if (copy_from_user(&mreq, ov, sizeof(mreq))) + return -EFAULT; break; default: if (ov || ol) @@ -2413,6 +2859,12 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, tsk->mc_method.rcast = true; tsk->mc_method.mandatory = true; break; + case TIPC_GROUP_JOIN: + res = tipc_sk_join(tsk, &mreq); + break; + case TIPC_GROUP_LEAVE: + res = tipc_sk_leave(tsk); + break; default: res = -EINVAL; } @@ -2440,7 +2892,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - int len; + struct tipc_name_seq seq; + int len, scope; u32 value; int res; @@ -2474,6 +2927,12 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, case TIPC_SOCK_RECVQ_DEPTH: value = skb_queue_len(&sk->sk_receive_queue); break; + case TIPC_GROUP_JOIN: + seq.type = 0; + if (tsk->group) + tipc_group_self(tsk->group, &seq, &scope); + value = seq.type; + break; default: res = -EINVAL; } diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index a24369d175fd..970f96489fe7 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -15,6 +15,16 @@ config VSOCKETS To compile this driver as a module, choose M here: the module will be called vsock. If unsure, say N. +config VSOCKETS_DIAG + tristate "Virtual Sockets monitoring interface" + depends on VSOCKETS + default y + help + Support for PF_VSOCK sockets monitoring interface used by the ss tool. + If unsure, say Y. + + Enable this module so userspace applications can query open sockets. + config VMWARE_VMCI_VSOCKETS tristate "VMware VMCI transport for Virtual Sockets" depends on VSOCKETS && VMWARE_VMCI diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index e63d574234a9..64afc06805da 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_VSOCKETS) += vsock.o +obj-$(CONFIG_VSOCKETS_DIAG) += vsock_diag.o obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o @@ -6,6 +7,8 @@ obj-$(CONFIG_HYPERV_VSOCKETS) += hv_sock.o vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o +vsock_diag-y += diag.o + vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ vmci_transport_notify_qstate.o diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index dfc8c51e4d74..98359c19522f 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -36,7 +36,7 @@ * not support simultaneous connects (two "client" sockets connecting). * * - "Server" sockets are referred to as listener sockets throughout this - * implementation because they are in the VSOCK_SS_LISTEN state. When a + * implementation because they are in the TCP_LISTEN state. When a * connection request is received (the second kind of socket mentioned above), * we create a new socket and refer to it as a pending socket. These pending * sockets are placed on the pending connection list of the listener socket. @@ -82,6 +82,15 @@ * argument, we must ensure the reference count is increased to ensure the * socket isn't freed before the function is run; the deferred function will * then drop the reference. + * + * - sk->sk_state uses the TCP state constants because they are widely used by + * other address families and exposed to userspace tools like ss(8): + * + * TCP_CLOSE - unconnected + * TCP_SYN_SENT - connecting + * TCP_ESTABLISHED - connected + * TCP_CLOSING - disconnecting + * TCP_LISTEN - listening */ #include <linux/types.h> @@ -153,7 +162,6 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); * vsock_bind_table[VSOCK_HASH_SIZE] is for unbound sockets. The hash function * mods with VSOCK_HASH_SIZE to ensure this. */ -#define VSOCK_HASH_SIZE 251 #define MAX_PORT_RETRIES 24 #define VSOCK_HASH(addr) ((addr)->svm_port % VSOCK_HASH_SIZE) @@ -168,9 +176,12 @@ EXPORT_SYMBOL_GPL(vm_sockets_get_local_cid); #define vsock_connected_sockets_vsk(vsk) \ vsock_connected_sockets(&(vsk)->remote_addr, &(vsk)->local_addr) -static struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; -static struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; -static DEFINE_SPINLOCK(vsock_table_lock); +struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; +EXPORT_SYMBOL_GPL(vsock_bind_table); +struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; +EXPORT_SYMBOL_GPL(vsock_connected_table); +DEFINE_SPINLOCK(vsock_table_lock); +EXPORT_SYMBOL_GPL(vsock_table_lock); /* Autobind this socket to the local address if necessary. */ static int vsock_auto_bind(struct vsock_sock *vsk) @@ -248,16 +259,6 @@ static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, return NULL; } -static bool __vsock_in_bound_table(struct vsock_sock *vsk) -{ - return !list_empty(&vsk->bound_table); -} - -static bool __vsock_in_connected_table(struct vsock_sock *vsk) -{ - return !list_empty(&vsk->connected_table); -} - static void vsock_insert_unbound(struct vsock_sock *vsk) { spin_lock_bh(&vsock_table_lock); @@ -485,7 +486,7 @@ void vsock_pending_work(struct work_struct *work) if (vsock_in_connected_table(vsk)) vsock_remove_connected(vsk); - sk->sk_state = SS_FREE; + sk->sk_state = TCP_CLOSE; out: release_sock(sk); @@ -625,7 +626,6 @@ struct sock *__vsock_create(struct net *net, sk->sk_destruct = vsock_sk_destruct; sk->sk_backlog_rcv = vsock_queue_rcv_skb; - sk->sk_state = 0; sock_reset_flag(sk, SOCK_DONE); INIT_LIST_HEAD(&vsk->bound_table); @@ -899,7 +899,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, /* Listening sockets that have connections in their accept * queue can be read. */ - if (sk->sk_state == VSOCK_SS_LISTEN + if (sk->sk_state == TCP_LISTEN && !vsock_is_accept_queue_empty(sk)) mask |= POLLIN | POLLRDNORM; @@ -928,7 +928,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, } /* Connected sockets that can produce data can be written. */ - if (sk->sk_state == SS_CONNECTED) { + if (sk->sk_state == TCP_ESTABLISHED) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { bool space_avail_now = false; int ret = transport->notify_poll_out( @@ -950,7 +950,7 @@ static unsigned int vsock_poll(struct file *file, struct socket *sock, * POLLOUT|POLLWRNORM when peer is closed and nothing to read, * but local send is not shutdown. */ - if (sk->sk_state == SS_UNCONNECTED) { + if (sk->sk_state == TCP_CLOSE) { if (!(sk->sk_shutdown & SEND_SHUTDOWN)) mask |= POLLOUT | POLLWRNORM; @@ -1120,9 +1120,9 @@ static void vsock_connect_timeout(struct work_struct *work) sk = sk_vsock(vsk); lock_sock(sk); - if (sk->sk_state == SS_CONNECTING && + if (sk->sk_state == TCP_SYN_SENT && (sk->sk_shutdown != SHUTDOWN_MASK)) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = ETIMEDOUT; sk->sk_error_report(sk); cancel = 1; @@ -1168,7 +1168,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, err = -EALREADY; break; default: - if ((sk->sk_state == VSOCK_SS_LISTEN) || + if ((sk->sk_state == TCP_LISTEN) || vsock_addr_cast(addr, addr_len, &remote_addr) != 0) { err = -EINVAL; goto out; @@ -1191,7 +1191,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - sk->sk_state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; err = transport->connect(vsk); if (err < 0) @@ -1211,7 +1211,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (sk->sk_state != SS_CONNECTED && sk->sk_err == 0) { + while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection @@ -1234,13 +1234,13 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (signal_pending(current)) { err = sock_intr_errno(timeout); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; } else if (timeout == 0) { err = -ETIMEDOUT; - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; vsock_transport_cancel_pkt(vsk); goto out_wait; @@ -1251,7 +1251,7 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr, if (sk->sk_err) { err = -sk->sk_err; - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; } else { err = 0; @@ -1284,7 +1284,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags, goto out; } - if (listener->sk_state != VSOCK_SS_LISTEN) { + if (listener->sk_state != TCP_LISTEN) { err = -EINVAL; goto out; } @@ -1374,7 +1374,7 @@ static int vsock_listen(struct socket *sock, int backlog) } sk->sk_max_ack_backlog = backlog; - sk->sk_state = VSOCK_SS_LISTEN; + sk->sk_state = TCP_LISTEN; err = 0; @@ -1554,7 +1554,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Callers should not provide a destination with stream sockets. */ if (msg->msg_namelen) { - err = sk->sk_state == SS_CONNECTED ? -EISCONN : -EOPNOTSUPP; + err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out; } @@ -1565,7 +1565,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (sk->sk_state != SS_CONNECTED || + if (sk->sk_state != TCP_ESTABLISHED || !vsock_addr_bound(&vsk->local_addr)) { err = -ENOTCONN; goto out; @@ -1689,7 +1689,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, lock_sock(sk); - if (sk->sk_state != SS_CONNECTED) { + if (sk->sk_state != TCP_ESTABLISHED) { /* Recvmsg is supposed to return 0 if a peer performs an * orderly shutdown. Differentiate between that case and when a * peer has not connected or a local shutdown occured with the diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c new file mode 100644 index 000000000000..31b567652250 --- /dev/null +++ b/net/vmw_vsock/diag.c @@ -0,0 +1,186 @@ +/* + * vsock sock_diag(7) module + * + * Copyright (C) 2017 Red Hat, Inc. + * Author: Stefan Hajnoczi <stefanha@redhat.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/module.h> +#include <linux/sock_diag.h> +#include <linux/vm_sockets_diag.h> +#include <net/af_vsock.h> + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags) +{ + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_diag_msg *rep; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), + flags); + if (!nlh) + return -EMSGSIZE; + + rep = nlmsg_data(nlh); + rep->vdiag_family = AF_VSOCK; + + /* Lock order dictates that sk_lock is acquired before + * vsock_table_lock, so we cannot lock here. Simply don't take + * sk_lock; sk is guaranteed to stay alive since vsock_table_lock is + * held. + */ + rep->vdiag_type = sk->sk_type; + rep->vdiag_state = sk->sk_state; + rep->vdiag_shutdown = sk->sk_shutdown; + rep->vdiag_src_cid = vsk->local_addr.svm_cid; + rep->vdiag_src_port = vsk->local_addr.svm_port; + rep->vdiag_dst_cid = vsk->remote_addr.svm_cid; + rep->vdiag_dst_port = vsk->remote_addr.svm_port; + rep->vdiag_ino = sock_i_ino(sk); + + sock_diag_save_cookie(sk, rep->vdiag_cookie); + + return 0; +} + +static int vsock_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct vsock_diag_req *req; + struct vsock_sock *vsk; + unsigned int bucket; + unsigned int last_i; + unsigned int table; + struct net *net; + unsigned int i; + + req = nlmsg_data(cb->nlh); + net = sock_net(skb->sk); + + /* State saved between calls: */ + table = cb->args[0]; + bucket = cb->args[1]; + i = last_i = cb->args[2]; + + /* TODO VMCI pending sockets? */ + + spin_lock_bh(&vsock_table_lock); + + /* Bind table (locally created sockets) */ + if (table == 0) { + while (bucket < ARRAY_SIZE(vsock_bind_table)) { + struct list_head *head = &vsock_bind_table[bucket]; + + i = 0; + list_for_each_entry(vsk, head, bound_table) { + struct sock *sk = sk_vsock(vsk); + + if (!net_eq(sock_net(sk), net)) + continue; + if (i < last_i) + goto next_bind; + if (!(req->vdiag_states & (1 << sk->sk_state))) + goto next_bind; + if (sk_diag_fill(sk, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + goto done; +next_bind: + i++; + } + last_i = 0; + bucket++; + } + + table++; + bucket = 0; + } + + /* Connected table (accepted connections) */ + while (bucket < ARRAY_SIZE(vsock_connected_table)) { + struct list_head *head = &vsock_connected_table[bucket]; + + i = 0; + list_for_each_entry(vsk, head, connected_table) { + struct sock *sk = sk_vsock(vsk); + + /* Skip sockets we've already seen above */ + if (__vsock_in_bound_table(vsk)) + continue; + + if (!net_eq(sock_net(sk), net)) + continue; + if (i < last_i) + goto next_connected; + if (!(req->vdiag_states & (1 << sk->sk_state))) + goto next_connected; + if (sk_diag_fill(sk, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + goto done; +next_connected: + i++; + } + last_i = 0; + bucket++; + } + +done: + spin_unlock_bh(&vsock_table_lock); + + cb->args[0] = table; + cb->args[1] = bucket; + cb->args[2] = i; + + return skb->len; +} + +static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct vsock_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = vsock_diag_dump, + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler vsock_diag_handler = { + .family = AF_VSOCK, + .dump = vsock_diag_handler_dump, +}; + +static int __init vsock_diag_init(void) +{ + return sock_diag_register(&vsock_diag_handler); +} + +static void __exit vsock_diag_exit(void) +{ + sock_diag_unregister(&vsock_diag_handler); +} + +module_init(vsock_diag_init); +module_exit(vsock_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, + 40 /* AF_VSOCK */); diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 14ed5a344cdf..bbac023e70d1 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -310,7 +310,7 @@ static void hvs_close_connection(struct vmbus_channel *chan) struct sock *sk = get_per_channel_state(chan); struct vsock_sock *vsk = vsock_sk(sk); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown |= SEND_SHUTDOWN | RCV_SHUTDOWN; @@ -344,8 +344,8 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!sk) return; - if ((conn_from_host && sk->sk_state != VSOCK_SS_LISTEN) || - (!conn_from_host && sk->sk_state != SS_CONNECTING)) + if ((conn_from_host && sk->sk_state != TCP_LISTEN) || + (!conn_from_host && sk->sk_state != TCP_SYN_SENT)) goto out; if (conn_from_host) { @@ -357,7 +357,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) if (!new) goto out; - new->sk_state = SS_CONNECTING; + new->sk_state = TCP_SYN_SENT; vnew = vsock_sk(new); hvs_new = vnew->trans; hvs_new->chan = chan; @@ -384,7 +384,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vmbus_set_chn_rescind_callback(chan, hvs_close_connection); if (conn_from_host) { - new->sk_state = SS_CONNECTED; + new->sk_state = TCP_ESTABLISHED; sk->sk_ack_backlog++; hvs_addr_init(&vnew->local_addr, if_type); @@ -399,7 +399,7 @@ static void hvs_open_connection(struct vmbus_channel *chan) vsock_enqueue_accept(sk, new); release_sock(sk); } else { - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsock_sk(sk)); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 403d86e80162..8e03bd3f3668 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -414,7 +414,7 @@ static void virtio_vsock_event_fill(struct virtio_vsock *vsock) static void virtio_vsock_reset_sock(struct sock *sk) { lock_sock(sk); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); release_sock(sk); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index edba7ab97563..3ae3a33da70b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -708,7 +708,7 @@ static void virtio_transport_do_close(struct vsock_sock *vsk, sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); if (vsk->close_work_scheduled && @@ -748,8 +748,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk) { struct sock *sk = &vsk->sk; - if (!(sk->sk_state == SS_CONNECTED || - sk->sk_state == SS_DISCONNECTING)) + if (!(sk->sk_state == TCP_ESTABLISHED || + sk->sk_state == TCP_CLOSING)) return true; /* Already received SHUTDOWN from peer, reply with RST */ @@ -801,7 +801,7 @@ virtio_transport_recv_connecting(struct sock *sk, switch (le16_to_cpu(pkt->hdr.op)) { case VIRTIO_VSOCK_OP_RESPONSE: - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); @@ -821,7 +821,7 @@ virtio_transport_recv_connecting(struct sock *sk, destroy: virtio_transport_reset(vsk, pkt); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk->sk_error_report(sk); return err; @@ -857,7 +857,7 @@ virtio_transport_recv_connected(struct sock *sk, vsk->peer_shutdown |= SEND_SHUTDOWN; if (vsk->peer_shutdown == SHUTDOWN_MASK && vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; if (le32_to_cpu(pkt->hdr.flags)) sk->sk_state_change(sk); break; @@ -928,7 +928,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt) lock_sock_nested(child, SINGLE_DEPTH_NESTING); - child->sk_state = SS_CONNECTED; + child->sk_state = TCP_ESTABLISHED; vchild = vsock_sk(child); vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid), @@ -1016,18 +1016,18 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt) sk->sk_write_space(sk); switch (sk->sk_state) { - case VSOCK_SS_LISTEN: + case TCP_LISTEN: virtio_transport_recv_listen(sk, pkt); virtio_transport_free_pkt(pkt); break; - case SS_CONNECTING: + case TCP_SYN_SENT: virtio_transport_recv_connecting(sk, pkt); virtio_transport_free_pkt(pkt); break; - case SS_CONNECTED: + case TCP_ESTABLISHED: virtio_transport_recv_connected(sk, pkt); break; - case SS_DISCONNECTING: + case TCP_CLOSING: virtio_transport_recv_disconnecting(sk, pkt); virtio_transport_free_pkt(pkt); break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 0206155bff53..391775e3575c 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -742,7 +742,7 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) /* The local context ID may be out of date, update it. */ vsk->local_addr.svm_cid = dst.svm_cid; - if (sk->sk_state == SS_CONNECTED) + if (sk->sk_state == TCP_ESTABLISHED) vmci_trans(vsk)->notify_ops->handle_notify_pkt( sk, pkt, true, &dst, &src, &bh_process_pkt); @@ -800,7 +800,9 @@ static void vmci_transport_handle_detach(struct sock *sk) * left in our consume queue. */ if (vsock_stream_has_data(vsk) <= 0) { - if (sk->sk_state == SS_CONNECTING) { + sk->sk_state = TCP_CLOSE; + + if (sk->sk_state == TCP_SYN_SENT) { /* The peer may detach from a queue pair while * we are still in the connecting state, i.e., * if the peer VM is killed after attaching to @@ -809,12 +811,10 @@ static void vmci_transport_handle_detach(struct sock *sk) * event like a reset. */ - sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; sk->sk_error_report(sk); return; } - sk->sk_state = SS_UNCONNECTED; } sk->sk_state_change(sk); } @@ -882,17 +882,17 @@ static void vmci_transport_recv_pkt_work(struct work_struct *work) vsock_sk(sk)->local_addr.svm_cid = pkt->dg.dst.context; switch (sk->sk_state) { - case VSOCK_SS_LISTEN: + case TCP_LISTEN: vmci_transport_recv_listen(sk, pkt); break; - case SS_CONNECTING: + case TCP_SYN_SENT: /* Processing of pending connections for servers goes through * the listening socket, so see vmci_transport_recv_listen() * for that path. */ vmci_transport_recv_connecting_client(sk, pkt); break; - case SS_CONNECTED: + case TCP_ESTABLISHED: vmci_transport_recv_connected(sk, pkt); break; default: @@ -941,7 +941,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_sk(pending)->local_addr.svm_cid = pkt->dg.dst.context; switch (pending->sk_state) { - case SS_CONNECTING: + case TCP_SYN_SENT: err = vmci_transport_recv_connecting_server(sk, pending, pkt); @@ -1071,7 +1071,7 @@ static int vmci_transport_recv_listen(struct sock *sk, vsock_add_pending(sk, pending); sk->sk_ack_backlog++; - pending->sk_state = SS_CONNECTING; + pending->sk_state = TCP_SYN_SENT; vmci_trans(vpending)->produce_size = vmci_trans(vpending)->consume_size = qp_size; vmci_trans(vpending)->queue_pair_size = qp_size; @@ -1196,11 +1196,11 @@ vmci_transport_recv_connecting_server(struct sock *listener, * the socket will be valid until it is removed from the queue. * * If we fail sending the attach below, we remove the socket from the - * connected list and move the socket to SS_UNCONNECTED before + * connected list and move the socket to TCP_CLOSE before * releasing the lock, so a pending slow path processing of an incoming * packet will not see the socket in the connected state in that case. */ - pending->sk_state = SS_CONNECTED; + pending->sk_state = TCP_ESTABLISHED; vsock_insert_connected(vpending); @@ -1231,7 +1231,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, destroy: pending->sk_err = skerr; - pending->sk_state = SS_UNCONNECTED; + pending->sk_state = TCP_CLOSE; /* As long as we drop our reference, all necessary cleanup will handle * when the cleanup function drops its reference and our destruct * implementation is called. Note that since the listen handler will @@ -1269,7 +1269,7 @@ vmci_transport_recv_connecting_client(struct sock *sk, * accounting (it can already be found since it's in the bound * table). */ - sk->sk_state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; sk->sk_socket->state = SS_CONNECTED; vsock_insert_connected(vsk); sk->sk_state_change(sk); @@ -1337,7 +1337,7 @@ vmci_transport_recv_connecting_client(struct sock *sk, destroy: vmci_transport_send_reset(sk, pkt); - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; sk->sk_err = skerr; sk->sk_error_report(sk); return err; @@ -1525,7 +1525,7 @@ static int vmci_transport_recv_connected(struct sock *sk, sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; if (vsock_stream_has_data(vsk) <= 0) - sk->sk_state = SS_DISCONNECTING; + sk->sk_state = TCP_CLOSING; sk->sk_state_change(sk); break; @@ -1789,7 +1789,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) err = vmci_transport_send_conn_request( sk, vmci_trans(vsk)->queue_pair_size); if (err < 0) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; return err; } } else { @@ -1799,7 +1799,7 @@ static int vmci_transport_connect(struct vsock_sock *vsk) sk, vmci_trans(vsk)->queue_pair_size, supported_proto_versions); if (err < 0) { - sk->sk_state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; return err; } diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c index 1406db4d97d1..41fb427f150a 100644 --- a/net/vmw_vsock/vmci_transport_notify.c +++ b/net/vmw_vsock/vmci_transport_notify.c @@ -355,7 +355,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, * queue. Ask for notifications when there is something to * read. */ - if (sk->sk_state == SS_CONNECTED) { + if (sk->sk_state == TCP_ESTABLISHED) { if (!send_waiting_read(sk, 1)) return -1; diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c index f3a0afc46208..0cc84f2bb05e 100644 --- a/net/vmw_vsock/vmci_transport_notify_qstate.c +++ b/net/vmw_vsock/vmci_transport_notify_qstate.c @@ -176,7 +176,7 @@ vmci_transport_notify_pkt_poll_in(struct sock *sk, * queue. Ask for notifications when there is something to * read. */ - if (sk->sk_state == SS_CONNECTED) + if (sk->sk_state == TCP_ESTABLISHED) vsock_block_update_write_window(sk); *data_ready_now = false; } diff --git a/net/wireless/.gitignore b/net/wireless/.gitignore index c33451b896d9..61cbc304a3d3 100644 --- a/net/wireless/.gitignore +++ b/net/wireless/.gitignore @@ -1 +1,2 @@ -regdb.c +shipped-certs.c +extra-certs.c diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 6c606120abfe..da91bb547db3 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -19,6 +19,7 @@ config WEXT_PRIV config CFG80211 tristate "cfg80211 - wireless configuration API" depends on RFKILL || !RFKILL + select FW_LOADER ---help--- cfg80211 is the Linux wireless LAN (802.11) configuration API. Enable this if you have a wireless device. @@ -82,6 +83,36 @@ config CFG80211_CERTIFICATION_ONUS you are a wireless researcher and are working in a controlled and approved environment by your local regulatory agency. +config CFG80211_REQUIRE_SIGNED_REGDB + bool "require regdb signature" if CFG80211_CERTIFICATION_ONUS + default y + select SYSTEM_DATA_VERIFICATION + help + Require that in addition to the "regulatory.db" file a + "regulatory.db.p7s" can be loaded with a valid PKCS#7 + signature for the regulatory.db file made by one of the + keys in the certs/ directory. + +config CFG80211_USE_KERNEL_REGDB_KEYS + bool "allow regdb keys shipped with the kernel" if CFG80211_CERTIFICATION_ONUS + default y + depends on CFG80211_REQUIRE_SIGNED_REGDB + help + Allow the regulatory database to be signed by one of the keys for + which certificates are part of the kernel sources + (in net/wireless/certs/). + + This is currently only Seth Forshee's key, who is the regulatory + database maintainer. + +config CFG80211_EXTRA_REGDB_KEYDIR + string "additional regdb key directory" if CFG80211_CERTIFICATION_ONUS + depends on CFG80211_REQUIRE_SIGNED_REGDB + help + If selected, point to a directory with DER-encoded X.509 + certificates like in the kernel sources (net/wireless/certs/) + that shall be accepted for a signed regulatory database. + config CFG80211_REG_CELLULAR_HINTS bool "cfg80211 regulatory support for cellular base station hints" depends on CFG80211_CERTIFICATION_ONUS @@ -139,35 +170,14 @@ config CFG80211_DEBUGFS If unsure, say N. -config CFG80211_INTERNAL_REGDB - bool "use statically compiled regulatory rules database" if EXPERT - default n - depends on CFG80211 - ---help--- - This option generates an internal data structure representing - the wireless regulatory rules described in net/wireless/db.txt - and includes code to query that database. This is an alternative - to using CRDA for defining regulatory rules for the kernel. - - Using this option requires some parsing of the db.txt at build time, - the parser will be upkept with the latest wireless-regdb updates but - older wireless-regdb formats will be ignored. The parser may later - be replaced to avoid issues with conflicts on versions of - wireless-regdb. - - For details see: - - http://wireless.kernel.org/en/developers/Regulatory - - Most distributions have a CRDA package. So if unsure, say N. - config CFG80211_CRDA_SUPPORT - bool "support CRDA" if CFG80211_INTERNAL_REGDB + bool "support CRDA" if EXPERT default y depends on CFG80211 help You should enable this option unless you know for sure you have no - need for it, for example when using internal regdb (above.) + need for it, for example when using internal regdb (above) or the + database loaded as a firmware file. If unsure, say Y. diff --git a/net/wireless/Makefile b/net/wireless/Makefile index d06e5015751a..e585f3f71f77 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -14,11 +14,27 @@ cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o cfg80211-$(CONFIG_OF) += of.o cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o -cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o CFLAGS_trace.o := -I$(src) -$(obj)/regdb.c: $(src)/db.txt $(src)/genregdb.awk - @$(AWK) -f $(srctree)/$(src)/genregdb.awk < $< > $@ +cfg80211-$(CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS) += shipped-certs.o +ifneq ($(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR),) +cfg80211-y += extra-certs.o +endif -clean-files := regdb.c +$(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.x509) + @$(kecho) " GEN $@" + @echo '#include "reg.h"' > $@ + @echo 'const u8 shipped_regdb_certs[] = {' >> $@ + @for f in $^ ; do hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ ; done + @echo '};' >> $@ + @echo 'unsigned int shipped_regdb_certs_len = sizeof(shipped_regdb_certs);' >> $@ + +$(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ + $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) + @$(kecho) " GEN $@" + @echo '#include "reg.h"' > $@ + @echo 'const u8 extra_regdb_certs[] = {' >> $@ + @for f in $^ ; do test -f $$f && hexdump -v -e '1/1 "0x%.2x," "\n"' < $$f >> $@ || true ; done + @echo '};' >> $@ + @echo 'unsigned int extra_regdb_certs_len = sizeof(extra_regdb_certs);' >> $@ diff --git a/net/wireless/certs/sforshee.x509 b/net/wireless/certs/sforshee.x509 Binary files differnew file mode 100644 index 000000000000..c6f8f9d6b988 --- /dev/null +++ b/net/wireless/certs/sforshee.x509 diff --git a/net/wireless/chan.c b/net/wireless/chan.c index b8aa5a7d5c77..eb824270f6e3 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -464,7 +464,7 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, struct ieee80211_channel *chan) { int width; - u32 cf_offset, freq; + u32 freq; if (chandef->chan->center_freq == chan->center_freq) return true; @@ -473,8 +473,6 @@ bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, if (width <= 20) return false; - cf_offset = width / 2 - 10; - for (freq = chandef->center_freq1 - width / 2 + 10; freq <= chandef->center_freq1 + width / 2 - 10; freq += 20) { if (chan->center_freq == freq) diff --git a/net/wireless/core.c b/net/wireless/core.c index 7b33e8c366bc..fdde0d98fde1 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1384,7 +1384,7 @@ out_fail_sysfs: out_fail_pernet: return err; } -subsys_initcall(cfg80211_init); +fs_initcall(cfg80211_init); static void __exit cfg80211_exit(void) { diff --git a/net/wireless/core.h b/net/wireless/core.h index 6e809325af3b..35165f42c2a8 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -216,6 +216,7 @@ enum cfg80211_event_type { EVENT_DISCONNECTED, EVENT_IBSS_JOINED, EVENT_STOPPED, + EVENT_PORT_AUTHORIZED, }; struct cfg80211_event { @@ -235,6 +236,9 @@ struct cfg80211_event { u8 bssid[ETH_ALEN]; struct ieee80211_channel *channel; } ij; + struct { + u8 bssid[ETH_ALEN]; + } pa; }; }; @@ -385,6 +389,7 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, bool wextev); void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info); +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_autodisconnect_wk(struct work_struct *work); diff --git a/net/wireless/db.txt b/net/wireless/db.txt deleted file mode 100644 index a2fc3a09ccdc..000000000000 --- a/net/wireless/db.txt +++ /dev/null @@ -1,17 +0,0 @@ -# -# This file is a placeholder to prevent accidental build breakage if someone -# enables CONFIG_CFG80211_INTERNAL_REGDB. Almost no one actually needs to -# enable that build option. -# -# You should be using CRDA instead. It is even better if you use the CRDA -# package provided by your distribution, since they will probably keep it -# up-to-date on your behalf. -# -# If you _really_ intend to use CONFIG_CFG80211_INTERNAL_REGDB then you will -# need to replace this file with one containing appropriately formatted -# regulatory rules that cover the regulatory domains you will be using. Your -# best option is to extract the db.txt file from the wireless-regdb git -# repository: -# -# git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-regdb.git -# diff --git a/net/wireless/genregdb.awk b/net/wireless/genregdb.awk deleted file mode 100644 index baf2426b555a..000000000000 --- a/net/wireless/genregdb.awk +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/awk -f -# -# genregdb.awk -- generate regdb.c from db.txt -# -# Actually, it reads from stdin (presumed to be db.txt) and writes -# to stdout (presumed to be regdb.c), but close enough... -# -# Copyright 2009 John W. Linville <linville@tuxdriver.com> -# -# Permission to use, copy, modify, and/or distribute this software for any -# purpose with or without fee is hereby granted, provided that the above -# copyright notice and this permission notice appear in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -BEGIN { - active = 0 - rules = 0; - print "/*" - print " * DO NOT EDIT -- file generated from data in db.txt" - print " */" - print "" - print "#include <linux/nl80211.h>" - print "#include <net/cfg80211.h>" - print "#include \"regdb.h\"" - print "" - regdb = "const struct ieee80211_regdomain *reg_regdb[] = {\n" -} - -function parse_country_head() { - country=$2 - sub(/:/, "", country) - printf "static const struct ieee80211_regdomain regdom_%s = {\n", country - printf "\t.alpha2 = \"%s\",\n", country - if ($NF ~ /DFS-ETSI/) - printf "\t.dfs_region = NL80211_DFS_ETSI,\n" - else if ($NF ~ /DFS-FCC/) - printf "\t.dfs_region = NL80211_DFS_FCC,\n" - else if ($NF ~ /DFS-JP/) - printf "\t.dfs_region = NL80211_DFS_JP,\n" - printf "\t.reg_rules = {\n" - active = 1 - regdb = regdb "\t®dom_" country ",\n" -} - -function parse_reg_rule() -{ - flag_starts_at = 7 - - start = $1 - sub(/\(/, "", start) - end = $3 - bw = $5 - sub(/\),/, "", bw) - gain = 0 - power = $6 - # power might be in mW... - units = $7 - dfs_cac = 0 - - sub(/\(/, "", power) - sub(/\),/, "", power) - sub(/\),/, "", units) - sub(/\)/, "", units) - - if (units == "mW") { - flag_starts_at = 8 - power = 10 * log(power)/log(10) - if ($8 ~ /[[:digit:]]/) { - flag_starts_at = 9 - dfs_cac = $8 - } - } else { - if ($7 ~ /[[:digit:]]/) { - flag_starts_at = 8 - dfs_cac = $7 - } - } - sub(/\(/, "", dfs_cac) - sub(/\),/, "", dfs_cac) - flagstr = "" - for (i=flag_starts_at; i<=NF; i++) - flagstr = flagstr $i - split(flagstr, flagarray, ",") - flags = "" - for (arg in flagarray) { - if (flagarray[arg] == "NO-OFDM") { - flags = flags "\n\t\t\tNL80211_RRF_NO_OFDM | " - } else if (flagarray[arg] == "NO-CCK") { - flags = flags "\n\t\t\tNL80211_RRF_NO_CCK | " - } else if (flagarray[arg] == "NO-INDOOR") { - flags = flags "\n\t\t\tNL80211_RRF_NO_INDOOR | " - } else if (flagarray[arg] == "NO-OUTDOOR") { - flags = flags "\n\t\t\tNL80211_RRF_NO_OUTDOOR | " - } else if (flagarray[arg] == "DFS") { - flags = flags "\n\t\t\tNL80211_RRF_DFS | " - } else if (flagarray[arg] == "PTP-ONLY") { - flags = flags "\n\t\t\tNL80211_RRF_PTP_ONLY | " - } else if (flagarray[arg] == "PTMP-ONLY") { - flags = flags "\n\t\t\tNL80211_RRF_PTMP_ONLY | " - } else if (flagarray[arg] == "PASSIVE-SCAN") { - flags = flags "\n\t\t\tNL80211_RRF_NO_IR | " - } else if (flagarray[arg] == "NO-IBSS") { - flags = flags "\n\t\t\tNL80211_RRF_NO_IR | " - } else if (flagarray[arg] == "NO-IR") { - flags = flags "\n\t\t\tNL80211_RRF_NO_IR | " - } else if (flagarray[arg] == "AUTO-BW") { - flags = flags "\n\t\t\tNL80211_RRF_AUTO_BW | " - } - - } - flags = flags "0" - printf "\t\tREG_RULE_EXT(%d, %d, %d, %d, %.0f, %d, %s),\n", start, end, bw, gain, power, dfs_cac, flags - rules++ -} - -function print_tail_country() -{ - active = 0 - printf "\t},\n" - printf "\t.n_reg_rules = %d\n", rules - printf "};\n\n" - rules = 0; -} - -/^[ \t]*#/ { - # Ignore -} - -!active && /^[ \t]*$/ { - # Ignore -} - -!active && /country/ { - parse_country_head() -} - -active && /^[ \t]*\(/ { - parse_reg_rule() -} - -active && /^[ \t]*$/ { - print_tail_country() -} - -END { - if (active) - print_tail_country() - print regdb "};" - print "" - print "int reg_regdb_size = ARRAY_SIZE(reg_regdb);" -} diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 690874293cfc..fce2cbe6a193 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -549,6 +549,14 @@ nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = { [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED }, }; +/* policy for packet pattern attributes */ +static const struct nla_policy +nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = { + [NL80211_PKTPAT_MASK] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_PATTERN] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -2122,6 +2130,15 @@ static int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, case NL80211_CHAN_HT40MINUS: cfg80211_chandef_create(chandef, chandef->chan, chantype); + /* user input for center_freq is incorrect */ + if (info->attrs[NL80211_ATTR_CENTER_FREQ1] && + chandef->center_freq1 != nla_get_u32( + info->attrs[NL80211_ATTR_CENTER_FREQ1])) + return -EINVAL; + /* center_freq2 must be zero */ + if (info->attrs[NL80211_ATTR_CENTER_FREQ2] && + nla_get_u32(info->attrs[NL80211_ATTR_CENTER_FREQ2])) + return -EINVAL; break; default: return -EINVAL; @@ -5669,6 +5686,11 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) } } +static int nl80211_reload_regdb(struct sk_buff *skb, struct genl_info *info) +{ + return reg_reload_regdb(); +} + static int nl80211_get_mesh_config(struct sk_buff *skb, struct genl_info *info) { @@ -6610,6 +6632,77 @@ static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev) return regulatory_pre_cac_allowed(wdev->wiphy); } +static int +nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, + void *request, struct nlattr **attrs, + bool is_sched_scan) +{ + u8 *mac_addr, *mac_addr_mask; + u32 *flags; + enum nl80211_feature_flags randomness_flag; + + if (!attrs[NL80211_ATTR_SCAN_FLAGS]) + return 0; + + if (is_sched_scan) { + struct cfg80211_sched_scan_request *req = request; + + randomness_flag = wdev ? + NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR : + NL80211_FEATURE_ND_RANDOM_MAC_ADDR; + flags = &req->flags; + mac_addr = req->mac_addr; + mac_addr_mask = req->mac_addr_mask; + } else { + struct cfg80211_scan_request *req = request; + + randomness_flag = NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR; + flags = &req->flags; + mac_addr = req->mac_addr; + mac_addr_mask = req->mac_addr_mask; + } + + *flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]); + + if ((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && + !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) + return -EOPNOTSUPP; + + if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { + int err; + + if (!(wiphy->features & randomness_flag) || + (wdev && wdev->current_bss)) + return -EOPNOTSUPP; + + err = nl80211_parse_random_mac(attrs, mac_addr, mac_addr_mask); + if (err) + return err; + } + + if ((*flags & NL80211_SCAN_FLAG_FILS_MAX_CHANNEL_TIME) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_FILS_MAX_CHANNEL_TIME)) + return -EOPNOTSUPP; + + if ((*flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_ACCEPT_BCAST_PROBE_RESP)) + return -EOPNOTSUPP; + + if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION)) + return -EOPNOTSUPP; + + if ((*flags & NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_OCE_PROBE_REQ_HIGH_TX_RATE)) + return -EOPNOTSUPP; + + return 0; +} + static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6815,34 +6908,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]); } - if (info->attrs[NL80211_ATTR_SCAN_FLAGS]) { - request->flags = nla_get_u32( - info->attrs[NL80211_ATTR_SCAN_FLAGS]); - if ((request->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && - !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) { - err = -EOPNOTSUPP; - goto out_free; - } - - if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { - if (!(wiphy->features & - NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR)) { - err = -EOPNOTSUPP; - goto out_free; - } - - if (wdev->current_bss) { - err = -EOPNOTSUPP; - goto out_free; - } - - err = nl80211_parse_random_mac(info->attrs, - request->mac_addr, - request->mac_addr_mask); - if (err) - goto out_free; - } - } + err = nl80211_check_scan_flags(wiphy, wdev, request, info->attrs, + false); + if (err) + goto out_free; request->no_cck = nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]); @@ -7290,37 +7359,9 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, request->ie_len); } - if (attrs[NL80211_ATTR_SCAN_FLAGS]) { - request->flags = nla_get_u32( - attrs[NL80211_ATTR_SCAN_FLAGS]); - if ((request->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && - !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) { - err = -EOPNOTSUPP; - goto out_free; - } - - if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { - u32 flg = NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR; - - if (!wdev) /* must be net-detect */ - flg = NL80211_FEATURE_ND_RANDOM_MAC_ADDR; - - if (!(wiphy->features & flg)) { - err = -EOPNOTSUPP; - goto out_free; - } - - if (wdev && wdev->current_bss) { - err = -EOPNOTSUPP; - goto out_free; - } - - err = nl80211_parse_random_mac(attrs, request->mac_addr, - request->mac_addr_mask); - if (err) - goto out_free; - } - } + err = nl80211_check_scan_flags(wiphy, wdev, request, attrs, true); + if (err) + goto out_free; if (attrs[NL80211_ATTR_SCHED_SCAN_DELAY]) request->delay = @@ -8924,8 +8965,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_USE_MFP]) { connect.mfp = nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]); + if (connect.mfp == NL80211_MFP_OPTIONAL && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_MFP_OPTIONAL)) + return -EOPNOTSUPP; + if (connect.mfp != NL80211_MFP_REQUIRED && - connect.mfp != NL80211_MFP_NO) + connect.mfp != NL80211_MFP_NO && + connect.mfp != NL80211_MFP_OPTIONAL) return -EINVAL; } else { connect.mfp = NL80211_MFP_NO; @@ -10532,7 +10579,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) u8 *mask_pat; nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - NULL, info->extack); + nl80211_packet_pattern_policy, + info->extack); err = -EINVAL; if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) @@ -10781,7 +10829,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL, NULL); + nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, + nl80211_packet_pattern_policy, NULL); if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) return -EINVAL; @@ -12675,6 +12724,12 @@ static const struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, }, { + .cmd = NL80211_CMD_RELOAD_REGDB, + .doit = nl80211_reload_regdb, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, + { .cmd = NL80211_CMD_GET_MESH_CONFIG, .doit = nl80211_get_mesh_config, .policy = nl80211_policy, @@ -13802,9 +13857,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, info->req_ie)) || (info->resp_ie && nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len, - info->resp_ie)) || - (info->authorized && - nla_put_flag(msg, NL80211_ATTR_PORT_AUTHORIZED))) + info->resp_ie))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -13818,6 +13871,36 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } +void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_PORT_AUTHORIZED); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, GFP_KERNEL); + return; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + nlmsg_free(msg); +} + void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap) diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index b96933322077..bf9e772a30b9 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -58,6 +58,8 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_roam_info *info, gfp_t gfp); +void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, + struct net_device *netdev, const u8 *bssid); void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 6e94f6934a0e..3871998059de 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -53,12 +53,13 @@ #include <linux/ctype.h> #include <linux/nl80211.h> #include <linux/platform_device.h> +#include <linux/verification.h> #include <linux/moduleparam.h> +#include <linux/firmware.h> #include <net/cfg80211.h> #include "core.h" #include "reg.h" #include "rdev-ops.h" -#include "regdb.h" #include "nl80211.h" /* @@ -100,7 +101,7 @@ static struct regulatory_request core_request_world = { static struct regulatory_request __rcu *last_request = (void __force __rcu *)&core_request_world; -/* To trigger userspace events */ +/* To trigger userspace events and load firmware */ static struct platform_device *reg_pdev; /* @@ -443,7 +444,6 @@ reg_copy_regd(const struct ieee80211_regdomain *src_regd) return regd; } -#ifdef CONFIG_CFG80211_INTERNAL_REGDB struct reg_regdb_apply_request { struct list_head list; const struct ieee80211_regdomain *regdom; @@ -475,55 +475,26 @@ static void reg_regdb_apply(struct work_struct *work) static DECLARE_WORK(reg_regdb_work, reg_regdb_apply); -static int reg_query_builtin(const char *alpha2) +static int reg_schedule_apply(const struct ieee80211_regdomain *regdom) { - const struct ieee80211_regdomain *regdom = NULL; struct reg_regdb_apply_request *request; - unsigned int i; - - for (i = 0; i < reg_regdb_size; i++) { - if (alpha2_equal(alpha2, reg_regdb[i]->alpha2)) { - regdom = reg_regdb[i]; - break; - } - } - - if (!regdom) - return -ENODATA; request = kzalloc(sizeof(struct reg_regdb_apply_request), GFP_KERNEL); - if (!request) - return -ENOMEM; - - request->regdom = reg_copy_regd(regdom); - if (IS_ERR_OR_NULL(request->regdom)) { - kfree(request); + if (!request) { + kfree(regdom); return -ENOMEM; } + request->regdom = regdom; + mutex_lock(®_regdb_apply_mutex); list_add_tail(&request->list, ®_regdb_apply_list); mutex_unlock(®_regdb_apply_mutex); schedule_work(®_regdb_work); - return 0; } -/* Feel free to add any other sanity checks here */ -static void reg_regdb_size_check(void) -{ - /* We should ideally BUILD_BUG_ON() but then random builds would fail */ - WARN_ONCE(!reg_regdb_size, "db.txt is empty, you should update it..."); -} -#else -static inline void reg_regdb_size_check(void) {} -static inline int reg_query_builtin(const char *alpha2) -{ - return -ENODATA; -} -#endif /* CONFIG_CFG80211_INTERNAL_REGDB */ - #ifdef CONFIG_CFG80211_CRDA_SUPPORT /* Max number of consecutive attempts to communicate with CRDA */ #define REG_MAX_CRDA_TIMEOUTS 10 @@ -599,10 +570,402 @@ static inline int call_crda(const char *alpha2) } #endif /* CONFIG_CFG80211_CRDA_SUPPORT */ +/* code to directly load a firmware database through request_firmware */ +static const struct fwdb_header *regdb; + +struct fwdb_country { + u8 alpha2[2]; + __be16 coll_ptr; + /* this struct cannot be extended */ +} __packed __aligned(4); + +struct fwdb_collection { + u8 len; + u8 n_rules; + u8 dfs_region; + /* no optional data yet */ + /* aligned to 2, then followed by __be16 array of rule pointers */ +} __packed __aligned(4); + +enum fwdb_flags { + FWDB_FLAG_NO_OFDM = BIT(0), + FWDB_FLAG_NO_OUTDOOR = BIT(1), + FWDB_FLAG_DFS = BIT(2), + FWDB_FLAG_NO_IR = BIT(3), + FWDB_FLAG_AUTO_BW = BIT(4), +}; + +struct fwdb_rule { + u8 len; + u8 flags; + __be16 max_eirp; + __be32 start, end, max_bw; + /* start of optional data */ + __be16 cac_timeout; +} __packed __aligned(4); + +#define FWDB_MAGIC 0x52474442 +#define FWDB_VERSION 20 + +struct fwdb_header { + __be32 magic; + __be32 version; + struct fwdb_country country[]; +} __packed __aligned(4); + +static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr) +{ + struct fwdb_rule *rule = (void *)(data + (rule_ptr << 2)); + + if ((u8 *)rule + sizeof(rule->len) > data + size) + return false; + + /* mandatory fields */ + if (rule->len < offsetofend(struct fwdb_rule, max_bw)) + return false; + + return true; +} + +static bool valid_country(const u8 *data, unsigned int size, + const struct fwdb_country *country) +{ + unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; + struct fwdb_collection *coll = (void *)(data + ptr); + __be16 *rules_ptr; + unsigned int i; + + /* make sure we can read len/n_rules */ + if ((u8 *)coll + offsetofend(typeof(*coll), n_rules) > data + size) + return false; + + /* make sure base struct and all rules fit */ + if ((u8 *)coll + ALIGN(coll->len, 2) + + (coll->n_rules * 2) > data + size) + return false; + + /* mandatory fields must exist */ + if (coll->len < offsetofend(struct fwdb_collection, dfs_region)) + return false; + + rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); + + for (i = 0; i < coll->n_rules; i++) { + u16 rule_ptr = be16_to_cpu(rules_ptr[i]); + + if (!valid_rule(data, size, rule_ptr)) + return false; + } + + return true; +} + +#ifdef CONFIG_CFG80211_REQUIRE_SIGNED_REGDB +static struct key *builtin_regdb_keys; + +static void __init load_keys_from_buffer(const u8 *p, unsigned int buflen) +{ + const u8 *end = p + buflen; + size_t plen; + key_ref_t key; + + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(builtin_regdb_keys, 1), + "asymmetric", NULL, p, plen, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(key)) { + pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + } else { + pr_notice("Loaded X.509 cert '%s'\n", + key_ref_to_ptr(key)->description); + key_ref_put(key); + } + p += plen; + } + + return; + +dodgy_cert: + pr_err("Problem parsing in-kernel X.509 certificate list\n"); +} + +static int __init load_builtin_regdb_keys(void) +{ + builtin_regdb_keys = + keyring_alloc(".builtin_regdb_keys", + KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(builtin_regdb_keys)) + return PTR_ERR(builtin_regdb_keys); + + pr_notice("Loading compiled-in X.509 certificates for regulatory database\n"); + +#ifdef CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS + load_keys_from_buffer(shipped_regdb_certs, shipped_regdb_certs_len); +#endif +#ifdef CONFIG_CFG80211_EXTRA_REGDB_KEYDIR + if (CONFIG_CFG80211_EXTRA_REGDB_KEYDIR[0] != '\0') + load_keys_from_buffer(extra_regdb_certs, extra_regdb_certs_len); +#endif + + return 0; +} + +static bool regdb_has_valid_signature(const u8 *data, unsigned int size) +{ + const struct firmware *sig; + bool result; + + if (request_firmware(&sig, "regulatory.db.p7s", ®_pdev->dev)) + return false; + + result = verify_pkcs7_signature(data, size, sig->data, sig->size, + builtin_regdb_keys, + VERIFYING_UNSPECIFIED_SIGNATURE, + NULL, NULL) == 0; + + release_firmware(sig); + + return result; +} + +static void free_regdb_keyring(void) +{ + key_put(builtin_regdb_keys); +} +#else +static int load_builtin_regdb_keys(void) +{ + return 0; +} + +static bool regdb_has_valid_signature(const u8 *data, unsigned int size) +{ + return true; +} + +static void free_regdb_keyring(void) +{ +} +#endif /* CONFIG_CFG80211_REQUIRE_SIGNED_REGDB */ + +static bool valid_regdb(const u8 *data, unsigned int size) +{ + const struct fwdb_header *hdr = (void *)data; + const struct fwdb_country *country; + + if (size < sizeof(*hdr)) + return false; + + if (hdr->magic != cpu_to_be32(FWDB_MAGIC)) + return false; + + if (hdr->version != cpu_to_be32(FWDB_VERSION)) + return false; + + if (!regdb_has_valid_signature(data, size)) + return false; + + country = &hdr->country[0]; + while ((u8 *)(country + 1) <= data + size) { + if (!country->coll_ptr) + break; + if (!valid_country(data, size, country)) + return false; + country++; + } + + return true; +} + +static int regdb_query_country(const struct fwdb_header *db, + const struct fwdb_country *country) +{ + unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; + struct fwdb_collection *coll = (void *)((u8 *)db + ptr); + struct ieee80211_regdomain *regdom; + unsigned int size_of_regd; + unsigned int i; + + size_of_regd = + sizeof(struct ieee80211_regdomain) + + coll->n_rules * sizeof(struct ieee80211_reg_rule); + + regdom = kzalloc(size_of_regd, GFP_KERNEL); + if (!regdom) + return -ENOMEM; + + regdom->n_reg_rules = coll->n_rules; + regdom->alpha2[0] = country->alpha2[0]; + regdom->alpha2[1] = country->alpha2[1]; + regdom->dfs_region = coll->dfs_region; + + for (i = 0; i < regdom->n_reg_rules; i++) { + __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); + unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; + struct fwdb_rule *rule = (void *)((u8 *)db + rule_ptr); + struct ieee80211_reg_rule *rrule = ®dom->reg_rules[i]; + + rrule->freq_range.start_freq_khz = be32_to_cpu(rule->start); + rrule->freq_range.end_freq_khz = be32_to_cpu(rule->end); + rrule->freq_range.max_bandwidth_khz = be32_to_cpu(rule->max_bw); + + rrule->power_rule.max_antenna_gain = 0; + rrule->power_rule.max_eirp = be16_to_cpu(rule->max_eirp); + + rrule->flags = 0; + if (rule->flags & FWDB_FLAG_NO_OFDM) + rrule->flags |= NL80211_RRF_NO_OFDM; + if (rule->flags & FWDB_FLAG_NO_OUTDOOR) + rrule->flags |= NL80211_RRF_NO_OUTDOOR; + if (rule->flags & FWDB_FLAG_DFS) + rrule->flags |= NL80211_RRF_DFS; + if (rule->flags & FWDB_FLAG_NO_IR) + rrule->flags |= NL80211_RRF_NO_IR; + if (rule->flags & FWDB_FLAG_AUTO_BW) + rrule->flags |= NL80211_RRF_AUTO_BW; + + rrule->dfs_cac_ms = 0; + + /* handle optional data */ + if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout)) + rrule->dfs_cac_ms = + 1000 * be16_to_cpu(rule->cac_timeout); + } + + return reg_schedule_apply(regdom); +} + +static int query_regdb(const char *alpha2) +{ + const struct fwdb_header *hdr = regdb; + const struct fwdb_country *country; + + ASSERT_RTNL(); + + if (IS_ERR(regdb)) + return PTR_ERR(regdb); + + country = &hdr->country[0]; + while (country->coll_ptr) { + if (alpha2_equal(alpha2, country->alpha2)) + return regdb_query_country(regdb, country); + country++; + } + + return -ENODATA; +} + +static void regdb_fw_cb(const struct firmware *fw, void *context) +{ + int set_error = 0; + bool restore = true; + void *db; + + if (!fw) { + pr_info("failed to load regulatory.db\n"); + set_error = -ENODATA; + } else if (!valid_regdb(fw->data, fw->size)) { + pr_info("loaded regulatory.db is malformed or signature is missing/invalid\n"); + set_error = -EINVAL; + } + + rtnl_lock(); + if (WARN_ON(regdb && !IS_ERR(regdb))) { + /* just restore and free new db */ + } else if (set_error) { + regdb = ERR_PTR(set_error); + } else if (fw) { + db = kmemdup(fw->data, fw->size, GFP_KERNEL); + if (db) { + regdb = db; + restore = context && query_regdb(context); + } else { + restore = true; + } + } + + if (restore) + restore_regulatory_settings(true); + + rtnl_unlock(); + + kfree(context); + + release_firmware(fw); +} + +static int query_regdb_file(const char *alpha2) +{ + ASSERT_RTNL(); + + if (regdb) + return query_regdb(alpha2); + + alpha2 = kmemdup(alpha2, 2, GFP_KERNEL); + if (!alpha2) + return -ENOMEM; + + return request_firmware_nowait(THIS_MODULE, true, "regulatory.db", + ®_pdev->dev, GFP_KERNEL, + (void *)alpha2, regdb_fw_cb); +} + +int reg_reload_regdb(void) +{ + const struct firmware *fw; + void *db; + int err; + + err = request_firmware(&fw, "regulatory.db", ®_pdev->dev); + if (err) + return err; + + if (!valid_regdb(fw->data, fw->size)) { + err = -ENODATA; + goto out; + } + + db = kmemdup(fw->data, fw->size, GFP_KERNEL); + if (!db) { + err = -ENOMEM; + goto out; + } + + rtnl_lock(); + if (!IS_ERR_OR_NULL(regdb)) + kfree(regdb); + regdb = db; + rtnl_unlock(); + + out: + release_firmware(fw); + return err; +} + static bool reg_query_database(struct regulatory_request *request) { - /* query internal regulatory database (if it exists) */ - if (reg_query_builtin(request->alpha2) == 0) + if (query_regdb_file(request->alpha2) == 0) return true; if (call_crda(request->alpha2) == 0) @@ -3285,6 +3648,10 @@ int __init regulatory_init(void) { int err = 0; + err = load_builtin_regdb_keys(); + if (err) + return err; + reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); if (IS_ERR(reg_pdev)) return PTR_ERR(reg_pdev); @@ -3293,8 +3660,6 @@ int __init regulatory_init(void) spin_lock_init(®_pending_beacons_lock); spin_lock_init(®_indoor_lock); - reg_regdb_size_check(); - rcu_assign_pointer(cfg80211_regdomain, cfg80211_world_regdom); user_alpha2[0] = '9'; @@ -3360,4 +3725,9 @@ void regulatory_exit(void) list_del(®_request->list); kfree(reg_request); } + + if (!IS_ERR_OR_NULL(regdb)) + kfree(regdb); + + free_regdb_keyring(); } diff --git a/net/wireless/reg.h b/net/wireless/reg.h index ca7fedf2e7a1..9ceeb5f3a7cb 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -1,5 +1,8 @@ #ifndef __NET_WIRELESS_REG_H #define __NET_WIRELESS_REG_H + +#include <net/cfg80211.h> + /* * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * @@ -179,4 +182,15 @@ void regulatory_propagate_dfs_state(struct wiphy *wiphy, * @wiphy2 - wiphy it's dfs_region to be checked against that of wiphy1 */ bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2); + +/** + * reg_reload_regdb - reload the regulatory.db firmware file + */ +int reg_reload_regdb(void); + +extern const u8 shipped_regdb_certs[]; +extern unsigned int shipped_regdb_certs_len; +extern const u8 extra_regdb_certs[]; +extern unsigned int extra_regdb_certs_len; + #endif /* __NET_WIRELESS_REG_H */ diff --git a/net/wireless/regdb.h b/net/wireless/regdb.h deleted file mode 100644 index 3279cfcefb0c..000000000000 --- a/net/wireless/regdb.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __REGDB_H__ -#define __REGDB_H__ - -/* - * Copyright 2009 John W. Linville <linville@tuxdriver.com> - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -extern const struct ieee80211_regdomain *reg_regdb[]; -extern int reg_regdb_size; - -#endif /* __REGDB_H__ */ diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 0a49b88070d0..f38ed490e42b 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -960,7 +960,6 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, ev->rm.resp_ie_len = info->resp_ie_len; memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); ev->rm.bss = info->bss; - ev->rm.authorized = info->authorized; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); @@ -969,6 +968,50 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, } EXPORT_SYMBOL(cfg80211_roamed); +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) +{ + ASSERT_WDEV_LOCK(wdev); + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + return; + + if (WARN_ON(!wdev->current_bss) || + WARN_ON(!ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) + return; + + nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, + bssid); +} + +void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, + gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_event *ev; + unsigned long flags; + + if (WARN_ON(!bssid)) + return; + + ev = kzalloc(sizeof(*ev), gfp); + if (!ev) + return; + + ev->type = EVENT_PORT_AUTHORIZED; + memcpy(ev->pa.bssid, bssid, ETH_ALEN); + + /* + * Use the wdev event list so that if there are pending + * connected/roamed events, they will be reported first. + */ + spin_lock_irqsave(&wdev->event_lock, flags); + list_add_tail(&ev->list, &wdev->event_list); + spin_unlock_irqrestore(&wdev->event_lock, flags); + queue_work(cfg80211_wq, &rdev->event_work); +} +EXPORT_SYMBOL(cfg80211_port_authorized); + void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap) { diff --git a/net/wireless/util.c b/net/wireless/util.c index bcb1284c3415..ff21c314a609 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -157,32 +157,30 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) case NL80211_BAND_2GHZ: want = 7; for (i = 0; i < sband->n_bitrates; i++) { - if (sband->bitrates[i].bitrate == 10) { + switch (sband->bitrates[i].bitrate) { + case 10: + case 20: + case 55: + case 110: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; want--; - } - - if (sband->bitrates[i].bitrate == 20 || - sband->bitrates[i].bitrate == 55 || - sband->bitrates[i].bitrate == 110 || - sband->bitrates[i].bitrate == 60 || - sband->bitrates[i].bitrate == 120 || - sband->bitrates[i].bitrate == 240) { + break; + case 60: + case 120: + case 240: sband->bitrates[i].flags |= IEEE80211_RATE_MANDATORY_G; want--; - } - - if (sband->bitrates[i].bitrate != 10 && - sband->bitrates[i].bitrate != 20 && - sband->bitrates[i].bitrate != 55 && - sband->bitrates[i].bitrate != 110) + /* fall through */ + default: sband->bitrates[i].flags |= IEEE80211_RATE_ERP_G; + break; + } } - WARN_ON(want != 0 && want != 3 && want != 6); + WARN_ON(want != 0 && want != 3); break; case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ @@ -529,121 +527,6 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, } EXPORT_SYMBOL(ieee80211_data_to_8023_exthdr); -int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, - enum nl80211_iftype iftype, - const u8 *bssid, bool qos) -{ - struct ieee80211_hdr hdr; - u16 hdrlen, ethertype; - __le16 fc; - const u8 *encaps_data; - int encaps_len, skip_header_bytes; - int nh_pos, h_pos; - int head_need; - - if (unlikely(skb->len < ETH_HLEN)) - return -EINVAL; - - nh_pos = skb_network_header(skb) - skb->data; - h_pos = skb_transport_header(skb) - skb->data; - - /* convert Ethernet header to proper 802.11 header (based on - * operation mode) */ - ethertype = (skb->data[12] << 8) | skb->data[13]; - fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_DATA); - - switch (iftype) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_P2P_GO: - fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); - /* DA BSSID SA */ - memcpy(hdr.addr1, skb->data, ETH_ALEN); - memcpy(hdr.addr2, addr, ETH_ALEN); - memcpy(hdr.addr3, skb->data + ETH_ALEN, ETH_ALEN); - hdrlen = 24; - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_CLIENT: - fc |= cpu_to_le16(IEEE80211_FCTL_TODS); - /* BSSID SA DA */ - memcpy(hdr.addr1, bssid, ETH_ALEN); - memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); - memcpy(hdr.addr3, skb->data, ETH_ALEN); - hdrlen = 24; - break; - case NL80211_IFTYPE_OCB: - case NL80211_IFTYPE_ADHOC: - /* DA SA BSSID */ - memcpy(hdr.addr1, skb->data, ETH_ALEN); - memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); - memcpy(hdr.addr3, bssid, ETH_ALEN); - hdrlen = 24; - break; - default: - return -EOPNOTSUPP; - } - - if (qos) { - fc |= cpu_to_le16(IEEE80211_STYPE_QOS_DATA); - hdrlen += 2; - } - - hdr.frame_control = fc; - hdr.duration_id = 0; - hdr.seq_ctrl = 0; - - skip_header_bytes = ETH_HLEN; - if (ethertype == ETH_P_AARP || ethertype == ETH_P_IPX) { - encaps_data = bridge_tunnel_header; - encaps_len = sizeof(bridge_tunnel_header); - skip_header_bytes -= 2; - } else if (ethertype >= ETH_P_802_3_MIN) { - encaps_data = rfc1042_header; - encaps_len = sizeof(rfc1042_header); - skip_header_bytes -= 2; - } else { - encaps_data = NULL; - encaps_len = 0; - } - - skb_pull(skb, skip_header_bytes); - nh_pos -= skip_header_bytes; - h_pos -= skip_header_bytes; - - head_need = hdrlen + encaps_len - skb_headroom(skb); - - if (head_need > 0 || skb_cloned(skb)) { - head_need = max(head_need, 0); - if (head_need) - skb_orphan(skb); - - if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC)) - return -ENOMEM; - } - - if (encaps_data) { - memcpy(skb_push(skb, encaps_len), encaps_data, encaps_len); - nh_pos += encaps_len; - h_pos += encaps_len; - } - - memcpy(skb_push(skb, hdrlen), &hdr, hdrlen); - - nh_pos += hdrlen; - h_pos += hdrlen; - - /* Update skb pointers to various headers since this modified frame - * is going to go through Linux networking code that may potentially - * need things like pointer to IP header. */ - skb_reset_mac_header(skb); - skb_set_network_header(skb, nh_pos); - skb_set_transport_header(skb, h_pos); - - return 0; -} -EXPORT_SYMBOL(ieee80211_data_from_8023); - static void __frame_add_frag(struct sk_buff *skb, struct page *page, void *ptr, int len, int size) @@ -963,6 +846,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) case EVENT_STOPPED: __cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; + case EVENT_PORT_AUTHORIZED: + __cfg80211_port_authorized(wdev, ev->pa.bssid); + break; } wdev_unlock(wdev); @@ -1367,13 +1253,29 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len, } EXPORT_SYMBOL(cfg80211_get_p2p_attr); -static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id) +static bool ieee80211_id_in_list(const u8 *ids, int n_ids, u8 id, bool id_ext) { int i; - for (i = 0; i < n_ids; i++) - if (ids[i] == id) + /* Make sure array values are legal */ + if (WARN_ON(ids[n_ids - 1] == WLAN_EID_EXTENSION)) + return false; + + i = 0; + while (i < n_ids) { + if (ids[i] == WLAN_EID_EXTENSION) { + if (id_ext && (ids[i + 1] == id)) + return true; + + i += 2; + continue; + } + + if (ids[i] == id && !id_ext) return true; + + i++; + } return false; } @@ -1403,14 +1305,36 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, { size_t pos = offset; - while (pos < ielen && ieee80211_id_in_list(ids, n_ids, ies[pos])) { + while (pos < ielen) { + u8 ext = 0; + + if (ies[pos] == WLAN_EID_EXTENSION) + ext = 2; + if ((pos + ext) >= ielen) + break; + + if (!ieee80211_id_in_list(ids, n_ids, ies[pos + ext], + ies[pos] == WLAN_EID_EXTENSION)) + break; + if (ies[pos] == WLAN_EID_RIC_DATA && n_after_ric) { pos = skip_ie(ies, ielen, pos); - while (pos < ielen && - !ieee80211_id_in_list(after_ric, n_after_ric, - ies[pos])) - pos = skip_ie(ies, ielen, pos); + while (pos < ielen) { + if (ies[pos] == WLAN_EID_EXTENSION) + ext = 2; + else + ext = 0; + + if ((pos + ext) >= ielen) + break; + + if (!ieee80211_id_in_list(after_ric, + n_after_ric, + ies[pos + ext], + ext == 2)) + pos = skip_ie(ies, ielen, pos); + } } else { pos = skip_ie(ies, ielen, pos); } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ac095936552d..ea87143314f3 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -374,9 +374,11 @@ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ -static void x25_destroy_timer(unsigned long data) +static void x25_destroy_timer(struct timer_list *t) { - x25_destroy_socket_from_timer((struct sock *)data); + struct sock *sk = from_timer(sk, t, sk_timer); + + x25_destroy_socket_from_timer(sk); } /* @@ -413,8 +415,7 @@ static void __x25_destroy_socket(struct sock *sk) if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; - sk->sk_timer.function = x25_destroy_timer; - sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index 5c5db1a36399..1dfba3c23459 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -26,18 +26,17 @@ #include <net/tcp_states.h> #include <net/x25.h> -static void x25_heartbeat_expiry(unsigned long); -static void x25_timer_expiry(unsigned long); +static void x25_heartbeat_expiry(struct timer_list *t); +static void x25_timer_expiry(struct timer_list *t); void x25_init_timers(struct sock *sk) { struct x25_sock *x25 = x25_sk(sk); - setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk); + timer_setup(&x25->timer, x25_timer_expiry, 0); /* initialized by sock_init_data */ - sk->sk_timer.data = (unsigned long)sk; - sk->sk_timer.function = &x25_heartbeat_expiry; + sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_heartbeat_expiry; } void x25_start_heartbeat(struct sock *sk) @@ -93,9 +92,9 @@ unsigned long x25_display_timer(struct sock *sk) return x25->timer.expires - jiffies; } -static void x25_heartbeat_expiry(unsigned long param) +static void x25_heartbeat_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct sock *sk = from_timer(sk, t, sk_timer); bh_lock_sock(sk); if (sock_owned_by_user(sk)) /* can currently only occur in state 3 */ @@ -160,9 +159,10 @@ static inline void x25_do_timer_expiry(struct sock * sk) } } -static void x25_timer_expiry(unsigned long param) +static void x25_timer_expiry(struct timer_list *t) { - struct sock *sk = (struct sock *)param; + struct x25_sock *x25 = from_timer(x25, t, timer); + struct sock *sk = &x25->sk; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* can currently only occur in state 3 */ diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index acf00104ef31..30e5746085b8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -91,6 +91,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { + xso->dev = NULL; dev_put(dev); return 0; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 2515cd2bc5db..8ac9d32fb79d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -429,7 +429,8 @@ resume: nf_reset(skb); if (decaps) { - skb->sp->olen = 0; + if (skb->sp) + skb->sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; @@ -440,7 +441,8 @@ resume: err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { - skb->sp->olen = 0; + if (skb->sp) + skb->sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index f06253969972..4838329bb43a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -57,7 +57,7 @@ static __read_mostly seqcount_t xfrm_policy_hash_generation; static void xfrm_init_pmtu(struct dst_entry *dst); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); -static void xfrm_policy_queue_process(unsigned long arg); +static void xfrm_policy_queue_process(struct timer_list *t); static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, @@ -179,9 +179,9 @@ static inline unsigned long make_jiffies(long secs) return secs*HZ; } -static void xfrm_policy_timer(unsigned long data) +static void xfrm_policy_timer(struct timer_list *t) { - struct xfrm_policy *xp = (struct xfrm_policy *)data; + struct xfrm_policy *xp = from_timer(xp, t, timer); unsigned long now = get_seconds(); long next = LONG_MAX; int warn = 0; @@ -267,10 +267,9 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) rwlock_init(&policy->lock); refcount_set(&policy->refcnt, 1); skb_queue_head_init(&policy->polq.hold_queue); - setup_timer(&policy->timer, xfrm_policy_timer, - (unsigned long)policy); - setup_timer(&policy->polq.hold_timer, xfrm_policy_queue_process, - (unsigned long)policy); + timer_setup(&policy->timer, xfrm_policy_timer, 0); + timer_setup(&policy->polq.hold_timer, + xfrm_policy_queue_process, 0); } return policy; } @@ -1852,12 +1851,12 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, return xdst; } -static void xfrm_policy_queue_process(unsigned long arg) +static void xfrm_policy_queue_process(struct timer_list *t) { struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; - struct xfrm_policy *pol = (struct xfrm_policy *)arg; + struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 0dab1cd79ce4..12213477cd3a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -732,12 +732,12 @@ restart: } } } +out: + spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (cnt) { err = 0; xfrm_policy_cache_flush(); } -out: - spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_state_flush); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2bfbd9121e3b..b997f1395357 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -657,6 +657,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) { x->km.state = XFRM_STATE_DEAD; + xfrm_dev_state_delete(x); __xfrm_state_put(x); goto out; } |