diff options
Diffstat (limited to 'net')
32 files changed, 1917 insertions, 395 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 7850412f52b7..e47600b4e2e3 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -327,10 +327,6 @@ static void vlan_sync_address(struct net_device *dev, static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { - u32 old_features = vlandev->features; - - vlandev->features &= ~dev->vlan_features; - vlandev->features |= dev->features & dev->vlan_features; vlandev->gso_max_size = dev->gso_max_size; if (dev->features & NETIF_F_HW_VLAN_TX) @@ -341,8 +337,8 @@ static void vlan_transfer_features(struct net_device *dev, #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif - if (old_features != vlandev->features) - netdev_features_change(vlandev); + + netdev_update_features(vlandev); } static void __vlan_device_event(struct net_device *dev, unsigned long event) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e34ea9e5e28b..b84a46b30c0c 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -704,8 +704,8 @@ static int vlan_dev_init(struct net_device *dev) (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); - dev->features |= real_dev->features & real_dev->vlan_features; - dev->features |= NETIF_F_LLTX; + dev->hw_features = real_dev->vlan_features & NETIF_F_ALL_TX_OFFLOADS; + dev->features |= real_dev->vlan_features | NETIF_F_LLTX; dev->gso_max_size = real_dev->gso_max_size; /* ipv6 shared card related stuff */ @@ -759,6 +759,17 @@ static void vlan_dev_uninit(struct net_device *dev) } } +static u32 vlan_dev_fix_features(struct net_device *dev, u32 features) +{ + struct net_device *real_dev = vlan_dev_info(dev)->real_dev; + + features &= (real_dev->features | NETIF_F_LLTX); + if (dev_ethtool_get_rx_csum(real_dev)) + features |= NETIF_F_RXCSUM; + + return features; +} + static int vlan_ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) { @@ -774,18 +785,6 @@ static void vlan_ethtool_get_drvinfo(struct net_device *dev, strcpy(info->fw_version, "N/A"); } -static u32 vlan_ethtool_get_rx_csum(struct net_device *dev) -{ - const struct vlan_dev_info *vlan = vlan_dev_info(dev); - return dev_ethtool_get_rx_csum(vlan->real_dev); -} - -static u32 vlan_ethtool_get_flags(struct net_device *dev) -{ - const struct vlan_dev_info *vlan = vlan_dev_info(dev); - return dev_ethtool_get_flags(vlan->real_dev); -} - static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { @@ -823,32 +822,10 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st return stats; } -static int vlan_ethtool_set_tso(struct net_device *dev, u32 data) -{ - if (data) { - struct net_device *real_dev = vlan_dev_info(dev)->real_dev; - - /* Underlying device must support TSO for VLAN-tagged packets - * and must have TSO enabled now. - */ - if (!(real_dev->vlan_features & NETIF_F_TSO)) - return -EOPNOTSUPP; - if (!(real_dev->features & NETIF_F_TSO)) - return -EINVAL; - dev->features |= NETIF_F_TSO; - } else { - dev->features &= ~NETIF_F_TSO; - } - return 0; -} - static const struct ethtool_ops vlan_ethtool_ops = { .get_settings = vlan_ethtool_get_settings, .get_drvinfo = vlan_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, - .get_rx_csum = vlan_ethtool_get_rx_csum, - .get_flags = vlan_ethtool_get_flags, - .set_tso = vlan_ethtool_set_tso, }; static const struct net_device_ops vlan_netdev_ops = { @@ -874,6 +851,7 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn, .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target, #endif + .ndo_fix_features = vlan_dev_fix_features, }; void vlan_setup(struct net_device *dev) diff --git a/net/bridge/br.c b/net/bridge/br.c index 84bbb82599b2..f20c4fd915a8 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -104,3 +104,4 @@ module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); MODULE_VERSION(BR_VERSION); +MODULE_ALIAS_RTNL_LINK("bridge"); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 21e5901186ea..45cfd54b06d3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -74,6 +74,17 @@ out: return NETDEV_TX_OK; } +static int br_dev_init(struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + + br->stats = alloc_percpu(struct br_cpu_netstats); + if (!br->stats) + return -ENOMEM; + + return 0; +} + static int br_dev_open(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -334,6 +345,7 @@ static const struct ethtool_ops br_ethtool_ops = { static const struct net_device_ops br_netdev_ops = { .ndo_open = br_dev_open, .ndo_stop = br_dev_stop, + .ndo_init = br_dev_init, .ndo_start_xmit = br_dev_xmit, .ndo_get_stats64 = br_get_stats64, .ndo_set_mac_address = br_set_mac_address, @@ -357,18 +369,47 @@ static void br_dev_free(struct net_device *dev) free_netdev(dev); } +static struct device_type br_type = { + .name = "bridge", +}; + void br_dev_setup(struct net_device *dev) { + struct net_bridge *br = netdev_priv(dev); + random_ether_addr(dev->dev_addr); ether_setup(dev); dev->netdev_ops = &br_netdev_ops; dev->destructor = br_dev_free; SET_ETHTOOL_OPS(dev, &br_ethtool_ops); + SET_NETDEV_DEVTYPE(dev, &br_type); dev->tx_queue_len = 0; dev->priv_flags = IFF_EBRIDGE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | NETIF_F_GSO | NETIF_F_HW_VLAN_TX; + + br->dev = dev; + spin_lock_init(&br->lock); + INIT_LIST_HEAD(&br->port_list); + spin_lock_init(&br->hash_lock); + + br->bridge_id.prio[0] = 0x80; + br->bridge_id.prio[1] = 0x00; + + memcpy(br->group_addr, br_group_address, ETH_ALEN); + + br->feature_mask = dev->features; + br->stp_enabled = BR_NO_STP; + br->designated_root = br->bridge_id; + br->bridge_max_age = br->max_age = 20 * HZ; + br->bridge_hello_time = br->hello_time = 2 * HZ; + br->bridge_forward_delay = br->forward_delay = 15 * HZ; + br->ageing_time = 300 * HZ; + + br_netfilter_rtable_init(br); + br_stp_timer_init(br); + br_multicast_init(br); } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index cc4d3c5ab1c6..e0dfbc151dd7 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -28,6 +28,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly; static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr); +static void fdb_notify(const struct net_bridge_fdb_entry *, int); static u32 fdb_salt __read_mostly; @@ -62,7 +63,7 @@ static inline int has_expired(const struct net_bridge *br, const struct net_bridge_fdb_entry *fdb) { return !fdb->is_static && - time_before_eq(fdb->ageing_timer + hold_time(br), jiffies); + time_before_eq(fdb->updated + hold_time(br), jiffies); } static inline int br_mac_hash(const unsigned char *mac) @@ -81,6 +82,7 @@ static void fdb_rcu_free(struct rcu_head *head) static inline void fdb_delete(struct net_bridge_fdb_entry *f) { + fdb_notify(f, RTM_DELNEIGH); hlist_del_rcu(&f->hlist); call_rcu(&f->rcu, fdb_rcu_free); } @@ -140,7 +142,7 @@ void br_fdb_cleanup(unsigned long _data) unsigned long this_timer; if (f->is_static) continue; - this_timer = f->ageing_timer + delay; + this_timer = f->updated + delay; if (time_before_eq(this_timer, jiffies)) fdb_delete(f); else if (time_before(this_timer, next_timer)) @@ -293,7 +295,7 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, fe->is_local = f->is_local; if (!f->is_static) - fe->ageing_timer_value = jiffies_to_clock_t(jiffies - f->ageing_timer); + fe->ageing_timer_value = jiffies_to_clock_t(jiffies - f->updated); ++fe; ++num; } @@ -305,8 +307,21 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, return num; } -static inline struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, - const unsigned char *addr) +static struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, + const unsigned char *addr) +{ + struct hlist_node *h; + struct net_bridge_fdb_entry *fdb; + + hlist_for_each_entry(fdb, h, head, hlist) { + if (!compare_ether_addr(fdb->addr.addr, addr)) + return fdb; + } + return NULL; +} + +static struct net_bridge_fdb_entry *fdb_find_rcu(struct hlist_head *head, + const unsigned char *addr) { struct hlist_node *h; struct net_bridge_fdb_entry *fdb; @@ -320,8 +335,7 @@ static inline struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, struct net_bridge_port *source, - const unsigned char *addr, - int is_local) + const unsigned char *addr) { struct net_bridge_fdb_entry *fdb; @@ -329,11 +343,11 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, if (fdb) { memcpy(fdb->addr.addr, addr, ETH_ALEN); fdb->dst = source; - fdb->is_local = is_local; - fdb->is_static = is_local; - fdb->ageing_timer = jiffies; - + fdb->is_local = 0; + fdb->is_static = 0; + fdb->updated = fdb->used = jiffies; hlist_add_head_rcu(&fdb->hlist, head); + fdb_notify(fdb, RTM_NEWNEIGH); } return fdb; } @@ -360,12 +374,15 @@ static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, fdb_delete(fdb); } - if (!fdb_create(head, source, addr, 1)) + fdb = fdb_create(head, source, addr); + if (!fdb) return -ENOMEM; + fdb->is_local = fdb->is_static = 1; return 0; } +/* Add entry for local address of interface */ int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr) { @@ -392,7 +409,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, source->state == BR_STATE_FORWARDING)) return; - fdb = fdb_find(head, addr); + fdb = fdb_find_rcu(head, addr); if (likely(fdb)) { /* attempt to update an entry for a local interface */ if (unlikely(fdb->is_local)) { @@ -403,15 +420,277 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } else { /* fastpath: update of existing entry */ fdb->dst = source; - fdb->ageing_timer = jiffies; + fdb->updated = jiffies; } } else { spin_lock(&br->hash_lock); - if (!fdb_find(head, addr)) - fdb_create(head, source, addr, 0); + if (likely(!fdb_find(head, addr))) + fdb_create(head, source, addr); + /* else we lose race and someone else inserts * it first, don't bother updating */ spin_unlock(&br->hash_lock); } } + +static int fdb_to_nud(const struct net_bridge_fdb_entry *fdb) +{ + if (fdb->is_local) + return NUD_PERMANENT; + else if (fdb->is_static) + return NUD_NOARP; + else if (has_expired(fdb->dst->br, fdb)) + return NUD_STALE; + else + return NUD_REACHABLE; +} + +static int fdb_fill_info(struct sk_buff *skb, + const struct net_bridge_fdb_entry *fdb, + u32 pid, u32 seq, int type, unsigned int flags) +{ + unsigned long now = jiffies; + struct nda_cacheinfo ci; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = 0; + ndm->ndm_type = 0; + ndm->ndm_ifindex = fdb->dst->dev->ifindex; + ndm->ndm_state = fdb_to_nud(fdb); + + NLA_PUT(skb, NDA_LLADDR, ETH_ALEN, &fdb->addr); + + ci.ndm_used = jiffies_to_clock_t(now - fdb->used); + ci.ndm_confirmed = 0; + ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); + ci.ndm_refcnt = 0; + NLA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static inline size_t fdb_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(struct nda_cacheinfo)); +} + +static void fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) +{ + struct net *net = dev_net(fdb->dst->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = fdb_fill_info(skb, fdb, 0, 0, type, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + +/* Dump information about entries, in response to GETNEIGH */ +int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int idx = 0; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + struct net_bridge *br = netdev_priv(dev); + int i; + + if (!(dev->priv_flags & IFF_EBRIDGE)) + continue; + + for (i = 0; i < BR_HASH_SIZE; i++) { + struct hlist_node *h; + struct net_bridge_fdb_entry *f; + + hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) { + if (idx < cb->args[0]) + goto skip; + + if (fdb_fill_info(skb, f, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI) < 0) + break; +skip: + ++idx; + } + } + } + rcu_read_unlock(); + + cb->args[0] = idx; + + return skb->len; +} + +/* Create new static fdb entry */ +static int fdb_add_entry(struct net_bridge_port *source, const __u8 *addr, + __u16 state) +{ + struct net_bridge *br = source->br; + struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct net_bridge_fdb_entry *fdb; + + fdb = fdb_find(head, addr); + if (fdb) + return -EEXIST; + + fdb = fdb_create(head, source, addr); + if (!fdb) + return -ENOMEM; + + if (state & NUD_PERMANENT) + fdb->is_local = fdb->is_static = 1; + else if (state & NUD_NOARP) + fdb->is_static = 1; + return 0; +} + +/* Add new permanent fdb entry with RTM_NEWNEIGH */ +int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX+1]; + struct net_device *dev; + struct net_bridge_port *p; + const __u8 *addr; + int err; + + ASSERT_RTNL(); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + if (err < 0) + return err; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex == 0) { + pr_info("bridge: RTM_NEWNEIGH with invalid ifindex\n"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + pr_info("bridge: RTM_NEWNEIGH with unknown ifindex\n"); + return -ENODEV; + } + + if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { + pr_info("bridge: RTM_NEWNEIGH with invalid address\n"); + return -EINVAL; + } + + addr = nla_data(tb[NDA_LLADDR]); + if (!is_valid_ether_addr(addr)) { + pr_info("bridge: RTM_NEWNEIGH with invalid ether address\n"); + return -EINVAL; + } + + p = br_port_get_rtnl(dev); + if (p == NULL) { + pr_info("bridge: RTM_NEWNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + + spin_lock_bh(&p->br->hash_lock); + err = fdb_add_entry(p, addr, ndm->ndm_state); + spin_unlock_bh(&p->br->hash_lock); + + return err; +} + +static int fdb_delete_by_addr(struct net_bridge_port *p, const u8 *addr) +{ + struct net_bridge *br = p->br; + struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct net_bridge_fdb_entry *fdb; + + fdb = fdb_find(head, addr); + if (!fdb) + return -ENOENT; + + fdb_delete(fdb); + return 0; +} + +/* Remove neighbor entry with RTM_DELNEIGH */ +int br_fdb_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ndmsg *ndm; + struct net_bridge_port *p; + struct nlattr *llattr; + const __u8 *addr; + struct net_device *dev; + int err; + + ASSERT_RTNL(); + if (nlmsg_len(nlh) < sizeof(*ndm)) + return -EINVAL; + + ndm = nlmsg_data(nlh); + if (ndm->ndm_ifindex == 0) { + pr_info("bridge: RTM_DELNEIGH with invalid ifindex\n"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (dev == NULL) { + pr_info("bridge: RTM_DELNEIGH with unknown ifindex\n"); + return -ENODEV; + } + + llattr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_LLADDR); + if (llattr == NULL || nla_len(llattr) != ETH_ALEN) { + pr_info("bridge: RTM_DELNEIGH with invalid address\n"); + return -EINVAL; + } + + addr = nla_data(llattr); + + p = br_port_get_rtnl(dev); + if (p == NULL) { + pr_info("bridge: RTM_DELNEIGH %s not a bridge port\n", + dev->name); + return -EINVAL; + } + + spin_lock_bh(&p->br->hash_lock); + err = fdb_delete_by_addr(p, addr); + spin_unlock_bh(&p->br->hash_lock); + + return err; +} diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 718b60366dfe..7f5379c593d9 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -175,56 +175,6 @@ static void del_br(struct net_bridge *br, struct list_head *head) unregister_netdevice_queue(br->dev, head); } -static struct net_device *new_bridge_dev(struct net *net, const char *name) -{ - struct net_bridge *br; - struct net_device *dev; - - dev = alloc_netdev(sizeof(struct net_bridge), name, - br_dev_setup); - - if (!dev) - return NULL; - dev_net_set(dev, net); - - br = netdev_priv(dev); - br->dev = dev; - - br->stats = alloc_percpu(struct br_cpu_netstats); - if (!br->stats) { - free_netdev(dev); - return NULL; - } - - spin_lock_init(&br->lock); - INIT_LIST_HEAD(&br->port_list); - spin_lock_init(&br->hash_lock); - - br->bridge_id.prio[0] = 0x80; - br->bridge_id.prio[1] = 0x00; - - memcpy(br->group_addr, br_group_address, ETH_ALEN); - - br->feature_mask = dev->features; - br->stp_enabled = BR_NO_STP; - br->designated_root = br->bridge_id; - br->root_path_cost = 0; - br->root_port = 0; - br->bridge_max_age = br->max_age = 20 * HZ; - br->bridge_hello_time = br->hello_time = 2 * HZ; - br->bridge_forward_delay = br->forward_delay = 15 * HZ; - br->topology_change = 0; - br->topology_change_detected = 0; - br->ageing_time = 300 * HZ; - - br_netfilter_rtable_init(br); - - br_stp_timer_init(br); - br_multicast_init(br); - - return dev; -} - /* find an available port number */ static int find_portno(struct net_bridge *br) { @@ -277,42 +227,19 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, return p; } -static struct device_type br_type = { - .name = "bridge", -}; - int br_add_bridge(struct net *net, const char *name) { struct net_device *dev; - int ret; - dev = new_bridge_dev(net, name); + dev = alloc_netdev(sizeof(struct net_bridge), name, + br_dev_setup); + if (!dev) return -ENOMEM; - rtnl_lock(); - if (strchr(dev->name, '%')) { - ret = dev_alloc_name(dev, dev->name); - if (ret < 0) - goto out_free; - } - - SET_NETDEV_DEVTYPE(dev, &br_type); - - ret = register_netdevice(dev); - if (ret) - goto out_free; - - ret = br_sysfs_addbr(dev); - if (ret) - unregister_netdevice(dev); - out: - rtnl_unlock(); - return ret; + dev_net_set(dev, net); -out_free: - free_netdev(dev); - goto out; + return register_netdev(dev); } int br_del_bridge(struct net *net, const char *name) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index e2160792e1bc..785932d7ad32 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -98,9 +98,10 @@ int br_handle_frame_finish(struct sk_buff *skb) } if (skb) { - if (dst) + if (dst) { + dst->used = jiffies; br_forward(dst->dst, skb, skb2); - else + } else br_flood_forward(br, skb, skb2); } diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 3d9fca0e3370..7222fe1d5460 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -181,40 +181,19 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!capable(CAP_NET_ADMIN)) return -EPERM; - spin_lock_bh(&br->lock); - br->bridge_forward_delay = clock_t_to_jiffies(args[1]); - if (br_is_root_bridge(br)) - br->forward_delay = br->bridge_forward_delay; - spin_unlock_bh(&br->lock); - return 0; + return br_set_forward_delay(br, args[1]); case BRCTL_SET_BRIDGE_HELLO_TIME: - { - unsigned long t = clock_t_to_jiffies(args[1]); if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (t < HZ) - return -EINVAL; - - spin_lock_bh(&br->lock); - br->bridge_hello_time = t; - if (br_is_root_bridge(br)) - br->hello_time = br->bridge_hello_time; - spin_unlock_bh(&br->lock); - return 0; - } + return br_set_hello_time(br, args[1]); case BRCTL_SET_BRIDGE_MAX_AGE: if (!capable(CAP_NET_ADMIN)) return -EPERM; - spin_lock_bh(&br->lock); - br->bridge_max_age = clock_t_to_jiffies(args[1]); - if (br_is_root_bridge(br)) - br->max_age = br->bridge_max_age; - spin_unlock_bh(&br->lock); - return 0; + return br_set_max_age(br, args[1]); case BRCTL_SET_AGEING_TIME: if (!capable(CAP_NET_ADMIN)) @@ -275,19 +254,16 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) case BRCTL_SET_PORT_PRIORITY: { struct net_bridge_port *p; - int ret = 0; + int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (args[2] >= (1<<(16-BR_PORT_BITS))) - return -ERANGE; - spin_lock_bh(&br->lock); if ((p = br_get_port(br, args[1])) == NULL) ret = -EINVAL; else - br_stp_set_port_priority(p, args[2]); + ret = br_stp_set_port_priority(p, args[2]); spin_unlock_bh(&br->lock); return ret; } @@ -295,15 +271,17 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) case BRCTL_SET_PATH_COST: { struct net_bridge_port *p; - int ret = 0; + int ret; if (!capable(CAP_NET_ADMIN)) return -EPERM; + spin_lock_bh(&br->lock); if ((p = br_get_port(br, args[1])) == NULL) ret = -EINVAL; else - br_stp_set_path_cost(p, args[2]); + ret = br_stp_set_path_cost(p, args[2]); + spin_unlock_bh(&br->lock); return ret; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index f8bf4c7f842c..134a2ff6b98b 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -12,9 +12,11 @@ #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/etherdevice.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> + #include "br_private.h" static inline size_t br_nlmsg_size(void) @@ -188,20 +190,61 @@ static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return 0; } +static int br_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (tb[IFLA_ADDRESS]) { + if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) + return -EINVAL; + if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) + return -EADDRNOTAVAIL; + } + + return 0; +} + +static struct rtnl_link_ops br_link_ops __read_mostly = { + .kind = "bridge", + .priv_size = sizeof(struct net_bridge), + .setup = br_dev_setup, + .validate = br_validate, +}; int __init br_netlink_init(void) { - if (__rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, br_dump_ifinfo)) - return -ENOBUFS; + int err; - /* Only the first call to __rtnl_register can fail */ - __rtnl_register(PF_BRIDGE, RTM_SETLINK, br_rtm_setlink, NULL); + err = rtnl_link_register(&br_link_ops); + if (err < 0) + goto err1; + + err = __rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, br_dump_ifinfo); + if (err) + goto err2; + err = __rtnl_register(PF_BRIDGE, RTM_SETLINK, br_rtm_setlink, NULL); + if (err) + goto err3; + err = __rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, br_fdb_add, NULL); + if (err) + goto err3; + err = __rtnl_register(PF_BRIDGE, RTM_DELNEIGH, br_fdb_delete, NULL); + if (err) + goto err3; + err = __rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, br_fdb_dump); + if (err) + goto err3; return 0; + +err3: + rtnl_unregister_all(PF_BRIDGE); +err2: + rtnl_link_unregister(&br_link_ops); +err1: + return err; } void __exit br_netlink_fini(void) { + rtnl_link_unregister(&br_link_ops); rtnl_unregister_all(PF_BRIDGE); } - diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 7d337c9b6082..7a03bb975375 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -36,6 +36,12 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v struct net_bridge *br; int err; + /* register of bridge completed, add sysfs entries */ + if ((dev->priv_flags && IFF_EBRIDGE) && event == NETDEV_REGISTER) { + br_sysfs_addbr(dev); + return NOTIFY_DONE; + } + /* not a port of a bridge */ p = br_port_get_rtnl(dev); if (!p) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 387013d33745..e2a40343aa09 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -64,7 +64,8 @@ struct net_bridge_fdb_entry struct net_bridge_port *dst; struct rcu_head rcu; - unsigned long ageing_timer; + unsigned long updated; + unsigned long used; mac_addr addr; unsigned char is_local; unsigned char is_static; @@ -353,6 +354,9 @@ extern int br_fdb_insert(struct net_bridge *br, extern void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr); +extern int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb); +extern int br_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg); +extern int br_fdb_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg); /* br_forward.c */ extern void br_deliver(const struct net_bridge_port *to, @@ -491,6 +495,11 @@ extern struct net_bridge_port *br_get_port(struct net_bridge *br, extern void br_init_port(struct net_bridge_port *p); extern void br_become_designated_port(struct net_bridge_port *p); +extern int br_set_forward_delay(struct net_bridge *br, unsigned long x); +extern int br_set_hello_time(struct net_bridge *br, unsigned long x); +extern int br_set_max_age(struct net_bridge *br, unsigned long x); + + /* br_stp_if.c */ extern void br_stp_enable_bridge(struct net_bridge *br); extern void br_stp_disable_bridge(struct net_bridge *br); @@ -501,10 +510,10 @@ extern bool br_stp_recalculate_bridge_id(struct net_bridge *br); extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); extern void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); -extern void br_stp_set_port_priority(struct net_bridge_port *p, - u8 newprio); -extern void br_stp_set_path_cost(struct net_bridge_port *p, - u32 path_cost); +extern int br_stp_set_port_priority(struct net_bridge_port *p, + unsigned long newprio); +extern int br_stp_set_path_cost(struct net_bridge_port *p, + unsigned long path_cost); extern ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id); /* br_stp_bpdu.c */ diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h index 8b650f7fbfa0..642ef47a867e 100644 --- a/net/bridge/br_private_stp.h +++ b/net/bridge/br_private_stp.h @@ -16,6 +16,19 @@ #define BPDU_TYPE_CONFIG 0 #define BPDU_TYPE_TCN 0x80 +/* IEEE 802.1D-1998 timer values */ +#define BR_MIN_HELLO_TIME (1*HZ) +#define BR_MAX_HELLO_TIME (10*HZ) + +#define BR_MIN_FORWARD_DELAY (2*HZ) +#define BR_MAX_FORWARD_DELAY (30*HZ) + +#define BR_MIN_MAX_AGE (6*HZ) +#define BR_MAX_MAX_AGE (40*HZ) + +#define BR_MIN_PATH_COST 1 +#define BR_MAX_PATH_COST 65535 + struct br_config_bpdu { unsigned topology_change:1; diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 7370d14f634d..bb4383e84de9 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -484,3 +484,51 @@ void br_received_tcn_bpdu(struct net_bridge_port *p) br_topology_change_acknowledge(p); } } + +/* Change bridge STP parameter */ +int br_set_hello_time(struct net_bridge *br, unsigned long val) +{ + unsigned long t = clock_t_to_jiffies(val); + + if (t < BR_MIN_HELLO_TIME || t > BR_MAX_HELLO_TIME) + return -ERANGE; + + spin_lock_bh(&br->lock); + br->bridge_hello_time = t; + if (br_is_root_bridge(br)) + br->hello_time = br->bridge_hello_time; + spin_unlock_bh(&br->lock); + return 0; +} + +int br_set_max_age(struct net_bridge *br, unsigned long val) +{ + unsigned long t = clock_t_to_jiffies(val); + + if (t < BR_MIN_MAX_AGE || t > BR_MAX_MAX_AGE) + return -ERANGE; + + spin_lock_bh(&br->lock); + br->bridge_max_age = t; + if (br_is_root_bridge(br)) + br->max_age = br->bridge_max_age; + spin_unlock_bh(&br->lock); + return 0; + +} + +int br_set_forward_delay(struct net_bridge *br, unsigned long val) +{ + unsigned long t = clock_t_to_jiffies(val); + + if (br->stp_enabled != BR_NO_STP && + (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY)) + return -ERANGE; + + spin_lock_bh(&br->lock); + br->bridge_forward_delay = t; + if (br_is_root_bridge(br)) + br->forward_delay = br->bridge_forward_delay; + spin_unlock_bh(&br->lock); + return 0; +} diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 9b61d09de9b9..6f615b8192f4 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -20,7 +20,7 @@ /* Port id is composed of priority and port number. - * NB: least significant bits of priority are dropped to + * NB: some bits of priority are dropped to * make room for more ports. */ static inline port_id br_make_port_id(__u8 priority, __u16 port_no) @@ -29,6 +29,8 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no) | (port_no & ((1<<BR_PORT_BITS)-1)); } +#define BR_MAX_PORT_PRIORITY ((u16)~0 >> BR_PORT_BITS) + /* called under bridge lock */ void br_init_port(struct net_bridge_port *p) { @@ -255,10 +257,14 @@ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) } /* called under bridge lock */ -void br_stp_set_port_priority(struct net_bridge_port *p, u8 newprio) +int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio) { - port_id new_port_id = br_make_port_id(newprio, p->port_no); + port_id new_port_id; + + if (newprio > BR_MAX_PORT_PRIORITY) + return -ERANGE; + new_port_id = br_make_port_id(newprio, p->port_no); if (br_is_designated_port(p)) p->designated_port = new_port_id; @@ -269,14 +275,21 @@ void br_stp_set_port_priority(struct net_bridge_port *p, u8 newprio) br_become_designated_port(p); br_port_state_selection(p->br); } + + return 0; } /* called under bridge lock */ -void br_stp_set_path_cost(struct net_bridge_port *p, u32 path_cost) +int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) { + if (path_cost < BR_MIN_PATH_COST || + path_cost > BR_MAX_PATH_COST) + return -ERANGE; + p->path_cost = path_cost; br_configuration_update(p->br); br_port_state_selection(p->br); + return 0; } ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 5c1e5559ebba..68b893ea8c3a 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -43,9 +43,7 @@ static ssize_t store_bridge_parm(struct device *d, if (endp == buf) return -EINVAL; - spin_lock_bh(&br->lock); err = (*set)(br, val); - spin_unlock_bh(&br->lock); return err ? err : len; } @@ -57,20 +55,11 @@ static ssize_t show_forward_delay(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } -static int set_forward_delay(struct net_bridge *br, unsigned long val) -{ - unsigned long delay = clock_t_to_jiffies(val); - br->forward_delay = delay; - if (br_is_root_bridge(br)) - br->bridge_forward_delay = delay; - return 0; -} - static ssize_t store_forward_delay(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, set_forward_delay); + return store_bridge_parm(d, buf, len, br_set_forward_delay); } static DEVICE_ATTR(forward_delay, S_IRUGO | S_IWUSR, show_forward_delay, store_forward_delay); @@ -82,24 +71,11 @@ static ssize_t show_hello_time(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->hello_time)); } -static int set_hello_time(struct net_bridge *br, unsigned long val) -{ - unsigned long t = clock_t_to_jiffies(val); - - if (t < HZ) - return -EINVAL; - - br->hello_time = t; - if (br_is_root_bridge(br)) - br->bridge_hello_time = t; - return 0; -} - static ssize_t store_hello_time(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, set_hello_time); + return store_bridge_parm(d, buf, len, br_set_hello_time); } static DEVICE_ATTR(hello_time, S_IRUGO | S_IWUSR, show_hello_time, store_hello_time); @@ -111,19 +87,10 @@ static ssize_t show_max_age(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->max_age)); } -static int set_max_age(struct net_bridge *br, unsigned long val) -{ - unsigned long t = clock_t_to_jiffies(val); - br->max_age = t; - if (br_is_root_bridge(br)) - br->bridge_max_age = t; - return 0; -} - static ssize_t store_max_age(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - return store_bridge_parm(d, buf, len, set_max_age); + return store_bridge_parm(d, buf, len, br_set_max_age); } static DEVICE_ATTR(max_age, S_IRUGO | S_IWUSR, show_max_age, store_max_age); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index fd5799c9bc8d..6229b62749e8 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -23,7 +23,7 @@ struct brport_attribute { struct attribute attr; ssize_t (*show)(struct net_bridge_port *, char *); - ssize_t (*store)(struct net_bridge_port *, unsigned long); + int (*store)(struct net_bridge_port *, unsigned long); }; #define BRPORT_ATTR(_name,_mode,_show,_store) \ @@ -38,27 +38,17 @@ static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->path_cost); } -static ssize_t store_path_cost(struct net_bridge_port *p, unsigned long v) -{ - br_stp_set_path_cost(p, v); - return 0; -} + static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR, - show_path_cost, store_path_cost); + show_path_cost, br_stp_set_path_cost); static ssize_t show_priority(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->priority); } -static ssize_t store_priority(struct net_bridge_port *p, unsigned long v) -{ - if (v >= (1<<(16-BR_PORT_BITS))) - return -ERANGE; - br_stp_set_port_priority(p, v); - return 0; -} + static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR, - show_priority, store_priority); + show_priority, br_stp_set_port_priority); static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) { @@ -136,7 +126,7 @@ static ssize_t show_hold_timer(struct net_bridge_port *p, } static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); -static ssize_t store_flush(struct net_bridge_port *p, unsigned long v) +static int store_flush(struct net_bridge_port *p, unsigned long v) { br_fdb_delete_by_port(p->br, p, 0); // Don't delete local entry return 0; @@ -148,7 +138,7 @@ static ssize_t show_hairpin_mode(struct net_bridge_port *p, char *buf) int hairpin_mode = (p->flags & BR_HAIRPIN_MODE) ? 1 : 0; return sprintf(buf, "%d\n", hairpin_mode); } -static ssize_t store_hairpin_mode(struct net_bridge_port *p, unsigned long v) +static int store_hairpin_mode(struct net_bridge_port *p, unsigned long v) { if (v) p->flags |= BR_HAIRPIN_MODE; @@ -165,7 +155,7 @@ static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) return sprintf(buf, "%d\n", p->multicast_router); } -static ssize_t store_multicast_router(struct net_bridge_port *p, +static int store_multicast_router(struct net_bridge_port *p, unsigned long v) { return br_multicast_set_port_router(p, v); diff --git a/net/can/af_can.c b/net/can/af_can.c index 733d66f1b05a..a8dcaa49675a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -85,7 +85,7 @@ static struct kmem_cache *rcv_cache __read_mostly; /* table of registered CAN protocols */ static struct can_proto *proto_tab[CAN_NPROTO] __read_mostly; -static DEFINE_SPINLOCK(proto_tab_lock); +static DEFINE_MUTEX(proto_tab_lock); struct timer_list can_stattimer; /* timer for statistics update */ struct s_stats can_stats; /* packet statistics */ @@ -115,6 +115,19 @@ static void can_sock_destruct(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); } +static struct can_proto *can_try_module_get(int protocol) +{ + struct can_proto *cp; + + rcu_read_lock(); + cp = rcu_dereference(proto_tab[protocol]); + if (cp && !try_module_get(cp->prot->owner)) + cp = NULL; + rcu_read_unlock(); + + return cp; +} + static int can_create(struct net *net, struct socket *sock, int protocol, int kern) { @@ -130,9 +143,12 @@ static int can_create(struct net *net, struct socket *sock, int protocol, if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; + cp = can_try_module_get(protocol); + #ifdef CONFIG_MODULES - /* try to load protocol module kernel is modular */ - if (!proto_tab[protocol]) { + if (!cp) { + /* try to load protocol module if kernel is modular */ + err = request_module("can-proto-%d", protocol); /* @@ -143,22 +159,18 @@ static int can_create(struct net *net, struct socket *sock, int protocol, if (err && printk_ratelimit()) printk(KERN_ERR "can: request_module " "(can-proto-%d) failed.\n", protocol); + + cp = can_try_module_get(protocol); } #endif - spin_lock(&proto_tab_lock); - cp = proto_tab[protocol]; - if (cp && !try_module_get(cp->prot->owner)) - cp = NULL; - spin_unlock(&proto_tab_lock); - /* check for available protocol and correct usage */ if (!cp) return -EPROTONOSUPPORT; if (cp->type != sock->type) { - err = -EPROTONOSUPPORT; + err = -EPROTOTYPE; goto errout; } @@ -694,15 +706,16 @@ int can_proto_register(struct can_proto *cp) if (err < 0) return err; - spin_lock(&proto_tab_lock); + mutex_lock(&proto_tab_lock); + if (proto_tab[proto]) { printk(KERN_ERR "can: protocol %d already registered\n", proto); err = -EBUSY; } else - proto_tab[proto] = cp; + rcu_assign_pointer(proto_tab[proto], cp); - spin_unlock(&proto_tab_lock); + mutex_unlock(&proto_tab_lock); if (err < 0) proto_unregister(cp->prot); @@ -719,13 +732,12 @@ void can_proto_unregister(struct can_proto *cp) { int proto = cp->protocol; - spin_lock(&proto_tab_lock); - if (!proto_tab[proto]) { - printk(KERN_ERR "BUG: can: protocol %d is not registered\n", - proto); - } - proto_tab[proto] = NULL; - spin_unlock(&proto_tab_lock); + mutex_lock(&proto_tab_lock); + BUG_ON(proto_tab[proto] != cp); + rcu_assign_pointer(proto_tab[proto], NULL); + mutex_unlock(&proto_tab_lock); + + synchronize_rcu(); proto_unregister(cp->prot); } diff --git a/net/core/dev.c b/net/core/dev.c index 956d3b006e8b..95897ff3a76f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5236,7 +5236,7 @@ u32 netdev_fix_features(struct net_device *dev, u32 features) } EXPORT_SYMBOL(netdev_fix_features); -void netdev_update_features(struct net_device *dev) +int __netdev_update_features(struct net_device *dev) { u32 features; int err = 0; @@ -5250,7 +5250,7 @@ void netdev_update_features(struct net_device *dev) features = netdev_fix_features(dev, features); if (dev->features == features) - return; + return 0; netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n", dev->features, features); @@ -5258,12 +5258,23 @@ void netdev_update_features(struct net_device *dev) if (dev->netdev_ops->ndo_set_features) err = dev->netdev_ops->ndo_set_features(dev, features); - if (!err) - dev->features = features; - else if (err < 0) + if (unlikely(err < 0)) { netdev_err(dev, "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n", err, features, dev->features); + return -1; + } + + if (!err) + dev->features = features; + + return 1; +} + +void netdev_update_features(struct net_device *dev) +{ + if (__netdev_update_features(dev)) + netdev_features_change(dev); } EXPORT_SYMBOL(netdev_update_features); @@ -5414,6 +5425,14 @@ int register_netdevice(struct net_device *dev) dev->features &= ~NETIF_F_GSO; } + /* Turn on no cache copy if HW is doing checksum */ + dev->hw_features |= NETIF_F_NOCACHE_COPY; + if ((dev->features & NETIF_F_ALL_CSUM) && + !(dev->features & NETIF_F_NO_CSUM)) { + dev->wanted_features |= NETIF_F_NOCACHE_COPY; + dev->features |= NETIF_F_NOCACHE_COPY; + } + /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, * vlan_dev_init() will do the dev->features check, so these features * are enabled only if supported by underlying device. @@ -5430,7 +5449,7 @@ int register_netdevice(struct net_device *dev) goto err_uninit; dev->reg_state = NETREG_REGISTERED; - netdev_update_features(dev); + __netdev_update_features(dev); /* * Default initial state at registry is that the @@ -6171,6 +6190,10 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask) } } + /* If device can't no cache copy, don't do for all */ + if (!(one & NETIF_F_NOCACHE_COPY)) + all &= ~NETIF_F_NOCACHE_COPY; + one |= NETIF_F_ALL_CSUM; one |= all & NETIF_F_ONE_FOR_ALL; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 74ead9eca126..704e176ad3a9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -21,6 +21,8 @@ #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/rtnetlink.h> +#include <linux/sched.h> /* * Some useful ethtool_ops methods that're device independent. @@ -317,7 +319,7 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr) dev->wanted_features &= ~features[0].valid; dev->wanted_features |= features[0].valid & features[0].requested; - netdev_update_features(dev); + __netdev_update_features(dev); if ((dev->wanted_features ^ dev->features) & features[0].valid) ret |= ETHTOOL_F_WISH; @@ -359,7 +361,7 @@ static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GS /* NETIF_F_NTUPLE */ "rx-ntuple-filter", /* NETIF_F_RXHASH */ "rx-hashing", /* NETIF_F_RXCSUM */ "rx-checksum", - "", + /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy" "", }; @@ -499,7 +501,7 @@ static int ethtool_set_one_feature(struct net_device *dev, else dev->wanted_features &= ~mask; - netdev_update_features(dev); + __netdev_update_features(dev); return 0; } @@ -551,7 +553,7 @@ int __ethtool_set_flags(struct net_device *dev, u32 data) dev->wanted_features = (dev->wanted_features & ~changed) | data; - netdev_update_features(dev); + __netdev_update_features(dev); return 0; } @@ -908,6 +910,9 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, struct ethtool_rx_ntuple_flow_spec_container *fsc = NULL; int ret; + if (!ops->set_rx_ntuple) + return -EOPNOTSUPP; + if (!(dev->features & NETIF_F_NTUPLE)) return -EINVAL; @@ -1618,14 +1623,63 @@ out: static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; + static bool busy; + int rc; - if (!dev->ethtool_ops->phys_id) + if (!dev->ethtool_ops->set_phys_id && !dev->ethtool_ops->phys_id) return -EOPNOTSUPP; + if (busy) + return -EBUSY; + if (copy_from_user(&id, useraddr, sizeof(id))) return -EFAULT; - return dev->ethtool_ops->phys_id(dev, id.data); + if (!dev->ethtool_ops->set_phys_id) + /* Do it the old way */ + return dev->ethtool_ops->phys_id(dev, id.data); + + rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); + if (rc && rc != -EINVAL) + return rc; + + /* Drop the RTNL lock while waiting, but prevent reentry or + * removal of the device. + */ + busy = true; + dev_hold(dev); + rtnl_unlock(); + + if (rc == 0) { + /* Driver will handle this itself */ + schedule_timeout_interruptible( + id.data ? id.data : MAX_SCHEDULE_TIMEOUT); + } else { + /* Driver expects to be called periodically */ + do { + rtnl_lock(); + rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_ON); + rtnl_unlock(); + if (rc) + break; + schedule_timeout_interruptible(HZ / 2); + + rtnl_lock(); + rc = dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_OFF); + rtnl_unlock(); + if (rc) + break; + schedule_timeout_interruptible(HZ / 2); + } while (!signal_pending(current) && + (id.data == 0 || --id.data != 0)); + } + + rtnl_lock(); + dev_put(dev); + busy = false; + + (void)dev->ethtool_ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); + return rc; } static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 451088330bbb..22524716fe70 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -44,6 +44,7 @@ #include <net/arp.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> +#include <net/xfrm.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -188,9 +189,9 @@ EXPORT_SYMBOL(inet_dev_addr_type); * - check, that packet arrived from expected physical interface. * called with rcu_read_lock() */ -int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, - struct net_device *dev, __be32 *spec_dst, - u32 *itag, u32 mark) +int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, + int oif, struct net_device *dev, __be32 *spec_dst, + u32 *itag) { struct in_device *in_dev; struct flowi4 fl4; @@ -202,7 +203,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, fl4.flowi4_oif = 0; fl4.flowi4_iif = oif; - fl4.flowi4_mark = mark; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; @@ -212,10 +212,12 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; - rpf = IN_DEV_RPFILTER(in_dev); + + /* Ignore rp_filter for packets protected by IPsec. */ + rpf = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(in_dev); + accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); - if (mark && !IN_DEV_SRC_VMARK(in_dev)) - fl4.flowi4_mark = 0; + fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; } if (in_dev == NULL) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e9013d6c1f51..bde80c450b52 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -126,7 +126,7 @@ struct tnode { struct work_struct work; struct tnode *tnode_free; }; - struct rt_trie_node *child[0]; + struct rt_trie_node __rcu *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -151,7 +151,7 @@ struct trie_stat { }; struct trie { - struct rt_trie_node *trie; + struct rt_trie_node __rcu *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif @@ -177,16 +177,29 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; -static inline struct tnode *node_parent(struct rt_trie_node *node) +/* + * caller must hold RTNL + */ +static inline struct tnode *node_parent(const struct rt_trie_node *node) { - return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); + + return (struct tnode *)(parent & ~NODE_TYPE_MASK); } -static inline struct tnode *node_parent_rcu(struct rt_trie_node *node) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) { - struct tnode *ret = node_parent(node); + unsigned long parent; + + parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || + lockdep_rtnl_is_held()); - return rcu_dereference_rtnl(ret); + return (struct tnode *)(parent & ~NODE_TYPE_MASK); } /* Same as rcu_assign_pointer @@ -198,18 +211,24 @@ static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) node->parent = (unsigned long)ptr | NODE_TYPE(node); } -static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i) +/* + * caller must hold RTNL + */ +static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) { BUG_ON(i >= 1U << tn->bits); - return tn->child[i]; + return rtnl_dereference(tn->child[i]); } -static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) +/* + * caller must hold RCU read lock or RTNL + */ +static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) { - struct rt_trie_node *ret = tnode_get_child(tn, i); + BUG_ON(i >= 1U << tn->bits); - return rcu_dereference_rtnl(ret); + return rcu_dereference_rtnl(tn->child[i]); } static inline int tnode_child_length(const struct tnode *tn) @@ -487,7 +506,7 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull) { - struct rt_trie_node *chi = tn->child[i]; + struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); int isfull; BUG_ON(i >= 1<<tn->bits); @@ -665,7 +684,7 @@ one_child: for (i = 0; i < tnode_child_length(tn); i++) { struct rt_trie_node *n; - n = tn->child[i]; + n = rtnl_dereference(tn->child[i]); if (!n) continue; @@ -679,6 +698,20 @@ one_child: return (struct rt_trie_node *) tn; } + +static void tnode_clean_free(struct tnode *tn) +{ + int i; + struct tnode *tofree; + + for (i = 0; i < tnode_child_length(tn); i++) { + tofree = (struct tnode *)rtnl_dereference(tn->child[i]); + if (tofree) + tnode_free(tofree); + } + tnode_free(tn); +} + static struct tnode *inflate(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; @@ -755,8 +788,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode = (struct tnode *) node; if (inode->bits == 1) { - put_child(t, tn, 2*i, inode->child[0]); - put_child(t, tn, 2*i+1, inode->child[1]); + put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); + put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); tnode_free_safe(inode); continue; @@ -797,8 +830,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) size = tnode_child_length(left); for (j = 0; j < size; j++) { - put_child(t, left, j, inode->child[j]); - put_child(t, right, j, inode->child[j + size]); + put_child(t, left, j, rtnl_dereference(inode->child[j])); + put_child(t, right, j, rtnl_dereference(inode->child[j + size])); } put_child(t, tn, 2*i, resize(t, left)); put_child(t, tn, 2*i+1, resize(t, right)); @@ -808,18 +841,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) tnode_free_safe(oldtnode); return tn; nomem: - { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - return ERR_PTR(-ENOMEM); - } + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); } static struct tnode *halve(struct trie *t, struct tnode *tn) @@ -890,18 +913,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) tnode_free_safe(oldtnode); return tn; nomem: - { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - return ERR_PTR(-ENOMEM); - } + tnode_clean_free(tn); + return ERR_PTR(-ENOMEM); } /* readside must use rcu_read_lock currently dump routines @@ -1033,7 +1046,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) t_key cindex; pos = 0; - n = t->trie; + n = rtnl_dereference(t->trie); /* If we point to NULL, stop. Either the tree is empty and we should * just put a new leaf in if, or we have reached an empty child slot, @@ -1756,7 +1769,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) continue; if (IS_LEAF(c)) { - prefetch(p->child[idx]); + prefetch(rcu_dereference_rtnl(p->child[idx])); return (struct leaf *) c; } @@ -2272,7 +2285,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); - while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) { + while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6c0b7f4a3d7d..f784608a4c45 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -356,20 +356,14 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options *opt = inet_rsk(req)->opt; - struct flowi4 fl4 = { - .flowi4_oif = sk->sk_bound_dev_if, - .flowi4_mark = sk->sk_mark, - .daddr = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .saddr = ireq->loc_addr, - .flowi4_tos = RT_CONN_FLAGS(sk), - .flowi4_proto = sk->sk_protocol, - .flowi4_flags = inet_sk_flowi_flags(sk), - .fl4_sport = inet_sk(sk)->inet_sport, - .fl4_dport = ireq->rmt_port, - }; struct net *net = sock_net(sk); + struct flowi4 fl4; + flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, + sk->sk_protocol, inet_sk_flowi_flags(sk), + (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, + ireq->loc_addr, ireq->rmt_port, inet_sk(sk)->inet_sport); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 459c011b1d4a..bdad3d60aa82 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1474,16 +1474,14 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } { - struct flowi4 fl4 = { - .flowi4_oif = arg->bound_dev_if, - .daddr = daddr, - .saddr = rt->rt_spec_dst, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), - .fl4_sport = tcp_hdr(skb)->dest, - .fl4_dport = tcp_hdr(skb)->source, - .flowi4_proto = sk->sk_protocol, - .flowi4_flags = ip_reply_arg_flowi_flags(arg), - }; + struct flowi4 fl4; + + flowi4_init_output(&fl4, arg->bound_dev_if, 0, + RT_TOS(ip_hdr(skb)->tos), + RT_SCOPE_UNIVERSE, sk->sk_protocol, + ip_reply_arg_flowi_flags(arg), + daddr, rt->rt_spec_dst, + tcp_hdr(skb)->source, tcp_hdr(skb)->dest); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index bceaec42c37d..2b50cc2da90a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -548,17 +548,13 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } { - struct flowi4 fl4 = { - .flowi4_oif = ipc.oif, - .flowi4_mark = sk->sk_mark, - .daddr = daddr, - .saddr = saddr, - .flowi4_tos = tos, - .flowi4_proto = (inet->hdrincl ? - IPPROTO_RAW : - sk->sk_protocol), - .flowi4_flags = FLOWI_FLAG_CAN_SLEEP, - }; + struct flowi4 fl4; + + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, + inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0); + if (!inet->hdrincl) { err = raw_probe_proto_opt(&fl4, msg); if (err) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c1acf69858fd..0e7430c327a7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1871,8 +1871,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag, 0); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, + &itag); if (err < 0) goto e_err; } @@ -1981,8 +1981,8 @@ static int __mkroute_input(struct sk_buff *skb, } - err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag, skb->mark); + err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, &spec_dst, &itag); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2150,9 +2150,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; if (res.type == RTN_LOCAL) { - err = fib_validate_source(saddr, daddr, tos, + err = fib_validate_source(skb, saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag, skb->mark); + dev, &spec_dst, &itag); if (err < 0) goto martian_source_keep_err; if (err) @@ -2176,8 +2176,8 @@ brd_input: if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { - err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag, skb->mark); + err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, + &itag); if (err < 0) goto martian_source_keep_err; if (err) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 8b44c6d2a79b..71e029691908 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -345,17 +345,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * no easy way to do this. */ { - struct flowi4 fl4 = { - .flowi4_mark = sk->sk_mark, - .daddr = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .saddr = ireq->loc_addr, - .flowi4_tos = RT_CONN_FLAGS(sk), - .flowi4_proto = IPPROTO_TCP, - .flowi4_flags = inet_sk_flowi_flags(sk), - .fl4_sport = th->dest, - .fl4_dport = th->source, - }; + struct flowi4 fl4; + + flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), + RT_SCOPE_UNIVERSE, IPPROTO_TCP, + inet_sk_flowi_flags(sk), + (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, + ireq->loc_addr, th->source, th->dest); security_req_classify_flow(req, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(sock_net(sk), &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b22d45010545..054a59d21eb0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -999,7 +999,8 @@ new_segment: /* We have some space in skb head. Superb! */ if (copy > skb_tailroom(skb)) copy = skb_tailroom(skb); - if ((err = skb_add_data(skb, from, copy)) != 0) + err = skb_add_data_nocache(sk, skb, from, copy); + if (err) goto do_fault; } else { int merge = 0; @@ -1042,8 +1043,8 @@ new_segment: /* Time to copy data. We are close to * the end! */ - err = skb_copy_to_page(sk, from, skb, page, - off, copy); + err = skb_copy_to_page_nocache(sk, from, skb, + page, off, copy); if (err) { /* If this page was new, give it to the * socket so it does not get leaked. diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f87a8eb76f3b..a15c8fb653af 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -909,20 +909,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { - struct flowi4 fl4 = { - .flowi4_oif = ipc.oif, - .flowi4_mark = sk->sk_mark, - .daddr = faddr, - .saddr = saddr, - .flowi4_tos = tos, - .flowi4_proto = sk->sk_protocol, - .flowi4_flags = (inet_sk_flowi_flags(sk) | - FLOWI_FLAG_CAN_SLEEP), - .fl4_sport = inet->inet_sport, - .fl4_dport = dport, - }; + struct flowi4 fl4; struct net *net = sock_net(sk); + flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP, + faddr, saddr, dport, inet->inet_sport); + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a7a5583d4f68..aeaa2110b699 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -239,6 +239,17 @@ config NET_SCH_CHOKE To compile this code as a module, choose M here: the module will be called sch_choke. +config NET_SCH_QFQ + tristate "Quick Fair Queueing scheduler (QFQ)" + help + Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ) + packet scheduling algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_qfq. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT diff --git a/net/sched/Makefile b/net/sched/Makefile index 2e77b8dba22e..dc5889c0a15a 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o +obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c new file mode 100644 index 000000000000..103343408593 --- /dev/null +++ b/net/sched/sch_qfq.c @@ -0,0 +1,1137 @@ +/* + * net/sched/sch_qfq.c Quick Fair Queueing Scheduler. + * + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/pkt_sched.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + + +/* Quick Fair Queueing + =================== + + Sources: + + Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient + Packet Scheduling with Tight Bandwidth Distribution Guarantees." + + See also: + http://retis.sssup.it/~fabio/linux/qfq/ + */ + +/* + + Virtual time computations. + + S, F and V are all computed in fixed point arithmetic with + FRAC_BITS decimal bits. + + QFQ_MAX_INDEX is the maximum index allowed for a group. We need + one bit per index. + QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. + + The layout of the bits is as below: + + [ MTU_SHIFT ][ FRAC_BITS ] + [ MAX_INDEX ][ MIN_SLOT_SHIFT ] + ^.__grp->index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + + The max group index corresponds to Lmax/w_min, where + Lmax=1<<MTU_SHIFT, w_min = 1 . + From this, and knowing how many groups (MAX_INDEX) we want, + we can derive the shift corresponding to each group. + + Because we often need to compute + F = S + len/w_i and V = V + len/wsum + instead of storing w_i store the value + inv_w = (1<<FRAC_BITS)/w_i + so we can do F = S + len * inv_w * wsum. + We use W_TOT in the formulas so we can easily move between + static and adaptive weight sum. + + The per-scheduler-instance data contain all the data structures + for the scheduler: bitmaps and bucket lists. + + */ + +/* + * Maximum number of consecutive slots occupied by backlogged classes + * inside a group. + */ +#define QFQ_MAX_SLOTS 32 + +/* + * Shifts used for class<->group mapping. We allow class weights that are + * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the + * group with the smallest index that can support the L_i / r_i configured + * for the class. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + */ +#define QFQ_MAX_INDEX 19 +#define QFQ_MAX_WSHIFT 16 + +#define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) +#define QFQ_MAX_WSUM (2*QFQ_MAX_WEIGHT) + +#define FRAC_BITS 30 /* fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) +#define IWSUM (ONE_FP/QFQ_MAX_WSUM) + +#define QFQ_MTU_SHIFT 11 +#define QFQ_MIN_SLOT_SHIFT (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX) + +/* + * Possible group states. These values are used as indexes for the bitmaps + * array of struct qfq_queue. + */ +enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; + +struct qfq_group; + +struct qfq_class { + struct Qdisc_class_common common; + + unsigned int refcnt; + unsigned int filter_cnt; + + struct gnet_stats_basic_packed bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + struct Qdisc *qdisc; + + struct hlist_node next; /* Link for the slot list. */ + u64 S, F; /* flow timestamps (exact) */ + + /* group we belong to. In principle we would need the index, + * which is log_2(lmax/weight), but we never reference it + * directly, only the group. + */ + struct qfq_group *grp; + + /* these are copied from the flowset. */ + u32 inv_w; /* ONE_FP/weight */ + u32 lmax; /* Max packet size for this flow. */ +}; + +struct qfq_group { + u64 S, F; /* group timestamps (approx). */ + unsigned int slot_shift; /* Slot shift. */ + unsigned int index; /* Group index. */ + unsigned int front; /* Index of the front slot. */ + unsigned long full_slots; /* non-empty slots */ + + /* Array of RR lists of active classes. */ + struct hlist_head slots[QFQ_MAX_SLOTS]; +}; + +struct qfq_sched { + struct tcf_proto *filter_list; + struct Qdisc_class_hash clhash; + + u64 V; /* Precise virtual time. */ + u32 wsum; /* weight sum */ + + unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ + struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ +}; + +static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct Qdisc_class_common *clc; + + clc = qdisc_class_find(&q->clhash, classid); + if (clc == NULL) + return NULL; + return container_of(clc, struct qfq_class, common); +} + +static void qfq_purge_queue(struct qfq_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + qdisc_tree_decrease_qlen(cl->qdisc, len); +} + +static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { + [TCA_QFQ_WEIGHT] = { .type = NLA_U32 }, + [TCA_QFQ_LMAX] = { .type = NLA_U32 }, +}; + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(u32 inv_w, unsigned int maxlen) +{ + u64 slot_size = (u64)maxlen * inv_w; + unsigned long size_map; + int index = 0; + + size_map = slot_size >> QFQ_MIN_SLOT_SHIFT; + if (!size_map) + goto out; + + index = __fls(size_map) + 1; /* basically a log_2 */ + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; +out: + pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n", + (unsigned long) ONE_FP/inv_w, maxlen, index); + + return index; +} + +static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct nlattr **tca, unsigned long *arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)*arg; + struct nlattr *tb[TCA_QFQ_MAX + 1]; + u32 weight, lmax, inv_w; + int i, err; + + if (tca[TCA_OPTIONS] == NULL) { + pr_notice("qfq: no options\n"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy); + if (err < 0) + return err; + + if (tb[TCA_QFQ_WEIGHT]) { + weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); + if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) { + pr_notice("qfq: invalid weight %u\n", weight); + return -EINVAL; + } + } else + weight = 1; + + inv_w = ONE_FP / weight; + weight = ONE_FP / inv_w; + if (q->wsum + weight > QFQ_MAX_WSUM) { + pr_notice("qfq: total weight out of range (%u + %u)\n", + weight, q->wsum); + return -EINVAL; + } + + if (tb[TCA_QFQ_LMAX]) { + lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); + if (!lmax || lmax > (1UL << QFQ_MTU_SHIFT)) { + pr_notice("qfq: invalid max length %u\n", lmax); + return -EINVAL; + } + } else + lmax = 1UL << QFQ_MTU_SHIFT; + + if (cl != NULL) { + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) + return err; + } + + sch_tree_lock(sch); + if (tb[TCA_QFQ_WEIGHT]) { + q->wsum = weight - ONE_FP / cl->inv_w; + cl->inv_w = inv_w; + } + sch_tree_unlock(sch); + + return 0; + } + + cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + + cl->refcnt = 1; + cl->common.classid = classid; + cl->lmax = lmax; + cl->inv_w = inv_w; + i = qfq_calc_index(cl->inv_w, cl->lmax); + + cl->grp = &q->groups[i]; + q->wsum += weight; + + cl->qdisc = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, classid); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, &cl->rate_est, + qdisc_root_sleeping_lock(sch), + tca[TCA_RATE]); + if (err) { + qdisc_destroy(cl->qdisc); + kfree(cl); + return err; + } + } + + sch_tree_lock(sch); + qdisc_class_hash_insert(&q->clhash, &cl->common); + sch_tree_unlock(sch); + + qdisc_class_hash_grow(sch, &q->clhash); + + *arg = (unsigned long)cl; + return 0; +} + +static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) +{ + struct qfq_sched *q = qdisc_priv(sch); + + if (cl->inv_w) { + q->wsum -= ONE_FP / cl->inv_w; + cl->inv_w = 0; + } + + gen_kill_estimator(&cl->bstats, &cl->rate_est); + qdisc_destroy(cl->qdisc); + kfree(cl); +} + +static int qfq_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)arg; + + if (cl->filter_cnt > 0) + return -EBUSY; + + sch_tree_lock(sch); + + qfq_purge_queue(cl); + qdisc_class_hash_remove(&q->clhash, &cl->common); + + BUG_ON(--cl->refcnt == 0); + /* + * This shouldn't happen: we "hold" one cops->get() when called + * from tc_ctl_tclass; the destroy method is done from cops->put(). + */ + + sch_tree_unlock(sch); + return 0; +} + +static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid) +{ + struct qfq_class *cl = qfq_find_class(sch, classid); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void qfq_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + if (--cl->refcnt == 0) + qfq_destroy_class(sch, cl); +} + +static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl) +{ + struct qfq_sched *q = qdisc_priv(sch); + + if (cl) + return NULL; + + return &q->filter_list; +} + +static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct qfq_class *cl = qfq_find_class(sch, classid); + + if (cl != NULL) + cl->filter_cnt++; + + return (unsigned long)cl; +} + +static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + cl->filter_cnt--; +} + +static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, + struct Qdisc *new, struct Qdisc **old) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, cl->common.classid); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + qfq_purge_queue(cl); + *old = cl->qdisc; + cl->qdisc = new; + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + + return cl->qdisc; +} + +static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + struct nlattr *nest; + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->common.classid; + tcm->tcm_info = cl->qdisc->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (nest == NULL) + goto nla_put_failure; + NLA_PUT_U32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w); + NLA_PUT_U32(skb, TCA_QFQ_LMAX, cl->lmax); + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct qfq_class *cl = (struct qfq_class *)arg; + struct tc_qfq_stats xstats; + + memset(&xstats, 0, sizeof(xstats)); + cl->qdisc->qstats.qlen = cl->qdisc->q.qlen; + + xstats.weight = ONE_FP/cl->inv_w; + xstats.lmax = cl->lmax; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + +static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct hlist_node *n; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct tcf_result res; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { + pr_debug("qfq_classify: found %d\n", skb->priority); + cl = qfq_find_class(sch, skb->priority); + if (cl != NULL) + return cl; + } + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tc_classify(skb, q->filter_list, &res); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NULL; + } +#endif + cl = (struct qfq_class *)res.class; + if (cl == NULL) + cl = qfq_find_class(sch, res.classid); + return cl; + } + + return NULL; +} + +/* Generic comparison function, handling wraparound. */ +static inline int qfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline u64 qfq_round_down(u64 ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = __ffs(bitmap); + return &q->groups[index]; +} +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, + int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_F)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= QFQ_MIN_SLOT_SHIFT; + if (old_V) { + ... + } + * + */ +static void qfq_make_eligible(struct qfq_sched *q, u64 old_V) +{ + unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT; + unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1; + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->S rounded on grp->slot_shift bits. + */ +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, + u64 roundedS) +{ + u64 slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + + hlist_add_head(&cl->next, &grp->slots[i]); + __set_bit(slot, &grp->full_slots); +} + +/* Maybe introduce hlist_first_entry?? */ +static struct qfq_class *qfq_slot_head(struct qfq_group *grp) +{ + return hlist_entry(grp->slots[grp->front].first, + struct qfq_class, next); +} + +/* + * remove the entry from the slot + */ +static void qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class *cl = qfq_slot_head(grp); + + BUG_ON(!cl); + hlist_del(&cl->next); + if (hlist_empty(&grp->slots[grp->front])) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in full_slots. + */ +static struct qfq_class *qfq_slot_scan(struct qfq_group *grp) +{ + unsigned int i; + + pr_debug("qfq slot_scan: grp %u full %#lx\n", + grp->index, grp->full_slots); + + if (grp->full_slots == 0) + return NULL; + + i = __ffs(grp->full_slots); /* zero based */ + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return qfq_slot_head(grp); +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + +static void qfq_update_eligible(struct qfq_sched *q, u64 old_V) +{ + struct qfq_group *grp; + unsigned long ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q, old_V); + } +} + +/* What is length of next packet in queue (0 if queue is empty) */ +static unsigned int qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = sch->ops->peek(sch); + return skb ? qdisc_pkt_len(skb) : 0; +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl) +{ + unsigned int len = qdisc_peek_len(cl->qdisc); + + cl->S = cl->F; + if (!len) + qfq_front_slot_remove(grp); /* queue is empty */ + else { + u64 roundedS; + + cl->F = cl->S + (u64)len * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (roundedS == grp->S) + return false; + + qfq_front_slot_remove(grp); + qfq_slot_insert(grp, cl, roundedS); + } + + return true; +} + +static struct sk_buff *qfq_dequeue(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + struct qfq_class *cl; + struct sk_buff *skb; + unsigned int len; + u64 old_V; + + if (!q->bitmaps[ER]) + return NULL; + + grp = qfq_ffs(q, q->bitmaps[ER]); + + cl = qfq_slot_head(grp); + skb = qdisc_dequeue_peeked(cl->qdisc); + if (!skb) { + WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n"); + return NULL; + } + + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + + old_V = q->V; + len = qdisc_pkt_len(skb); + q->V += (u64)len * IWSUM; + pr_debug("qfq dequeue: len %u F %lld now %lld\n", + len, (unsigned long long) cl->F, (unsigned long long) q->V); + + if (qfq_update_class(grp, cl)) { + u64 old_F = grp->F; + + cl = qfq_slot_scan(grp); + if (!cl) + __clear_bit(grp->index, &q->bitmaps[ER]); + else { + u64 roundedS = qfq_round_down(cl->S, grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + goto skip_unblock; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + + qfq_unblock_groups(q, grp->index, old_F); + } + +skip_unblock: + qfq_update_eligible(q, old_V); + + return skb; +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +{ + unsigned long mask; + uint32_t limit, roundedF; + int slot_shift = cl->grp->slot_shift; + + roundedF = qfq_round_down(cl->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], cl->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + cl->S = next->F; + return; + } + } + cl->S = q->V; + } else /* timestamp is not stale */ + cl->S = cl->F; +} + +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + struct qfq_class *cl; + int err; + u64 roundedS; + int s; + + cl = qfq_classify(skb, sch, &err); + if (cl == NULL) { + if (err & __NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return err; + } + pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); + + err = qdisc_enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + pr_debug("qfq_enqueue: enqueue failed %d\n", err); + if (net_xmit_drop_count(err)) { + cl->qstats.drops++; + sch->qstats.drops++; + } + return err; + } + + bstats_update(&cl->bstats, skb); + ++sch->q.qlen; + + /* If the new skb is not the head of queue, then done here. */ + if (cl->qdisc->q.qlen != 1) + return err; + + /* If reach this point, queue q was idle */ + grp = cl->grp; + qfq_update_start(q, cl); + + /* compute new finish time and rounded start. */ + cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + + /* + * insert cl in the correct bucket. + * If cl->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, cl->S)) + goto skip_update; + + /* create a slot for this cl->S */ + qfq_slot_rotate(grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + + pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", + s, q->bitmaps[s], + (unsigned long long) cl->S, + (unsigned long long) cl->F, + (unsigned long long) q->V); + +skip_update: + qfq_slot_insert(grp, cl, roundedS); + + return err; +} + + +static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl) +{ + unsigned int i, offset; + u64 roundedS; + + roundedS = qfq_round_down(cl->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; + + hlist_del(&cl->next); + if (hlist_empty(&grp->slots[i])) + __clear_bit(offset, &grp->full_slots); +} + +/* + * called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + */ +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) +{ + struct qfq_group *grp = cl->grp; + unsigned long mask; + u64 roundedS; + int s; + + cl->F = cl->S; + qfq_slot_remove(q, grp, cl); + + if (!grp->full_slots) { + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (hlist_empty(&grp->slots[grp->front])) { + cl = qfq_slot_scan(grp); + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } + + qfq_update_eligible(q, q->V); +} + +static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl = (struct qfq_class *)arg; + + if (cl->qdisc->q.qlen == 0) + qfq_deactivate_class(q, cl); +} + +static unsigned int qfq_drop(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + unsigned int i, j, len; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + for (j = 0; j < QFQ_MAX_SLOTS; j++) { + struct qfq_class *cl; + struct hlist_node *n; + + hlist_for_each_entry(cl, n, &grp->slots[j], next) { + + if (!cl->qdisc->ops->drop) + continue; + + len = cl->qdisc->ops->drop(cl->qdisc); + if (len > 0) { + sch->q.qlen--; + if (!cl->qdisc->q.qlen) + qfq_deactivate_class(q, cl); + + return len; + } + } + } + } + + return 0; +} + +static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + int i, j, err; + + err = qdisc_class_hash_init(&q->clhash); + if (err < 0) + return err; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS + - (QFQ_MAX_INDEX - i); + for (j = 0; j < QFQ_MAX_SLOTS; j++) + INIT_HLIST_HEAD(&grp->slots[j]); + } + + return 0; +} + +static void qfq_reset_qdisc(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_group *grp; + struct qfq_class *cl; + struct hlist_node *n, *tmp; + unsigned int i, j; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + for (j = 0; j < QFQ_MAX_SLOTS; j++) { + hlist_for_each_entry_safe(cl, n, tmp, + &grp->slots[j], next) { + qfq_deactivate_class(q, cl); + } + } + } + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) + qdisc_reset(cl->qdisc); + } + sch->q.qlen = 0; +} + +static void qfq_destroy_qdisc(struct Qdisc *sch) +{ + struct qfq_sched *q = qdisc_priv(sch); + struct qfq_class *cl; + struct hlist_node *n, *next; + unsigned int i; + + tcf_destroy_chain(&q->filter_list); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i], + common.hnode) { + qfq_destroy_class(sch, cl); + } + } + qdisc_class_hash_destroy(&q->clhash); +} + +static const struct Qdisc_class_ops qfq_class_ops = { + .change = qfq_change_class, + .delete = qfq_delete_class, + .get = qfq_get_class, + .put = qfq_put_class, + .tcf_chain = qfq_tcf_chain, + .bind_tcf = qfq_bind_tcf, + .unbind_tcf = qfq_unbind_tcf, + .graft = qfq_graft_class, + .leaf = qfq_class_leaf, + .qlen_notify = qfq_qlen_notify, + .dump = qfq_dump_class, + .dump_stats = qfq_dump_class_stats, + .walk = qfq_walk, +}; + +static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { + .cl_ops = &qfq_class_ops, + .id = "qfq", + .priv_size = sizeof(struct qfq_sched), + .enqueue = qfq_enqueue, + .dequeue = qfq_dequeue, + .peek = qdisc_peek_dequeued, + .drop = qfq_drop, + .init = qfq_init_qdisc, + .reset = qfq_reset_qdisc, + .destroy = qfq_destroy_qdisc, + .owner = THIS_MODULE, +}; + +static int __init qfq_init(void) +{ + return register_qdisc(&qfq_qdisc_ops); +} + +static void __exit qfq_exit(void) +{ + unregister_qdisc(&qfq_qdisc_ops); +} + +module_init(qfq_init); +module_exit(qfq_exit); +MODULE_LICENSE("GPL"); diff --git a/net/socket.c b/net/socket.c index 310d16b1b3c9..d25f5a9d6fa2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2643,13 +2643,13 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) return -EFAULT; if (convert_in) { - /* We expect there to be holes between fs.m_u and + /* We expect there to be holes between fs.m_ext and * fs.ring_cookie and at the end of fs, but nowhere else. */ - BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_u) + - sizeof(compat_rxnfc->fs.m_u) != - offsetof(struct ethtool_rxnfc, fs.m_u) + - sizeof(rxnfc->fs.m_u)); + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + + sizeof(compat_rxnfc->fs.m_ext) != + offsetof(struct ethtool_rxnfc, fs.m_ext) + + sizeof(rxnfc->fs.m_ext)); BUILD_BUG_ON( offsetof(struct compat_ethtool_rxnfc, fs.location) - offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != @@ -2657,7 +2657,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) offsetof(struct ethtool_rxnfc, fs.ring_cookie)); if (copy_in_user(rxnfc, compat_rxnfc, - (void *)(&rxnfc->fs.m_u + 1) - + (void *)(&rxnfc->fs.m_ext + 1) - (void *)rxnfc) || copy_in_user(&rxnfc->fs.ring_cookie, &compat_rxnfc->fs.ring_cookie, @@ -2674,7 +2674,7 @@ static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) if (convert_out) { if (copy_in_user(compat_rxnfc, rxnfc, - (const void *)(&rxnfc->fs.m_u + 1) - + (const void *)(&rxnfc->fs.m_ext + 1) - (const void *)rxnfc) || copy_in_user(&compat_rxnfc->fs.ring_cookie, &rxnfc->fs.ring_cookie, |