diff options
Diffstat (limited to 'net')
32 files changed, 765 insertions, 510 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 61fc573f1142..b3d17d1c49c3 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -98,14 +98,14 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); + + netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); - netdev_upper_dev_unlink(real_dev, dev); - if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); @@ -169,13 +169,13 @@ int register_vlan_dev(struct net_device *dev) if (err < 0) goto out_uninit_mvrp; - err = netdev_upper_dev_link(real_dev, dev); - if (err) - goto out_uninit_mvrp; - err = register_netdevice(dev); if (err < 0) - goto out_upper_dev_unlink; + goto out_uninit_mvrp; + + err = netdev_upper_dev_link(real_dev, dev); + if (err) + goto out_unregister_netdev; /* Account for reference in struct vlan_dev_priv */ dev_hold(real_dev); @@ -191,8 +191,8 @@ int register_vlan_dev(struct net_device *dev) return 0; -out_upper_dev_unlink: - netdev_upper_dev_unlink(real_dev, dev); +out_unregister_netdev: + unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); diff --git a/net/core/dev.c b/net/core/dev.c index 65f829cfd928..c25db20a4246 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4373,42 +4373,40 @@ struct netdev_adjacent { /* upper master flag, there can only be one master device per list */ bool master; - /* indicates that this dev is our first-level lower/upper device */ - bool neighbour; - /* counter for the number of times this device was added to us */ u16 ref_nr; + /* private field for the users */ + void *private; + struct list_head list; struct rcu_head rcu; }; -static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, - struct net_device *adj_dev, - bool upper) +static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *adj_list) { struct netdev_adjacent *adj; - struct list_head *dev_list; - - dev_list = upper ? &dev->upper_dev_list : &dev->lower_dev_list; - list_for_each_entry(adj, dev_list, list) { + list_for_each_entry_rcu(adj, adj_list, list) { if (adj->dev == adj_dev) return adj; } return NULL; } -static inline struct netdev_adjacent *__netdev_find_upper(struct net_device *dev, - struct net_device *udev) +static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, + struct net_device *adj_dev, + struct list_head *adj_list) { - return __netdev_find_adj(dev, udev, true); -} + struct netdev_adjacent *adj; -static inline struct netdev_adjacent *__netdev_find_lower(struct net_device *dev, - struct net_device *ldev) -{ - return __netdev_find_adj(dev, ldev, false); + list_for_each_entry(adj, adj_list, list) { + if (adj->dev == adj_dev) + return adj; + } + return NULL; } /** @@ -4425,7 +4423,7 @@ bool netdev_has_upper_dev(struct net_device *dev, { ASSERT_RTNL(); - return __netdev_find_upper(dev, upper_dev); + return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_upper_dev); @@ -4440,7 +4438,7 @@ bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); - return !list_empty(&dev->upper_dev_list); + return !list_empty(&dev->all_adj_list.upper); } EXPORT_SYMBOL(netdev_has_any_upper_dev); @@ -4457,10 +4455,10 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) ASSERT_RTNL(); - if (list_empty(&dev->upper_dev_list)) + if (list_empty(&dev->adj_list.upper)) return NULL; - upper = list_first_entry(&dev->upper_dev_list, + upper = list_first_entry(&dev->adj_list.upper, struct netdev_adjacent, list); if (likely(upper->master)) return upper->dev; @@ -4468,15 +4466,26 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev) } EXPORT_SYMBOL(netdev_master_upper_dev_get); -/* netdev_upper_get_next_dev_rcu - Get the next dev from upper list +void *netdev_adjacent_get_private(struct list_head *adj_list) +{ + struct netdev_adjacent *adj; + + adj = list_entry(adj_list, struct netdev_adjacent, list); + + return adj->private; +} +EXPORT_SYMBOL(netdev_adjacent_get_private); + +/** + * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list * @dev: device * @iter: list_head ** of the current position * * Gets the next device from the dev's upper list, starting from iter * position. The caller must hold RCU read lock. */ -struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, - struct list_head **iter) +struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *upper; @@ -4484,14 +4493,71 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); - if (&upper->list == &dev->upper_dev_list) + if (&upper->list == &dev->all_adj_list.upper) return NULL; *iter = &upper->list; return upper->dev; } -EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); +EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); + +/** + * netdev_lower_get_next_private - Get the next ->private from the + * lower neighbour list + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold either hold the + * RTNL lock or its own locking that guarantees that the neighbour lower + * list will remain unchainged. + */ +void *netdev_lower_get_next_private(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + lower = list_entry(*iter, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + if (iter) + *iter = lower->list.next; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private); + +/** + * netdev_lower_get_next_private_rcu - Get the next ->private from the + * lower neighbour list, RCU + * variant + * @dev: device + * @iter: list_head ** of the current position + * + * Gets the next netdev_adjacent->private from the dev's lower neighbour + * list, starting from iter position. The caller must hold RCU read lock. + */ +void *netdev_lower_get_next_private_rcu(struct net_device *dev, + struct list_head **iter) +{ + struct netdev_adjacent *lower; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); + + if (&lower->list == &dev->adj_list.lower) + return NULL; + + if (iter) + *iter = &lower->list; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); /** * netdev_master_upper_dev_get_rcu - Get master upper device @@ -4504,7 +4570,7 @@ struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) { struct netdev_adjacent *upper; - upper = list_first_or_null_rcu(&dev->upper_dev_list, + upper = list_first_or_null_rcu(&dev->adj_list.upper, struct netdev_adjacent, list); if (upper && likely(upper->master)) return upper->dev; @@ -4514,15 +4580,16 @@ EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); static int __netdev_adjacent_dev_insert(struct net_device *dev, struct net_device *adj_dev, - bool neighbour, bool master, - bool upper) + struct list_head *dev_list, + void *private, bool master) { struct netdev_adjacent *adj; + char linkname[IFNAMSIZ+7]; + int ret; - adj = __netdev_find_adj(dev, adj_dev, upper); + adj = __netdev_find_adj(dev, adj_dev, dev_list); if (adj) { - BUG_ON(neighbour); adj->ref_nr++; return 0; } @@ -4533,124 +4600,178 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev, adj->dev = adj_dev; adj->master = master; - adj->neighbour = neighbour; adj->ref_nr = 1; - + adj->private = private; dev_hold(adj_dev); - pr_debug("dev_hold for %s, because of %s link added from %s to %s\n", - adj_dev->name, upper ? "upper" : "lower", dev->name, - adj_dev->name); - if (!upper) { - list_add_tail_rcu(&adj->list, &dev->lower_dev_list); - return 0; + pr_debug("dev_hold for %s, because of link added from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); + + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), linkname); + if (ret) + goto free_adj; + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), linkname); + if (ret) + goto free_adj; } - /* Ensure that master upper link is always the first item in list. */ - if (master) - list_add_rcu(&adj->list, &dev->upper_dev_list); - else - list_add_tail_rcu(&adj->list, &dev->upper_dev_list); + /* Ensure that master link is always the first item in list. */ + if (master) { + ret = sysfs_create_link(&(dev->dev.kobj), + &(adj_dev->dev.kobj), "master"); + if (ret) + goto remove_symlinks; + + list_add_rcu(&adj->list, dev_list); + } else { + list_add_tail_rcu(&adj->list, dev_list); + } return 0; -} -static inline int __netdev_upper_dev_insert(struct net_device *dev, - struct net_device *udev, - bool master, bool neighbour) -{ - return __netdev_adjacent_dev_insert(dev, udev, neighbour, master, - true); -} +remove_symlinks: + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } -static inline int __netdev_lower_dev_insert(struct net_device *dev, - struct net_device *ldev, - bool neighbour) -{ - return __netdev_adjacent_dev_insert(dev, ldev, neighbour, false, - false); +free_adj: + kfree(adj); + + return ret; } void __netdev_adjacent_dev_remove(struct net_device *dev, - struct net_device *adj_dev, bool upper) + struct net_device *adj_dev, + struct list_head *dev_list) { struct netdev_adjacent *adj; + char linkname[IFNAMSIZ+7]; - if (upper) - adj = __netdev_find_upper(dev, adj_dev); - else - adj = __netdev_find_lower(dev, adj_dev); + adj = __netdev_find_adj(dev, adj_dev, dev_list); - if (!adj) + if (!adj) { + pr_err("tried to remove device %s from %s\n", + dev->name, adj_dev->name); BUG(); + } if (adj->ref_nr > 1) { + pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, + adj->ref_nr-1); adj->ref_nr--; return; } + if (adj->master) + sysfs_remove_link(&(dev->dev.kobj), "master"); + + if (dev_list == &dev->adj_list.lower) { + sprintf(linkname, "lower_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } else if (dev_list == &dev->adj_list.upper) { + sprintf(linkname, "upper_%s", adj_dev->name); + sysfs_remove_link(&(dev->dev.kobj), linkname); + } + list_del_rcu(&adj->list); - pr_debug("dev_put for %s, because of %s link removed from %s to %s\n", - adj_dev->name, upper ? "upper" : "lower", dev->name, - adj_dev->name); + pr_debug("dev_put for %s, because link removed from %s to %s\n", + adj_dev->name, dev->name, adj_dev->name); dev_put(adj_dev); kfree_rcu(adj, rcu); } -static inline void __netdev_upper_dev_remove(struct net_device *dev, - struct net_device *udev) -{ - return __netdev_adjacent_dev_remove(dev, udev, true); -} - -static inline void __netdev_lower_dev_remove(struct net_device *dev, - struct net_device *ldev) -{ - return __netdev_adjacent_dev_remove(dev, ldev, false); -} - -int __netdev_adjacent_dev_insert_link(struct net_device *dev, - struct net_device *upper_dev, - bool master, bool neighbour) +int __netdev_adjacent_dev_link_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list, + void *private, bool master) { int ret; - ret = __netdev_upper_dev_insert(dev, upper_dev, master, neighbour); + ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, + master); if (ret) return ret; - ret = __netdev_lower_dev_insert(upper_dev, dev, neighbour); + ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, + false); if (ret) { - __netdev_upper_dev_remove(dev, upper_dev); + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); return ret; } return 0; } -static inline int __netdev_adjacent_dev_link(struct net_device *dev, - struct net_device *udev) +int __netdev_adjacent_dev_link(struct net_device *dev, + struct net_device *upper_dev) { - return __netdev_adjacent_dev_insert_link(dev, udev, false, false); + return __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower, + NULL, false); } -static inline int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, - struct net_device *udev, - bool master) +void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, + struct net_device *upper_dev, + struct list_head *up_list, + struct list_head *down_list) { - return __netdev_adjacent_dev_insert_link(dev, udev, master, true); + __netdev_adjacent_dev_remove(dev, upper_dev, up_list); + __netdev_adjacent_dev_remove(upper_dev, dev, down_list); } void __netdev_adjacent_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { - __netdev_upper_dev_remove(dev, upper_dev); - __netdev_lower_dev_remove(upper_dev, dev); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->all_adj_list.upper, + &upper_dev->all_adj_list.lower); } +int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, + struct net_device *upper_dev, + void *private, bool master) +{ + int ret = __netdev_adjacent_dev_link(dev, upper_dev); + + if (ret) + return ret; + + ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower, + private, master); + if (ret) { + __netdev_adjacent_dev_unlink(dev, upper_dev); + return ret; + } + + return 0; +} + +void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, + struct net_device *upper_dev) +{ + __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_lists(dev, upper_dev, + &dev->adj_list.upper, + &upper_dev->adj_list.lower); +} static int __netdev_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, bool master) + struct net_device *upper_dev, bool master, + void *private) { struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; @@ -4661,26 +4782,29 @@ static int __netdev_upper_dev_link(struct net_device *dev, return -EBUSY; /* To prevent loops, check if dev is not upper device to upper_dev. */ - if (__netdev_find_upper(upper_dev, dev)) + if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) return -EBUSY; - if (__netdev_find_upper(dev, upper_dev)) + if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) return -EEXIST; if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; - ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, master); + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, + master); if (ret) return ret; /* Now that we linked these devs, make all the upper_dev's - * upper_dev_list visible to every dev's lower_dev_list and vice + * all_adj_list.upper visible to every dev's all_adj_list.lower an * versa, and don't forget the devices itself. All of these * links are non-neighbours. */ - list_for_each_entry(i, &dev->lower_dev_list, list) { - list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { + pr_debug("Interlinking %s with %s, non-neighbour\n", + i->dev->name, j->dev->name); ret = __netdev_adjacent_dev_link(i->dev, j->dev); if (ret) goto rollback_mesh; @@ -4688,14 +4812,18 @@ static int __netdev_upper_dev_link(struct net_device *dev, } /* add dev to every upper_dev's upper device */ - list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { + pr_debug("linking %s's upper device %s with %s\n", + upper_dev->name, i->dev->name, dev->name); ret = __netdev_adjacent_dev_link(dev, i->dev); if (ret) goto rollback_upper_mesh; } /* add upper_dev to every dev's lower device */ - list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + pr_debug("linking %s's lower device %s with %s\n", dev->name, + i->dev->name, upper_dev->name); ret = __netdev_adjacent_dev_link(i->dev, upper_dev); if (ret) goto rollback_lower_mesh; @@ -4706,7 +4834,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, rollback_lower_mesh: to_i = i; - list_for_each_entry(i, &dev->lower_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { if (i == to_i) break; __netdev_adjacent_dev_unlink(i->dev, upper_dev); @@ -4716,7 +4844,7 @@ rollback_lower_mesh: rollback_upper_mesh: to_i = i; - list_for_each_entry(i, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { if (i == to_i) break; __netdev_adjacent_dev_unlink(dev, i->dev); @@ -4727,8 +4855,8 @@ rollback_upper_mesh: rollback_mesh: to_i = i; to_j = j; - list_for_each_entry(i, &dev->lower_dev_list, list) { - list_for_each_entry(j, &upper_dev->upper_dev_list, list) { + list_for_each_entry(i, &dev->all_adj_list.lower, list) { + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { if (i == to_i && j == to_j) break; __netdev_adjacent_dev_unlink(i->dev, j->dev); @@ -4737,7 +4865,7 @@ rollback_mesh: break; } - __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); return ret; } @@ -4755,7 +4883,7 @@ rollback_mesh: int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, false); + return __netdev_upper_dev_link(dev, upper_dev, false, NULL); } EXPORT_SYMBOL(netdev_upper_dev_link); @@ -4773,10 +4901,18 @@ EXPORT_SYMBOL(netdev_upper_dev_link); int netdev_master_upper_dev_link(struct net_device *dev, struct net_device *upper_dev) { - return __netdev_upper_dev_link(dev, upper_dev, true); + return __netdev_upper_dev_link(dev, upper_dev, true, NULL); } EXPORT_SYMBOL(netdev_master_upper_dev_link); +int netdev_master_upper_dev_link_private(struct net_device *dev, + struct net_device *upper_dev, + void *private) +{ + return __netdev_upper_dev_link(dev, upper_dev, true, private); +} +EXPORT_SYMBOL(netdev_master_upper_dev_link_private); + /** * netdev_upper_dev_unlink - Removes a link to upper device * @dev: device @@ -4791,29 +4927,59 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct netdev_adjacent *i, *j; ASSERT_RTNL(); - __netdev_adjacent_dev_unlink(dev, upper_dev); + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower * devices from all upper_dev's upper devices and vice * versa, to maintain the graph relationship. */ - list_for_each_entry(i, &dev->lower_dev_list, list) - list_for_each_entry(j, &upper_dev->upper_dev_list, list) + list_for_each_entry(i, &dev->all_adj_list.lower, list) + list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(i->dev, j->dev); /* remove also the devices itself from lower/upper device * list */ - list_for_each_entry(i, &dev->lower_dev_list, list) + list_for_each_entry(i, &dev->all_adj_list.lower, list) __netdev_adjacent_dev_unlink(i->dev, upper_dev); - list_for_each_entry(i, &upper_dev->upper_dev_list, list) + list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(dev, i->dev); call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); } EXPORT_SYMBOL(netdev_upper_dev_unlink); +void *netdev_lower_dev_get_private_rcu(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu); + +void *netdev_lower_dev_get_private(struct net_device *dev, + struct net_device *lower_dev) +{ + struct netdev_adjacent *lower; + + if (!lower_dev) + return NULL; + lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); + if (!lower) + return NULL; + + return lower->private; +} +EXPORT_SYMBOL(netdev_lower_dev_get_private); + static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -4822,7 +4988,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) ops->ndo_change_rx_flags(dev, flags); } -static int __dev_set_promiscuity(struct net_device *dev, int inc) +static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags; kuid_t uid; @@ -4865,6 +5031,8 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) dev_change_rx_flags(dev, IFF_PROMISC); } + if (notify) + __dev_notify_flags(dev, old_flags, IFF_PROMISC); return 0; } @@ -4884,7 +5052,7 @@ int dev_set_promiscuity(struct net_device *dev, int inc) unsigned int old_flags = dev->flags; int err; - err = __dev_set_promiscuity(dev, inc); + err = __dev_set_promiscuity(dev, inc, true); if (err < 0) return err; if (dev->flags != old_flags) @@ -4893,22 +5061,9 @@ int dev_set_promiscuity(struct net_device *dev, int inc) } EXPORT_SYMBOL(dev_set_promiscuity); -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) +static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) { - unsigned int old_flags = dev->flags; + unsigned int old_flags = dev->flags, old_gflags = dev->gflags; ASSERT_RTNL(); @@ -4931,9 +5086,30 @@ int dev_set_allmulti(struct net_device *dev, int inc) if (dev->flags ^ old_flags) { dev_change_rx_flags(dev, IFF_ALLMULTI); dev_set_rx_mode(dev); + if (notify) + __dev_notify_flags(dev, old_flags, + dev->gflags ^ old_gflags); } return 0; } + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * Return 0 if successful or a negative errno code on error. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + return __dev_set_allmulti(dev, inc, true); +} EXPORT_SYMBOL(dev_set_allmulti); /* @@ -4958,10 +5134,10 @@ void __dev_set_rx_mode(struct net_device *dev) * therefore calling __dev_set_promiscuity here is safe. */ if (!netdev_uc_empty(dev) && !dev->uc_promisc) { - __dev_set_promiscuity(dev, 1); + __dev_set_promiscuity(dev, 1, false); dev->uc_promisc = true; } else if (netdev_uc_empty(dev) && dev->uc_promisc) { - __dev_set_promiscuity(dev, -1); + __dev_set_promiscuity(dev, -1, false); dev->uc_promisc = false; } } @@ -5050,9 +5226,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; + unsigned int old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); + + if (__dev_set_promiscuity(dev, inc, false) >= 0) + if (dev->flags != old_flags) + dev_set_rx_mode(dev); } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI @@ -5063,16 +5243,20 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); + __dev_set_allmulti(dev, inc, false); } return ret; } -void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges) { unsigned int changes = dev->flags ^ old_flags; + if (gchanges) + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges); + if (changes & IFF_UP) { if (dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_UP, dev); @@ -5101,17 +5285,14 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags) int dev_change_flags(struct net_device *dev, unsigned int flags) { int ret; - unsigned int changes, old_flags = dev->flags; + unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; ret = __dev_change_flags(dev, flags); if (ret < 0) return ret; - changes = old_flags ^ dev->flags; - if (changes) - rtmsg_ifinfo(RTM_NEWLINK, dev, changes); - - __dev_notify_flags(dev, old_flags); + changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); + __dev_notify_flags(dev, old_flags, changes); return ret; } EXPORT_SYMBOL(dev_change_flags); @@ -6077,8 +6258,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); INIT_LIST_HEAD(&dev->link_watch_list); - INIT_LIST_HEAD(&dev->upper_dev_list); - INIT_LIST_HEAD(&dev->lower_dev_list); + INIT_LIST_HEAD(&dev->adj_list.upper); + INIT_LIST_HEAD(&dev->adj_list.lower); + INIT_LIST_HEAD(&dev->all_adj_list.upper); + INIT_LIST_HEAD(&dev->all_adj_list.lower); dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6072610a8672..ca15f32821fb 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -867,7 +867,7 @@ static void neigh_invalidate(struct neighbour *neigh) static void neigh_probe(struct neighbour *neigh) __releases(neigh->lock) { - struct sk_buff *skb = skb_peek(&neigh->arp_queue); + struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); /* keep skb alive even if arp_queue overflows */ if (skb) skb = skb_copy(skb, GFP_ATOMIC); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2a0e21de3060..4aedf03da052 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1647,9 +1647,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); - __dev_notify_flags(dev, old_flags); + __dev_notify_flags(dev, old_flags, ~0U); return 0; } EXPORT_SYMBOL(rtnl_configure_link); diff --git a/net/core/sock.c b/net/core/sock.c index 5b6beba494a3..2bd9b3faa0d0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -914,6 +914,13 @@ set_rcvbuf: } break; #endif + + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + default: ret = -ENOPROTOOPT; break; @@ -1177,6 +1184,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; #endif + case SO_MAX_PACING_RATE: + v.val = sk->sk_max_pacing_rate; + break; + default: return -ENOPROTOOPT; } @@ -2319,6 +2330,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_ll_usec = sysctl_net_busy_read; #endif + sk->sk_max_pacing_rate = ~0U; /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index be1f64d35358..8f032bae60ad 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -58,7 +58,7 @@ #include <net/ipv6.h> #include <net/ip.h> #include <net/dsa.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> __setup("ether=", netdev_boot_setup); @@ -133,7 +133,7 @@ int eth_rebuild_header(struct sk_buff *skb) return arp_find(eth->h_dest, skb); #endif default: - printk(KERN_DEBUG + netdev_dbg(dev, "%s: unable to resolve type %X addresses.\n", dev->name, ntohs(eth->h_proto)); @@ -169,20 +169,9 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) else skb->pkt_type = PACKET_MULTICAST; } - - /* - * This ALLMULTI check should be redundant by 1.4 - * so don't forget to remove it. - * - * Seems, you forgot to remove it. All silly devices - * seems to set IFF_PROMISC. - */ - - else if (1 /*dev->flags&IFF_PROMISC */ ) { - if (unlikely(!ether_addr_equal_64bits(eth->h_dest, - dev->dev_addr))) - skb->pkt_type = PACKET_OTHERHOST; - } + else if (unlikely(!ether_addr_equal_64bits(eth->h_dest, + dev->dev_addr))) + skb->pkt_type = PACKET_OTHERHOST; /* * Some variants of DSA tagging don't have an ethertype field @@ -190,12 +179,13 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ - if (netdev_uses_dsa_tags(dev)) + if (unlikely(netdev_uses_dsa_tags(dev))) return htons(ETH_P_DSA); - if (netdev_uses_trailer_tags(dev)) + + if (unlikely(netdev_uses_trailer_tags(dev))) return htons(ETH_P_TRAILER); - if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) + if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) return eth->h_proto; /* @@ -204,7 +194,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This * won't work for fault tolerant netware but does for the rest. */ - if (skb->len >= 2 && *(unsigned short *)(skb->data) == 0xFFFF) + if (unlikely(skb->len >= 2 && *(unsigned short *)(skb->data) == 0xFFFF)) return htons(ETH_P_802_3); /* diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5f7d11a45871..5c0e8bc6e5ba 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -353,6 +353,9 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) saddr = fib_compute_spec_dst(skb); ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; + if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; if (ipc.opt->opt.srr) @@ -608,6 +611,8 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ipc.addr = iph->saddr; ipc.opt = &icmp_param->replyopts.opt; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, type, code, icmp_param); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6acb541c9091..7ac7aa11130e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,27 +29,19 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif -/* - * This struct holds the first and last local port number. - */ -struct local_ports sysctl_local_ports __read_mostly = { - .lock = __SEQLOCK_UNLOCKED(sysctl_local_ports.lock), - .range = { 32768, 61000 }, -}; - unsigned long *sysctl_local_reserved_ports; EXPORT_SYMBOL(sysctl_local_reserved_ports); -void inet_get_local_port_range(int *low, int *high) +void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; do { - seq = read_seqbegin(&sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); - *low = sysctl_local_ports.range[0]; - *high = sysctl_local_ports.range[1]; - } while (read_seqretry(&sysctl_local_ports.lock, seq)); + *low = net->ipv4.sysctl_local_ports.range[0]; + *high = net->ipv4.sysctl_local_ports.range[1]; + } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); } EXPORT_SYMBOL(inet_get_local_port_range); @@ -116,7 +108,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) int remaining, rover, low, high; again: - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; smallest_rover = rover = net_random() % remaining + low; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 7bd8983dbfcf..2779037bd113 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -494,7 +494,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 offset = hint + port_offset; struct inet_timewait_sock *tw = NULL; - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; local_bh_disable(); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a04d872c54f9..7d8357bb2ba6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1060,6 +1060,9 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, rt->dst.dev->mtu : dst_mtu(&rt->dst); cork->dst = &rt->dst; cork->length = 0; + cork->ttl = ipc->ttl; + cork->tos = ipc->tos; + cork->priority = ipc->priority; cork->tx_flags = ipc->tx_flags; return 0; @@ -1311,7 +1314,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk, if (cork->flags & IPCORK_OPT) opt = cork->opt; - if (rt->rt_type == RTN_MULTICAST) + if (cork->ttl != 0) + ttl = cork->ttl; + else if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; else ttl = ip_select_ttl(inet, &rt->dst); @@ -1319,7 +1324,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; - iph->tos = inet->tos; + iph->tos = (cork->tos != -1) ? cork->tos : inet->tos; iph->frag_off = df; iph->ttl = ttl; iph->protocol = sk->sk_protocol; @@ -1331,7 +1336,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_options_build(skb, opt, cork->addr, rt, 0); } - skb->priority = sk->sk_priority; + skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority; skb->mark = sk->sk_mark; /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec @@ -1481,6 +1486,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, ipc.addr = daddr; ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; if (replyopts.opt.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index d9c4f113d709..56e34457ac07 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -189,7 +189,7 @@ EXPORT_SYMBOL(ip_cmsg_recv); int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) { - int err; + int err, val; struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { @@ -215,6 +215,24 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc) ipc->addr = info->ipi_spec_dst.s_addr; break; } + case IP_TTL: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 1 || val > 255) + return -EINVAL; + ipc->ttl = val; + break; + case IP_TOS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) + return -EINVAL; + val = *(int *)CMSG_DATA(cmsg); + if (val < 0 || val > 255) + return -EINVAL; + ipc->tos = val; + ipc->priority = rt_tos2priority(ipc->tos); + break; + default: return -EINVAL; } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index e805e7b3030e..91f69bc883fe 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -49,70 +49,6 @@ static struct rtnl_link_ops vti_link_ops __read_mostly; static int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); -static int vti_err(struct sk_buff *skb, u32 info) -{ - - /* All the routers (except for Linux) return only - * 8 bytes of packet payload. It means, that precise relaying of - * ICMP in the real Internet is absolutely infeasible. - */ - struct net *net = dev_net(skb->dev); - struct ip_tunnel_net *itn = net_generic(net, vti_net_id); - struct iphdr *iph = (struct iphdr *)skb->data; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct ip_tunnel *t; - int err; - - switch (type) { - default: - case ICMP_PARAMETERPROB: - return 0; - - case ICMP_DEST_UNREACH: - switch (code) { - case ICMP_SR_FAILED: - case ICMP_PORT_UNREACH: - /* Impossible event. */ - return 0; - default: - /* All others are translated to HOST_UNREACH. */ - break; - } - break; - case ICMP_TIME_EXCEEDED: - if (code != ICMP_EXC_TTL) - return 0; - break; - } - - err = -ENOENT; - - t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, - iph->daddr, iph->saddr, 0); - if (t == NULL) - goto out; - - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, dev_net(skb->dev), info, - t->parms.link, 0, IPPROTO_IPIP, 0); - err = 0; - goto out; - } - - err = 0; - if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - goto out; - - if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) - t->err_count++; - else - t->err_count = 1; - t->err_time = jiffies; -out: - return err; -} - /* We dont digest the packet therefore let the packet pass */ static int vti_rcv(struct sk_buff *skb) { @@ -296,9 +232,8 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; } -static struct xfrm_tunnel vti_handler __read_mostly = { +static struct xfrm_tunnel_notifier vti_handler __read_mostly = { .handler = vti_rcv, - .err_handler = vti_err, .priority = 1, }; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index d7d9882d4cae..a62610443152 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -237,11 +237,11 @@ static void inet_get_ping_group_range_net(struct net *net, kgid_t *low, unsigned int seq; do { - seq = read_seqbegin(&sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); } @@ -713,6 +713,8 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; sock_tx_timestamp(sk, &ipc.tx_flags); @@ -744,7 +746,7 @@ int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EINVAL; faddr = ipc.opt->opt.faddr; } - tos = RT_TOS(inet->tos); + tos = get_rttos(&ipc, inet); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 193db03540ad..b2fa14c1a6f1 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -519,6 +519,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { @@ -558,7 +560,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, daddr = ipc.opt->opt.faddr; } } - tos = RT_CONN_FLAGS(sk); + tos = get_rtconn_flags(&ipc, sk); if (msg->msg_flags & MSG_DONTROUTE) tos |= RTO_ONLINK; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 14a15c49129d..15e024105f91 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -89,8 +89,7 @@ __u32 cookie_init_timestamp(struct request_sock *req) static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, - __be16 dport, __u32 sseq, __u32 count, - __u32 data) + __be16 dport, __u32 sseq, __u32 data) { /* * Compute the secure sequence number. @@ -102,7 +101,7 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, * As an extra hack, we add a small "data" value that encodes the * MSS into the second hash value. */ - + u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) @@ -114,22 +113,21 @@ static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, * If the syncookie is bad, the data returned will be out of * range. This must be checked by the caller. * - * The count value used to generate the cookie must be within - * "maxdiff" if the current (passed-in) "count". The return value - * is (__u32)-1 if this test fails. + * The count value used to generate the cookie must be less than + * MAX_SYNCOOKIE_AGE minutes in the past. + * The return value (__u32)-1 if this test fails. */ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, - __be16 sport, __be16 dport, __u32 sseq, - __u32 count, __u32 maxdiff) + __be16 sport, __be16 dport, __u32 sseq) { - __u32 diff; + u32 diff, count = tcp_cookie_time(); /* Strip away the layers from the cookie */ cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); - if (diff >= maxdiff) + if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - @@ -138,22 +136,22 @@ static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, } /* - * MSS Values are taken from the 2009 paper - * 'Measuring TCP Maximum Segment Size' by S. Alcock and R. Nelson: - * - values 1440 to 1460 accounted for 80% of observed mss values - * - values outside the 536-1460 range are rare (<0.2%). + * MSS Values are chosen based on the 2011 paper + * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. + * Values .. + * .. lower than 536 are rare (< 0.2%) + * .. between 537 and 1299 account for less than < 1.5% of observed values + * .. in the 1300-1349 range account for about 15 to 20% of observed mss values + * .. exceeding 1460 are very rare (< 0.04%) * - * Table must be sorted. + * 1460 is the single most frequently announced mss value (30 to 46% depending + * on monitor location). Table must be sorted. */ static __u16 const msstab[] = { - 64, - 512, 536, - 1024, - 1440, + 1300, + 1440, /* 1440, 1452: PPPoE */ 1460, - 4312, - 8960, }; /* @@ -173,7 +171,7 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), - jiffies / (HZ * 60), mssind); + mssind); } EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); @@ -189,13 +187,6 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) } /* - * This (misnamed) value is the age of syncookie which is permitted. - * Its ideal value should be dependent on TCP_TIMEOUT_INIT and - * sysctl_tcp_retries1. It's a rather complicated formula (exponential - * backoff) to compute at runtime so it's currently hardcoded here. - */ -#define COUNTER_TRIES 4 -/* * Check if a ack sequence number is a valid syncookie. * Return the decoded mss if it is, or 0 if not. */ @@ -204,9 +195,7 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, { __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr, - th->source, th->dest, seq, - jiffies / (HZ * 60), - COUNTER_TRIES); + th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 540279f4c531..c08f096d46b5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -43,12 +43,12 @@ static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; /* Update system visible IP port range */ -static void set_local_port_range(int range[2]) +static void set_local_port_range(struct net *net, int range[2]) { - write_seqlock(&sysctl_local_ports.lock); - sysctl_local_ports.range[0] = range[0]; - sysctl_local_ports.range[1] = range[1]; - write_sequnlock(&sysctl_local_ports.lock); + write_seqlock(&net->ipv4.sysctl_local_ports.lock); + net->ipv4.sysctl_local_ports.range[0] = range[0]; + net->ipv4.sysctl_local_ports.range[1] = range[1]; + write_sequnlock(&net->ipv4.sysctl_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -56,6 +56,8 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = + container_of(table->data, struct net, ipv4.sysctl_local_ports.range); int ret; int range[2]; struct ctl_table tmp = { @@ -66,14 +68,15 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, .extra2 = &ip_local_port_range_max, }; - inet_get_local_port_range(range, range + 1); + inet_get_local_port_range(net, &range[0], &range[1]); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) ret = -EINVAL; else - set_local_port_range(range); + set_local_port_range(net, range); } return ret; @@ -83,23 +86,27 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) { kgid_t *data = table->data; + struct net *net = + container_of(table->data, struct net, ipv4.sysctl_ping_group_range); unsigned int seq; do { - seq = read_seqbegin(&sysctl_local_ports.lock); + seq = read_seqbegin(&net->ipv4.sysctl_local_ports.lock); *low = data[0]; *high = data[1]; - } while (read_seqretry(&sysctl_local_ports.lock, seq)); + } while (read_seqretry(&net->ipv4.sysctl_local_ports.lock, seq)); } /* Update system visible IP port range */ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high) { kgid_t *data = table->data; - write_seqlock(&sysctl_local_ports.lock); + struct net *net = + container_of(table->data, struct net, ipv4.sysctl_ping_group_range); + write_seqlock(&net->ipv4.sysctl_local_ports.lock); data[0] = low; data[1] = high; - write_sequnlock(&sysctl_local_ports.lock); + write_sequnlock(&net->ipv4.sysctl_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -475,13 +482,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_local_port_range", - .data = &sysctl_local_ports.range, - .maxlen = sizeof(sysctl_local_ports.range), - .mode = 0644, - .proc_handler = ipv4_local_port_range, - }, - { .procname = "ip_local_reserved_ports", .data = NULL, /* initialized in sysctl_ipv4_init */ .maxlen = 65536, @@ -854,6 +854,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_local_port_range", + .maxlen = sizeof(init_net.ipv4.sysctl_local_ports.range), + .data = &init_net.ipv4.sysctl_local_ports.range, + .mode = 0644, + .proc_handler = ipv4_local_port_range, + }, + { .procname = "tcp_mem", .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), .mode = 0644, @@ -888,6 +895,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) &net->ipv4.sysctl_ping_group_range; table[7].data = &net->ipv4.sysctl_tcp_ecn; + table[8].data = + &net->ipv4.sysctl_local_ports.range; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) @@ -901,6 +910,13 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1); net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0); + /* + * Set defaults for local port range + */ + seqlock_init(&net->ipv4.sysctl_local_ports.lock); + net->ipv4.sysctl_local_ports.range[0] = 32768; + net->ipv4.sysctl_local_ports.range[1] = 61000; + tcp_init_mem(net); net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 25a89eaa669d..66aa816ad30b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -355,6 +355,12 @@ static void tcp_fixup_rcvbuf(struct sock *sk) rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) * tcp_default_init_rwnd(mss); + /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency + * Allow enough cushion so that sender is not limited by our window + */ + if (sysctl_tcp_moderate_rcvbuf) + rcvmem <<= 2; + if (sk->sk_rcvbuf < rcvmem) sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]); } @@ -373,6 +379,8 @@ void tcp_init_buffer_space(struct sock *sk) tcp_fixup_sndbuf(sk); tp->rcvq_space.space = tp->rcv_wnd; + tp->rcvq_space.time = tcp_time_stamp; + tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); @@ -512,48 +520,62 @@ void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time; - int space; - - if (tp->rcvq_space.time == 0) - goto new_measure; + int copied; time = tcp_time_stamp - tp->rcvq_space.time; if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; - space = 2 * (tp->copied_seq - tp->rcvq_space.seq); + /* Number of bytes copied to user in last RTT */ + copied = tp->copied_seq - tp->rcvq_space.seq; + if (copied <= tp->rcvq_space.space) + goto new_measure; + + /* A bit of theory : + * copied = bytes received in previous RTT, our base window + * To cope with packet losses, we need a 2x factor + * To cope with slow start, and sender growing its cwin by 100 % + * every RTT, we need a 4x factor, because the ACK we are sending + * now is for the next RTT, not the current one : + * <prev RTT . ><current RTT .. ><next RTT .... > + */ + + if (sysctl_tcp_moderate_rcvbuf && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { + int rcvwin, rcvmem, rcvbuf; - space = max(tp->rcvq_space.space, space); + /* minimal window to cope with packet losses, assuming + * steady state. Add some cushion because of small variations. + */ + rcvwin = (copied << 1) + 16 * tp->advmss; - if (tp->rcvq_space.space != space) { - int rcvmem; + /* If rate increased by 25%, + * assume slow start, rcvwin = 3 * copied + * If rate increased by 50%, + * assume sender can use 2x growth, rcvwin = 4 * copied + */ + if (copied >= + tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) { + if (copied >= + tp->rcvq_space.space + (tp->rcvq_space.space >> 1)) + rcvwin <<= 1; + else + rcvwin += (rcvwin >> 1); + } - tp->rcvq_space.space = space; + rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; - if (sysctl_tcp_moderate_rcvbuf && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int new_clamp = space; + rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]); + if (rcvbuf > sk->sk_rcvbuf) { + sk->sk_rcvbuf = rcvbuf; - /* Receive space grows, normalize in order to - * take into account packet headers and sk_buff - * structure overhead. - */ - space /= tp->advmss; - if (!space) - space = 1; - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(rcvmem) < tp->advmss) - rcvmem += 128; - space *= rcvmem; - space = min(space, sysctl_tcp_rmem[2]); - if (space > sk->sk_rcvbuf) { - sk->sk_rcvbuf = space; - - /* Make the window clamp follow along. */ - tp->window_clamp = new_clamp; - } + /* Make the window clamp follow along. */ + tp->window_clamp = rcvwin; } } + tp->rcvq_space.space = copied; new_measure: tp->rcvq_space.seq = tp->copied_seq; @@ -713,7 +735,7 @@ static void tcp_update_pacing_rate(struct sock *sk) if (tp->srtt > 8 + 2) do_div(rate, tp->srtt); - sk->sk_pacing_rate = min_t(u64, rate, ~0U); + sk->sk_pacing_rate = min_t(u64, rate, sk->sk_max_pacing_rate); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -5674,8 +5696,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_congestion_control(sk); tcp_mtup_init(sk); - tcp_init_buffer_space(sk); tp->copied_seq = tp->rcv_nxt; + tcp_init_buffer_space(sk); } smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0ca44df51ee9..c41833e9c083 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -219,7 +219,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned short first, last; DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rand = net_random(); @@ -855,6 +855,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.tx_flags = 0; + ipc.ttl = 0; + ipc.tos = -1; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; @@ -938,7 +940,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, faddr = ipc.opt->opt.faddr; connected = 0; } - tos = RT_TOS(inet->tos); + tos = get_rttos(&ipc, inet); if (sock_flag(sk, SOCK_LOCALROUTE) || (msg->msg_flags & MSG_DONTROUTE) || (ipc.opt && ipc.opt->opt.is_strictroute)) { diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index b5663c37f089..31b18152528f 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -16,13 +16,13 @@ #include <net/xfrm.h> /* Informational hook. The decap is still done here. */ -static struct xfrm_tunnel __rcu *rcv_notify_handlers __read_mostly; +static struct xfrm_tunnel_notifier __rcu *rcv_notify_handlers __read_mostly; static DEFINE_MUTEX(xfrm4_mode_tunnel_input_mutex); -int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel *handler) +int xfrm4_mode_tunnel_input_register(struct xfrm_tunnel_notifier *handler) { - struct xfrm_tunnel __rcu **pprev; - struct xfrm_tunnel *t; + struct xfrm_tunnel_notifier __rcu **pprev; + struct xfrm_tunnel_notifier *t; int ret = -EEXIST; int priority = handler->priority; @@ -50,10 +50,10 @@ err: } EXPORT_SYMBOL_GPL(xfrm4_mode_tunnel_input_register); -int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel *handler) +int xfrm4_mode_tunnel_input_deregister(struct xfrm_tunnel_notifier *handler) { - struct xfrm_tunnel __rcu **pprev; - struct xfrm_tunnel *t; + struct xfrm_tunnel_notifier __rcu **pprev; + struct xfrm_tunnel_notifier *t; int ret = -ENOENT; mutex_lock(&xfrm4_mode_tunnel_input_mutex); @@ -134,7 +134,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - struct xfrm_tunnel *handler; + struct xfrm_tunnel_notifier *handler; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7c96100b021e..4966b124dc2e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1028,52 +1028,4 @@ out_unregister_tcp_proto: } module_init(inet6_init); -static void __exit inet6_exit(void) -{ - if (disable_ipv6_mod) - return; - - /* First of all disallow new sockets creation. */ - sock_unregister(PF_INET6); - /* Disallow any further netlink messages */ - rtnl_unregister_all(PF_INET6); - - udpv6_exit(); - udplitev6_exit(); - tcpv6_exit(); - - /* Cleanup code parts. */ - ipv6_packet_cleanup(); - ipv6_frag_exit(); - ipv6_exthdrs_exit(); - addrconf_cleanup(); - ip6_flowlabel_cleanup(); - ndisc_late_cleanup(); - ip6_route_cleanup(); -#ifdef CONFIG_PROC_FS - - /* Cleanup code parts. */ - if6_proc_exit(); - ipv6_misc_proc_exit(); - udplite6_proc_exit(); - raw6_proc_exit(); -#endif - ipv6_netfilter_fini(); - ipv6_stub = NULL; - igmp6_cleanup(); - ndisc_cleanup(); - ip6_mr_cleanup(); - icmpv6_cleanup(); - rawv6_exit(); - - unregister_pernet_subsys(&inet6_net_ops); - proto_unregister(&rawv6_prot); - proto_unregister(&udplitev6_prot); - proto_unregister(&udpv6_prot); - proto_unregister(&tcpv6_prot); - - rcu_barrier(); /* Wait for completion of call_rcu()'s */ -} -module_exit(inet6_exit); - MODULE_ALIAS_NETPROTO(PF_INET6); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5bec666aba61..5550a8113a6d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1529,25 +1529,6 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, fib6_walk(&c.w); } -void fib6_clean_all_ro(struct net *net, int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) -{ - struct fib6_table *table; - struct hlist_head *head; - unsigned int h; - - rcu_read_lock(); - for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(table, head, tb6_hlist) { - read_lock_bh(&table->tb6_lock); - fib6_clean_tree(net, &table->tb6_root, - func, prune, arg); - read_unlock_bh(&table->tb6_lock); - } - } - rcu_read_unlock(); -} void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), int prune, void *arg) { @@ -1782,3 +1763,189 @@ void fib6_gc_cleanup(void) unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } + +#ifdef CONFIG_PROC_FS + +struct ipv6_route_iter { + struct seq_net_private p; + struct fib6_walker_t w; + loff_t skip; + struct fib6_table *tbl; + __u32 sernum; +}; + +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + struct rt6_info *rt = v; + struct ipv6_route_iter *iter = seq->private; + + seq_printf(seq, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); + +#ifdef CONFIG_IPV6_SUBTREES + seq_printf(seq, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); +#else + seq_puts(seq, "00000000000000000000000000000000 00 "); +#endif + if (rt->rt6i_flags & RTF_GATEWAY) + seq_printf(seq, "%pi6", &rt->rt6i_gateway); + else + seq_puts(seq, "00000000000000000000000000000000"); + + seq_printf(seq, " %08x %08x %08x %08x %8s\n", + rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), + rt->dst.__use, rt->rt6i_flags, + rt->dst.dev ? rt->dst.dev->name : ""); + iter->w.leaf = NULL; + return 0; +} + +static int ipv6_route_yield(struct fib6_walker_t *w) +{ + struct ipv6_route_iter *iter = w->args; + + if (!iter->skip) + return 1; + + do { + iter->w.leaf = iter->w.leaf->dst.rt6_next; + iter->skip--; + if (!iter->skip && iter->w.leaf) + return 1; + } while (iter->w.leaf); + + return 0; +} + +static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) +{ + memset(&iter->w, 0, sizeof(iter->w)); + iter->w.func = ipv6_route_yield; + iter->w.root = &iter->tbl->tb6_root; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + iter->w.args = iter; + iter->sernum = iter->w.root->fn_sernum; + INIT_LIST_HEAD(&iter->w.lh); + fib6_walker_link(&iter->w); +} + +static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, + struct net *net) +{ + unsigned int h; + struct hlist_node *node; + + if (tbl) { + h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; + node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist)); + } else { + h = 0; + node = NULL; + } + + while (!node && h < FIB6_TABLE_HASHSZ) { + node = rcu_dereference_bh( + hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); + } + return hlist_entry_safe(node, struct fib6_table, tb6_hlist); +} + +static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) +{ + if (iter->sernum != iter->w.root->fn_sernum) { + iter->sernum = iter->w.root->fn_sernum; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + WARN_ON(iter->w.skip); + iter->w.skip = iter->w.count; + } +} + +static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int r; + struct rt6_info *n; + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + if (!v) + goto iter_table; + + n = ((struct rt6_info *)v)->dst.rt6_next; + if (n) { + ++*pos; + return n; + } + +iter_table: + ipv6_route_check_sernum(iter); + read_lock(&iter->tbl->tb6_lock); + r = fib6_walk_continue(&iter->w); + read_unlock(&iter->tbl->tb6_lock); + if (r > 0) { + if (v) + ++*pos; + return iter->w.leaf; + } else if (r < 0) { + fib6_walker_unlink(&iter->w); + return NULL; + } + fib6_walker_unlink(&iter->w); + + iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); + if (!iter->tbl) + return NULL; + + ipv6_route_seq_setup_walk(iter); + goto iter_table; +} + +static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU_BH) +{ + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + rcu_read_lock_bh(); + iter->tbl = ipv6_route_seq_next_table(NULL, net); + iter->skip = *pos; + + if (iter->tbl) { + ipv6_route_seq_setup_walk(iter); + return ipv6_route_seq_next(seq, NULL, pos); + } else { + return NULL; + } +} + +static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) +{ + struct fib6_walker_t *w = &iter->w; + return w->node && !(w->state == FWS_U && w->node == w->root); +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) + __releases(RCU_BH) +{ + struct ipv6_route_iter *iter = seq->private; + + if (ipv6_route_iter_active(iter)) + fib6_walker_unlink(&iter->w); + + rcu_read_unlock_bh(); +} + +static const struct seq_operations ipv6_route_seq_ops = { + .start = ipv6_route_seq_start, + .next = ipv6_route_seq_next, + .stop = ipv6_route_seq_stop, + .show = ipv6_route_seq_show +}; + +int ipv6_route_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &ipv6_route_seq_ops, + sizeof(struct ipv6_route_iter)); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c979dd96d82a..c3130ffc3bca 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1137,7 +1137,6 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; - fl6.flowi6_flags = 0; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); @@ -1236,7 +1235,6 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; - fl6.flowi6_flags = 0; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); @@ -1258,7 +1256,6 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = oif; fl6.flowi6_mark = mark; - fl6.flowi6_flags = 0; fl6.daddr = msg->dest; fl6.saddr = iph->daddr; @@ -2800,56 +2797,12 @@ static int ip6_route_dev_notify(struct notifier_block *this, #ifdef CONFIG_PROC_FS -struct rt6_proc_arg -{ - char *buffer; - int offset; - int length; - int skip; - int len; -}; - -static int rt6_info_route(struct rt6_info *rt, void *p_arg) -{ - struct seq_file *m = p_arg; - - seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen); - -#ifdef CONFIG_IPV6_SUBTREES - seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen); -#else - seq_puts(m, "00000000000000000000000000000000 00 "); -#endif - if (rt->rt6i_flags & RTF_GATEWAY) { - seq_printf(m, "%pi6", &rt->rt6i_gateway); - } else { - seq_puts(m, "00000000000000000000000000000000"); - } - seq_printf(m, " %08x %08x %08x %08x %8s\n", - rt->rt6i_metric, atomic_read(&rt->dst.__refcnt), - rt->dst.__use, rt->rt6i_flags, - rt->dst.dev ? rt->dst.dev->name : ""); - return 0; -} - -static int ipv6_route_show(struct seq_file *m, void *v) -{ - struct net *net = (struct net *)m->private; - fib6_clean_all_ro(net, rt6_info_route, 0, m); - return 0; -} - -static int ipv6_route_open(struct inode *inode, struct file *file) -{ - return single_open_net(inode, file, ipv6_route_show); -} - static const struct file_operations ipv6_route_proc_fops = { .owner = THIS_MODULE, .open = ipv6_route_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release_net, + .release = seq_release_net, }; static int rt6_stats_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index bf63ac8a49b9..d703218a653b 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -24,26 +24,21 @@ #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -/* Table must be sorted. */ +/* RFC 2460, Section 8.3: + * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] + * + * Due to IPV6_MIN_MTU=1280 the lowest possible MSS is 1220, which allows + * using higher values than ipv4 tcp syncookies. + * The other values are chosen based on ethernet (1500 and 9k MTU), plus + * one that accounts for common encap (PPPoe) overhead. Table must be sorted. + */ static __u16 const msstab[] = { - 64, - 512, - 536, - 1280 - 60, + 1280 - 60, /* IPV6_MIN_MTU - 60 */ 1480 - 60, 1500 - 60, - 4460 - 60, 9000 - 60, }; -/* - * This (misnamed) value is the age of syncookie which is permitted. - * Its ideal value should be dependent on TCP_TIMEOUT_INIT and - * sysctl_tcp_retries1. It's a rather complicated formula (exponential - * backoff) to compute at runtime so it's currently hardcoded here. - */ -#define COUNTER_TRIES 4 - static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) @@ -86,8 +81,9 @@ static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *dadd static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, __be16 dport, __u32 sseq, - __u32 count, __u32 data) + __u32 data) { + u32 count = tcp_cookie_time(); return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + (count << COOKIEBITS) + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) @@ -96,15 +92,14 @@ static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, static __u32 check_tcp_syn_cookie(__u32 cookie, const struct in6_addr *saddr, const struct in6_addr *daddr, __be16 sport, - __be16 dport, __u32 sseq, __u32 count, - __u32 maxdiff) + __be16 dport, __u32 sseq) { - __u32 diff; + __u32 diff, count = tcp_cookie_time(); cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); - if (diff >= maxdiff) + if (diff >= MAX_SYNCOOKIE_AGE) return (__u32)-1; return (cookie - @@ -125,8 +120,7 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, *mssp = msstab[mssind]; return secure_tcp_syn_cookie(&iph->saddr, &iph->daddr, th->source, - th->dest, ntohl(th->seq), - jiffies / (HZ * 60), mssind); + th->dest, ntohl(th->seq), mssind); } EXPORT_SYMBOL_GPL(__cookie_v6_init_sequence); @@ -146,8 +140,7 @@ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, { __u32 seq = ntohl(th->seq) - 1; __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr, - th->source, th->dest, seq, - jiffies / (HZ * 60), COUNTER_TRIES); + th->source, th->dest, seq); return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index a481c03e2861..56e22b74cf96 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -173,7 +173,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) skb->local_df = 1; - inet_get_local_port_range(&port_min, &port_max); + inet_get_local_port_range(net, &port_min, &port_max); src_port = vxlan_src_port(port_min, port_max, skb); err = vxlan_xmit_skb(vxlan_port->vs, rt, skb, diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 189e3c5b3d09..272d8e924cf6 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -231,14 +231,14 @@ override: } if (R_tab) { police->rate_present = true; - psched_ratecfg_precompute(&police->rate, &R_tab->rate); + psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0); qdisc_put_rtab(R_tab); } else { police->rate_present = false; } if (P_tab) { police->peak_present = true; - psched_ratecfg_precompute(&police->peak, &P_tab->rate); + psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0); qdisc_put_rtab(P_tab); } else { police->peak_present = false; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d76a35d0dc85..636d9131d870 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -137,7 +137,7 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct nlattr **tb, struct nlattr *est) { - int err = -EINVAL; + int err; struct tcf_exts e; struct tcf_ematch_tree t; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 7c3de6ffa516..e5cef9567225 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -793,8 +793,10 @@ static int em_meta_change(struct tcf_proto *tp, void *data, int len, goto errout; meta = kzalloc(sizeof(*meta), GFP_KERNEL); - if (meta == NULL) + if (meta == NULL) { + err = -ENOMEM; goto errout; + } memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a74e278654aa..e7121d29c4bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -910,11 +910,12 @@ void dev_shutdown(struct net_device *dev) } void psched_ratecfg_precompute(struct psched_ratecfg *r, - const struct tc_ratespec *conf) + const struct tc_ratespec *conf, + u64 rate64) { memset(r, 0, sizeof(*r)); r->overhead = conf->overhead; - r->rate_bytes_ps = conf->rate; + r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); r->mult = 1; /* diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 863846cc5513..0e1e38b40025 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -997,6 +997,8 @@ static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { [TCA_HTB_CTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_HTB_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, [TCA_HTB_DIRECT_QLEN] = { .type = NLA_U32 }, + [TCA_HTB_RATE64] = { .type = NLA_U64 }, + [TCA_HTB_CEIL64] = { .type = NLA_U64 }, }; static void htb_work_func(struct work_struct *work) @@ -1114,6 +1116,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, opt.level = cl->level; if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt)) goto nla_put_failure; + if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps)) + goto nla_put_failure; + if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) && + nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) + goto nla_put_failure; nla_nest_end(skb, nest); spin_unlock_bh(root_lock); @@ -1332,6 +1340,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct qdisc_rate_table *rtab = NULL, *ctab = NULL; struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; + u64 rate64, ceil64; /* extract all subattrs from opt attr */ if (!opt) @@ -1491,8 +1500,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->prio = TC_HTB_NUMPRIO - 1; } - psched_ratecfg_precompute(&cl->rate, &hopt->rate); - psched_ratecfg_precompute(&cl->ceil, &hopt->ceil); + rate64 = tb[TCA_HTB_RATE64] ? nla_get_u64(tb[TCA_HTB_RATE64]) : 0; + + ceil64 = tb[TCA_HTB_CEIL64] ? nla_get_u64(tb[TCA_HTB_CEIL64]) : 0; + + psched_ratecfg_precompute(&cl->rate, &hopt->rate, rate64); + psched_ratecfg_precompute(&cl->ceil, &hopt->ceil, ceil64); cl->buffer = PSCHED_TICKS2NS(hopt->buffer); cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 1aaf1b6e51a2..b0571224f3c9 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -341,9 +341,9 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->tokens = q->buffer; q->ptokens = q->mtu; - psched_ratecfg_precompute(&q->rate, &rtab->rate); + psched_ratecfg_precompute(&q->rate, &rtab->rate, 0); if (ptab) { - psched_ratecfg_precompute(&q->peak, &ptab->rate); + psched_ratecfg_precompute(&q->peak, &ptab->rate, 0); q->peak_present = true; } else { q->peak_present = false; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 911b71b26b0e..72046b9729a8 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5890,7 +5890,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) int low, high, remaining, index; unsigned int rover; - inet_get_local_port_range(&low, &high); + inet_get_local_port_range(sock_net(sk), &low, &high); remaining = (high - low) + 1; rover = net_random() % remaining + low; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b9c3f9e943a9..d6e7f98fbfbf 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -468,7 +468,7 @@ expired: } err = __xfrm_state_delete(x); - if (!err && x->id.spi) + if (!err) km_state_expired(x, 1, 0); xfrm_audit_state_delete(x, err ? 0 : 1, |