summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c124
-rw-r--r--net/core/dev_ioctl.c7
-rw-r--r--net/core/devlink.c106
-rw-r--r--net/core/dst_cache.c4
-rw-r--r--net/core/ethtool.c83
-rw-r--r--net/core/fib_notifier.c12
-rw-r--r--net/core/fib_rules.c110
-rw-r--r--net/core/filter.c820
-rw-r--r--net/core/flow_dissector.c16
-rw-r--r--net/core/gen_estimator.c1
-rw-r--r--net/core/net-procfs.c6
-rw-r--r--net/core/net-sysfs.c12
-rw-r--r--net/core/net_namespace.c123
-rw-r--r--net/core/pktgen.c15
-rw-r--r--net/core/rtnetlink.c19
-rw-r--r--net/core/skbuff.c98
-rw-r--r--net/core/sock.c111
-rw-r--r--net/core/sock_diag.c12
-rw-r--r--net/core/sysctl_net_core.c13
-rw-r--r--net/core/utils.c23
20 files changed, 1432 insertions, 283 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index dda9d7b9a840..9b04a9fd1dfd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1571,6 +1571,27 @@ static void dev_disable_gro_hw(struct net_device *dev)
netdev_WARN(dev, "failed to disable GRO_HW!\n");
}
+const char *netdev_cmd_to_name(enum netdev_cmd cmd)
+{
+#define N(val) \
+ case NETDEV_##val: \
+ return "NETDEV_" __stringify(val);
+ switch (cmd) {
+ N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
+ N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
+ N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
+ N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
+ N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
+ N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
+ N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+ N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
+ N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
+ };
+#undef N
+ return "UNKNOWN_NETDEV_EVENT";
+}
+EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
+
static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
struct net_device *dev)
{
@@ -1604,6 +1625,8 @@ int register_netdevice_notifier(struct notifier_block *nb)
struct net *net;
int err;
+ /* Close race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
rtnl_lock();
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
@@ -1626,6 +1649,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
unlock:
rtnl_unlock();
+ up_write(&pernet_ops_rwsem);
return err;
rollback:
@@ -1670,6 +1694,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
struct net *net;
int err;
+ /* Close race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
rtnl_lock();
err = raw_notifier_chain_unregister(&netdev_chain, nb);
if (err)
@@ -1687,6 +1713,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
}
unlock:
rtnl_unlock();
+ up_write(&pernet_ops_rwsem);
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);
@@ -2378,12 +2405,15 @@ EXPORT_SYMBOL(netdev_set_num_tc);
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
- * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
+ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
*/
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
+ bool disabling;
int rc;
+ disabling = txq < dev->real_num_tx_queues;
+
if (txq < 1 || txq > dev->num_tx_queues)
return -EINVAL;
@@ -2399,15 +2429,19 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
- if (txq < dev->real_num_tx_queues) {
+ dev->real_num_tx_queues = txq;
+
+ if (disabling) {
+ synchronize_net();
qdisc_reset_all_tx_gt(dev, txq);
#ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, txq);
#endif
}
+ } else {
+ dev->real_num_tx_queues = txq;
}
- dev->real_num_tx_queues = txq;
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
@@ -2728,7 +2762,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
return 0;
- eth = (struct ethhdr *)skb_mac_header(skb);
+ eth = (struct ethhdr *)skb->data;
type = eth->h_proto;
}
@@ -3271,15 +3305,23 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
static void skb_update_prio(struct sk_buff *skb)
{
- struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
+ const struct netprio_map *map;
+ const struct sock *sk;
+ unsigned int prioidx;
- if (!skb->priority && skb->sk && map) {
- unsigned int prioidx =
- sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
+ if (skb->priority)
+ return;
+ map = rcu_dereference_bh(skb->dev->priomap);
+ if (!map)
+ return;
+ sk = skb_to_full_sk(skb);
+ if (!sk)
+ return;
- if (prioidx < map->priomap_len)
- skb->priority = map->priomap[prioidx];
- }
+ prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
+
+ if (prioidx < map->priomap_len)
+ skb->priority = map->priomap[prioidx];
}
#else
#define skb_update_prio(skb)
@@ -4344,6 +4386,9 @@ int netdev_rx_handler_register(struct net_device *dev,
if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
+ if (dev->priv_flags & IFF_NO_RX_HANDLER)
+ return -EINVAL;
+
/* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
@@ -6389,6 +6434,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
.linking = true,
.upper_info = upper_info,
};
+ struct net_device *master_dev;
int ret = 0;
ASSERT_RTNL();
@@ -6400,11 +6446,14 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
- if (netdev_has_upper_dev(dev, upper_dev))
- return -EEXIST;
-
- if (master && netdev_master_upper_dev_get(dev))
- return -EBUSY;
+ if (!master) {
+ if (netdev_has_upper_dev(dev, upper_dev))
+ return -EEXIST;
+ } else {
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ return master_dev == upper_dev ? -EEXIST : -EBUSY;
+ }
ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
&changeupper_info.info);
@@ -7535,6 +7584,19 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ /* LRO/HW-GRO features cannot be combined with RX-FCS */
+ if (features & NETIF_F_RXFCS) {
+ if (features & NETIF_F_LRO) {
+ netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
+ features &= ~NETIF_F_LRO;
+ }
+
+ if (features & NETIF_F_GRO_HW) {
+ netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
+ features &= ~NETIF_F_GRO_HW;
+ }
+ }
+
return features;
}
@@ -7606,6 +7668,24 @@ sync_lower:
}
}
+ if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
+ if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+ dev->features = features;
+ err |= vlan_get_rx_ctag_filter_info(dev);
+ } else {
+ vlan_drop_rx_ctag_filter_info(dev);
+ }
+ }
+
+ if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
+ if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
+ dev->features = features;
+ err |= vlan_get_rx_stag_filter_info(dev);
+ } else {
+ vlan_drop_rx_stag_filter_info(dev);
+ }
+ }
+
dev->features = features;
}
@@ -7991,7 +8071,8 @@ int register_netdev(struct net_device *dev)
{
int err;
- rtnl_lock();
+ if (rtnl_lock_killable())
+ return -EINTR;
err = register_netdevice(dev);
rtnl_unlock();
return err;
@@ -8041,7 +8122,6 @@ static void netdev_wait_allrefs(struct net_device *dev)
rcu_barrier();
rtnl_lock();
- call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
&dev->state)) {
/* We must not have linkwatch events
@@ -8113,10 +8193,6 @@ void netdev_run_todo(void)
= list_first_entry(&list, struct net_device, todo_list);
list_del(&dev->todo_list);
- rtnl_lock();
- call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
- __rtnl_unlock();
-
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
pr_err("network todo '%s' but state %d\n",
dev->name, dev->reg_state);
@@ -8134,8 +8210,9 @@ void netdev_run_todo(void)
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
+#if IS_ENABLED(CONFIG_DECNET)
WARN_ON(dev->dn_ptr);
-
+#endif
if (dev->priv_destructor)
dev->priv_destructor(dev);
if (dev->needs_free_netdev)
@@ -8557,7 +8634,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
- call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
new_nsid = peernet2id_alloc(dev_net(dev), net);
/* If there is an ifindex conflict assign a new one */
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 0ab1af04296c..a04e1e88bf3a 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -402,8 +402,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
if (colon)
*colon = 0;
- dev_load(net, ifr->ifr_name);
-
/*
* See which interface the caller is talking about.
*/
@@ -423,6 +421,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCGIFMAP:
case SIOCGIFINDEX:
case SIOCGIFTXQLEN:
+ dev_load(net, ifr->ifr_name);
rcu_read_lock();
ret = dev_ifsioc_locked(net, ifr, cmd);
rcu_read_unlock();
@@ -431,6 +430,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
return ret;
case SIOCETHTOOL:
+ dev_load(net, ifr->ifr_name);
rtnl_lock();
ret = dev_ethtool(net, ifr);
rtnl_unlock();
@@ -447,6 +447,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCGMIIPHY:
case SIOCGMIIREG:
case SIOCSIFNAME:
+ dev_load(net, ifr->ifr_name);
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
rtnl_lock();
@@ -494,6 +495,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
/* fall through */
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
+ dev_load(net, ifr->ifr_name);
rtnl_lock();
ret = dev_ifsioc(net, ifr, cmd);
rtnl_unlock();
@@ -518,6 +520,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
cmd == SIOCGHWTSTAMP ||
(cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15)) {
+ dev_load(net, ifr->ifr_name);
rtnl_lock();
ret = dev_ifsioc(net, ifr, cmd);
rtnl_unlock();
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 18d385ed8237..9236e421bd62 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1695,10 +1695,11 @@ static int devlink_dpipe_table_put(struct sk_buff *skb,
goto nla_put_failure;
if (table->resource_valid) {
- nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
- table->resource_id, DEVLINK_ATTR_PAD);
- nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
- table->resource_units, DEVLINK_ATTR_PAD);
+ if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+ table->resource_id, DEVLINK_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
+ table->resource_units, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
}
if (devlink_dpipe_matches_put(table, skb))
goto nla_put_failure;
@@ -1797,7 +1798,7 @@ send_done:
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
- goto err_skb_send_alloc;
+ return err;
goto send_done;
}
@@ -1806,7 +1807,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
-err_skb_send_alloc:
genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
@@ -2072,7 +2072,7 @@ static int devlink_dpipe_entries_fill(struct genl_info *info,
table->counters_enabled,
&dump_ctx);
if (err)
- goto err_entries_dump;
+ return err;
send_done:
nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq,
@@ -2080,16 +2080,10 @@ send_done:
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info);
if (err)
- goto err_skb_send_alloc;
+ return err;
goto send_done;
}
return genlmsg_reply(dump_ctx.skb, info);
-
-err_entries_dump:
-err_skb_send_alloc:
- genlmsg_cancel(dump_ctx.skb, dump_ctx.hdr);
- nlmsg_free(dump_ctx.skb);
- return err;
}
static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
@@ -2228,7 +2222,7 @@ send_done:
if (!nlh) {
err = devlink_dpipe_send_and_alloc_skb(&skb, info);
if (err)
- goto err_skb_send_alloc;
+ return err;
goto send_done;
}
return genlmsg_reply(skb, info);
@@ -2236,7 +2230,6 @@ send_done:
nla_put_failure:
err = -EMSGSIZE;
err_table_put:
-err_skb_send_alloc:
genlmsg_cancel(skb, hdr);
nlmsg_free(skb);
return err;
@@ -2332,12 +2325,38 @@ devlink_resource_validate_children(struct devlink_resource *resource)
list_for_each_entry(child_resource, &resource->resource_list, list)
parts_size += child_resource->size_new;
- if (parts_size > resource->size)
+ if (parts_size > resource->size_new)
size_valid = false;
out:
resource->size_valid = size_valid;
}
+static int
+devlink_resource_validate_size(struct devlink_resource *resource, u64 size,
+ struct netlink_ext_ack *extack)
+{
+ u64 reminder;
+ int err = 0;
+
+ if (size > resource->size_params.size_max) {
+ NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum");
+ err = -EINVAL;
+ }
+
+ if (size < resource->size_params.size_min) {
+ NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum");
+ err = -EINVAL;
+ }
+
+ div64_u64_rem(size, resource->size_params.size_granularity, &reminder);
+ if (reminder) {
+ NL_SET_ERR_MSG_MOD(extack, "Wrong granularity");
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
struct genl_info *info)
{
@@ -2356,12 +2375,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
if (!resource)
return -EINVAL;
- if (!resource->resource_ops->size_validate)
- return -EINVAL;
-
size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
- err = resource->resource_ops->size_validate(devlink, size,
- info->extack);
+ err = devlink_resource_validate_size(resource, size, info->extack);
if (err)
return err;
@@ -2372,20 +2387,22 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
return 0;
}
-static void
+static int
devlink_resource_size_params_put(struct devlink_resource *resource,
struct sk_buff *skb)
{
struct devlink_resource_size_params *size_params;
- size_params = resource->size_params;
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
- size_params->size_granularity, DEVLINK_ATTR_PAD);
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
- size_params->size_max, DEVLINK_ATTR_PAD);
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
- size_params->size_min, DEVLINK_ATTR_PAD);
- nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit);
+ size_params = &resource->size_params;
+ if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
+ size_params->size_granularity, DEVLINK_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
+ size_params->size_max, DEVLINK_ATTR_PAD) ||
+ nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
+ size_params->size_min, DEVLINK_ATTR_PAD) ||
+ nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit))
+ return -EMSGSIZE;
+ return 0;
}
static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
@@ -2409,10 +2426,12 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
resource->size_new, DEVLINK_ATTR_PAD);
if (resource->resource_ops && resource->resource_ops->occ_get)
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
- resource->resource_ops->occ_get(devlink),
- DEVLINK_ATTR_PAD);
- devlink_resource_size_params_put(resource, skb);
+ if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+ resource->resource_ops->occ_get(devlink),
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (devlink_resource_size_params_put(resource, skb))
+ goto nla_put_failure;
if (list_empty(&resource->resource_list))
goto out;
@@ -2717,22 +2736,22 @@ static const struct genl_ops devlink_nl_ops[] = {
.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
.doit = devlink_nl_cmd_dpipe_table_get,
.policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
.doit = devlink_nl_cmd_dpipe_entries_get,
.policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
.doit = devlink_nl_cmd_dpipe_headers_get,
.policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
@@ -2752,8 +2771,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.cmd = DEVLINK_CMD_RESOURCE_DUMP,
.doit = devlink_nl_cmd_resource_dump,
.policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_RELOAD,
@@ -3147,17 +3166,19 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
*/
int devlink_resource_register(struct devlink *devlink,
const char *resource_name,
- bool top_hierarchy,
u64 resource_size,
u64 resource_id,
u64 parent_resource_id,
- struct devlink_resource_size_params *size_params,
+ const struct devlink_resource_size_params *size_params,
const struct devlink_resource_ops *resource_ops)
{
struct devlink_resource *resource;
struct list_head *resource_list;
+ bool top_hierarchy;
int err = 0;
+ top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
+
mutex_lock(&devlink->lock);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (resource) {
@@ -3194,7 +3215,8 @@ int devlink_resource_register(struct devlink *devlink,
resource->id = resource_id;
resource->resource_ops = resource_ops;
resource->size_valid = true;
- resource->size_params = size_params;
+ memcpy(&resource->size_params, size_params,
+ sizeof(resource->size_params));
INIT_LIST_HEAD(&resource->resource_list);
list_add_tail(&resource->list, resource_list);
out:
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 554d36449231..64cef977484a 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -107,7 +107,7 @@ EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
#if IS_ENABLED(CONFIG_IPV6)
void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
- const struct in6_addr *addr)
+ const struct in6_addr *saddr)
{
struct dst_cache_pcpu *idst;
@@ -117,7 +117,7 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
rt6_get_cookie((struct rt6_info *)dst));
- idst->in6_saddr = *addr;
+ idst->in6_saddr = *saddr;
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 494e6a5d7306..03416e6dd5d7 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -22,6 +22,7 @@
#include <linux/bitops.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/sfp.h>
#include <linux/slab.h>
#include <linux/rtnetlink.h>
#include <linux/sched/signal.h>
@@ -107,6 +108,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_HW_ESP_BIT] = "esp-hw-offload",
[NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
+ [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
};
static const char
@@ -121,6 +123,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
[ETHTOOL_ID_UNSPEC] = "Unspec",
[ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
[ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
+ [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout",
};
static const char
@@ -1022,6 +1025,15 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
if (copy_from_user(&info, useraddr, info_size))
return -EFAULT;
+ /* If FLOW_RSS was requested then user-space must be using the
+ * new definition, as FLOW_RSS is newer.
+ */
+ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
+ info_size = sizeof(info);
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+ }
+
if (info.cmd == ETHTOOL_GRXCLSRLALL) {
if (info.rule_cnt > 0) {
if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
@@ -1251,9 +1263,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
user_key_size = rxfh.key_size;
/* Check that reserved fields are 0 for now */
- if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
- rxfh.rsvd8[2] || rxfh.rsvd32)
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
return -EINVAL;
+ /* Most drivers don't handle rss_context, check it's 0 as well */
+ if (rxfh.rss_context && !ops->get_rxfh_context)
+ return -EOPNOTSUPP;
rxfh.indir_size = dev_indir_size;
rxfh.key_size = dev_key_size;
@@ -1276,7 +1290,12 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
if (user_key_size)
hkey = rss_config + indir_bytes;
- ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+ if (rxfh.rss_context)
+ ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey,
+ &dev_hfunc,
+ rxfh.rss_context);
+ else
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
if (ret)
goto out;
@@ -1306,6 +1325,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
u8 *hkey = NULL;
u8 *rss_config;
u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+ bool delete = false;
if (!ops->get_rxnfc || !ops->set_rxfh)
return -EOPNOTSUPP;
@@ -1319,9 +1339,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
return -EFAULT;
/* Check that reserved fields are 0 for now */
- if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
- rxfh.rsvd8[2] || rxfh.rsvd32)
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
return -EINVAL;
+ /* Most drivers don't handle rss_context, check it's 0 as well */
+ if (rxfh.rss_context && !ops->set_rxfh_context)
+ return -EOPNOTSUPP;
/* If either indir, hash key or function is valid, proceed further.
* Must request at least one change: indir size, hash key or function.
@@ -1346,7 +1368,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
if (ret)
goto out;
- /* rxfh.indir_size == 0 means reset the indir table to default.
+ /* rxfh.indir_size == 0 means reset the indir table to default (master
+ * context) or delete the context (other RSS contexts).
* rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
*/
if (rxfh.indir_size &&
@@ -1359,9 +1382,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
if (ret)
goto out;
} else if (rxfh.indir_size == 0) {
- indir = (u32 *)rss_config;
- for (i = 0; i < dev_indir_size; i++)
- indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ if (rxfh.rss_context == 0) {
+ indir = (u32 *)rss_config;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ delete = true;
+ }
}
if (rxfh.key_size) {
@@ -1374,15 +1401,25 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
}
}
- ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+ if (rxfh.rss_context)
+ ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc,
+ &rxfh.rss_context, delete);
+ else
+ ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
if (ret)
goto out;
- /* indicate whether rxfh was set to default */
- if (rxfh.indir_size == 0)
- dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
- else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
- dev->priv_flags |= IFF_RXFH_CONFIGURED;
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context),
+ &rxfh.rss_context, sizeof(rxfh.rss_context)))
+ ret = -EFAULT;
+
+ if (!rxfh.rss_context) {
+ /* indicate whether rxfh was set to default */
+ if (rxfh.indir_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
+ }
out:
kfree(rss_config);
@@ -2210,6 +2247,9 @@ static int __ethtool_get_module_info(struct net_device *dev,
const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ if (dev->sfp_bus)
+ return sfp_get_module_info(dev->sfp_bus, modinfo);
+
if (phydev && phydev->drv && phydev->drv->module_info)
return phydev->drv->module_info(phydev, modinfo);
@@ -2244,6 +2284,9 @@ static int __ethtool_get_module_eeprom(struct net_device *dev,
const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ if (dev->sfp_bus)
+ return sfp_get_module_eeprom(dev->sfp_bus, ee, data);
+
if (phydev && phydev->drv && phydev->drv->module_eeprom)
return phydev->drv->module_eeprom(phydev, ee, data);
@@ -2277,6 +2320,11 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
tuna->type_id != ETHTOOL_TUNABLE_U32)
return -EINVAL;
break;
+ case ETHTOOL_PFC_PREVENTION_TOUT:
+ if (tuna->len != sizeof(u16) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U16)
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -2520,11 +2568,14 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM };
+ int rc;
if (!dev->ethtool_ops->get_fecparam)
return -EOPNOTSUPP;
- dev->ethtool_ops->get_fecparam(dev, &fecparam);
+ rc = dev->ethtool_ops->get_fecparam(dev, &fecparam);
+ if (rc)
+ return rc;
if (copy_to_user(useraddr, &fecparam, sizeof(fecparam)))
return -EFAULT;
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 0c048bdeb016..13a40b831d6d 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -13,16 +13,22 @@ int call_fib_notifier(struct notifier_block *nb, struct net *net,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
+ int err;
+
info->net = net;
- return nb->notifier_call(nb, event_type, info);
+ err = nb->notifier_call(nb, event_type, info);
+ return notifier_to_errno(err);
}
EXPORT_SYMBOL(call_fib_notifier);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info)
{
+ int err;
+
info->net = net;
- return atomic_notifier_call_chain(&fib_chain, event_type, info);
+ err = atomic_notifier_call_chain(&fib_chain, event_type, info);
+ return notifier_to_errno(err);
}
EXPORT_SYMBOL(call_fib_notifiers);
@@ -33,6 +39,7 @@ static unsigned int fib_seq_sum(void)
struct net *net;
rtnl_lock();
+ down_read(&net_rwsem);
for_each_net(net) {
rcu_read_lock();
list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
@@ -43,6 +50,7 @@ static unsigned int fib_seq_sum(void)
}
rcu_read_unlock();
}
+ up_read(&net_rwsem);
rtnl_unlock();
return fib_seq;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 98e1066c3d55..33958f84c173 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule)
if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
!uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
return false;
+ if (fib_rule_port_range_set(&rule->sport_range))
+ return false;
+ if (fib_rule_port_range_set(&rule->dport_range))
+ return false;
return true;
}
EXPORT_SYMBOL_GPL(fib_rule_matchall);
@@ -51,6 +55,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->pref = pref;
r->table = table;
r->flags = flags;
+ r->proto = RTPROT_KERNEL;
r->fr_net = ops->fro_net;
r->uid_range = fib_kuid_range_unset;
@@ -220,6 +225,26 @@ static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
}
+static int nla_get_port_range(struct nlattr *pattr,
+ struct fib_rule_port_range *port_range)
+{
+ const struct fib_rule_port_range *pr = nla_data(pattr);
+
+ if (!fib_rule_port_range_valid(pr))
+ return -EINVAL;
+
+ port_range->start = pr->start;
+ port_range->end = pr->end;
+
+ return 0;
+}
+
+static int nla_put_port_range(struct sk_buff *skb, int attrtype,
+ struct fib_rule_port_range *range)
+{
+ return nla_put(skb, attrtype, sizeof(*range), range);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
@@ -424,6 +449,17 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
!uid_eq(r->uid_range.end, rule->uid_range.end))
continue;
+ if (r->ip_proto != rule->ip_proto)
+ continue;
+
+ if (!fib_rule_port_range_compare(&r->sport_range,
+ &rule->sport_range))
+ continue;
+
+ if (!fib_rule_port_range_compare(&r->dport_range,
+ &rule->dport_range))
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -469,6 +505,9 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
: fib_default_rule_pref(ops);
+ rule->proto = tb[FRA_PROTOCOL] ?
+ nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
+
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
@@ -565,6 +604,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
rule->uid_range = fib_kuid_range_unset;
}
+ if (tb[FRA_IP_PROTO])
+ rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+
+ if (tb[FRA_SPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_SPORT_RANGE],
+ &rule->sport_range);
+ if (err)
+ goto errout_free;
+ }
+
+ if (tb[FRA_DPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_DPORT_RANGE],
+ &rule->dport_range);
+ if (err)
+ goto errout_free;
+ }
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -575,6 +631,11 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout_free;
+ err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops,
+ extack);
+ if (err < 0)
+ goto errout_free;
+
list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref > rule->pref)
break;
@@ -611,7 +672,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rule->tun_id)
ip_tunnel_need_metadata();
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack);
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
flush_route_cache(ops);
rules_ops_put(ops);
@@ -630,6 +690,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rule_port_range sprange = {0, 0};
+ struct fib_rule_port_range dprange = {0, 0};
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *r;
struct nlattr *tb[FRA_MAX+1];
@@ -663,7 +725,25 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
range = fib_kuid_range_unset;
}
+ if (tb[FRA_SPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_SPORT_RANGE],
+ &sprange);
+ if (err)
+ goto errout;
+ }
+
+ if (tb[FRA_DPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_DPORT_RANGE],
+ &dprange);
+ if (err)
+ goto errout;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
+ if (tb[FRA_PROTOCOL] &&
+ (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
+ continue;
+
if (frh->action && (frh->action != rule->action))
continue;
@@ -704,6 +784,18 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
!uid_eq(rule->uid_range.end, range.end)))
continue;
+ if (tb[FRA_IP_PROTO] &&
+ (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
+ continue;
+
+ if (fib_rule_port_range_set(&sprange) &&
+ !fib_rule_port_range_compare(&rule->sport_range, &sprange))
+ continue;
+
+ if (fib_rule_port_range_set(&dprange) &&
+ !fib_rule_port_range_compare(&rule->dport_range, &dprange))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -781,7 +873,11 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
+ nla_total_size_64bit(8) /* FRA_TUN_ID */
- + nla_total_size(sizeof(struct fib_kuid_range));
+ + nla_total_size(sizeof(struct fib_kuid_range))
+ + nla_total_size(1) /* FRA_PROTOCOL */
+ + nla_total_size(1) /* FRA_IP_PROTO */
+ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
+ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -812,6 +908,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
frh->action = rule->action;
frh->flags = rule->flags;
+ if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto))
+ goto nla_put_failure;
+
if (rule->action == FR_ACT_GOTO &&
rcu_access_pointer(rule->ctarget) == NULL)
frh->flags |= FIB_RULE_UNRESOLVED;
@@ -843,7 +942,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->l3mdev &&
nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
(uid_range_set(&rule->uid_range) &&
- nla_put_uid_range(skb, &rule->uid_range)))
+ nla_put_uid_range(skb, &rule->uid_range)) ||
+ (fib_rule_port_range_set(&rule->sport_range) &&
+ nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
+ (fib_rule_port_range_set(&rule->dport_range) &&
+ nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
+ (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 08ab4c65a998..d31aff93270d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -33,6 +33,7 @@
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/gfp.h>
+#include <net/inet_common.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/netlink.h>
@@ -1855,7 +1856,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
/* If user passes invalid input drop the packet. */
- if (unlikely(flags))
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
return SK_DROP;
tcb->bpf.key = key;
@@ -1890,6 +1891,202 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ msg->key = key;
+ msg->flags = flags;
+ msg->map = map;
+
+ return SK_PASS;
+}
+
+struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+{
+ struct sock *sk = NULL;
+
+ if (msg->map) {
+ sk = __sock_map_lookup_elem(msg->map, msg->key);
+
+ msg->key = 0;
+ msg->map = NULL;
+ }
+
+ return sk;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+ .func = bpf_msg_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+ msg->apply_bytes = bytes;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
+ .func = bpf_msg_apply_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+ msg->cork_bytes = bytes;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
+ .func = bpf_msg_cork_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_pull_data,
+ struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+{
+ unsigned int len = 0, offset = 0, copy = 0;
+ struct scatterlist *sg = msg->sg_data;
+ int first_sg, last_sg, i, shift;
+ unsigned char *p, *to, *from;
+ int bytes = end - start;
+ struct page *page;
+
+ if (unlikely(flags || end <= start))
+ return -EINVAL;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg_start;
+ do {
+ len = sg[i].length;
+ offset += len;
+ if (start < offset + len)
+ break;
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (i != msg->sg_end);
+
+ if (unlikely(start >= offset + len))
+ return -EINVAL;
+
+ if (!msg->sg_copy[i] && bytes <= len)
+ goto out;
+
+ first_sg = i;
+
+ /* At this point we need to linearize multiple scatterlist
+ * elements or a single shared page. Either way we need to
+ * copy into a linear buffer exclusively owned by BPF. Then
+ * place the buffer in the scatterlist and fixup the original
+ * entries by removing the entries now in the linear buffer
+ * and shifting the remaining entries. For now we do not try
+ * to copy partial entries to avoid complexity of running out
+ * of sg_entry slots. The downside is reading a single byte
+ * will copy the entire sg entry.
+ */
+ do {
+ copy += sg[i].length;
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ if (bytes < copy)
+ break;
+ } while (i != msg->sg_end);
+ last_sg = i;
+
+ if (unlikely(copy < end - start))
+ return -EINVAL;
+
+ page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
+ if (unlikely(!page))
+ return -ENOMEM;
+ p = page_address(page);
+ offset = 0;
+
+ i = first_sg;
+ do {
+ from = sg_virt(&sg[i]);
+ len = sg[i].length;
+ to = p + offset;
+
+ memcpy(to, from, len);
+ offset += len;
+ sg[i].length = 0;
+ put_page(sg_page(&sg[i]));
+
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (i != last_sg);
+
+ sg[first_sg].length = copy;
+ sg_set_page(&sg[first_sg], page, copy, 0);
+
+ /* To repair sg ring we need to shift entries. If we only
+ * had a single entry though we can just replace it and
+ * be done. Otherwise walk the ring and shift the entries.
+ */
+ shift = last_sg - first_sg - 1;
+ if (!shift)
+ goto out;
+
+ i = first_sg + 1;
+ do {
+ int move_from;
+
+ if (i + shift >= MAX_SKB_FRAGS)
+ move_from = i + shift - MAX_SKB_FRAGS;
+ else
+ move_from = i + shift;
+
+ if (move_from == msg->sg_end)
+ break;
+
+ sg[i] = sg[move_from];
+ sg[move_from].length = 0;
+ sg[move_from].page_link = 0;
+ sg[move_from].offset = 0;
+
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (1);
+ msg->sg_end -= shift;
+ if (msg->sg_end < 0)
+ msg->sg_end += MAX_SKB_FRAGS;
+out:
+ msg->data = sg_virt(&sg[i]) + start - offset;
+ msg->data_end = msg->data + bytes;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pull_data_proto = {
+ .func = bpf_msg_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -2087,6 +2284,10 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
+ /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+ if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+ return -ENOTSUPP;
+
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
@@ -2096,19 +2297,21 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
return ret;
if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
/* SKB_GSO_TCPV4 needs to be changed into
* SKB_GSO_TCPV6.
*/
- if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
- skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+ if (shinfo->gso_type & SKB_GSO_TCPV4) {
+ shinfo->gso_type &= ~SKB_GSO_TCPV4;
+ shinfo->gso_type |= SKB_GSO_TCPV6;
}
/* Due to IPv6 header, MSS needs to be downgraded. */
- skb_shinfo(skb)->gso_size -= len_diff;
+ skb_decrease_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
- skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
- skb_shinfo(skb)->gso_segs = 0;
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IPV6);
@@ -2123,6 +2326,10 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
+ /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+ if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+ return -ENOTSUPP;
+
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
return ret;
@@ -2132,19 +2339,21 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
return ret;
if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
/* SKB_GSO_TCPV6 needs to be changed into
* SKB_GSO_TCPV4.
*/
- if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
- skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ if (shinfo->gso_type & SKB_GSO_TCPV6) {
+ shinfo->gso_type &= ~SKB_GSO_TCPV6;
+ shinfo->gso_type |= SKB_GSO_TCPV4;
}
/* Due to IPv4 header, MSS can be upgraded. */
- skb_shinfo(skb)->gso_size += len_diff;
+ skb_increase_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
- skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
- skb_shinfo(skb)->gso_segs = 0;
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
}
skb->protocol = htons(ETH_P_IP);
@@ -2243,6 +2452,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
int ret;
+ /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+ if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+ return -ENOTSUPP;
+
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
@@ -2252,11 +2465,13 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
return ret;
if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
/* Due to header grow, MSS needs to be downgraded. */
- skb_shinfo(skb)->gso_size -= len_diff;
+ skb_decrease_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
- skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
- skb_shinfo(skb)->gso_segs = 0;
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
}
return 0;
@@ -2267,6 +2482,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
int ret;
+ /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
+ if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
+ return -ENOTSUPP;
+
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
return ret;
@@ -2276,11 +2495,13 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
return ret;
if (skb_is_gso(skb)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
/* Due to header shrink, MSS can be upgraded. */
- skb_shinfo(skb)->gso_size += len_diff;
+ skb_increase_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
- skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
- skb_shinfo(skb)->gso_segs = 0;
+ shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_segs = 0;
}
return 0;
@@ -2831,7 +3052,8 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_l3_csum_replace ||
func == bpf_l4_csum_replace ||
func == bpf_xdp_adjust_head ||
- func == bpf_xdp_adjust_meta)
+ func == bpf_xdp_adjust_meta ||
+ func == bpf_msg_pull_data)
return true;
return false;
@@ -2991,7 +3213,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
struct ip_tunnel_info *info;
if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
- BPF_F_DONT_FRAGMENT)))
+ BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
return -EINVAL;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
@@ -3025,6 +3247,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
if (flags & BPF_F_ZERO_CSUM_TX)
info->key.tun_flags &= ~TUNNEL_CSUM;
+ if (flags & BPF_F_SEQ_NUMBER)
+ info->key.tun_flags |= TUNNEL_SEQ;
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
@@ -3239,6 +3463,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
ret = -EINVAL;
}
#ifdef CONFIG_INET
+ } else if (level == SOL_IP) {
+ if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+ return -EINVAL;
+
+ val = *((int *)optval);
+ /* Only some options are supported */
+ switch (optname) {
+ case IP_TOS:
+ if (val < -1 || val > 0xff) {
+ ret = -EINVAL;
+ } else {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (val == -1)
+ val = 0;
+ inet->tos = val;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ }
#if IS_ENABLED(CONFIG_IPV6)
} else if (level == SOL_IPV6) {
if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
@@ -3338,6 +3583,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
} else {
goto err_clear;
}
+ } else if (level == SOL_IP) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+ goto err_clear;
+
+ /* Only some options are supported */
+ switch (optname) {
+ case IP_TOS:
+ *((int *)optval) = (int)inet->tos;
+ break;
+ default:
+ goto err_clear;
+ }
#if IS_ENABLED(CONFIG_IPV6)
} else if (level == SOL_IPV6) {
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -3381,17 +3640,13 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
struct sock *sk = bpf_sock->sk;
int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
- if (!sk_fullsock(sk))
+ if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
return -EINVAL;
-#ifdef CONFIG_INET
if (val)
tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
-#else
- return -EINVAL;
-#endif
}
static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
@@ -3402,6 +3657,52 @@ static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
.arg2_type = ARG_ANYTHING,
};
+const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
+EXPORT_SYMBOL_GPL(ipv6_bpf_stub);
+
+BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
+ int, addr_len)
+{
+#ifdef CONFIG_INET
+ struct sock *sk = ctx->sk;
+ int err;
+
+ /* Binding to port can be expensive so it's prohibited in the helper.
+ * Only binding to IP is supported.
+ */
+ err = -EINVAL;
+ if (addr->sa_family == AF_INET) {
+ if (addr_len < sizeof(struct sockaddr_in))
+ return err;
+ if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+ return err;
+ return __inet_bind(sk, addr, addr_len, true, false);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (addr->sa_family == AF_INET6) {
+ if (addr_len < SIN6_LEN_RFC2133)
+ return err;
+ if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+ return err;
+ /* ipv6_bpf_stub cannot be NULL, since it's called from
+ * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
+ */
+ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false);
+#endif /* CONFIG_IPV6 */
+ }
+#endif /* CONFIG_INET */
+
+ return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_bind_proto = {
+ .func = bpf_bind,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
static const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
@@ -3431,7 +3732,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
-sock_filter_func_proto(enum bpf_func_id func_id)
+sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
/* inet and inet6 sockets are created in a process
@@ -3445,7 +3746,29 @@ sock_filter_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
-sk_filter_func_proto(enum bpf_func_id func_id)
+sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ /* inet and inet6 sockets are created in a process
+ * context so there is always a valid uid/gid
+ */
+ case BPF_FUNC_get_current_uid_gid:
+ return &bpf_get_current_uid_gid_proto;
+ case BPF_FUNC_bind:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ return &bpf_bind_proto;
+ default:
+ return NULL;
+ }
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_skb_load_bytes:
@@ -3460,7 +3783,7 @@ sk_filter_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
-tc_cls_act_func_proto(enum bpf_func_id func_id)
+tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_skb_store_bytes:
@@ -3527,7 +3850,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
-xdp_func_proto(enum bpf_func_id func_id)
+xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_output:
@@ -3550,7 +3873,7 @@ xdp_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
-lwt_inout_func_proto(enum bpf_func_id func_id)
+lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_skb_load_bytes:
@@ -3577,7 +3900,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
- sock_ops_func_proto(enum bpf_func_id func_id)
+sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_setsockopt:
@@ -3593,7 +3916,25 @@ static const struct bpf_func_proto *
}
}
-static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_msg_redirect_map:
+ return &bpf_msg_redirect_map_proto;
+ case BPF_FUNC_msg_apply_bytes:
+ return &bpf_msg_apply_bytes_proto;
+ case BPF_FUNC_msg_cork_bytes:
+ return &bpf_msg_cork_bytes_proto;
+ case BPF_FUNC_msg_pull_data:
+ return &bpf_msg_pull_data_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
+static const struct bpf_func_proto *
+sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_skb_store_bytes:
@@ -3618,7 +3959,7 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
}
static const struct bpf_func_proto *
-lwt_xmit_func_proto(enum bpf_func_id func_id)
+lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_skb_get_tunnel_key:
@@ -3648,11 +3989,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
default:
- return lwt_inout_func_proto(func_id);
+ return lwt_inout_func_proto(func_id, prog);
}
}
static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
@@ -3696,6 +4038,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
static bool sk_filter_is_valid_access(int off, int size,
enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
switch (off) {
@@ -3716,11 +4059,12 @@ static bool sk_filter_is_valid_access(int off, int size,
}
}
- return bpf_skb_is_valid_access(off, size, type, info);
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
}
static bool lwt_is_valid_access(int off, int size,
enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
switch (off) {
@@ -3750,32 +4094,83 @@ static bool lwt_is_valid_access(int off, int size,
break;
}
- return bpf_skb_is_valid_access(off, size, type, info);
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
}
-static bool sock_filter_is_valid_access(int off, int size,
- enum bpf_access_type type,
- struct bpf_insn_access_aux *info)
+
+/* Attach type specific accesses */
+static bool __sock_filter_check_attach_type(int off,
+ enum bpf_access_type access_type,
+ enum bpf_attach_type attach_type)
{
- if (type == BPF_WRITE) {
- switch (off) {
- case offsetof(struct bpf_sock, bound_dev_if):
- case offsetof(struct bpf_sock, mark):
- case offsetof(struct bpf_sock, priority):
- break;
+ switch (off) {
+ case offsetof(struct bpf_sock, bound_dev_if):
+ case offsetof(struct bpf_sock, mark):
+ case offsetof(struct bpf_sock, priority):
+ switch (attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ goto full_access;
+ default:
+ return false;
+ }
+ case bpf_ctx_range(struct bpf_sock, src_ip4):
+ switch (attach_type) {
+ case BPF_CGROUP_INET4_POST_BIND:
+ goto read_only;
+ default:
+ return false;
+ }
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ switch (attach_type) {
+ case BPF_CGROUP_INET6_POST_BIND:
+ goto read_only;
+ default:
+ return false;
+ }
+ case bpf_ctx_range(struct bpf_sock, src_port):
+ switch (attach_type) {
+ case BPF_CGROUP_INET4_POST_BIND:
+ case BPF_CGROUP_INET6_POST_BIND:
+ goto read_only;
default:
return false;
}
}
+read_only:
+ return access_type == BPF_READ;
+full_access:
+ return true;
+}
- if (off < 0 || off + size > sizeof(struct bpf_sock))
+static bool __sock_filter_check_size(int off, int size,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock, src_ip4):
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ }
+
+ return size == size_default;
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_sock))
return false;
- /* The verifier guarantees that size > 0. */
if (off % size != 0)
return false;
- if (size != sizeof(__u32))
+ if (!__sock_filter_check_attach_type(off, type,
+ prog->expected_attach_type))
+ return false;
+ if (!__sock_filter_check_size(off, size, info))
return false;
-
return true;
}
@@ -3826,6 +4221,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (type == BPF_WRITE) {
@@ -3855,7 +4251,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
return false;
}
- return bpf_skb_is_valid_access(off, size, type, info);
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
}
static bool __is_valid_xdp_access(int off, int size)
@@ -3872,6 +4268,7 @@ static bool __is_valid_xdp_access(int off, int size)
static bool xdp_is_valid_access(int off, int size,
enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (type == BPF_WRITE)
@@ -3902,8 +4299,74 @@ void bpf_warn_invalid_xdp_action(u32 act)
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+static bool sock_addr_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sock_addr))
+ return false;
+ if (off % size != 0)
+ return false;
+
+ /* Disallow access to IPv6 fields from IPv4 contex and vise
+ * versa.
+ */
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ break;
+ default:
+ return false;
+ }
+ break;
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET6_CONNECT:
+ break;
+ default:
+ return false;
+ }
+ break;
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ /* Only narrow read access allowed for now. */
+ if (type == BPF_READ) {
+ bpf_ctx_record_field_size(info, size_default);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+ return false;
+ } else {
+ if (size != size_default)
+ return false;
+ }
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, user_port):
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ if (type == BPF_READ) {
+ if (size != size_default)
+ return false;
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+}
+
static bool sock_ops_is_valid_access(int off, int size,
enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
@@ -3950,6 +4413,7 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
static bool sk_skb_is_valid_access(int off, int size,
enum bpf_access_type type,
+ const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
switch (off) {
@@ -3979,7 +4443,34 @@ static bool sk_skb_is_valid_access(int off, int size,
break;
}
- return bpf_skb_is_valid_access(off, size, type, info);
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
+static bool sk_msg_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case offsetof(struct sk_msg_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct sk_msg_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ if (off < 0 || off >= sizeof(struct sk_msg_md))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u64))
+ return false;
+
+ return true;
}
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
@@ -4287,6 +4778,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
+ int off;
switch (si->off) {
case offsetof(struct bpf_sock, bound_dev_if):
@@ -4342,6 +4834,43 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
break;
+
+ case offsetof(struct bpf_sock, src_ip4):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_rcv_saddr,
+ FIELD_SIZEOF(struct sock_common,
+ skc_rcv_saddr),
+ target_size));
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ off = si->off;
+ off -= offsetof(struct bpf_sock, src_ip6[0]);
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(
+ struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0],
+ FIELD_SIZEOF(struct sock_common,
+ skc_v6_rcv_saddr.s6_addr32[0]),
+ target_size) + off);
+#else
+ (void)off;
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+
+ case offsetof(struct bpf_sock, src_port):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_num),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_num,
+ FIELD_SIZEOF(struct sock_common,
+ skc_num),
+ target_size));
+ break;
}
return insn - insn_buf;
@@ -4417,6 +4946,152 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
+ * context Structure, F is Field in context structure that contains a pointer
+ * to Nested Structure of type NS that has the field NF.
+ *
+ * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
+ * sure that SIZE is not greater than actual size of S.F.NF.
+ *
+ * If offset OFF is provided, the load happens from that offset relative to
+ * offset of NF.
+ */
+#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \
+ do { \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \
+ si->src_reg, offsetof(S, F)); \
+ *insn++ = BPF_LDX_MEM( \
+ SIZE, si->dst_reg, si->dst_reg, \
+ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
+ target_size) \
+ + OFF); \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
+ BPF_FIELD_SIZEOF(NS, NF), 0)
+
+/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
+ * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
+ *
+ * It doesn't support SIZE argument though since narrow stores are not
+ * supported for now.
+ *
+ * In addition it uses Temporary Field TF (member of struct S) as the 3rd
+ * "register" since two registers available in convert_ctx_access are not
+ * enough: we can't override neither SRC, since it contains value to store, nor
+ * DST since it contains pointer to context that may be used by later
+ * instructions. But we need a temporary place to save pointer to nested
+ * structure whose field we want to store to.
+ */
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \
+ do { \
+ int tmp_reg = BPF_REG_9; \
+ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
+ --tmp_reg; \
+ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
+ --tmp_reg; \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \
+ offsetof(S, TF)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
+ si->dst_reg, offsetof(S, F)); \
+ *insn++ = BPF_STX_MEM( \
+ BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \
+ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
+ target_size) \
+ + OFF); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \
+ offsetof(S, TF)); \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
+ TF) \
+ do { \
+ if (type == BPF_WRITE) { \
+ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \
+ TF); \
+ } else { \
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \
+ S, NS, F, NF, SIZE, OFF); \
+ } \
+ } while (0)
+
+#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \
+ S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)
+
+static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+ int off;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sock_addr, user_family):
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sockaddr, uaddr, sa_family);
+ break;
+
+ case offsetof(struct bpf_sock_addr, user_ip4):
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
+ sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+ off = si->off;
+ off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+ sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
+ tmp_reg);
+ break;
+
+ case offsetof(struct bpf_sock_addr, user_port):
+ /* To get port we need to know sa_family first and then treat
+ * sockaddr as either sockaddr_in or sockaddr_in6.
+ * Though we can simplify since port field has same offset and
+ * size in both structures.
+ * Here we check this invariant and use just one of the
+ * structures if it's true.
+ */
+ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
+ offsetof(struct sockaddr_in6, sin6_port));
+ BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) !=
+ FIELD_SIZEOF(struct sockaddr_in6, sin6_port));
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sockaddr_in6, uaddr,
+ sin6_port, tmp_reg);
+ break;
+
+ case offsetof(struct bpf_sock_addr, family):
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_family);
+ break;
+
+ case offsetof(struct bpf_sock_addr, type):
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sock, sk,
+ __sk_flags_offset, BPF_W, 0);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ break;
+
+ case offsetof(struct bpf_sock_addr, protocol):
+ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sock, sk,
+ __sk_flags_offset, BPF_W, 0);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+ SK_FL_PROTO_SHIFT);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -4780,6 +5455,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct sk_msg_md, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, data));
+ break;
+ case offsetof(struct sk_msg_md, data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, data_end));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
const struct bpf_verifier_ops sk_filter_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
@@ -4851,6 +5549,15 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = {
const struct bpf_prog_ops cg_sock_prog_ops = {
};
+const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
+ .get_func_proto = sock_addr_func_proto,
+ .is_valid_access = sock_addr_is_valid_access,
+ .convert_ctx_access = sock_addr_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sock_addr_prog_ops = {
+};
+
const struct bpf_verifier_ops sock_ops_verifier_ops = {
.get_func_proto = sock_ops_func_proto,
.is_valid_access = sock_ops_is_valid_access,
@@ -4870,6 +5577,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = {
const struct bpf_prog_ops sk_skb_prog_ops = {
};
+const struct bpf_verifier_ops sk_msg_verifier_ops = {
+ .get_func_proto = sk_msg_func_proto,
+ .is_valid_access = sk_msg_is_valid_access,
+ .convert_ctx_access = sk_msg_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_msg_prog_ops = {
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 559db9ea8d86..d29f09bc5ff9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1341,22 +1341,6 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
}
EXPORT_SYMBOL(__get_hash_from_flowi6);
-__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys)
-{
- memset(keys, 0, sizeof(*keys));
-
- keys->addrs.v4addrs.src = fl4->saddr;
- keys->addrs.v4addrs.dst = fl4->daddr;
- keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
- keys->ports.src = fl4->fl4_sport;
- keys->ports.dst = fl4->fl4_dport;
- keys->keyid.keyid = fl4->fl4_gre_key;
- keys->basic.ip_proto = fl4->flowi4_proto;
-
- return flow_hash_from_keys(keys);
-}
-EXPORT_SYMBOL(__get_hash_from_flowi4);
-
static const struct flow_dissector_key flow_keys_dissector_keys[] = {
{
.key_id = FLOW_DISSECTOR_KEY_CONTROL,
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 0a3f88f08727..98fd12721221 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -66,6 +66,7 @@ struct net_rate_estimator {
static void est_fetch_counters(struct net_rate_estimator *e,
struct gnet_stats_basic_packed *b)
{
+ memset(b, 0, sizeof(*b));
if (e->stats_lock)
spin_lock(e->stats_lock);
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index e010bb800d7b..9737302907b1 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -315,12 +315,12 @@ static int __net_init dev_proc_net_init(struct net *net)
{
int rc = -ENOMEM;
- if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
+ if (!proc_create("dev", 0444, net->proc_net, &dev_seq_fops))
goto out;
- if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
+ if (!proc_create("softnet_stat", 0444, net->proc_net,
&softnet_seq_fops))
goto out_dev;
- if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
+ if (!proc_create("ptype", 0444, net->proc_net, &ptype_seq_fops))
goto out_softnet;
if (wext_proc_init(net))
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 60a5ad2c33ee..c476f0794132 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -431,7 +431,7 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr,
return netdev_store(dev, attr, buf, len, change_group);
}
NETDEVICE_SHOW(group, fmt_dec);
-static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
+static DEVICE_ATTR(netdev_group, 0644, group_show, group_store);
static int change_proto_down(struct net_device *dev, unsigned long proto_down)
{
@@ -854,10 +854,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
}
static struct rx_queue_attribute rps_cpus_attribute __ro_after_init
- = __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+ = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map);
static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init
- = __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+ = __ATTR(rps_flow_cnt, 0644,
show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
#endif /* CONFIG_RPS */
@@ -1154,7 +1154,7 @@ static ssize_t bql_set_hold_time(struct netdev_queue *queue,
}
static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
- = __ATTR(hold_time, S_IRUGO | S_IWUSR,
+ = __ATTR(hold_time, 0644,
bql_show_hold_time, bql_set_hold_time);
static ssize_t bql_show_inflight(struct netdev_queue *queue,
@@ -1166,7 +1166,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue,
}
static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
- __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL);
+ __ATTR(inflight, 0444, bql_show_inflight, NULL);
#define BQL_ATTR(NAME, FIELD) \
static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
@@ -1182,7 +1182,7 @@ static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
} \
\
static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \
- = __ATTR(NAME, S_IRUGO | S_IWUSR, \
+ = __ATTR(NAME, 0644, \
bql_show_ ## NAME, bql_set_ ## NAME)
BQL_ATTR(limit, limit);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3cad5f51afd3..a11e03f920d3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -29,11 +29,14 @@
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
-DEFINE_MUTEX(net_mutex);
LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
+/* Protects net_namespace_list. Nests iside rtnl_lock() */
+DECLARE_RWSEM(net_rwsem);
+EXPORT_SYMBOL_GPL(net_rwsem);
+
struct net init_net = {
.count = REFCOUNT_INIT(1),
.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
@@ -41,6 +44,14 @@ struct net init_net = {
EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
+/*
+ * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
+ * init_net_initialized and first_device pointer.
+ * This is internal net namespace object. Please, don't use it
+ * outside.
+ */
+DECLARE_RWSEM(pernet_ops_rwsem);
+EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
#define MIN_PERNET_OPS_ID \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -65,11 +76,10 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
struct net_generic *ng, *old_ng;
- BUG_ON(!mutex_is_locked(&net_mutex));
BUG_ON(id < MIN_PERNET_OPS_ID);
old_ng = rcu_dereference_protected(net->gen,
- lockdep_is_held(&net_mutex));
+ lockdep_is_held(&pernet_ops_rwsem));
if (old_ng->s.len > id) {
old_ng->ptr[id] = data;
return 0;
@@ -286,7 +296,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
*/
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
{
- /* Must be called with net_mutex held */
+ /* Must be called with pernet_ops_rwsem held */
const struct pernet_operations *ops, *saved_ops;
int error = 0;
LIST_HEAD(net_exit_list);
@@ -297,12 +307,16 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
net->user_ns = user_ns;
idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock);
+ mutex_init(&net->ipv4.ra_mutex);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
if (error < 0)
goto out_undo;
}
+ down_write(&net_rwsem);
+ list_add_tail_rcu(&net->list, &net_namespace_list);
+ up_write(&net_rwsem);
out:
return error;
@@ -354,7 +368,7 @@ static void dec_net_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
}
-static struct kmem_cache *net_cachep;
+static struct kmem_cache *net_cachep __ro_after_init;
static struct workqueue_struct *netns_wq;
static struct net *net_alloc(void)
@@ -408,32 +422,27 @@ struct net *copy_net_ns(unsigned long flags,
net = net_alloc();
if (!net) {
- dec_net_namespaces(ucounts);
- return ERR_PTR(-ENOMEM);
+ rv = -ENOMEM;
+ goto dec_ucounts;
}
-
+ refcount_set(&net->passive, 1);
+ net->ucounts = ucounts;
get_user_ns(user_ns);
- rv = mutex_lock_killable(&net_mutex);
- if (rv < 0) {
- net_free(net);
- dec_net_namespaces(ucounts);
- put_user_ns(user_ns);
- return ERR_PTR(rv);
- }
+ rv = down_read_killable(&pernet_ops_rwsem);
+ if (rv < 0)
+ goto put_userns;
- net->ucounts = ucounts;
rv = setup_net(net, user_ns);
- if (rv == 0) {
- rtnl_lock();
- list_add_tail_rcu(&net->list, &net_namespace_list);
- rtnl_unlock();
- }
- mutex_unlock(&net_mutex);
+
+ up_read(&pernet_ops_rwsem);
+
if (rv < 0) {
- dec_net_namespaces(ucounts);
+put_userns:
put_user_ns(user_ns);
net_drop_ns(net);
+dec_ucounts:
+ dec_net_namespaces(ucounts);
return ERR_PTR(rv);
}
return net;
@@ -446,7 +455,7 @@ static void unhash_nsid(struct net *net, struct net *last)
* and this work is the only process, that may delete
* a net from net_namespace_list. So, when the below
* is executing, the list may only grow. Thus, we do not
- * use for_each_net_rcu() or rtnl_lock().
+ * use for_each_net_rcu() or net_rwsem.
*/
for_each_net(tmp) {
int id;
@@ -466,26 +475,23 @@ static void unhash_nsid(struct net *net, struct net *last)
spin_unlock_bh(&net->nsid_lock);
}
-static DEFINE_SPINLOCK(cleanup_list_lock);
-static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */
+static LLIST_HEAD(cleanup_list);
static void cleanup_net(struct work_struct *work)
{
const struct pernet_operations *ops;
struct net *net, *tmp, *last;
- struct list_head net_kill_list;
+ struct llist_node *net_kill_list;
LIST_HEAD(net_exit_list);
/* Atomically snapshot the list of namespaces to cleanup */
- spin_lock_irq(&cleanup_list_lock);
- list_replace_init(&cleanup_list, &net_kill_list);
- spin_unlock_irq(&cleanup_list_lock);
+ net_kill_list = llist_del_all(&cleanup_list);
- mutex_lock(&net_mutex);
+ down_read(&pernet_ops_rwsem);
/* Don't let anyone else find us. */
- rtnl_lock();
- list_for_each_entry(net, &net_kill_list, cleanup_list)
+ down_write(&net_rwsem);
+ llist_for_each_entry(net, net_kill_list, cleanup_list)
list_del_rcu(&net->list);
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
@@ -498,9 +504,9 @@ static void cleanup_net(struct work_struct *work)
* useless anyway, as netns_ids are destroyed there.
*/
last = list_last_entry(&net_namespace_list, struct net, list);
- rtnl_unlock();
+ up_write(&net_rwsem);
- list_for_each_entry(net, &net_kill_list, cleanup_list) {
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
unhash_nsid(net, last);
list_add_tail(&net->exit_list, &net_exit_list);
}
@@ -520,7 +526,7 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list);
- mutex_unlock(&net_mutex);
+ up_read(&pernet_ops_rwsem);
/* Ensure there are no outstanding rcu callbacks using this
* network namespace.
@@ -547,8 +553,8 @@ static void cleanup_net(struct work_struct *work)
*/
void net_ns_barrier(void)
{
- mutex_lock(&net_mutex);
- mutex_unlock(&net_mutex);
+ down_write(&pernet_ops_rwsem);
+ up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL(net_ns_barrier);
@@ -557,13 +563,8 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net);
void __put_net(struct net *net)
{
/* Cleanup the network namespace in process context */
- unsigned long flags;
-
- spin_lock_irqsave(&cleanup_list_lock, flags);
- list_add(&net->cleanup_list, &cleanup_list);
- spin_unlock_irqrestore(&cleanup_list_lock, flags);
-
- queue_work(netns_wq, &net_cleanup_work);
+ if (llist_add(&net->cleanup_list, &cleanup_list))
+ queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);
@@ -861,7 +862,7 @@ static int __init net_ns_init(void)
#ifdef CONFIG_NET_NS
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
- SLAB_PANIC, NULL);
+ SLAB_PANIC|SLAB_ACCOUNT, NULL);
/* Create workqueue for cleanup */
netns_wq = create_singlethread_workqueue("netns");
@@ -875,17 +876,12 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
- mutex_lock(&net_mutex);
+ down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
init_net_initialized = true;
-
- rtnl_lock();
- list_add_tail_rcu(&init_net.list, &net_namespace_list);
- rtnl_unlock();
-
- mutex_unlock(&net_mutex);
+ up_write(&pernet_ops_rwsem);
register_pernet_subsys(&net_ns_ops);
@@ -909,6 +905,9 @@ static int __register_pernet_operations(struct list_head *list,
list_add_tail(&ops->list, list);
if (ops->init || (ops->id && ops->size)) {
+ /* We held write locked pernet_ops_rwsem, and parallel
+ * setup_net() and cleanup_net() are not possible.
+ */
for_each_net(net) {
error = ops_init(ops, net);
if (error)
@@ -932,6 +931,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
LIST_HEAD(net_exit_list);
list_del(&ops->list);
+ /* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
ops_exit_list(ops, &net_exit_list);
@@ -996,7 +996,6 @@ again:
static void unregister_pernet_operations(struct pernet_operations *ops)
{
-
__unregister_pernet_operations(ops);
rcu_barrier();
if (ops->id)
@@ -1025,9 +1024,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
int register_pernet_subsys(struct pernet_operations *ops)
{
int error;
- mutex_lock(&net_mutex);
+ down_write(&pernet_ops_rwsem);
error = register_pernet_operations(first_device, ops);
- mutex_unlock(&net_mutex);
+ up_write(&pernet_ops_rwsem);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);
@@ -1043,9 +1042,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
*/
void unregister_pernet_subsys(struct pernet_operations *ops)
{
- mutex_lock(&net_mutex);
+ down_write(&pernet_ops_rwsem);
unregister_pernet_operations(ops);
- mutex_unlock(&net_mutex);
+ up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
@@ -1071,11 +1070,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
int register_pernet_device(struct pernet_operations *ops)
{
int error;
- mutex_lock(&net_mutex);
+ down_write(&pernet_ops_rwsem);
error = register_pernet_operations(&pernet_list, ops);
if (!error && (first_device == &pernet_list))
first_device = &ops->list;
- mutex_unlock(&net_mutex);
+ up_write(&pernet_ops_rwsem);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);
@@ -1091,11 +1090,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
*/
void unregister_pernet_device(struct pernet_operations *ops)
{
- mutex_lock(&net_mutex);
+ down_write(&pernet_ops_rwsem);
if (&ops->list == first_device)
first_device = first_device->next;
unregister_pernet_operations(ops);
- mutex_unlock(&net_mutex);
+ up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b8ab5c829511..7e4ede34cc52 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -906,13 +906,14 @@ static ssize_t pktgen_if_write(struct file *file,
i += len;
if (debug) {
- size_t copy = min_t(size_t, count, 1023);
- char tb[copy + 1];
- if (copy_from_user(tb, user_buffer, copy))
- return -EFAULT;
- tb[copy] = 0;
- pr_debug("%s,%lu buffer -:%s:-\n",
- name, (unsigned long)count, tb);
+ size_t copy = min_t(size_t, count + 1, 1024);
+ char *tp = strndup_user(user_buffer, copy);
+
+ if (IS_ERR(tp))
+ return PTR_ERR(tp);
+
+ pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp);
+ kfree(tp);
}
if (!strcmp(name, "min_pkt_size")) {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bc290413a49d..45936922d7e2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -75,6 +75,12 @@ void rtnl_lock(void)
}
EXPORT_SYMBOL(rtnl_lock);
+int rtnl_lock_killable(void)
+{
+ return mutex_lock_killable(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock_killable);
+
static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
{
@@ -406,7 +412,9 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
* __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
* @ops: struct rtnl_link_ops * to unregister
*
- * The caller must hold the rtnl_mutex.
+ * The caller must hold the rtnl_mutex and guarantee net_namespace_list
+ * integrity (hold pernet_ops_rwsem for writing to close the race
+ * with setup_net() and cleanup_net()).
*/
void __rtnl_link_unregister(struct rtnl_link_ops *ops)
{
@@ -432,6 +440,9 @@ static void rtnl_lock_unregistering_all(void)
for (;;) {
unregistering = false;
rtnl_lock();
+ /* We held write locked pernet_ops_rwsem, and parallel
+ * setup_net() and cleanup_net() are not possible.
+ */
for_each_net(net) {
if (net->dev_unreg_count > 0) {
unregistering = true;
@@ -453,12 +464,12 @@ static void rtnl_lock_unregistering_all(void)
*/
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
- /* Close the race with cleanup_net() */
- mutex_lock(&net_mutex);
+ /* Close the race with setup_net() and cleanup_net() */
+ down_write(&pernet_ops_rwsem);
rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);
rtnl_unlock();
- mutex_unlock(&net_mutex);
+ up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 09bd89c90a71..1bca1e0fc8f7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -77,8 +77,8 @@
#include <linux/capability.h>
#include <linux/user_namespace.h>
-struct kmem_cache *skbuff_head_cache __read_mostly;
-static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __ro_after_init;
+static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
@@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
}
EXPORT_SYMBOL_GPL(skb_morph);
-static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
unsigned long max_pg, num_pg, new_pg, old_pg;
struct user_struct *user;
@@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
return 0;
}
+EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
-static void mm_unaccount_pinned_pages(struct mmpin *mmp)
+void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
if (mmp->user) {
atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
free_uid(mmp->user);
}
}
+EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
{
@@ -3458,6 +3460,19 @@ void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
}
EXPORT_SYMBOL_GPL(skb_pull_rcsum);
+static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
+{
+ skb_frag_t head_frag;
+ struct page *page;
+
+ page = virt_to_head_page(frag_skb->head);
+ head_frag.page.p = page;
+ head_frag.page_offset = frag_skb->data -
+ (unsigned char *)page_address(page);
+ head_frag.size = skb_headlen(frag_skb);
+ return head_frag;
+}
+
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
@@ -3662,15 +3677,19 @@ normal:
while (pos < offset + len) {
if (i >= nfrags) {
- BUG_ON(skb_headlen(list_skb));
-
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
+ if (!skb_headlen(list_skb)) {
+ BUG_ON(!nfrags);
+ } else {
+ BUG_ON(!list_skb->head_frag);
- BUG_ON(!nfrags);
-
+ /* to make room for head_frag. */
+ i--;
+ frag--;
+ }
if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
skb_zerocopy_clone(nskb, frag_skb,
GFP_ATOMIC))
@@ -3687,7 +3706,7 @@ normal:
goto err;
}
- *nskb_frag = *frag;
+ *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
__skb_frag_ref(nskb_frag);
size = skb_frag_size(nskb_frag);
@@ -4179,7 +4198,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk);
+ sk->sk_error_report(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
@@ -4891,7 +4910,7 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet);
*
* The MAC/L2 or network (IP, IPv6) headers are not accounted for.
*/
-unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
{
const struct skb_shared_info *shinfo = skb_shinfo(skb);
unsigned int thlen = 0;
@@ -4904,7 +4923,7 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
thlen += inner_tcp_hdrlen(skb);
} else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
thlen = tcp_hdrlen(skb);
- } else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) {
+ } else if (unlikely(skb_is_gso_sctp(skb))) {
thlen = sizeof(struct sctphdr);
}
/* UFO sets gso_size to the size of the fragmentation
@@ -4913,7 +4932,40 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
*/
return thlen + shinfo->gso_size;
}
-EXPORT_SYMBOL_GPL(skb_gso_transport_seglen);
+
+/**
+ * skb_gso_network_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_network_seglen is used to determine the real size of the
+ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
+ *
+ * The MAC/L2 header is not accounted for.
+ */
+static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) -
+ skb_network_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_mac_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_mac_seglen is used to determine the real size of the
+ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
+ * headers (TCP/UDP).
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
/**
* skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
@@ -4955,19 +5007,20 @@ static inline bool skb_gso_size_check(const struct sk_buff *skb,
}
/**
- * skb_gso_validate_mtu - Return in case such skb fits a given MTU
+ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
*
* @skb: GSO skb
* @mtu: MTU to validate against
*
- * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU
- * once split.
+ * skb_gso_validate_network_len validates if a given skb will fit a
+ * wanted MTU once split. It considers L3 headers, L4 headers, and the
+ * payload.
*/
-bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu)
+bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
{
return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
}
-EXPORT_SYMBOL_GPL(skb_gso_validate_mtu);
+EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
/**
* skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
@@ -4986,13 +5039,18 @@ EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
+ int mac_len;
+
if (skb_cow(skb, skb_headroom(skb)) < 0) {
kfree_skb(skb);
return NULL;
}
- memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
- 2 * ETH_ALEN);
+ mac_len = skb->data - skb_mac_header(skb);
+ if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
+ memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
+ mac_len - VLAN_HLEN - ETH_TLEN);
+ }
skb->mac_header += VLAN_HLEN;
return skb;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index c501499a04fe..6444525f610c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1049,16 +1049,18 @@ set_rcvbuf:
break;
case SO_ZEROCOPY:
- if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+ if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
+ if (sk->sk_protocol != IPPROTO_TCP)
+ ret = -ENOTSUPP;
+ } else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP;
- else if (sk->sk_protocol != IPPROTO_TCP)
- ret = -ENOTSUPP;
- else if (sk->sk_state != TCP_CLOSE)
- ret = -EBUSY;
- else if (val < 0 || val > 1)
- ret = -EINVAL;
- else
- sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+ }
+ if (!ret) {
+ if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+ }
break;
default:
@@ -1274,7 +1276,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
{
char address[128];
- if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+ lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
+ if (lv < 0)
return -ENOTCONN;
if (lv < len)
return -EINVAL;
@@ -1773,7 +1776,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
u32 max_segs = 1;
sk_dst_set(sk, dst);
- sk->sk_route_caps = dst->dev->features;
+ sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
sk->sk_route_caps &= ~sk->sk_route_nocaps;
@@ -2234,6 +2237,67 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
}
EXPORT_SYMBOL(sk_page_frag_refill);
+int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
+ int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
+ int first_coalesce)
+{
+ int sg_curr = *sg_curr_index, use = 0, rc = 0;
+ unsigned int size = *sg_curr_size;
+ struct page_frag *pfrag;
+ struct scatterlist *sge;
+
+ len -= size;
+ pfrag = sk_page_frag(sk);
+
+ while (len > 0) {
+ unsigned int orig_offset;
+
+ if (!sk_page_frag_refill(sk, pfrag)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ use = min_t(int, len, pfrag->size - pfrag->offset);
+
+ if (!sk_wmem_schedule(sk, use)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ sk_mem_charge(sk, use);
+ size += use;
+ orig_offset = pfrag->offset;
+ pfrag->offset += use;
+
+ sge = sg + sg_curr - 1;
+ if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
+ sg->offset + sg->length == orig_offset) {
+ sg->length += use;
+ } else {
+ sge = sg + sg_curr;
+ sg_unmark_end(sge);
+ sg_set_page(sge, pfrag->page, use, orig_offset);
+ get_page(pfrag->page);
+ sg_curr++;
+
+ if (sg_curr == MAX_SKB_FRAGS)
+ sg_curr = 0;
+
+ if (sg_curr == sg_start) {
+ rc = -ENOSPC;
+ break;
+ }
+ }
+
+ len -= use;
+ }
+out:
+ *sg_curr_size = size;
+ *sg_curr_index = sg_curr;
+ return rc;
+}
+EXPORT_SYMBOL(sk_alloc_sg);
+
static void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
@@ -2497,7 +2561,7 @@ int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
EXPORT_SYMBOL(sock_no_accept);
int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
- int *len, int peer)
+ int peer)
{
return -EOPNOTSUPP;
}
@@ -3261,6 +3325,27 @@ void proto_unregister(struct proto *prot)
}
EXPORT_SYMBOL(proto_unregister);
+int sock_load_diag_module(int family, int protocol)
+{
+ if (!protocol) {
+ if (!sock_is_registered(family))
+ return -ENOENT;
+
+ return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, family);
+ }
+
+#ifdef CONFIG_INET
+ if (family == AF_INET &&
+ !rcu_access_pointer(inet_protos[protocol]))
+ return -ENOENT;
+#endif
+
+ return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, family, protocol);
+}
+EXPORT_SYMBOL(sock_load_diag_module);
+
#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(proto_list_mutex)
@@ -3369,7 +3454,7 @@ static const struct file_operations proto_seq_fops = {
static __net_init int proto_init_net(struct net *net)
{
- if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
+ if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
return -ENOMEM;
return 0;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 146b50e30659..c37b5be7c5e4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -220,8 +220,7 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
if (sock_diag_handlers[req->sdiag_family] == NULL)
- request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
- NETLINK_SOCK_DIAG, req->sdiag_family);
+ sock_load_diag_module(req->sdiag_family, 0);
mutex_lock(&sock_diag_table_mutex);
hndl = sock_diag_handlers[req->sdiag_family];
@@ -247,8 +246,7 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
case TCPDIAG_GETSOCK:
case DCCPDIAG_GETSOCK:
if (inet_rcv_compat == NULL)
- request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
- NETLINK_SOCK_DIAG, AF_INET);
+ sock_load_diag_module(AF_INET, 0);
mutex_lock(&sock_diag_table_mutex);
if (inet_rcv_compat != NULL)
@@ -281,14 +279,12 @@ static int sock_diag_bind(struct net *net, int group)
case SKNLGRP_INET_TCP_DESTROY:
case SKNLGRP_INET_UDP_DESTROY:
if (!sock_diag_handlers[AF_INET])
- request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
- NETLINK_SOCK_DIAG, AF_INET);
+ sock_load_diag_module(AF_INET, 0);
break;
case SKNLGRP_INET6_TCP_DESTROY:
case SKNLGRP_INET6_UDP_DESTROY:
if (!sock_diag_handlers[AF_INET6])
- request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
- NETLINK_SOCK_DIAG, AF_INET6);
+ sock_load_diag_module(AF_INET6, 0);
break;
}
return 0;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f2d0462611c3..b1a2c5e38530 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -15,7 +15,6 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/kmemleak.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -32,6 +31,9 @@ static int max_skb_frags = MAX_SKB_FRAGS;
static int net_msg_warn; /* Unused, but still a sysctl */
+int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
+EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
+
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -513,6 +515,15 @@ static struct ctl_table net_core_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
+ {
+ .procname = "fb_tunnels_only_for_init_net",
+ .data = &sysctl_fb_tunnels_only_for_init_net,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{ }
};
diff --git a/net/core/utils.c b/net/core/utils.c
index 93066bd0305a..d47863b07a60 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -403,6 +403,29 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
}
EXPORT_SYMBOL(inet_pton_with_scope);
+bool inet_addr_is_any(struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
+ const struct sockaddr_in6 in6_any =
+ { .sin6_addr = IN6ADDR_ANY_INIT };
+
+ if (!memcmp(in6->sin6_addr.s6_addr,
+ in6_any.sin6_addr.s6_addr, 16))
+ return true;
+ } else if (addr->sa_family == AF_INET) {
+ struct sockaddr_in *in = (struct sockaddr_in *)addr;
+
+ if (in->sin_addr.s_addr == htonl(INADDR_ANY))
+ return true;
+ } else {
+ pr_warn("unexpected address family %u\n", addr->sa_family);
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(inet_addr_is_any);
+
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, bool pseudohdr)
{