diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 244 | ||||
-rw-r--r-- | net/core/dev_addr_lists.c | 3 | ||||
-rw-r--r-- | net/core/ethtool.c | 2 | ||||
-rw-r--r-- | net/core/filter.c | 139 | ||||
-rw-r--r-- | net/core/flow.c | 4 | ||||
-rw-r--r-- | net/core/neighbour.c | 20 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 37 | ||||
-rw-r--r-- | net/core/netpoll.c | 6 | ||||
-rw-r--r-- | net/core/netprio_cgroup.c | 262 | ||||
-rw-r--r-- | net/core/pktgen.c | 47 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 233 | ||||
-rw-r--r-- | net/core/scm.c | 6 | ||||
-rw-r--r-- | net/core/skbuff.c | 40 | ||||
-rw-r--r-- | net/core/sock.c | 84 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 5 |
15 files changed, 844 insertions, 288 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 09cb3f6dc40c..d0cbc93fcf32 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -176,8 +176,10 @@ #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) static DEFINE_SPINLOCK(ptype_lock); +static DEFINE_SPINLOCK(offload_lock); static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; static struct list_head ptype_all __read_mostly; /* Taps */ +static struct list_head offload_base __read_mostly; /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -201,6 +203,8 @@ static struct list_head ptype_all __read_mostly; /* Taps */ DEFINE_RWLOCK(dev_base_lock); EXPORT_SYMBOL(dev_base_lock); +DEFINE_SEQLOCK(devnet_rename_seq); + static inline void dev_base_seq_inc(struct net *net) { while (++net->dev_base_seq == 0); @@ -470,6 +474,82 @@ void dev_remove_pack(struct packet_type *pt) } EXPORT_SYMBOL(dev_remove_pack); + +/** + * dev_add_offload - register offload handlers + * @po: protocol offload declaration + * + * Add protocol offload handlers to the networking stack. The passed + * &proto_offload is linked into kernel lists and may not be freed until + * it has been removed from the kernel lists. + * + * This call does not sleep therefore it can not + * guarantee all CPU's that are in middle of receiving packets + * will see the new offload handlers (until the next received packet). + */ +void dev_add_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + + spin_lock(&offload_lock); + list_add_rcu(&po->list, head); + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(dev_add_offload); + +/** + * __dev_remove_offload - remove offload handler + * @po: packet offload declaration + * + * Remove a protocol offload handler that was previously added to the + * kernel offload handlers by dev_add_offload(). The passed &offload_type + * is removed from the kernel lists and can be freed or reused once this + * function returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +void __dev_remove_offload(struct packet_offload *po) +{ + struct list_head *head = &offload_base; + struct packet_offload *po1; + + spin_lock(&offload_lock); + + list_for_each_entry(po1, head, list) { + if (po == po1) { + list_del_rcu(&po->list); + goto out; + } + } + + pr_warn("dev_remove_offload: %p not found\n", po); +out: + spin_unlock(&offload_lock); +} +EXPORT_SYMBOL(__dev_remove_offload); + +/** + * dev_remove_offload - remove packet offload handler + * @po: packet offload declaration + * + * Remove a packet offload handler that was previously added to the kernel + * offload handlers by dev_add_offload(). The passed &offload_type is + * removed from the kernel lists and can be freed or reused once this + * function returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_offload(struct packet_offload *po) +{ + __dev_remove_offload(po); + + synchronize_net(); +} +EXPORT_SYMBOL(dev_remove_offload); + /****************************************************************************** Device Boot-time Settings Routines @@ -1013,22 +1093,31 @@ int dev_change_name(struct net_device *dev, const char *newname) if (dev->flags & IFF_UP) return -EBUSY; - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) + write_seqlock(&devnet_rename_seq); + + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { + write_sequnlock(&devnet_rename_seq); return 0; + } memcpy(oldname, dev->name, IFNAMSIZ); err = dev_get_valid_name(net, dev, newname); - if (err < 0) + if (err < 0) { + write_sequnlock(&devnet_rename_seq); return err; + } rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock(&devnet_rename_seq); return ret; } + write_sequnlock(&devnet_rename_seq); + write_lock_bh(&dev_base_lock); hlist_del_rcu(&dev->name_hlist); write_unlock_bh(&dev_base_lock); @@ -1046,6 +1135,7 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; + write_seqlock(&devnet_rename_seq); memcpy(dev->name, oldname, IFNAMSIZ); goto rollback; } else { @@ -1075,10 +1165,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return -EINVAL; if (!len) { - if (dev->ifalias) { - kfree(dev->ifalias); - dev->ifalias = NULL; - } + kfree(dev->ifalias); + dev->ifalias = NULL; return 0; } @@ -1666,7 +1754,7 @@ static inline int deliver_skb(struct sk_buff *skb, static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { - if (ptype->af_packet_priv == NULL) + if (!ptype->af_packet_priv || !skb->sk) return false; if (ptype->id_match) @@ -1994,7 +2082,7 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); - struct packet_type *ptype; + struct packet_offload *ptype; __be16 type = skb->protocol; int vlan_depth = ETH_HLEN; int err; @@ -2023,18 +2111,17 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, } rcu_read_lock(); - list_for_each_entry_rcu(ptype, - &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && !ptype->dev && ptype->gso_segment) { + list_for_each_entry_rcu(ptype, &offload_base, list) { + if (ptype->type == type && ptype->callbacks.gso_segment) { if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { - err = ptype->gso_send_check(skb); + err = ptype->callbacks.gso_send_check(skb); segs = ERR_PTR(err); if (err || skb_gso_ok(skb, features)) break; __skb_push(skb, (skb->data - skb_network_header(skb))); } - segs = ptype->gso_segment(skb, features); + segs = ptype->callbacks.gso_segment(skb, features); break; } } @@ -2237,6 +2324,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->vlan_tci = 0; } + /* If encapsulation offload request, verify we are testing + * hardware encapsulation features instead of standard + * features for the netdev + */ + if (skb->encapsulation) + features &= dev->hw_enc_features; + if (netif_needs_gso(skb, features)) { if (unlikely(dev_gso_segment(skb, features))) goto out_kfree_skb; @@ -2252,8 +2346,12 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, * checksumming here. */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); + if (skb->encapsulation) + skb_set_inner_transport_header(skb, + skb_checksum_start_offset(skb)); + else + skb_set_transport_header(skb, + skb_checksum_start_offset(skb)); if (!(features & NETIF_F_ALL_CSUM) && skb_checksum_help(skb)) goto out_kfree_skb; @@ -2818,8 +2916,10 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (unlikely(tcpu != next_cpu) && (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || ((int)(per_cpu(softnet_data, tcpu).input_queue_head - - rflow->last_qtail)) >= 0)) + rflow->last_qtail)) >= 0)) { + tcpu = next_cpu; rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + } if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { *rflowp = rflow; @@ -3444,11 +3544,13 @@ static void flush_backlog(void *arg) static int napi_gro_complete(struct sk_buff *skb) { - struct packet_type *ptype; + struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + struct list_head *head = &offload_base; int err = -ENOENT; + BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); + if (NAPI_GRO_CB(skb)->count == 1) { skb_shinfo(skb)->gso_size = 0; goto out; @@ -3456,10 +3558,10 @@ static int napi_gro_complete(struct sk_buff *skb) rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || ptype->dev || !ptype->gro_complete) + if (ptype->type != type || !ptype->callbacks.gro_complete) continue; - err = ptype->gro_complete(skb); + err = ptype->callbacks.gro_complete(skb); break; } rcu_read_unlock(); @@ -3503,12 +3605,34 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); -enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; + + for (p = napi->gro_list; p; p = p->next) { + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); + NAPI_GRO_CB(p)->same_flow = !diffs; + NAPI_GRO_CB(p)->flush = 0; + } +} + +static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; - struct packet_type *ptype; + struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + struct list_head *head = &offload_base; int same_flow; int mac_len; enum gro_result ret; @@ -3519,9 +3643,11 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; + gro_list_prepare(napi, skb); + rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || ptype->dev || !ptype->gro_receive) + if (ptype->type != type || !ptype->callbacks.gro_receive) continue; skb_set_network_header(skb, skb_gro_offset(skb)); @@ -3531,7 +3657,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) NAPI_GRO_CB(skb)->flush = 0; NAPI_GRO_CB(skb)->free = 0; - pp = ptype->gro_receive(&napi->gro_list, skb); + pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); break; } rcu_read_unlock(); @@ -3594,34 +3720,9 @@ normal: ret = GRO_NORMAL; goto pull; } -EXPORT_SYMBOL(dev_gro_receive); -static inline gro_result_t -__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) -{ - struct sk_buff *p; - unsigned int maclen = skb->dev->hard_header_len; - for (p = napi->gro_list; p; p = p->next) { - unsigned long diffs; - - diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= p->vlan_tci ^ skb->vlan_tci; - if (maclen == ETH_HLEN) - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); - else if (!diffs) - diffs = memcmp(skb_mac_header(p), - skb_gro_mac_header(skb), - maclen); - NAPI_GRO_CB(p)->same_flow = !diffs; - NAPI_GRO_CB(p)->flush = 0; - } - - return dev_gro_receive(napi, skb); -} - -gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { case GRO_NORMAL: @@ -3647,7 +3748,6 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) return ret; } -EXPORT_SYMBOL(napi_skb_finish); static void skb_gro_reset_offset(struct sk_buff *skb) { @@ -3670,7 +3770,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); - return napi_skb_finish(__napi_gro_receive(napi, skb), skb); + return napi_skb_finish(dev_gro_receive(napi, skb), skb); } EXPORT_SYMBOL(napi_gro_receive); @@ -3699,7 +3799,7 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, +static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { @@ -3724,7 +3824,6 @@ gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, return ret; } -EXPORT_SYMBOL(napi_frags_finish); static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { @@ -3769,7 +3868,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) if (!skb) return GRO_DROP; - return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); + return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); } EXPORT_SYMBOL(napi_gro_frags); @@ -4071,6 +4170,7 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) { struct net_device *dev; struct ifreq ifr; + unsigned seq; /* * Fetch the caller's info block. @@ -4079,6 +4179,8 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; +retry: + seq = read_seqbegin(&devnet_rename_seq); rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex); if (!dev) { @@ -4088,6 +4190,8 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) strcpy(ifr.ifr_name, dev->name); rcu_read_unlock(); + if (read_seqretry(&devnet_rename_seq, seq)) + goto retry; if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) return -EFAULT; @@ -4880,7 +4984,7 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) else dev->mtu = new_mtu; - if (!err && dev->flags & IFF_UP) + if (!err) call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); return err; } @@ -5200,7 +5304,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSIFNAME: - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; dev_load(net, ifr.ifr_name); rtnl_lock(); @@ -5221,16 +5325,25 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) * - require strict serialization. * - do not return a value */ + case SIOCSIFMAP: + case SIOCSIFTXQLEN: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* fall through */ + /* + * These ioctl calls: + * - require local superuser power. + * - require strict serialization. + * - do not return a value + */ case SIOCSIFFLAGS: case SIOCSIFMETRIC: case SIOCSIFMTU: - case SIOCSIFMAP: case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFHWBROADCAST: - case SIOCSIFTXQLEN: case SIOCSMIIREG: case SIOCBONDENSLAVE: case SIOCBONDRELEASE: @@ -5239,7 +5352,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCBRADDIF: case SIOCBRDELIF: case SIOCSHWTSTAMP: - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; /* fall through */ case SIOCBONDSLAVEINFOQUERY: @@ -6264,7 +6377,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char goto out; /* Ensure the device has been registrered */ - err = -EINVAL; if (dev->reg_state != NETREG_REGISTERED) goto out; @@ -6319,6 +6431,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_uc_flush(dev); dev_mc_flush(dev); + /* Send a netdev-removed uevent to the old namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); + /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -6330,6 +6445,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev->iflink = dev->ifindex; } + /* Send a netdev-add uevent to the new namespace */ + kobject_uevent(&dev->dev.kobj, KOBJ_ADD); + /* Fixup kobjects */ err = device_rename(&dev->dev, dev->name); WARN_ON(err); @@ -6662,6 +6780,8 @@ static int __init net_dev_init(void) for (i = 0; i < PTYPE_HASH_SIZE; i++) INIT_LIST_HEAD(&ptype_base[i]); + INIT_LIST_HEAD(&offload_base); + if (register_pernet_subsys(&netdev_net_ops)) goto out; diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 87cc17db2d56..b079c7bbc157 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -319,7 +319,8 @@ int dev_addr_del(struct net_device *dev, const unsigned char *addr, */ ha = list_first_entry(&dev->dev_addrs.list, struct netdev_hw_addr, list); - if (ha->addr == dev->dev_addr && ha->refcount == 1) + if (!memcmp(ha->addr, addr, dev->addr_len) && + ha->type == addr_type && ha->refcount == 1) return -ENOENT; err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 4d64cc2e3fa9..a8705432e4b1 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1460,7 +1460,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GEEE: break; default: - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; } diff --git a/net/core/filter.c b/net/core/filter.c index 3d92ebb7fbcf..c23543cba132 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -39,6 +39,7 @@ #include <linux/reciprocal_div.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> +#include <linux/if_vlan.h> /* No hurry in this branch * @@ -341,6 +342,12 @@ load_b: case BPF_S_ANC_CPU: A = raw_smp_processor_id(); continue; + case BPF_S_ANC_VLAN_TAG: + A = vlan_tx_tag_get(skb); + continue; + case BPF_S_ANC_VLAN_TAG_PRESENT: + A = !!vlan_tx_tag_present(skb); + continue; case BPF_S_ANC_NLATTR: { struct nlattr *nla; @@ -600,6 +607,8 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen) ANCILLARY(RXHASH); ANCILLARY(CPU); ANCILLARY(ALU_XOR_X); + ANCILLARY(VLAN_TAG); + ANCILLARY(VLAN_TAG_PRESENT); } } ftest->code = code; @@ -751,3 +760,133 @@ int sk_detach_filter(struct sock *sk) return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); + +static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) +{ + static const u16 decodes[] = { + [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K, + [BPF_S_ALU_ADD_X] = BPF_ALU|BPF_ADD|BPF_X, + [BPF_S_ALU_SUB_K] = BPF_ALU|BPF_SUB|BPF_K, + [BPF_S_ALU_SUB_X] = BPF_ALU|BPF_SUB|BPF_X, + [BPF_S_ALU_MUL_K] = BPF_ALU|BPF_MUL|BPF_K, + [BPF_S_ALU_MUL_X] = BPF_ALU|BPF_MUL|BPF_X, + [BPF_S_ALU_DIV_X] = BPF_ALU|BPF_DIV|BPF_X, + [BPF_S_ALU_MOD_K] = BPF_ALU|BPF_MOD|BPF_K, + [BPF_S_ALU_MOD_X] = BPF_ALU|BPF_MOD|BPF_X, + [BPF_S_ALU_AND_K] = BPF_ALU|BPF_AND|BPF_K, + [BPF_S_ALU_AND_X] = BPF_ALU|BPF_AND|BPF_X, + [BPF_S_ALU_OR_K] = BPF_ALU|BPF_OR|BPF_K, + [BPF_S_ALU_OR_X] = BPF_ALU|BPF_OR|BPF_X, + [BPF_S_ALU_XOR_K] = BPF_ALU|BPF_XOR|BPF_K, + [BPF_S_ALU_XOR_X] = BPF_ALU|BPF_XOR|BPF_X, + [BPF_S_ALU_LSH_K] = BPF_ALU|BPF_LSH|BPF_K, + [BPF_S_ALU_LSH_X] = BPF_ALU|BPF_LSH|BPF_X, + [BPF_S_ALU_RSH_K] = BPF_ALU|BPF_RSH|BPF_K, + [BPF_S_ALU_RSH_X] = BPF_ALU|BPF_RSH|BPF_X, + [BPF_S_ALU_NEG] = BPF_ALU|BPF_NEG, + [BPF_S_LD_W_ABS] = BPF_LD|BPF_W|BPF_ABS, + [BPF_S_LD_H_ABS] = BPF_LD|BPF_H|BPF_ABS, + [BPF_S_LD_B_ABS] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_PROTOCOL] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_PKTTYPE] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_IFINDEX] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_NLATTR] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_NLATTR_NEST] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_MARK] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_QUEUE] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_HATYPE] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_RXHASH] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_CPU] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_ALU_XOR_X] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_VLAN_TAG] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS, + [BPF_S_LD_W_LEN] = BPF_LD|BPF_W|BPF_LEN, + [BPF_S_LD_W_IND] = BPF_LD|BPF_W|BPF_IND, + [BPF_S_LD_H_IND] = BPF_LD|BPF_H|BPF_IND, + [BPF_S_LD_B_IND] = BPF_LD|BPF_B|BPF_IND, + [BPF_S_LD_IMM] = BPF_LD|BPF_IMM, + [BPF_S_LDX_W_LEN] = BPF_LDX|BPF_W|BPF_LEN, + [BPF_S_LDX_B_MSH] = BPF_LDX|BPF_B|BPF_MSH, + [BPF_S_LDX_IMM] = BPF_LDX|BPF_IMM, + [BPF_S_MISC_TAX] = BPF_MISC|BPF_TAX, + [BPF_S_MISC_TXA] = BPF_MISC|BPF_TXA, + [BPF_S_RET_K] = BPF_RET|BPF_K, + [BPF_S_RET_A] = BPF_RET|BPF_A, + [BPF_S_ALU_DIV_K] = BPF_ALU|BPF_DIV|BPF_K, + [BPF_S_LD_MEM] = BPF_LD|BPF_MEM, + [BPF_S_LDX_MEM] = BPF_LDX|BPF_MEM, + [BPF_S_ST] = BPF_ST, + [BPF_S_STX] = BPF_STX, + [BPF_S_JMP_JA] = BPF_JMP|BPF_JA, + [BPF_S_JMP_JEQ_K] = BPF_JMP|BPF_JEQ|BPF_K, + [BPF_S_JMP_JEQ_X] = BPF_JMP|BPF_JEQ|BPF_X, + [BPF_S_JMP_JGE_K] = BPF_JMP|BPF_JGE|BPF_K, + [BPF_S_JMP_JGE_X] = BPF_JMP|BPF_JGE|BPF_X, + [BPF_S_JMP_JGT_K] = BPF_JMP|BPF_JGT|BPF_K, + [BPF_S_JMP_JGT_X] = BPF_JMP|BPF_JGT|BPF_X, + [BPF_S_JMP_JSET_K] = BPF_JMP|BPF_JSET|BPF_K, + [BPF_S_JMP_JSET_X] = BPF_JMP|BPF_JSET|BPF_X, + }; + u16 code; + + code = filt->code; + + to->code = decodes[code]; + to->jt = filt->jt; + to->jf = filt->jf; + + if (code == BPF_S_ALU_DIV_K) { + /* + * When loaded this rule user gave us X, which was + * translated into R = r(X). Now we calculate the + * RR = r(R) and report it back. If next time this + * value is loaded and RRR = r(RR) is calculated + * then the R == RRR will be true. + * + * One exception. X == 1 translates into R == 0 and + * we can't calculate RR out of it with r(). + */ + + if (filt->k == 0) + to->k = 1; + else + to->k = reciprocal_value(filt->k); + + BUG_ON(reciprocal_value(to->k) != filt->k); + } else + to->k = filt->k; +} + +int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) +{ + struct sk_filter *filter; + int i, ret; + + lock_sock(sk); + filter = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + ret = 0; + if (!filter) + goto out; + ret = filter->len; + if (!len) + goto out; + ret = -EINVAL; + if (len < filter->len) + goto out; + + ret = -EFAULT; + for (i = 0; i < filter->len; i++) { + struct sock_filter fb; + + sk_decode_filter(&filter->insns[i], &fb); + if (copy_to_user(&ubuf[i], &fb, sizeof(fb))) + goto out; + } + + ret = filter->len; +out: + release_sock(sk); + return ret; +} diff --git a/net/core/flow.c b/net/core/flow.c index e318c7e98042..b0901ee5a002 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -327,11 +327,9 @@ static void flow_cache_flush_tasklet(unsigned long data) static void flow_cache_flush_per_cpu(void *data) { struct flow_flush_info *info = data; - int cpu; struct tasklet_struct *tasklet; - cpu = smp_processor_id(); - tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; + tasklet = this_cpu_ptr(&info->cache->percpu->flush_tasklet); tasklet->data = (unsigned long)info; tasklet_schedule(tasklet); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 22571488730a..c815f285e5ab 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1787,8 +1787,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes) || /* approximative value for deprecated QUEUE_LEN (in packets) */ nla_put_u32(skb, NDTPA_QUEUE_LEN, - DIV_ROUND_UP(parms->queue_len_bytes, - SKB_TRUESIZE(ETH_FRAME_LEN))) || + parms->queue_len_bytes / SKB_TRUESIZE(ETH_FRAME_LEN)) || nla_put_u32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen) || nla_put_u32(skb, NDTPA_APP_PROBES, parms->app_probes) || nla_put_u32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes) || @@ -2770,6 +2769,8 @@ EXPORT_SYMBOL(neigh_app_ns); #endif /* CONFIG_ARPD */ #ifdef CONFIG_SYSCTL +static int zero; +static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2777,9 +2778,13 @@ static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, int size, ret; ctl_table tmp = *ctl; + tmp.extra1 = &zero; + tmp.extra2 = &unres_qlen_max; tmp.data = &size; - size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN)); - ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + + size = *(int *)ctl->data / SKB_TRUESIZE(ETH_FRAME_LEN); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); return ret; @@ -2865,7 +2870,8 @@ static struct neigh_sysctl_table { .procname = "unres_qlen_bytes", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .extra1 = &zero, + .proc_handler = proc_dointvec_minmax, }, [NEIGH_VAR_PROXY_QLEN] = { .procname = "proxy_qlen", @@ -2987,6 +2993,10 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev; } + /* Don't export sysctls to unprivileged users */ + if (neigh_parms_net(p)->user_ns != &init_user_ns) + t->neigh_vars[0].procname = NULL; + snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s", p_name, dev_name_source); t->sysctl_header = diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bcf02f608cbf..334efd5d67a9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -18,11 +18,9 @@ #include <net/sock.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> -#include <linux/wireless.h> #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/jiffies.h> -#include <net/wext.h> #include "net-sysfs.h" @@ -73,11 +71,12 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len, int (*set)(struct net_device *, unsigned long)) { - struct net_device *net = to_net_dev(dev); + struct net_device *netdev = to_net_dev(dev); + struct net *net = dev_net(netdev); unsigned long new; int ret = -EINVAL; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = kstrtoul(buf, 0, &new); @@ -87,8 +86,8 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (!rtnl_trylock()) return restart_syscall(); - if (dev_isalive(net)) { - if ((ret = (*set)(net, new)) == 0) + if (dev_isalive(netdev)) { + if ((ret = (*set)(netdev, new)) == 0) ret = len; } rtnl_unlock(); @@ -264,6 +263,9 @@ static ssize_t store_tx_queue_len(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + return netdev_store(dev, attr, buf, len, change_tx_queue_len); } @@ -271,10 +273,11 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); + struct net *net = dev_net(netdev); size_t count = len; ssize_t ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; /* ignore trailing newline */ @@ -429,6 +432,17 @@ static struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; + +#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) +static struct attribute *wireless_attrs[] = { + NULL +}; + +static struct attribute_group wireless_group = { + .name = "wireless", + .attrs = wireless_attrs, +}; +#endif #endif /* CONFIG_SYSFS */ #ifdef CONFIG_RPS @@ -1409,6 +1423,15 @@ int netdev_register_kobject(struct net_device *net) groups++; *groups++ = &netstat_group; + +#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) + if (net->ieee80211_ptr) + *groups++ = &wireless_group; +#if IS_ENABLED(CONFIG_WIRELESS_EXT) + else if (net->wireless_handlers) + *groups++ = &wireless_group; +#endif +#endif #endif /* CONFIG_SYSFS */ error = device_add(dev); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 77a0388fc3be..3151acf5ec13 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -674,7 +674,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if ((delim = strchr(cur, '@')) == NULL) goto parse_failed; *delim = 0; - np->local_port = simple_strtol(cur, NULL, 10); + if (kstrtou16(cur, 10, &np->local_port)) + goto parse_failed; cur = delim; } cur++; @@ -705,7 +706,8 @@ int netpoll_parse_options(struct netpoll *np, char *opt) *delim = 0; if (*cur == ' ' || *cur == '\t') np_info(np, "warning: whitespace is not allowed\n"); - np->remote_port = simple_strtol(cur, NULL, 10); + if (kstrtou16(cur, 10, &np->remote_port)) + goto parse_failed; cur = delim; } cur++; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 79285a36035f..5e67defe2cb0 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -27,11 +27,7 @@ #include <linux/fdtable.h> -#define PRIOIDX_SZ 128 - -static unsigned long prioidx_map[PRIOIDX_SZ]; -static DEFINE_SPINLOCK(prioidx_map_lock); -static atomic_t max_prioidx = ATOMIC_INIT(0); +#define PRIOMAP_MIN_SZ 128 static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) { @@ -39,136 +35,157 @@ static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgr struct cgroup_netprio_state, css); } -static int get_prioidx(u32 *prio) -{ - unsigned long flags; - u32 prioidx; - - spin_lock_irqsave(&prioidx_map_lock, flags); - prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); - if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { - spin_unlock_irqrestore(&prioidx_map_lock, flags); - return -ENOSPC; - } - set_bit(prioidx, prioidx_map); - if (atomic_read(&max_prioidx) < prioidx) - atomic_set(&max_prioidx, prioidx); - spin_unlock_irqrestore(&prioidx_map_lock, flags); - *prio = prioidx; - return 0; -} - -static void put_prioidx(u32 idx) +/* + * Extend @dev->priomap so that it's large enough to accomodate + * @target_idx. @dev->priomap.priomap_len > @target_idx after successful + * return. Must be called under rtnl lock. + */ +static int extend_netdev_table(struct net_device *dev, u32 target_idx) { - unsigned long flags; - - spin_lock_irqsave(&prioidx_map_lock, flags); - clear_bit(idx, prioidx_map); - spin_unlock_irqrestore(&prioidx_map_lock, flags); -} + struct netprio_map *old, *new; + size_t new_sz, new_len; -static int extend_netdev_table(struct net_device *dev, u32 new_len) -{ - size_t new_size = sizeof(struct netprio_map) + - ((sizeof(u32) * new_len)); - struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL); - struct netprio_map *old_priomap; + /* is the existing priomap large enough? */ + old = rtnl_dereference(dev->priomap); + if (old && old->priomap_len > target_idx) + return 0; - old_priomap = rtnl_dereference(dev->priomap); + /* + * Determine the new size. Let's keep it power-of-two. We start + * from PRIOMAP_MIN_SZ and double it until it's large enough to + * accommodate @target_idx. + */ + new_sz = PRIOMAP_MIN_SZ; + while (true) { + new_len = (new_sz - offsetof(struct netprio_map, priomap)) / + sizeof(new->priomap[0]); + if (new_len > target_idx) + break; + new_sz *= 2; + /* overflowed? */ + if (WARN_ON(new_sz < PRIOMAP_MIN_SZ)) + return -ENOSPC; + } - if (!new_priomap) { + /* allocate & copy */ + new = kzalloc(new_sz, GFP_KERNEL); + if (!new) { pr_warn("Unable to alloc new priomap!\n"); return -ENOMEM; } - if (old_priomap) - memcpy(new_priomap->priomap, old_priomap->priomap, - old_priomap->priomap_len * - sizeof(old_priomap->priomap[0])); + if (old) + memcpy(new->priomap, old->priomap, + old->priomap_len * sizeof(old->priomap[0])); - new_priomap->priomap_len = new_len; + new->priomap_len = new_len; - rcu_assign_pointer(dev->priomap, new_priomap); - if (old_priomap) - kfree_rcu(old_priomap, rcu); + /* install the new priomap */ + rcu_assign_pointer(dev->priomap, new); + if (old) + kfree_rcu(old, rcu); return 0; } -static int write_update_netdev_table(struct net_device *dev) +/** + * netprio_prio - return the effective netprio of a cgroup-net_device pair + * @cgrp: cgroup part of the target pair + * @dev: net_device part of the target pair + * + * Should be called under RCU read or rtnl lock. + */ +static u32 netprio_prio(struct cgroup *cgrp, struct net_device *dev) +{ + struct netprio_map *map = rcu_dereference_rtnl(dev->priomap); + + if (map && cgrp->id < map->priomap_len) + return map->priomap[cgrp->id]; + return 0; +} + +/** + * netprio_set_prio - set netprio on a cgroup-net_device pair + * @cgrp: cgroup part of the target pair + * @dev: net_device part of the target pair + * @prio: prio to set + * + * Set netprio to @prio on @cgrp-@dev pair. Should be called under rtnl + * lock and may fail under memory pressure for non-zero @prio. + */ +static int netprio_set_prio(struct cgroup *cgrp, struct net_device *dev, + u32 prio) { - int ret = 0; - u32 max_len; struct netprio_map *map; + int ret; - max_len = atomic_read(&max_prioidx) + 1; + /* avoid extending priomap for zero writes */ map = rtnl_dereference(dev->priomap); - if (!map || map->priomap_len < max_len) - ret = extend_netdev_table(dev, max_len); + if (!prio && (!map || map->priomap_len <= cgrp->id)) + return 0; - return ret; + ret = extend_netdev_table(dev, cgrp->id); + if (ret) + return ret; + + map = rtnl_dereference(dev->priomap); + map->priomap[cgrp->id] = prio; + return 0; } -static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) { struct cgroup_netprio_state *cs; - int ret = -EINVAL; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); - if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) - goto out; - - ret = get_prioidx(&cs->prioidx); - if (ret < 0) { - pr_warn("No space in priority index array\n"); - goto out; - } - return &cs->css; -out: - kfree(cs); - return ERR_PTR(ret); } -static void cgrp_destroy(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup *cgrp) { - struct cgroup_netprio_state *cs; + struct cgroup *parent = cgrp->parent; struct net_device *dev; - struct netprio_map *map; + int ret = 0; + + if (!parent) + return 0; - cs = cgrp_netprio_state(cgrp); rtnl_lock(); + /* + * Inherit prios from the parent. As all prios are set during + * onlining, there is no need to clear them on offline. + */ for_each_netdev(&init_net, dev) { - map = rtnl_dereference(dev->priomap); - if (map && cs->prioidx < map->priomap_len) - map->priomap[cs->prioidx] = 0; + u32 prio = netprio_prio(parent, dev); + + ret = netprio_set_prio(cgrp, dev, prio); + if (ret) + break; } rtnl_unlock(); - put_prioidx(cs->prioidx); - kfree(cs); + return ret; +} + +static void cgrp_css_free(struct cgroup *cgrp) +{ + kfree(cgrp_netprio_state(cgrp)); } static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) { - return (u64)cgrp_netprio_state(cgrp)->prioidx; + return cgrp->id; } static int read_priomap(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { struct net_device *dev; - u32 prioidx = cgrp_netprio_state(cont)->prioidx; - u32 priority; - struct netprio_map *map; rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) { - map = rcu_dereference(dev->priomap); - priority = (map && prioidx < map->priomap_len) ? map->priomap[prioidx] : 0; - cb->fill(cb, dev->name, priority); - } + for_each_netdev_rcu(&init_net, dev) + cb->fill(cb, dev->name, netprio_prio(cont, dev)); rcu_read_unlock(); return 0; } @@ -176,66 +193,24 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft, static int write_priomap(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { - char *devname = kstrdup(buffer, GFP_KERNEL); - int ret = -EINVAL; - u32 prioidx = cgrp_netprio_state(cgrp)->prioidx; - unsigned long priority; - char *priostr; + char devname[IFNAMSIZ + 1]; struct net_device *dev; - struct netprio_map *map; - - if (!devname) - return -ENOMEM; - - /* - * Minimally sized valid priomap string - */ - if (strlen(devname) < 3) - goto out_free_devname; - - priostr = strstr(devname, " "); - if (!priostr) - goto out_free_devname; - - /* - *Separate the devname from the associated priority - *and advance the priostr pointer to the priority value - */ - *priostr = '\0'; - priostr++; - - /* - * If the priostr points to NULL, we're at the end of the passed - * in string, and its not a valid write - */ - if (*priostr == '\0') - goto out_free_devname; - - ret = kstrtoul(priostr, 10, &priority); - if (ret < 0) - goto out_free_devname; + u32 prio; + int ret; - ret = -ENODEV; + if (sscanf(buffer, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2) + return -EINVAL; dev = dev_get_by_name(&init_net, devname); if (!dev) - goto out_free_devname; + return -ENODEV; rtnl_lock(); - ret = write_update_netdev_table(dev); - if (ret < 0) - goto out_put_dev; - map = rtnl_dereference(dev->priomap); - if (map) - map->priomap[prioidx] = priority; + ret = netprio_set_prio(cgrp, dev, prio); -out_put_dev: rtnl_unlock(); dev_put(dev); - -out_free_devname: - kfree(devname); return ret; } @@ -248,7 +223,7 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *p; void *v; @@ -276,22 +251,13 @@ static struct cftype ss_files[] = { struct cgroup_subsys net_prio_subsys = { .name = "net_prio", - .create = cgrp_create, - .destroy = cgrp_destroy, + .css_alloc = cgrp_css_alloc, + .css_online = cgrp_css_online, + .css_free = cgrp_css_free, .attach = net_prio_attach, .subsys_id = net_prio_subsys_id, .base_cftypes = ss_files, .module = THIS_MODULE, - - /* - * net_prio has artificial limit on the number of cgroups and - * disallows nesting making it impossible to co-mount it with other - * hierarchical subsystems. Remove the artificially low PRIOIDX_SZ - * limit and properly nest configuration such that children follow - * their parents' configurations by default and are allowed to - * override and remove the following. - */ - .broken_hierarchy = true, }; static int netprio_device_event(struct notifier_block *unused, diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d1dc14c2aac4..b29dacf900f9 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -419,20 +419,6 @@ struct pktgen_thread { #define REMOVE 1 #define FIND 0 -static inline ktime_t ktime_now(void) -{ - struct timespec ts; - ktime_get_ts(&ts); - - return timespec_to_ktime(ts); -} - -/* This works even if 32 bit because of careful byte order choice */ -static inline int ktime_lt(const ktime_t cmp1, const ktime_t cmp2) -{ - return cmp1.tv64 < cmp2.tv64; -} - static const char version[] = "Packet Generator for packet performance testing. " "Version: " VERSION "\n"; @@ -675,7 +661,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_puts(seq, "\n"); /* not really stopped, more like last-running-at */ - stopped = pkt_dev->running ? ktime_now() : pkt_dev->stopped_at; + stopped = pkt_dev->running ? ktime_get() : pkt_dev->stopped_at; idle = pkt_dev->idle_acc; do_div(idle, NSEC_PER_USEC); @@ -2141,12 +2127,12 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) return; } - start_time = ktime_now(); + start_time = ktime_get(); if (remaining < 100000) { /* for small delays (<100us), just loop until limit is reached */ do { - end_time = ktime_now(); - } while (ktime_lt(end_time, spin_until)); + end_time = ktime_get(); + } while (ktime_compare(end_time, spin_until) < 0); } else { /* see do_nanosleep */ hrtimer_init_sleeper(&t, current); @@ -2162,7 +2148,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) hrtimer_cancel(&t.timer); } while (t.task && pkt_dev->running && !signal_pending(current)); __set_current_state(TASK_RUNNING); - end_time = ktime_now(); + end_time = ktime_get(); } pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); @@ -2427,11 +2413,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) } } else { /* IPV6 * */ - if (pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && - pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && - pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && - pkt_dev->min_in6_daddr.s6_addr32[3] == 0) ; - else { + if (!ipv6_addr_any(&pkt_dev->min_in6_daddr)) { int i; /* Only random destinations yet */ @@ -2916,8 +2898,7 @@ static void pktgen_run(struct pktgen_thread *t) pktgen_clear_counters(pkt_dev); pkt_dev->running = 1; /* Cranke yeself! */ pkt_dev->skb = NULL; - pkt_dev->started_at = - pkt_dev->next_tx = ktime_now(); + pkt_dev->started_at = pkt_dev->next_tx = ktime_get(); set_pkt_overhead(pkt_dev); @@ -3076,7 +3057,7 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev) kfree_skb(pkt_dev->skb); pkt_dev->skb = NULL; - pkt_dev->stopped_at = ktime_now(); + pkt_dev->stopped_at = ktime_get(); pkt_dev->running = 0; show_results(pkt_dev, nr_frags); @@ -3095,7 +3076,7 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t) continue; if (best == NULL) best = pkt_dev; - else if (ktime_lt(pkt_dev->next_tx, best->next_tx)) + else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0) best = pkt_dev; } if_unlock(t); @@ -3180,14 +3161,14 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static void pktgen_resched(struct pktgen_dev *pkt_dev) { - ktime_t idle_start = ktime_now(); + ktime_t idle_start = ktime_get(); schedule(); - pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) { - ktime_t idle_start = ktime_now(); + ktime_t idle_start = ktime_get(); while (atomic_read(&(pkt_dev->skb->users)) != 1) { if (signal_pending(current)) @@ -3198,7 +3179,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) else cpu_relax(); } - pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_now(), idle_start)); + pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } static void pktgen_xmit(struct pktgen_dev *pkt_dev) @@ -3220,7 +3201,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) * "never transmit" */ if (unlikely(pkt_dev->delay == ULLONG_MAX)) { - pkt_dev->next_tx = ktime_add_ns(ktime_now(), ULONG_MAX); + pkt_dev->next_tx = ktime_add_ns(ktime_get(), ULONG_MAX); return; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 76d4c2c3c89b..1868625af25e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -128,7 +128,7 @@ static rtnl_doit_func rtnl_get_doit(int protocol, int msgindex) if (tab == NULL || tab[msgindex].doit == NULL) tab = rtnl_msg_handlers[PF_UNSPEC]; - return tab ? tab[msgindex].doit : NULL; + return tab[msgindex].doit; } static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) @@ -143,7 +143,7 @@ static rtnl_dumpit_func rtnl_get_dumpit(int protocol, int msgindex) if (tab == NULL || tab[msgindex].dumpit == NULL) tab = rtnl_msg_handlers[PF_UNSPEC]; - return tab ? tab[msgindex].dumpit : NULL; + return tab[msgindex].dumpit; } static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) @@ -158,7 +158,7 @@ static rtnl_calcit_func rtnl_get_calcit(int protocol, int msgindex) if (tab == NULL || tab[msgindex].calcit == NULL) tab = rtnl_msg_handlers[PF_UNSPEC]; - return tab ? tab[msgindex].calcit : NULL; + return tab[msgindex].calcit; } /** @@ -1316,6 +1316,10 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, err = PTR_ERR(net); goto errout; } + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + err = -EPERM; + goto errout; + } err = dev_change_net_namespace(dev, net, ifname); put_net(net); if (err) @@ -1638,7 +1642,7 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } EXPORT_SYMBOL(rtnl_configure_link); -struct net_device *rtnl_create_link(struct net *src_net, struct net *net, +struct net_device *rtnl_create_link(struct net *net, char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { int err; @@ -1836,7 +1840,7 @@ replay: if (IS_ERR(dest_net)) return PTR_ERR(dest_net); - dev = rtnl_create_link(net, dest_net, ifname, ops, tb); + dev = rtnl_create_link(dest_net, ifname, ops, tb); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; @@ -2057,6 +2061,9 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) u8 *addr; int err; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); if (err < 0) return err; @@ -2123,6 +2130,9 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) int err = -EINVAL; __u8 *addr; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (nlmsg_len(nlh) < sizeof(*ndm)) return -EINVAL; @@ -2192,7 +2202,8 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, - portid, seq, 0, NTF_SELF); + portid, seq, + RTM_NEWNEIGH, NTF_SELF); if (err < 0) return err; skip: @@ -2252,6 +2263,211 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, + struct net_device *dev, u16 mode) +{ + struct nlmsghdr *nlh; + struct ifinfomsg *ifm; + struct nlattr *br_afspec; + u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + + nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI); + if (nlh == NULL) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifi_family = AF_BRIDGE; + ifm->__ifi_pad = 0; + ifm->ifi_type = dev->type; + ifm->ifi_index = dev->ifindex; + ifm->ifi_flags = dev_get_flags(dev); + ifm->ifi_change = 0; + + + if (nla_put_string(skb, IFLA_IFNAME, dev->name) || + nla_put_u32(skb, IFLA_MTU, dev->mtu) || + nla_put_u8(skb, IFLA_OPERSTATE, operstate) || + (dev->master && + nla_put_u32(skb, IFLA_MASTER, dev->master->ifindex)) || + (dev->addr_len && + nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || + (dev->ifindex != dev->iflink && + nla_put_u32(skb, IFLA_LINK, dev->iflink))) + goto nla_put_failure; + + br_afspec = nla_nest_start(skb, IFLA_AF_SPEC); + if (!br_afspec) + goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF) || + nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { + nla_nest_cancel(skb, br_afspec); + goto nla_put_failure; + } + nla_nest_end(skb, br_afspec); + + return nlmsg_end(skb, nlh); +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} +EXPORT_SYMBOL(ndo_dflt_bridge_getlink); + +static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int idx = 0; + u32 portid = NETLINK_CB(cb->skb).portid; + u32 seq = cb->nlh->nlmsg_seq; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + const struct net_device_ops *ops = dev->netdev_ops; + struct net_device *master = dev->master; + + if (master && master->netdev_ops->ndo_bridge_getlink) { + if (idx >= cb->args[0] && + master->netdev_ops->ndo_bridge_getlink( + skb, portid, seq, dev) < 0) + break; + idx++; + } + + if (ops->ndo_bridge_getlink) { + if (idx >= cb->args[0] && + ops->ndo_bridge_getlink(skb, portid, seq, dev) < 0) + break; + idx++; + } + } + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} + +static inline size_t bridge_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(sizeof(u32)) /* IFLA_MASTER */ + + nla_total_size(sizeof(u32)) /* IFLA_MTU */ + + nla_total_size(sizeof(u32)) /* IFLA_LINK */ + + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */ + + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */ + + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */ + + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */ + + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ +} + +static int rtnl_bridge_notify(struct net_device *dev, u16 flags) +{ + struct net *net = dev_net(dev); + struct net_device *master = dev->master; + struct sk_buff *skb; + int err = -EOPNOTSUPP; + + skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); + if (!skb) { + err = -ENOMEM; + goto errout; + } + + if ((!flags || (flags & BRIDGE_FLAGS_MASTER)) && + master && master->netdev_ops->ndo_bridge_getlink) { + err = master->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev); + if (err < 0) + goto errout; + } + + if ((flags & BRIDGE_FLAGS_SELF) && + dev->netdev_ops->ndo_bridge_getlink) { + err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev); + if (err < 0) + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); + return 0; +errout: + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + rtnl_set_sk_err(net, RTNLGRP_LINK, err); + return err; +} + +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + void *arg) +{ + struct net *net = sock_net(skb->sk); + struct ifinfomsg *ifm; + struct net_device *dev; + struct nlattr *br_spec, *attr = NULL; + int rem, err = -EOPNOTSUPP; + u16 oflags, flags = 0; + bool have_flags = false; + + if (nlmsg_len(nlh) < sizeof(*ifm)) + return -EINVAL; + + ifm = nlmsg_data(nlh); + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + dev = __dev_get_by_index(net, ifm->ifi_index); + if (!dev) { + pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + return -ENODEV; + } + + br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); + if (br_spec) { + nla_for_each_nested(attr, br_spec, rem) { + if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { + have_flags = true; + flags = nla_get_u16(attr); + break; + } + } + } + + oflags = flags; + + if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { + if (!dev->master || + !dev->master->netdev_ops->ndo_bridge_setlink) { + err = -EOPNOTSUPP; + goto out; + } + + err = dev->master->netdev_ops->ndo_bridge_setlink(dev, nlh); + if (err) + goto out; + + flags &= ~BRIDGE_FLAGS_MASTER; + } + + if ((flags & BRIDGE_FLAGS_SELF)) { + if (!dev->netdev_ops->ndo_bridge_setlink) + err = -EOPNOTSUPP; + else + err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh); + + if (!err) + flags &= ~BRIDGE_FLAGS_SELF; + } + + if (have_flags) + memcpy(nla_data(attr), &flags, sizeof(flags)); + /* Generate event to notify upper layer of bridge change */ + if (!err) + err = rtnl_bridge_notify(dev, oflags); +out: + return err; +} + /* Protected by RTNL sempahore. */ static struct rtattr **rta_buf; static int rtattr_max; @@ -2282,7 +2498,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sz_idx = type>>2; kind = type&3; - if (kind != 2 && !capable(CAP_NET_ADMIN)) + if (kind != 2 && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { @@ -2433,5 +2649,8 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, NULL); + + rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); + rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); } diff --git a/net/core/scm.c b/net/core/scm.c index ab570841a532..57fb1ee6649f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -51,11 +51,11 @@ static __inline__ int scm_check_creds(struct ucred *creds) if (!uid_valid(uid) || !gid_valid(gid)) return -EINVAL; - if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && + if ((creds->pid == task_tgid_vnr(current) || nsown_capable(CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || - uid_eq(uid, cred->suid)) || capable(CAP_SETUID)) && + uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || - gid_eq(gid, cred->sgid)) || capable(CAP_SETGID))) { + gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) { return 0; } return -EPERM; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4007c1437fda..3ab989b0de42 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -519,7 +519,7 @@ static void skb_release_data(struct sk_buff *skb) uarg = skb_shinfo(skb)->destructor_arg; if (uarg->callback) - uarg->callback(uarg); + uarg->callback(uarg, true); } if (skb_has_frag_list(skb)) @@ -635,6 +635,26 @@ void kfree_skb(struct sk_buff *skb) EXPORT_SYMBOL(kfree_skb); /** + * skb_tx_error - report an sk_buff xmit error + * @skb: buffer that triggered an error + * + * Report xmit error if a device callback is tracking this skb. + * skb must be freed afterwards. + */ +void skb_tx_error(struct sk_buff *skb) +{ + if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + struct ubuf_info *uarg; + + uarg = skb_shinfo(skb)->destructor_arg; + if (uarg->callback) + uarg->callback(uarg, false); + skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; + } +} +EXPORT_SYMBOL(skb_tx_error); + +/** * consume_skb - free an skbuff * @skb: buffer to free * @@ -662,11 +682,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; + new->inner_transport_header = old->inner_transport_header; + new->inner_network_header = old->inner_transport_header; skb_dst_copy(new, old); new->rxhash = old->rxhash; new->ooo_okay = old->ooo_okay; new->l4_rxhash = old->l4_rxhash; new->no_fcs = old->no_fcs; + new->encapsulation = old->encapsulation; #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif @@ -797,7 +820,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); - uarg->callback(uarg); + uarg->callback(uarg, false); /* skb frags point to kernel buffers */ for (i = num_frags - 1; i >= 0; i--) { @@ -872,6 +895,8 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->network_header += offset; if (skb_mac_header_was_set(new)) new->mac_header += offset; + new->inner_transport_header += offset; + new->inner_network_header += offset; #endif skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; @@ -1069,6 +1094,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->network_header += off; if (skb_mac_header_was_set(skb)) skb->mac_header += off; + skb->inner_transport_header += off; + skb->inner_network_header += off; /* Only adjust this if it actually is csum_start rather than csum */ if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += nhead; @@ -1168,6 +1195,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, n->network_header += off; if (skb_mac_header_was_set(skb)) n->mac_header += off; + n->inner_transport_header += off; + n->inner_network_header += off; #endif return n; @@ -2999,12 +3028,11 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) memcpy(skb_mac_header(nskb), skb_mac_header(p), p->data - skb_mac_header(p)); - *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; skb_shinfo(nskb)->gso_size = pinfo->gso_size; pinfo->gso_size = 0; skb_header_release(p); - nskb->prev = p; + NAPI_GRO_CB(nskb)->last = p; nskb->data_len += p->len; nskb->truesize += p->truesize; @@ -3030,8 +3058,8 @@ merge: __skb_pull(skb, offset); - p->prev->next = skb; - p->prev = skb; + NAPI_GRO_CB(p)->last->next = skb; + NAPI_GRO_CB(p)->last = skb; skb_header_release(skb); done: diff --git a/net/core/sock.c b/net/core/sock.c index 8a146cfcc366..a692ef49c9bb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -505,7 +505,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) +static int sock_setbindtodevice(struct sock *sk, char __user *optval, + int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -515,7 +516,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) /* Sorry... */ ret = -EPERM; - if (!capable(CAP_NET_RAW)) + if (!ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; @@ -562,6 +563,59 @@ out: return ret; } +static int sock_getbindtodevice(struct sock *sk, char __user *optval, + int __user *optlen, int len) +{ + int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + struct net *net = sock_net(sk); + struct net_device *dev; + char devname[IFNAMSIZ]; + unsigned seq; + + if (sk->sk_bound_dev_if == 0) { + len = 0; + goto zero; + } + + ret = -EINVAL; + if (len < IFNAMSIZ) + goto out; + +retry: + seq = read_seqbegin(&devnet_rename_seq); + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + ret = -ENODEV; + if (!dev) { + rcu_read_unlock(); + goto out; + } + + strcpy(devname, dev->name); + rcu_read_unlock(); + if (read_seqretry(&devnet_rename_seq, seq)) + goto retry; + + len = strlen(devname) + 1; + + ret = -EFAULT; + if (copy_to_user(optval, devname, len)) + goto out; + +zero: + ret = -EFAULT; + if (put_user(len, optlen)) + goto out; + + ret = 0; + +out: +#endif + + return ret; +} + static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) { if (valbool) @@ -589,7 +643,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, */ if (optname == SO_BINDTODEVICE) - return sock_bindtodevice(sk, optval, optlen); + return sock_setbindtodevice(sk, optval, optlen); if (optlen < sizeof(int)) return -EINVAL; @@ -696,7 +750,8 @@ set_rcvbuf: break; case SO_PRIORITY: - if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) + if ((val >= 0 && val <= 6) || + ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) sk->sk_priority = val; else ret = -EPERM; @@ -813,7 +868,7 @@ set_rcvbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) ret = -EPERM; else sk->sk_mark = val; @@ -1074,6 +1129,17 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_NOFCS: v.val = sock_flag(sk, SOCK_NOFCS); break; + + case SO_BINDTODEVICE: + return sock_getbindtodevice(sk, optval, optlen, len); + + case SO_GET_FILTER: + len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); + if (len < 0) + return len; + + goto lenout; + default: return -ENOPROTOOPT; } @@ -1214,13 +1280,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) #ifdef CONFIG_CGROUPS #if IS_ENABLED(CONFIG_NET_CLS_CGROUP) -void sock_update_classid(struct sock *sk) +void sock_update_classid(struct sock *sk, struct task_struct *task) { u32 classid; - rcu_read_lock(); /* doing current task, which cannot vanish. */ - classid = task_cls_classid(current); - rcu_read_unlock(); + classid = task_cls_classid(task); if (classid != sk->sk_classid) sk->sk_classid = classid; } @@ -1263,7 +1327,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, get_net(net)); atomic_set(&sk->sk_wmem_alloc, 1); - sock_update_classid(sk); + sock_update_classid(sk, current); sock_update_netprioidx(sk, current); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a7c36845b123..d1b08045a9df 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -216,6 +216,11 @@ static __net_init int sysctl_core_net_init(struct net *net) goto err_dup; tbl[0].data = &net->core.sysctl_somaxconn; + + /* Don't export any sysctls to unprivileged users */ + if (net->user_ns != &init_user_ns) { + tbl[0].procname = NULL; + } } net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); |