diff options
Diffstat (limited to 'net')
94 files changed, 4133 insertions, 753 deletions
diff --git a/net/Kconfig b/net/Kconfig index 2760825e53fa..10640d5f8bee 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -360,6 +360,7 @@ source "net/can/Kconfig" source "net/irda/Kconfig" source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" +source "net/kcm/Kconfig" config FIB_RULES bool diff --git a/net/Makefile b/net/Makefile index a5d04098dfce..81d14119eab5 100644 --- a/net/Makefile +++ b/net/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA) += irda/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_SUNRPC) += sunrpc/ obj-$(CONFIG_AF_RXRPC) += rxrpc/ +obj-$(CONFIG_AF_KCM) += kcm/ obj-$(CONFIG_ATM) += atm/ obj-$(CONFIG_L2TP) += l2tp/ obj-$(CONFIG_DECNET) += decnet/ diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index b563a3f5f2a8..2fa3be965101 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -228,8 +228,23 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) } #endif +static bool ax25_validate_header(const char *header, unsigned int len) +{ + ax25_digi digi; + + if (!len) + return false; + + if (header[0]) + return true; + + return ax25_addr_parse(header + 1, len - 1, NULL, NULL, &digi, NULL, + NULL); +} + const struct header_ops ax25_header_ops = { .create = ax25_hard_header, + .validate = ax25_validate_header, }; EXPORT_SYMBOL(ax25_header_ops); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 82e3e9705017..dcea4f4c62b3 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -723,6 +723,8 @@ int br_fdb_dump(struct sk_buff *skb, struct net_bridge_fdb_entry *f; hlist_for_each_entry_rcu(f, &br->hash[i], hlist) { + int err; + if (idx < cb->args[0]) goto skip; @@ -741,12 +743,15 @@ int br_fdb_dump(struct sk_buff *skb, if (!filter_dev && f->dst) goto skip; - if (fdb_fill_info(skb, br, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI) < 0) + err = fdb_fill_info(skb, br, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI); + if (err < 0) { + cb->args[1] = err; break; + } skip: ++idx; } diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 7ddbe7ec81d6..44114a94c576 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -37,6 +37,7 @@ #include <net/addrconf.h> #include <net/route.h> #include <net/netfilter/br_netfilter.h> +#include <net/netns/generic.h> #include <asm/uaccess.h> #include "br_private.h" @@ -44,6 +45,12 @@ #include <linux/sysctl.h> #endif +static int brnf_net_id __read_mostly; + +struct brnf_net { + bool enabled; +}; + #ifdef CONFIG_SYSCTL static struct ctl_table_header *brnf_sysctl_header; static int brnf_call_iptables __read_mostly = 1; @@ -938,6 +945,53 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = { }, }; +static int brnf_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct brnf_net *brnet; + struct net *net; + int ret; + + if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE)) + return NOTIFY_DONE; + + ASSERT_RTNL(); + + net = dev_net(dev); + brnet = net_generic(net, brnf_net_id); + if (brnet->enabled) + return NOTIFY_OK; + + ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + if (ret) + return NOTIFY_BAD; + + brnet->enabled = true; + return NOTIFY_OK; +} + +static void __net_exit brnf_exit_net(struct net *net) +{ + struct brnf_net *brnet = net_generic(net, brnf_net_id); + + if (!brnet->enabled) + return; + + nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops)); + brnet->enabled = false; +} + +static struct pernet_operations brnf_net_ops __read_mostly = { + .exit = brnf_exit_net, + .id = &brnf_net_id, + .size = sizeof(struct brnf_net), +}; + +static struct notifier_block brnf_notifier __read_mostly = { + .notifier_call = brnf_device_event, +}; + #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, @@ -1003,16 +1057,23 @@ static int __init br_netfilter_init(void) { int ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + ret = register_pernet_subsys(&brnf_net_ops); if (ret < 0) return ret; + ret = register_netdevice_notifier(&brnf_notifier); + if (ret < 0) { + unregister_pernet_subsys(&brnf_net_ops); + return ret; + } + #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + unregister_netdevice_notifier(&brnf_notifier); + unregister_pernet_subsys(&brnf_net_ops); return -ENOMEM; } #endif @@ -1024,7 +1085,8 @@ static int __init br_netfilter_init(void) static void __exit br_netfilter_fini(void) { RCU_INIT_POINTER(nf_br_ops, NULL); - nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + unregister_netdevice_notifier(&brnf_notifier); + unregister_pernet_subsys(&brnf_net_ops); #ifdef CONFIG_SYSCTL unregister_net_sysctl_table(brnf_sysctl_header); #endif diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index c22816a0b1b1..e23449094188 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -562,6 +562,14 @@ int br_set_max_age(struct net_bridge *br, unsigned long val) } +/* Set time interval that dynamic forwarding entries live + * For pure software bridge, allow values outside the 802.1 + * standard specification for special cases: + * 0 - entry never ages (all permanant) + * 1 - entry disappears (no persistance) + * + * Offloaded switch entries maybe more restrictive + */ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) { struct switchdev_attr attr = { @@ -573,9 +581,6 @@ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time) unsigned long t = clock_t_to_jiffies(ageing_time); int err; - if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME) - return -ERANGE; - err = switchdev_port_attr_set(br->dev, &attr); if (err) return err; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9cfedf565f5b..9382619a405b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1197,6 +1197,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, return new_piece; } +static size_t sizeof_footer(struct ceph_connection *con) +{ + return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? + sizeof(struct ceph_msg_footer) : + sizeof(struct ceph_msg_footer_old); +} + static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { BUG_ON(!msg); @@ -2335,9 +2342,9 @@ static int read_partial_message(struct ceph_connection *con) ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - - sizeof(m->footer); + sizeof_footer(con); con->in_tag = CEPH_MSGR_TAG_READY; - return 0; + return 1; } else if ((s64)seq - (s64)con->in_seq > 1) { pr_err("read_partial_message bad seq %lld expected %lld\n", seq, con->in_seq + 1); @@ -2360,10 +2367,10 @@ static int read_partial_message(struct ceph_connection *con) /* skip this message */ dout("alloc_msg said skip message\n"); con->in_base_pos = -front_len - middle_len - data_len - - sizeof(m->footer); + sizeof_footer(con); con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; - return 0; + return 1; } BUG_ON(!con->in_msg); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3534e12683d3..5bc053778fed 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2853,8 +2853,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (!req) { - pr_warn("%s osd%d tid %llu unknown, skipping\n", - __func__, osd->o_osd, tid); + dout("%s osd%d tid %llu unknown, skipping\n", __func__, + osd->o_osd, tid); m = NULL; *skip = 1; goto out; diff --git a/net/core/filter.c b/net/core/filter.c index 5e2a3b5e5196..6fc3893a6170 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1353,7 +1353,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) + if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; /* bpf verifier guarantees that: @@ -1384,11 +1384,13 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) if (flags & BPF_F_RECOMPUTE_CSUM) skb_postpush_rcsum(skb, ptr, len); + if (flags & BPF_F_INVALIDATE_HASH) + skb_clear_hash(skb); return 0; } -const struct bpf_func_proto bpf_skb_store_bytes_proto = { +static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1419,7 +1421,7 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_load_bytes_proto = { +static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1447,6 +1449,12 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EFAULT; switch (flags & BPF_F_HDR_FIELD_MASK) { + case 0: + if (unlikely(from != 0)) + return -EINVAL; + + csum_replace_by_diff(ptr, to); + break; case 2: csum_replace2(ptr, from, to); break; @@ -1464,7 +1472,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l3_csum_replace_proto = { +static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1523,7 +1531,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return 0; } -const struct bpf_func_proto bpf_l4_csum_replace_proto = { +static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1562,7 +1570,7 @@ static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) return csum_partial(sp->diff, diff_size, seed); } -const struct bpf_func_proto bpf_csum_diff_proto = { +static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1600,7 +1608,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) return dev_queue_xmit(skb2); } -const struct bpf_func_proto bpf_clone_redirect_proto = { +static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1652,7 +1660,7 @@ int skb_do_redirect(struct sk_buff *skb) return dev_queue_xmit(skb); } -const struct bpf_func_proto bpf_redirect_proto = { +static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1762,12 +1770,15 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EPROTO; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): + goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) return -EINVAL; +set_compat: to = (struct bpf_tunnel_key *)compat; break; default: @@ -1779,11 +1790,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - if (flags & BPF_F_TUNINFO_IPV6) + if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); - else + to->tunnel_label = be32_to_cpu(info->key.label); + } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy((void *)(long) r2, to, size); @@ -1791,7 +1804,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return 0; } -const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1801,6 +1814,32 @@ const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; +static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *to = (u8 *) (long) r2; + const struct ip_tunnel_info *info = skb_tunnel_info(skb); + + if (unlikely(!info || + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) + return -ENOENT; + if (unlikely(size < info->options_len)) + return -ENOMEM; + + ip_tunnel_info_opts_get(to, info); + + return info->options_len; +} + +static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { + .func = bpf_skb_get_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + static struct metadata_dst __percpu *md_dst; static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) @@ -1811,10 +1850,12 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; - if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6))) + if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | + BPF_F_DONT_FRAGMENT))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { + case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. @@ -1827,6 +1868,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; } } + if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label)) + return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); @@ -1835,7 +1878,10 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; - info->key.tun_flags = TUNNEL_KEY; + info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; + if (flags & BPF_F_DONT_FRAGMENT) + info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; @@ -1844,14 +1890,18 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); + info->key.label = cpu_to_be32(from->tunnel_label) & + IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; } return 0; } -const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { +static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, @@ -1861,17 +1911,58 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .arg4_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +#define BPF_TUNLEN_MAX 255 + +static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + u8 *from = (u8 *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct metadata_dst *md = this_cpu_ptr(md_dst); + + if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) + return -EINVAL; + if (unlikely(size > BPF_TUNLEN_MAX)) + return -ENOMEM; + + ip_tunnel_info_opts_set(info, from, size); + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { + .func = bpf_skb_set_tunnel_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, +}; + +static const struct bpf_func_proto * +bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* race is not possible, since it's called from - * verifier that is holding verifier mutex + BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info, + options_len) != 1); + + /* Race is not possible, since it's called from verifier + * that is holding verifier mutex. */ - md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX, + GFP_KERNEL); if (!md_dst) return NULL; } - return &bpf_skb_set_tunnel_key_proto; + + switch (which) { + case BPF_FUNC_skb_set_tunnel_key: + return &bpf_skb_set_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return &bpf_skb_set_tunnel_opt_proto; + default: + return NULL; + } } static const struct bpf_func_proto * @@ -1925,7 +2016,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: - return bpf_get_skb_set_tunnel_key_proto(); + return bpf_get_skb_set_tunnel_proto(func_id); + case BPF_FUNC_skb_get_tunnel_opt: + return &bpf_skb_get_tunnel_opt_proto; + case BPF_FUNC_skb_set_tunnel_opt: + return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 7c7b8739b8b8..a669dea146c6 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -19,25 +19,12 @@ #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> -static bool dissector_uses_key(const struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) -{ - return flow_dissector->used_keys & (1 << key_id); -} - static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } -static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id, - void *target_container) -{ - return ((char *) target_container) + flow_dissector->offset[key_id]; -} - void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6128aac01b11..d2d9e5ebf58e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2970,6 +2970,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb, nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc); out: netif_addr_unlock_bh(dev); + cb->args[1] = err; return idx; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); @@ -3003,6 +3004,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } + cb->args[1] = 0; for_each_netdev(net, dev) { if (brport_idx && (dev->ifindex != brport_idx)) continue; @@ -3030,12 +3032,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev, idx); } + if (cb->args[1] == -EMSGSIZE) + break; if (dev->netdev_ops->ndo_fdb_dump) idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, idx); else idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx); + if (cb->args[1] == -EMSGSIZE) + break; cops = NULL; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7af7ec635d90..51d768e7bc90 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1918,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, struct splice_pipe_desc *spd, struct sock *sk) { int seg; + struct sk_buff *iter; /* map the linear part : * If skb->head_frag is set, this 'linear' part is backed by a @@ -1944,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, return true; } + skb_walk_frags(skb, iter) { + if (*offset >= iter->len) { + *offset -= iter->len; + continue; + } + /* __skb_splice_bits() only fails if the output has no room + * left, so no point in going over the frag_list for the error + * case. + */ + if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) + return true; + } + return false; } @@ -1970,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk, /* * Map data from the skb to a pipe. Should handle both the linear part, - * the fragments, and the frag list. It does NOT handle frag lists within - * the frag list, if such a thing exists. We'd probably need to recurse to - * handle that cleanly. + * the fragments, and the frag list. */ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, @@ -1991,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; - struct sk_buff *frag_iter; int ret = 0; - /* - * __skb_splice_bits() only fails if the output has no room left, - * so no point in going over the frag_list for the error case. - */ - if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) - goto done; - else if (!tlen) - goto done; + __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); - /* - * now see if we have a frag_list to map - */ - skb_walk_frags(skb, frag_iter) { - if (!tlen) - break; - if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) - break; - } - -done: if (spd.nr_pages) ret = splice_cb(sk, pipe, &spd); @@ -3023,6 +3016,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page, EXPORT_SYMBOL_GPL(skb_append_pagefrags); /** + * skb_push_rcsum - push skb and update receive checksum + * @skb: buffer to update + * @len: length of data pulled + * + * This function performs an skb_push on the packet and updates + * the CHECKSUM_COMPLETE checksum. It should be used on + * receive path processing instead of skb_push unless you know + * that the checksum difference is zero (e.g., a valid IP header) + * or you are setting ip_summed to CHECKSUM_NONE. + */ +static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len) +{ + skb_push(skb, len); + skb_postpush_rcsum(skb, skb->data, len); + return skb->data; +} + +/** * skb_pull_rcsum - pull skb and update receive checksum * @skb: buffer to update * @len: length of data pulled @@ -4167,9 +4178,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, if (!pskb_may_pull(skb_chk, offset)) goto err; - __skb_pull(skb_chk, offset); + skb_pull_rcsum(skb_chk, offset); ret = skb_chkf(skb_chk); - __skb_push(skb_chk, offset); + skb_push_rcsum(skb_chk, offset); if (ret) goto err; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index fa4daba8db55..d8fb47fcad05 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -935,6 +935,14 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst) { int i; + dst->master_netdev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); + for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; @@ -988,14 +996,6 @@ static int dsa_suspend(struct device *d) struct dsa_switch_tree *dst = platform_get_drvdata(pdev); int i, ret = 0; - dst->master_netdev->dsa_ptr = NULL; - - /* If we used a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point get sent - * without the tag and go through the regular receive path. - */ - wmb(); - for (i = 0; i < dst->pd->nr_chips; i++) { struct dsa_switch *ds = dst->ds[i]; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c102eb5ac55c..c34c7544d1db 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (!in_dev) - goto out; + goto out_free_skb; arp = arp_hdr(skb); @@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) - goto out; + goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: @@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) - goto out; + goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) - goto out; + goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) - goto out; + goto out_free_skb; break; } @@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) - goto out; + goto out_free_skb; /* * Extract fields @@ -733,7 +733,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) - goto out; + goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), @@ -741,7 +741,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) - goto out; + goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address @@ -778,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); - goto out; + goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && @@ -803,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) neigh_release(n); } } - goto out; + goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || @@ -826,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) in_dev->arp_parms, skb); goto out_free_dst; } - goto out; + goto out_consume_skb; } } } @@ -876,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) neigh_release(n); } -out: +out_consume_skb: consume_skb(skb); + out_free_dst: dst_release(reply_dst); - return 0; + return NET_RX_SUCCESS; + +out_free_skb: + kfree_skb(skb); + return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) @@ -924,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, consumeskb: consume_skb(skb); - return 0; + return NET_RX_SUCCESS; freeskb: kfree_skb(skb); out_of_mem: - return 0; + return NET_RX_DROP; } /* diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 88dab0c1670c..780484243e14 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -319,8 +319,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, skb_gro_pull(skb, hdrlen); - flush = 0; - for (p = *head; p; p = p->next) { const struct guehdr *guehdr2; @@ -352,6 +350,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 47f4c544c916..540866dbd27d 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -175,8 +175,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - flush = 0; - for (p = *head; p; p = p->next) { const struct gre_base_hdr *greh2; @@ -213,6 +211,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, skb_gro_postpull_rcsum(skb, greh, grehlen); pp = ptype->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2aea9f1a2a31..9b4ca87f70ba 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -350,9 +350,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) skb_dst_set(skb, &rt->dst); skb->dev = dev; - skb->reserved_tailroom = skb_end_offset(skb) - - min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); + skb_tailroom_reserve(skb, mtu, tlen); skb_reset_network_header(skb); pip = ip_hdr(skb); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 202437d6087b..31936d387cfd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -527,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; + struct rtable *rt = NULL; struct flowi4 fl; - struct rtable *rt; int min_headroom; int tunnel_hlen; __be16 df, flags; + bool use_cache; int err; tun_info = skb_tunnel_info(skb); @@ -540,13 +541,14 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = !skb->mark ? dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr) : - NULL; + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); if (!rt) { rt = gre_get_rt(skb, dev, &fl, key); if (IS_ERR(rt)) goto err_free_skb; - if (!skb->mark) + if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl.saddr); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f734c42acdaf..124bf0a66328 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1233,13 +1233,16 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, if (!skb) return -EINVAL; - cork->length += size; if ((size + skb->len > mtu) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { + if (skb->ip_summed != CHECKSUM_PARTIAL) + return -EOPNOTSUPP; + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; } + cork->length += size; while (size > 0) { if (skb_is_gso(skb)) { diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index dff8a05739a2..6aad0192443d 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -607,6 +607,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ @@ -706,7 +708,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); dst_link_failure(skb); } else tunnel->err_count = 0; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b488cac9c5ca..bf081927e06b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1780,9 +1780,29 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -struct xt_table *arpt_register_table(struct net *net, - const struct xt_table *table, - const struct arpt_replace *repl) +static void __arpt_unregister_table(struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct arpt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int arpt_register_table(struct net *net, + const struct xt_table *table, + const struct arpt_replace *repl, + const struct nf_hook_ops *ops, + struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -1791,10 +1811,8 @@ struct xt_table *arpt_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -1809,30 +1827,28 @@ struct xt_table *arpt_register_table(struct net *net, ret = PTR_ERR(new_table); goto out_free; } - return new_table; + + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __arpt_unregister_table(new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void arpt_unregister_table(struct xt_table *table) +void arpt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct arpt_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __arpt_unregister_table(table); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 1897ee160920..dd8c80dc32a2 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) +static int __net_init arptable_filter_table_init(struct net *net); + static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_ARP, .priority = NF_IP_PRI_FILTER, + .table_init = arptable_filter_table_init, }; /* The work comes in here from netfilter.c */ @@ -35,26 +38,32 @@ arptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *arpfilter_ops __read_mostly; -static int __net_init arptable_filter_net_init(struct net *net) +static int __net_init arptable_filter_table_init(struct net *net) { struct arpt_replace *repl; - + int err; + + if (net->ipv4.arptable_filter) + return 0; + repl = arpt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; - net->ipv4.arptable_filter = - arpt_register_table(net, &packet_filter, repl); + err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops, + &net->ipv4.arptable_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter); + return err; } static void __net_exit arptable_filter_net_exit(struct net *net) { - arpt_unregister_table(net->ipv4.arptable_filter); + if (!net->ipv4.arptable_filter) + return; + arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops); + net->ipv4.arptable_filter = NULL; } static struct pernet_operations arptable_filter_net_ops = { - .init = arptable_filter_net_init, .exit = arptable_filter_net_exit, }; @@ -62,26 +71,23 @@ static int __init arptable_filter_init(void) { int ret; + arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook); + if (IS_ERR(arpfilter_ops)) + return PTR_ERR(arpfilter_ops); + ret = register_pernet_subsys(&arptable_filter_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(arpfilter_ops); return ret; - - arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook); - if (IS_ERR(arpfilter_ops)) { - ret = PTR_ERR(arpfilter_ops); - goto cleanup_table; } - return ret; -cleanup_table: - unregister_pernet_subsys(&arptable_filter_net_ops); return ret; } static void __exit arptable_filter_fini(void) { - xt_hook_unlink(&packet_filter, arpfilter_ops); unregister_pernet_subsys(&arptable_filter_net_ops); + kfree(arpfilter_ops); } module_init(arptable_filter_init); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b99affad6ba1..e53f8d6f326d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -2062,9 +2062,27 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ipt_register_table(struct net *net, - const struct xt_table *table, - const struct ipt_replace *repl) +static void __ipt_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ipt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int ipt_register_table(struct net *net, const struct xt_table *table, + const struct ipt_replace *repl, + const struct nf_hook_ops *ops, struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -2073,10 +2091,8 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2091,30 +2107,27 @@ struct xt_table *ipt_register_table(struct net *net, goto out_free; } - return new_table; + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __ipt_unregister_table(net, new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void ipt_unregister_table(struct net *net, struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct ipt_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter, net); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __ipt_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 397ef2dd133e..7667f223d7f8 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_filter_table_init(struct net *net); static const struct xt_table packet_filter = { .name = "filter", @@ -30,6 +31,7 @@ static const struct xt_table packet_filter = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, + .table_init = iptable_filter_table_init, }; static unsigned int @@ -48,12 +50,16 @@ iptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ -static bool forward = true; +static bool forward __read_mostly = true; module_param(forward, bool, 0000); -static int __net_init iptable_filter_net_init(struct net *net) +static int __net_init iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; + int err; + + if (net->ipv4.iptable_filter) + return 0; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) @@ -62,15 +68,26 @@ static int __net_init iptable_filter_net_init(struct net *net) ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - net->ipv4.iptable_filter = - ipt_register_table(net, &packet_filter, repl); + err = ipt_register_table(net, &packet_filter, repl, filter_ops, + &net->ipv4.iptable_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter); + return err; +} + +static int __net_init iptable_filter_net_init(struct net *net) +{ + if (net == &init_net || !forward) + return iptable_filter_table_init(net); + + return 0; } static void __net_exit iptable_filter_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_filter); + if (!net->ipv4.iptable_filter) + return; + ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops); + net->ipv4.iptable_filter = NULL; } static struct pernet_operations iptable_filter_net_ops = { @@ -82,24 +99,21 @@ static int __init iptable_filter_init(void) { int ret; + filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); + if (IS_ERR(filter_ops)) + return PTR_ERR(filter_ops); + ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) - return ret; - - /* Register hooks */ - filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); - if (IS_ERR(filter_ops)) { - ret = PTR_ERR(filter_ops); - unregister_pernet_subsys(&iptable_filter_net_ops); - } + kfree(filter_ops); return ret; } static void __exit iptable_filter_fini(void) { - xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&iptable_filter_net_ops); + kfree(filter_ops); } module_init(iptable_filter_init); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index ba5d392a13c4..57fc97cdac70 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) +static int __net_init iptable_mangle_table_init(struct net *net); + static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, + .table_init = iptable_mangle_table_init, }; static unsigned int @@ -92,27 +95,32 @@ iptable_mangle_hook(void *priv, } static struct nf_hook_ops *mangle_ops __read_mostly; - -static int __net_init iptable_mangle_net_init(struct net *net) +static int __net_init iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_mangle) + return 0; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_mangle = - ipt_register_table(net, &packet_mangler, repl); + ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops, + &net->ipv4.iptable_mangle); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle); + return ret; } static void __net_exit iptable_mangle_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_mangle); + if (!net->ipv4.iptable_mangle) + return; + ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops); + net->ipv4.iptable_mangle = NULL; } static struct pernet_operations iptable_mangle_net_ops = { - .init = iptable_mangle_net_init, .exit = iptable_mangle_net_exit, }; @@ -120,15 +128,22 @@ static int __init iptable_mangle_init(void) { int ret; + mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); + return ret; + } + ret = register_pernet_subsys(&iptable_mangle_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(mangle_ops); return ret; + } - /* Register hooks */ - mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); - if (IS_ERR(mangle_ops)) { - ret = PTR_ERR(mangle_ops); + ret = iptable_mangle_table_init(&init_net); + if (ret) { unregister_pernet_subsys(&iptable_mangle_net_ops); + kfree(mangle_ops); } return ret; @@ -136,8 +151,8 @@ static int __init iptable_mangle_init(void) static void __exit iptable_mangle_fini(void) { - xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&iptable_mangle_net_ops); + kfree(mangle_ops); } module_init(iptable_mangle_init); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ae2cd2752046..138a24bc76ad 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -18,6 +18,8 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l3proto.h> +static int __net_init iptable_nat_table_init(struct net *net); + static const struct xt_table nf_nat_ipv4_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV4, + .table_init = iptable_nat_table_init, }; static unsigned int iptable_nat_do_chain(void *priv, @@ -95,50 +98,50 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, }; -static int __net_init iptable_nat_net_init(struct net *net) +static int __net_init iptable_nat_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.nat_table) + return 0; repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); if (repl == NULL) return -ENOMEM; - net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); + ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, + nf_nat_ipv4_ops, &net->ipv4.nat_table); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.nat_table); + return ret; } static void __net_exit iptable_nat_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.nat_table); + if (!net->ipv4.nat_table) + return; + ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops); + net->ipv4.nat_table = NULL; } static struct pernet_operations iptable_nat_net_ops = { - .init = iptable_nat_net_init, .exit = iptable_nat_net_exit, }; static int __init iptable_nat_init(void) { - int err; + int ret = register_pernet_subsys(&iptable_nat_net_ops); - err = register_pernet_subsys(&iptable_nat_net_ops); - if (err < 0) - goto err1; + if (ret) + return ret; - err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); - if (err < 0) - goto err2; - return 0; - -err2: - unregister_pernet_subsys(&iptable_nat_net_ops); -err1: - return err; + ret = iptable_nat_table_init(&init_net); + if (ret) + unregister_pernet_subsys(&iptable_nat_net_ops); + return ret; } static void __exit iptable_nat_exit(void) { - nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); unregister_pernet_subsys(&iptable_nat_net_ops); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 1ba02811acb0..2642ecd2645c 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -10,12 +10,15 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_raw_table_init(struct net *net); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_RAW, + .table_init = iptable_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -34,26 +37,32 @@ iptable_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_net_init(struct net *net) +static int __net_init iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_raw) + return 0; repl = ipt_alloc_initial_table(&packet_raw); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_raw = - ipt_register_table(net, &packet_raw, repl); + ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops, + &net->ipv4.iptable_raw); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw); + return ret; } static void __net_exit iptable_raw_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_raw); + if (!net->ipv4.iptable_raw) + return; + ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops); + net->ipv4.iptable_raw = NULL; } static struct pernet_operations iptable_raw_net_ops = { - .init = iptable_raw_net_init, .exit = iptable_raw_net_exit, }; @@ -61,15 +70,20 @@ static int __init iptable_raw_init(void) { int ret; + rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); + if (IS_ERR(rawtable_ops)) + return PTR_ERR(rawtable_ops); + ret = register_pernet_subsys(&iptable_raw_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(rawtable_ops); return ret; + } - /* Register hooks */ - rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); - if (IS_ERR(rawtable_ops)) { - ret = PTR_ERR(rawtable_ops); + ret = iptable_raw_table_init(&init_net); + if (ret) { unregister_pernet_subsys(&iptable_raw_net_ops); + kfree(rawtable_ops); } return ret; @@ -77,8 +91,8 @@ static int __init iptable_raw_init(void) static void __exit iptable_raw_fini(void) { - xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&iptable_raw_net_ops); + kfree(rawtable_ops); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c2e23d5e9cd4..ff226596e4b5 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) +static int __net_init iptable_security_table_init(struct net *net); + static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_SECURITY, + .table_init = iptable_security_table_init, }; static unsigned int @@ -51,26 +54,33 @@ iptable_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init iptable_security_net_init(struct net *net) +static int __net_init iptable_security_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_security) + return 0; repl = ipt_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_security = - ipt_register_table(net, &security_table, repl); + ret = ipt_register_table(net, &security_table, repl, sectbl_ops, + &net->ipv4.iptable_security); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_security); + return ret; } static void __net_exit iptable_security_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_security); + if (!net->ipv4.iptable_security) + return; + + ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops); + net->ipv4.iptable_security = NULL; } static struct pernet_operations iptable_security_net_ops = { - .init = iptable_security_net_init, .exit = iptable_security_net_exit, }; @@ -78,27 +88,29 @@ static int __init iptable_security_init(void) { int ret; + sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook); + if (IS_ERR(sectbl_ops)) + return PTR_ERR(sectbl_ops); + ret = register_pernet_subsys(&iptable_security_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(sectbl_ops); return ret; - - sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); - if (IS_ERR(sectbl_ops)) { - ret = PTR_ERR(sectbl_ops); - goto cleanup_table; } - return ret; + ret = iptable_security_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&iptable_security_net_ops); + kfree(sectbl_ops); + } -cleanup_table: - unregister_pernet_subsys(&iptable_security_net_ops); return ret; } static void __exit iptable_security_fini(void) { - xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&iptable_security_net_ops); + kfree(sectbl_ops); } module_init(iptable_security_init); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index a04dee536b8e..d88da36b383c 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -31,10 +31,8 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, err = ip_defrag(net, skb, user); local_bh_enable(); - if (!err) { - ip_send_check(ip_hdr(skb)); + if (!err) skb->ignore_df = 1; - } return err; } diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index b72ffc58e255..51ced81b616c 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - + if (priv->sreg_proto_min) { + range.min_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_min]; + range.max_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_max]; + } regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, &range, pkt->out); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f9faadb42485..a265f00b9df9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -556,20 +556,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -EINVAL; slow = lock_sock_fast(sk); - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) - answ = 0; - else if (sock_flag(sk, SOCK_URGINLINE) || - !tp->urg_data || - before(tp->urg_seq, tp->copied_seq) || - !before(tp->urg_seq, tp->rcv_nxt)) { - - answ = tp->rcv_nxt - tp->copied_seq; - - /* Subtract 1, if FIN was received */ - if (answ && sock_flag(sk, SOCK_DONE)) - answ--; - } else - answ = tp->urg_seq - tp->copied_seq; + answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c26241f3057b..7b7eec439906 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -551,7 +551,7 @@ reset: */ if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ - crtt /= 8 * USEC_PER_MSEC; + crtt /= 8 * USEC_PER_SEC / HZ; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fadd8b978951..ae90e4b34bd3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -452,7 +452,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->segs_in = 0; + newtp->segs_in = 1; newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1; @@ -812,6 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; + tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 0ec08814f37d..96599d1a1318 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -89,6 +89,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb uh->source = src_port; uh->len = htons(skb->len); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + udp_set_csum(nocheck, skb, src, dst, skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 5c5d23e59da5..9508a20fbf61 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -257,7 +257,11 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, *fragoff = _frag_off; return hp->nexthdr; } - return -ENOENT; + if (!found) + return -ENOENT; + if (fragoff) + *fragoff = _frag_off; + break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0c7e276c230e..ea071fad67a0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -55,8 +55,6 @@ struct fib6_cleaner { void *arg; }; -static DEFINE_RWLOCK(fib6_walker_lock); - #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else @@ -66,7 +64,7 @@ static DEFINE_RWLOCK(fib6_walker_lock); static void fib6_prune_clones(struct net *net, struct fib6_node *fn); static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); -static int fib6_walk(struct fib6_walker *w); +static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* @@ -78,21 +76,21 @@ static int fib6_walk_continue(struct fib6_walker *w); static void fib6_gc_timer_cb(unsigned long arg); -static LIST_HEAD(fib6_walkers); -#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) +#define FOR_WALKERS(net, w) \ + list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) -static void fib6_walker_link(struct fib6_walker *w) +static void fib6_walker_link(struct net *net, struct fib6_walker *w) { - write_lock_bh(&fib6_walker_lock); - list_add(&w->lh, &fib6_walkers); - write_unlock_bh(&fib6_walker_lock); + write_lock_bh(&net->ipv6.fib6_walker_lock); + list_add(&w->lh, &net->ipv6.fib6_walkers); + write_unlock_bh(&net->ipv6.fib6_walker_lock); } -static void fib6_walker_unlink(struct fib6_walker *w) +static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { - write_lock_bh(&fib6_walker_lock); + write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); - write_unlock_bh(&fib6_walker_lock); + write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) @@ -325,12 +323,13 @@ static int fib6_dump_node(struct fib6_walker *w) static void fib6_dump_end(struct netlink_callback *cb) { + struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); @@ -348,6 +347,7 @@ static int fib6_dump_done(struct netlink_callback *cb) static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; @@ -359,7 +359,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->skip = 0; read_lock_bh(&table->tb6_lock); - res = fib6_walk(w); + res = fib6_walk(net, w); read_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; @@ -379,7 +379,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, res = fib6_walk_continue(w); read_unlock_bh(&table->tb6_lock); if (res <= 0) { - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); cb->args[4] = 0; } } @@ -1340,8 +1340,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } #endif - read_lock(&fib6_walker_lock); - FOR_WALKERS(w) { + read_lock(&net->ipv6.fib6_walker_lock); + FOR_WALKERS(net, w) { if (!child) { if (w->root == fn) { w->root = w->node = NULL; @@ -1368,7 +1368,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } } } - read_unlock(&fib6_walker_lock); + read_unlock(&net->ipv6.fib6_walker_lock); node_free(fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) @@ -1411,8 +1411,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, } /* Adjust walkers */ - read_lock(&fib6_walker_lock); - FOR_WALKERS(w) { + read_lock(&net->ipv6.fib6_walker_lock); + FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rt->dst.rt6_next; @@ -1420,7 +1420,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, w->state = FWS_U; } } - read_unlock(&fib6_walker_lock); + read_unlock(&net->ipv6.fib6_walker_lock); rt->dst.rt6_next = NULL; @@ -1588,17 +1588,17 @@ skip: } } -static int fib6_walk(struct fib6_walker *w) +static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; - fib6_walker_link(w); + fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); return res; } @@ -1668,7 +1668,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.arg = arg; c.net = net; - fib6_walk(&c.w); + fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, @@ -1725,14 +1725,15 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static struct fib6_gc_args +struct fib6_gc_args { int timeout; int more; -} gc_args; +}; static int fib6_age(struct rt6_info *rt, void *arg) { + struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -1748,10 +1749,10 @@ static int fib6_age(struct rt6_info *rt, void *arg) RT6_TRACE("expiring %p\n", rt); return -1; } - gc_args.more++; + gc_args->more++; } else if (rt->rt6i_flags & RTF_CACHE) { if (atomic_read(&rt->dst.__refcnt) == 0 && - time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { + time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { RT6_TRACE("aging clone %p\n", rt); return -1; } else if (rt->rt6i_flags & RTF_GATEWAY) { @@ -1769,21 +1770,20 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } } - gc_args.more++; + gc_args->more++; } return 0; } -static DEFINE_SPINLOCK(fib6_gc_lock); - void fib6_run_gc(unsigned long expires, struct net *net, bool force) { + struct fib6_gc_args gc_args; unsigned long now; if (force) { - spin_lock_bh(&fib6_gc_lock); - } else if (!spin_trylock_bh(&fib6_gc_lock)) { + spin_lock_bh(&net->ipv6.fib6_gc_lock); + } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } @@ -1792,7 +1792,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) gc_args.more = icmp6_dst_gc(); - fib6_clean_all(net, fib6_age, NULL); + fib6_clean_all(net, fib6_age, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; @@ -1802,7 +1802,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); - spin_unlock_bh(&fib6_gc_lock); + spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(unsigned long arg) @@ -1814,6 +1814,9 @@ static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + spin_lock_init(&net->ipv6.fib6_gc_lock); + rwlock_init(&net->ipv6.fib6_walker_lock); + INIT_LIST_HEAD(&net->ipv6.fib6_walkers); setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); @@ -1974,7 +1977,8 @@ static int ipv6_route_yield(struct fib6_walker *w) return 0; } -static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) +static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, + struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; @@ -1984,7 +1988,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter) iter->w.args = iter; iter->sernum = iter->w.root->fn_sernum; INIT_LIST_HEAD(&iter->w.lh); - fib6_walker_link(&iter->w); + fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, @@ -2045,16 +2049,16 @@ iter_table: ++*pos; return iter->w.leaf; } else if (r < 0) { - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); return NULL; } - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; - ipv6_route_seq_setup_walk(iter); + ipv6_route_seq_setup_walk(iter, net); goto iter_table; } @@ -2069,7 +2073,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) iter->skip = *pos; if (iter->tbl) { - ipv6_route_seq_setup_walk(iter); + ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, pos); } else { return NULL; @@ -2085,10 +2089,11 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) static void ipv6_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU_BH) { + struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) - fib6_walker_unlink(&iter->w); + fib6_walker_unlink(net, &iter->w); rcu_read_unlock_bh(); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f7c9560b75fa..4e636e60a360 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -777,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) __u32 mtu; int err; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3f3aabd2f07b..eb2ac4bb09ce 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1089,6 +1089,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + tproto = ACCESS_ONCE(t->parms.proto); if (tproto != IPPROTO_IPIP && tproto != 0) return -1; diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 14dacf1df529..a7520528ecd2 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -73,8 +73,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be16 src_port, - __be16 dst_port, bool nocheck) + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -98,7 +98,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); - ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 5ee56d0a8699..d64ee7e83664 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1574,9 +1574,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) return NULL; skb->priority = TC_PRIO_CONTROL; - skb->reserved_tailroom = skb_end_offset(skb) - - min(mtu, skb_end_offset(skb)); skb_reserve(skb, hlen); + skb_tailroom_reserve(skb, mtu, tlen); if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 99425cf2819b..84f9baf7aee8 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -2071,9 +2071,28 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ip6t_register_table(struct net *net, - const struct xt_table *table, - const struct ip6t_replace *repl) +static void __ip6t_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ip6t_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int ip6t_register_table(struct net *net, const struct xt_table *table, + const struct ip6t_replace *repl, + const struct nf_hook_ops *ops, + struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -2082,10 +2101,8 @@ struct xt_table *ip6t_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2099,30 +2116,28 @@ struct xt_table *ip6t_register_table(struct net *net, ret = PTR_ERR(new_table); goto out_free; } - return new_table; + + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __ip6t_unregister_table(net, new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void ip6t_unregister_table(struct net *net, struct xt_table *table) +void ip6t_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct ip6t_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter, net); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __ip6t_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 8b277b983ca5..1343077dde93 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -22,12 +22,15 @@ MODULE_DESCRIPTION("ip6tables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) +static int __net_init ip6table_filter_table_init(struct net *net); + static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_FILTER, + .table_init = ip6table_filter_table_init, }; /* The work comes in here from netfilter.c. */ @@ -44,9 +47,13 @@ static struct nf_hook_ops *filter_ops __read_mostly; static bool forward = true; module_param(forward, bool, 0000); -static int __net_init ip6table_filter_net_init(struct net *net) +static int __net_init ip6table_filter_table_init(struct net *net) { struct ip6t_replace *repl; + int err; + + if (net->ipv6.ip6table_filter) + return 0; repl = ip6t_alloc_initial_table(&packet_filter); if (repl == NULL) @@ -55,15 +62,26 @@ static int __net_init ip6table_filter_net_init(struct net *net) ((struct ip6t_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - net->ipv6.ip6table_filter = - ip6t_register_table(net, &packet_filter, repl); + err = ip6t_register_table(net, &packet_filter, repl, filter_ops, + &net->ipv6.ip6table_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter); + return err; +} + +static int __net_init ip6table_filter_net_init(struct net *net) +{ + if (net == &init_net || !forward) + return ip6table_filter_table_init(net); + + return 0; } static void __net_exit ip6table_filter_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_filter); + if (!net->ipv6.ip6table_filter) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops); + net->ipv6.ip6table_filter = NULL; } static struct pernet_operations ip6table_filter_net_ops = { @@ -75,28 +93,21 @@ static int __init ip6table_filter_init(void) { int ret; + filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook); + if (IS_ERR(filter_ops)) + return PTR_ERR(filter_ops); + ret = register_pernet_subsys(&ip6table_filter_net_ops); if (ret < 0) - return ret; - - /* Register hooks */ - filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook); - if (IS_ERR(filter_ops)) { - ret = PTR_ERR(filter_ops); - goto cleanup_table; - } + kfree(filter_ops); return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_filter_net_ops); - return ret; } static void __exit ip6table_filter_fini(void) { - xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&ip6table_filter_net_ops); + kfree(filter_ops); } module_init(ip6table_filter_init); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index abe278b07932..cb2b28883252 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -23,12 +23,15 @@ MODULE_DESCRIPTION("ip6tables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) +static int __net_init ip6table_mangle_table_init(struct net *net); + static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_MANGLE, + .table_init = ip6table_mangle_table_init, }; static unsigned int @@ -88,26 +91,33 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, } static struct nf_hook_ops *mangle_ops __read_mostly; -static int __net_init ip6table_mangle_net_init(struct net *net) +static int __net_init ip6table_mangle_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_mangle) + return 0; repl = ip6t_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_mangle = - ip6t_register_table(net, &packet_mangler, repl); + ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops, + &net->ipv6.ip6table_mangle); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle); + return ret; } static void __net_exit ip6table_mangle_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_mangle); + if (!net->ipv6.ip6table_mangle) + return; + + ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops); + net->ipv6.ip6table_mangle = NULL; } static struct pernet_operations ip6table_mangle_net_ops = { - .init = ip6table_mangle_net_init, .exit = ip6table_mangle_net_exit, }; @@ -115,28 +125,28 @@ static int __init ip6table_mangle_init(void) { int ret; + mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); + if (IS_ERR(mangle_ops)) + return PTR_ERR(mangle_ops); + ret = register_pernet_subsys(&ip6table_mangle_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(mangle_ops); return ret; - - /* Register hooks */ - mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) { - ret = PTR_ERR(mangle_ops); - goto cleanup_table; } - return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_mangle_net_ops); + ret = ip6table_mangle_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_mangle_net_ops); + kfree(mangle_ops); + } return ret; } static void __exit ip6table_mangle_fini(void) { - xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&ip6table_mangle_net_ops); + kfree(mangle_ops); } module_init(ip6table_mangle_init); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index de2a10a565f5..7d2bd940291f 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -20,6 +20,8 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l3proto.h> +static int __net_init ip6table_nat_table_init(struct net *net); + static const struct xt_table nf_nat_ipv6_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -28,6 +30,7 @@ static const struct xt_table nf_nat_ipv6_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV6, + .table_init = ip6table_nat_table_init, }; static unsigned int ip6table_nat_do_chain(void *priv, @@ -97,50 +100,50 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = { }, }; -static int __net_init ip6table_nat_net_init(struct net *net) +static int __net_init ip6table_nat_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_nat) + return 0; repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl); + ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl, + nf_nat_ipv6_ops, &net->ipv6.ip6table_nat); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat); + return ret; } static void __net_exit ip6table_nat_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_nat); + if (!net->ipv6.ip6table_nat) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops); + net->ipv6.ip6table_nat = NULL; } static struct pernet_operations ip6table_nat_net_ops = { - .init = ip6table_nat_net_init, .exit = ip6table_nat_net_exit, }; static int __init ip6table_nat_init(void) { - int err; + int ret = register_pernet_subsys(&ip6table_nat_net_ops); - err = register_pernet_subsys(&ip6table_nat_net_ops); - if (err < 0) - goto err1; + if (ret) + return ret; - err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); - if (err < 0) - goto err2; - return 0; - -err2: - unregister_pernet_subsys(&ip6table_nat_net_ops); -err1: - return err; + ret = ip6table_nat_table_init(&init_net); + if (ret) + unregister_pernet_subsys(&ip6table_nat_net_ops); + return ret; } static void __exit ip6table_nat_exit(void) { - nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); unregister_pernet_subsys(&ip6table_nat_net_ops); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 9021963565c3..d4bc56443dc1 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -9,12 +9,15 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) +static int __net_init ip6table_raw_table_init(struct net *net); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW, + .table_init = ip6table_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -27,26 +30,32 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init ip6table_raw_net_init(struct net *net) +static int __net_init ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_raw) + return 0; repl = ip6t_alloc_initial_table(&packet_raw); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_raw = - ip6t_register_table(net, &packet_raw, repl); + ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops, + &net->ipv6.ip6table_raw); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw); + return ret; } static void __net_exit ip6table_raw_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_raw); + if (!net->ipv6.ip6table_raw) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops); + net->ipv6.ip6table_raw = NULL; } static struct pernet_operations ip6table_raw_net_ops = { - .init = ip6table_raw_net_init, .exit = ip6table_raw_net_exit, }; @@ -54,28 +63,29 @@ static int __init ip6table_raw_init(void) { int ret; + /* Register hooks */ + rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook); + if (IS_ERR(rawtable_ops)) + return PTR_ERR(rawtable_ops); + ret = register_pernet_subsys(&ip6table_raw_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(rawtable_ops); return ret; - - /* Register hooks */ - rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook); - if (IS_ERR(rawtable_ops)) { - ret = PTR_ERR(rawtable_ops); - goto cleanup_table; } - return ret; - - cleanup_table: - unregister_pernet_subsys(&ip6table_raw_net_ops); + ret = ip6table_raw_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_raw_net_ops); + kfree(rawtable_ops); + } return ret; } static void __exit ip6table_raw_fini(void) { - xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&ip6table_raw_net_ops); + kfree(rawtable_ops); } module_init(ip6table_raw_init); diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 0d856fedfeb0..cf26ccb04056 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -27,12 +27,15 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) +static int __net_init ip6table_security_table_init(struct net *net); + static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_SECURITY, + .table_init = ip6table_security_table_init, }; static unsigned int @@ -44,26 +47,32 @@ ip6table_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init ip6table_security_net_init(struct net *net) +static int __net_init ip6table_security_table_init(struct net *net) { struct ip6t_replace *repl; + int ret; + + if (net->ipv6.ip6table_security) + return 0; repl = ip6t_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - net->ipv6.ip6table_security = - ip6t_register_table(net, &security_table, repl); + ret = ip6t_register_table(net, &security_table, repl, sectbl_ops, + &net->ipv6.ip6table_security); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security); + return ret; } static void __net_exit ip6table_security_net_exit(struct net *net) { - ip6t_unregister_table(net, net->ipv6.ip6table_security); + if (!net->ipv6.ip6table_security) + return; + ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops); + net->ipv6.ip6table_security = NULL; } static struct pernet_operations ip6table_security_net_ops = { - .init = ip6table_security_net_init, .exit = ip6table_security_net_exit, }; @@ -71,27 +80,28 @@ static int __init ip6table_security_init(void) { int ret; + sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook); + if (IS_ERR(sectbl_ops)) + return PTR_ERR(sectbl_ops); + ret = register_pernet_subsys(&ip6table_security_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(sectbl_ops); return ret; - - sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook); - if (IS_ERR(sectbl_ops)) { - ret = PTR_ERR(sectbl_ops); - goto cleanup_table; } - return ret; - -cleanup_table: - unregister_pernet_subsys(&ip6table_security_net_ops); + ret = ip6table_security_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&ip6table_security_net_ops); + kfree(sectbl_ops); + } return ret; } static void __exit ip6table_security_fini(void) { - xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&ip6table_security_net_ops); + kfree(sectbl_ops); } module_init(ip6table_security_init); diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c index cd1ac1637a05..9597ffb74077 100644 --- a/net/ipv6/netfilter/nft_masq_ipv6.c +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - + if (priv->sreg_proto_min) { + range.min_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_min]; + range.max_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_max]; + } regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 0711f8fe4d44..fd25e447a5fa 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -922,11 +922,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); - /* a return value > 0 means to resubmit the input, but - * it wants the return to be -protocol, or 0 - */ + /* a return value > 0 means to resubmit the input */ if (ret > 0) - return -ret; + return ret; return 0; } diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig new file mode 100644 index 000000000000..5db94d940ecc --- /dev/null +++ b/net/kcm/Kconfig @@ -0,0 +1,10 @@ + +config AF_KCM + tristate "KCM sockets" + depends on INET + select BPF_SYSCALL + ---help--- + KCM (Kernel Connection Multiplexor) sockets provide a method + for multiplexing messages of a message based application + protocol over kernel connectons (e.g. TCP connections). + diff --git a/net/kcm/Makefile b/net/kcm/Makefile new file mode 100644 index 000000000000..71256133e677 --- /dev/null +++ b/net/kcm/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_AF_KCM) += kcm.o + +kcm-y := kcmsock.o kcmproc.o diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c new file mode 100644 index 000000000000..738008726cc6 --- /dev/null +++ b/net/kcm/kcmproc.c @@ -0,0 +1,426 @@ +#include <linux/in.h> +#include <linux/inet.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/proc_fs.h> +#include <linux/rculist.h> +#include <linux/seq_file.h> +#include <linux/socket.h> +#include <net/inet_sock.h> +#include <net/kcm.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/tcp.h> + +#ifdef CONFIG_PROC_FS +struct kcm_seq_muxinfo { + char *name; + const struct file_operations *seq_fops; + const struct seq_operations seq_ops; +}; + +static struct kcm_mux *kcm_get_first(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + + return list_first_or_null_rcu(&knet->mux_list, + struct kcm_mux, kcm_mux_list); +} + +static struct kcm_mux *kcm_get_next(struct kcm_mux *mux) +{ + struct kcm_net *knet = mux->knet; + + return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list, + struct kcm_mux, kcm_mux_list); +} + +static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + struct kcm_mux *m; + + list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) { + if (!pos) + return m; + --pos; + } + return NULL; +} + +static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + void *p; + + if (v == SEQ_START_TOKEN) + p = kcm_get_first(seq); + else + p = kcm_get_next(v); + ++*pos; + return p; +} + +static void *kcm_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + + if (!*pos) + return SEQ_START_TOKEN; + else + return kcm_get_idx(seq, *pos - 1); +} + +static void kcm_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) +{ + rcu_read_unlock(); +} + +struct kcm_proc_mux_state { + struct seq_net_private p; + int idx; +}; + +static int kcm_seq_open(struct inode *inode, struct file *file) +{ + struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode); + int err; + + err = seq_open_net(inode, file, &muxinfo->seq_ops, + sizeof(struct kcm_proc_mux_state)); + if (err < 0) + return err; + return err; +} + +static void kcm_format_mux_header(struct seq_file *seq) +{ + struct net *net = seq_file_net(seq); + struct kcm_net *knet = net_generic(net, kcm_net_id); + + seq_printf(seq, + "*** KCM statistics (%d MUX) ****\n", + knet->count); + + seq_printf(seq, + "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s", + "Object", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "Recv-Q", + "Rmem", + "Send-Q", + "Smem", + "Status"); + + /* XXX: pdsts header stuff here */ + seq_puts(seq, "\n"); +} + +static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq, + int i, int *len) +{ + seq_printf(seq, + " kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ", + kcm->index, + kcm->stats.rx_msgs, + kcm->stats.rx_bytes, + kcm->stats.tx_msgs, + kcm->stats.tx_bytes, + kcm->sk.sk_receive_queue.qlen, + sk_rmem_alloc_get(&kcm->sk), + kcm->sk.sk_write_queue.qlen, + "-"); + + if (kcm->tx_psock) + seq_printf(seq, "Psck-%u ", kcm->tx_psock->index); + + if (kcm->tx_wait) + seq_puts(seq, "TxWait "); + + if (kcm->tx_wait_more) + seq_puts(seq, "WMore "); + + if (kcm->rx_wait) + seq_puts(seq, "RxWait "); + + seq_puts(seq, "\n"); +} + +static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, + int i, int *len) +{ + seq_printf(seq, + " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", + psock->index, + psock->stats.rx_msgs, + psock->stats.rx_bytes, + psock->stats.tx_msgs, + psock->stats.tx_bytes, + psock->sk->sk_receive_queue.qlen, + atomic_read(&psock->sk->sk_rmem_alloc), + psock->sk->sk_write_queue.qlen, + atomic_read(&psock->sk->sk_wmem_alloc)); + + if (psock->done) + seq_puts(seq, "Done "); + + if (psock->tx_stopped) + seq_puts(seq, "TxStop "); + + if (psock->rx_stopped) + seq_puts(seq, "RxStop "); + + if (psock->tx_kcm) + seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); + + if (psock->ready_rx_msg) + seq_puts(seq, "RdyRx "); + + seq_puts(seq, "\n"); +} + +static void +kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq) +{ + int i, len; + struct kcm_sock *kcm; + struct kcm_psock *psock; + + /* mux information */ + seq_printf(seq, + "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ", + "mux", "", + mux->stats.rx_msgs, + mux->stats.rx_bytes, + mux->stats.tx_msgs, + mux->stats.tx_bytes, + "-", "-", "-", "-"); + + seq_printf(seq, "KCMs: %d, Psocks %d\n", + mux->kcm_socks_cnt, mux->psocks_cnt); + + /* kcm sock information */ + i = 0; + spin_lock_bh(&mux->lock); + list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) { + kcm_format_sock(kcm, seq, i, &len); + i++; + } + i = 0; + list_for_each_entry(psock, &mux->psocks, psock_list) { + kcm_format_psock(psock, seq, i, &len); + i++; + } + spin_unlock_bh(&mux->lock); +} + +static int kcm_seq_show(struct seq_file *seq, void *v) +{ + struct kcm_proc_mux_state *mux_state; + + mux_state = seq->private; + if (v == SEQ_START_TOKEN) { + mux_state->idx = 0; + kcm_format_mux_header(seq); + } else { + kcm_format_mux(v, mux_state->idx, seq); + mux_state->idx++; + } + return 0; +} + +static const struct file_operations kcm_seq_fops = { + .owner = THIS_MODULE, + .open = kcm_seq_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +static struct kcm_seq_muxinfo kcm_seq_muxinfo = { + .name = "kcm", + .seq_fops = &kcm_seq_fops, + .seq_ops = { + .show = kcm_seq_show, + .start = kcm_seq_start, + .next = kcm_seq_next, + .stop = kcm_seq_stop, + } +}; + +static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo) +{ + struct proc_dir_entry *p; + int rc = 0; + + p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net, + muxinfo->seq_fops, muxinfo); + if (!p) + rc = -ENOMEM; + return rc; +} +EXPORT_SYMBOL(kcm_proc_register); + +static void kcm_proc_unregister(struct net *net, + struct kcm_seq_muxinfo *muxinfo) +{ + remove_proc_entry(muxinfo->name, net->proc_net); +} +EXPORT_SYMBOL(kcm_proc_unregister); + +static int kcm_stats_seq_show(struct seq_file *seq, void *v) +{ + struct kcm_psock_stats psock_stats; + struct kcm_mux_stats mux_stats; + struct kcm_mux *mux; + struct kcm_psock *psock; + struct net *net = seq->private; + struct kcm_net *knet = net_generic(net, kcm_net_id); + + memset(&mux_stats, 0, sizeof(mux_stats)); + memset(&psock_stats, 0, sizeof(psock_stats)); + + mutex_lock(&knet->mutex); + + aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); + aggregate_psock_stats(&knet->aggregate_psock_stats, + &psock_stats); + + list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) { + spin_lock_bh(&mux->lock); + aggregate_mux_stats(&mux->stats, &mux_stats); + aggregate_psock_stats(&mux->aggregate_psock_stats, + &psock_stats); + list_for_each_entry(psock, &mux->psocks, psock_list) + aggregate_psock_stats(&psock->stats, &psock_stats); + spin_unlock_bh(&mux->lock); + } + + mutex_unlock(&knet->mutex); + + seq_printf(seq, + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n", + "MUX", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "TX-Retries", + "Attach", + "Unattach", + "UnattchRsvd", + "RX-RdyDrops"); + + seq_printf(seq, + "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n", + "", + mux_stats.rx_msgs, + mux_stats.rx_bytes, + mux_stats.tx_msgs, + mux_stats.tx_bytes, + mux_stats.tx_retries, + mux_stats.psock_attach, + mux_stats.psock_unattach_rsvd, + mux_stats.psock_unattach, + mux_stats.rx_ready_drops); + + seq_printf(seq, + "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", + "Psock", + "RX-Msgs", + "RX-Bytes", + "TX-Msgs", + "TX-Bytes", + "Reserved", + "Unreserved", + "RX-Aborts", + "RX-MemFail", + "RX-NeedMor", + "RX-BadLen", + "RX-TooBig", + "RX-Timeout", + "TX-Aborts"); + + seq_printf(seq, + "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", + "", + psock_stats.rx_msgs, + psock_stats.rx_bytes, + psock_stats.tx_msgs, + psock_stats.tx_bytes, + psock_stats.reserved, + psock_stats.unreserved, + psock_stats.rx_aborts, + psock_stats.rx_mem_fail, + psock_stats.rx_need_more_hdr, + psock_stats.rx_bad_hdr_len, + psock_stats.rx_msg_too_big, + psock_stats.rx_msg_timeouts, + psock_stats.tx_aborts); + + return 0; +} + +static int kcm_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, kcm_stats_seq_show); +} + +static const struct file_operations kcm_stats_seq_fops = { + .owner = THIS_MODULE, + .open = kcm_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release_net, +}; + +static int kcm_proc_init_net(struct net *net) +{ + int err; + + if (!proc_create("kcm_stats", S_IRUGO, net->proc_net, + &kcm_stats_seq_fops)) { + err = -ENOMEM; + goto out_kcm_stats; + } + + err = kcm_proc_register(net, &kcm_seq_muxinfo); + if (err) + goto out_kcm; + + return 0; + +out_kcm: + remove_proc_entry("kcm_stats", net->proc_net); +out_kcm_stats: + return err; +} + +static void kcm_proc_exit_net(struct net *net) +{ + kcm_proc_unregister(net, &kcm_seq_muxinfo); + remove_proc_entry("kcm_stats", net->proc_net); +} + +static struct pernet_operations kcm_net_ops = { + .init = kcm_proc_init_net, + .exit = kcm_proc_exit_net, +}; + +int __init kcm_proc_init(void) +{ + return register_pernet_subsys(&kcm_net_ops); +} + +void __exit kcm_proc_exit(void) +{ + unregister_pernet_subsys(&kcm_net_ops); +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c new file mode 100644 index 000000000000..40662d73204f --- /dev/null +++ b/net/kcm/kcmsock.c @@ -0,0 +1,2409 @@ +#include <linux/bpf.h> +#include <linux/errno.h> +#include <linux/errqueue.h> +#include <linux/file.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/poll.h> +#include <linux/rculist.h> +#include <linux/skbuff.h> +#include <linux/socket.h> +#include <linux/uaccess.h> +#include <linux/workqueue.h> +#include <net/kcm.h> +#include <net/netns/generic.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <uapi/linux/kcm.h> + +unsigned int kcm_net_id; + +static struct kmem_cache *kcm_psockp __read_mostly; +static struct kmem_cache *kcm_muxp __read_mostly; +static struct workqueue_struct *kcm_wq; + +static inline struct kcm_sock *kcm_sk(const struct sock *sk) +{ + return (struct kcm_sock *)sk; +} + +static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb) +{ + return (struct kcm_tx_msg *)skb->cb; +} + +static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb) +{ + return (struct kcm_rx_msg *)((void *)skb->cb + + offsetof(struct qdisc_skb_cb, data)); +} + +static void report_csk_error(struct sock *csk, int err) +{ + csk->sk_err = EPIPE; + csk->sk_error_report(csk); +} + +/* Callback lock held */ +static void kcm_abort_rx_psock(struct kcm_psock *psock, int err, + struct sk_buff *skb) +{ + struct sock *csk = psock->sk; + + /* Unrecoverable error in receive */ + + del_timer(&psock->rx_msg_timer); + + if (psock->rx_stopped) + return; + + psock->rx_stopped = 1; + KCM_STATS_INCR(psock->stats.rx_aborts); + + /* Report an error on the lower socket */ + report_csk_error(csk, err); +} + +static void kcm_abort_tx_psock(struct kcm_psock *psock, int err, + bool wakeup_kcm) +{ + struct sock *csk = psock->sk; + struct kcm_mux *mux = psock->mux; + + /* Unrecoverable error in transmit */ + + spin_lock_bh(&mux->lock); + + if (psock->tx_stopped) { + spin_unlock_bh(&mux->lock); + return; + } + + psock->tx_stopped = 1; + KCM_STATS_INCR(psock->stats.tx_aborts); + + if (!psock->tx_kcm) { + /* Take off psocks_avail list */ + list_del(&psock->psock_avail_list); + } else if (wakeup_kcm) { + /* In this case psock is being aborted while outside of + * write_msgs and psock is reserved. Schedule tx_work + * to handle the failure there. Need to commit tx_stopped + * before queuing work. + */ + smp_mb(); + + queue_work(kcm_wq, &psock->tx_kcm->tx_work); + } + + spin_unlock_bh(&mux->lock); + + /* Report error on lower socket */ + report_csk_error(csk, err); +} + +/* RX mux lock held. */ +static void kcm_update_rx_mux_stats(struct kcm_mux *mux, + struct kcm_psock *psock) +{ + KCM_STATS_ADD(mux->stats.rx_bytes, + psock->stats.rx_bytes - psock->saved_rx_bytes); + mux->stats.rx_msgs += + psock->stats.rx_msgs - psock->saved_rx_msgs; + psock->saved_rx_msgs = psock->stats.rx_msgs; + psock->saved_rx_bytes = psock->stats.rx_bytes; +} + +static void kcm_update_tx_mux_stats(struct kcm_mux *mux, + struct kcm_psock *psock) +{ + KCM_STATS_ADD(mux->stats.tx_bytes, + psock->stats.tx_bytes - psock->saved_tx_bytes); + mux->stats.tx_msgs += + psock->stats.tx_msgs - psock->saved_tx_msgs; + psock->saved_tx_msgs = psock->stats.tx_msgs; + psock->saved_tx_bytes = psock->stats.tx_bytes; +} + +static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); + +/* KCM is ready to receive messages on its queue-- either the KCM is new or + * has become unblocked after being blocked on full socket buffer. Queue any + * pending ready messages on a psock. RX mux lock held. + */ +static void kcm_rcv_ready(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + struct sk_buff *skb; + + if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled)) + return; + + while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) { + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Assuming buffer limit has been reached */ + skb_queue_head(&mux->rx_hold_queue, skb); + WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); + return; + } + } + + while (!list_empty(&mux->psocks_ready)) { + psock = list_first_entry(&mux->psocks_ready, struct kcm_psock, + psock_ready_list); + + if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) { + /* Assuming buffer limit has been reached */ + WARN_ON(!sk_rmem_alloc_get(&kcm->sk)); + return; + } + + /* Consumed the ready message on the psock. Schedule rx_work to + * get more messages. + */ + list_del(&psock->psock_ready_list); + psock->ready_rx_msg = NULL; + + /* Commit clearing of ready_rx_msg for queuing work */ + smp_mb(); + + queue_work(kcm_wq, &psock->rx_work); + } + + /* Buffer limit is okay now, add to ready list */ + list_add_tail(&kcm->wait_rx_list, + &kcm->mux->kcm_rx_waiters); + kcm->rx_wait = true; +} + +static void kcm_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct kcm_mux *mux = kcm->mux; + unsigned int len = skb->truesize; + + sk_mem_uncharge(sk, len); + atomic_sub(len, &sk->sk_rmem_alloc); + + /* For reading rx_wait and rx_psock without holding lock */ + smp_mb__after_atomic(); + + if (!kcm->rx_wait && !kcm->rx_psock && + sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { + spin_lock_bh(&mux->rx_lock); + kcm_rcv_ready(kcm); + spin_unlock_bh(&mux->rx_lock); + } +} + +static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff_head *list = &sk->sk_receive_queue; + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + return -ENOMEM; + + if (!sk_rmem_schedule(sk, skb, skb->truesize)) + return -ENOBUFS; + + skb->dev = NULL; + + skb_orphan(skb); + skb->sk = sk; + skb->destructor = kcm_rfree; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); + + skb_queue_tail(list, skb); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + + return 0; +} + +/* Requeue received messages for a kcm socket to other kcm sockets. This is + * called with a kcm socket is receive disabled. + * RX mux lock held. + */ +static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head) +{ + struct sk_buff *skb; + struct kcm_sock *kcm; + + while ((skb = __skb_dequeue(head))) { + /* Reset destructor to avoid calling kcm_rcv_ready */ + skb->destructor = sock_rfree; + skb_orphan(skb); +try_again: + if (list_empty(&mux->kcm_rx_waiters)) { + skb_queue_tail(&mux->rx_hold_queue, skb); + continue; + } + + kcm = list_first_entry(&mux->kcm_rx_waiters, + struct kcm_sock, wait_rx_list); + + if (kcm_queue_rcv_skb(&kcm->sk, skb)) { + /* Should mean socket buffer full */ + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + + /* Commit rx_wait to read in kcm_free */ + smp_wmb(); + + goto try_again; + } + } +} + +/* Lower sock lock held */ +static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, + struct sk_buff *head) +{ + struct kcm_mux *mux = psock->mux; + struct kcm_sock *kcm; + + WARN_ON(psock->ready_rx_msg); + + if (psock->rx_kcm) + return psock->rx_kcm; + + spin_lock_bh(&mux->rx_lock); + + if (psock->rx_kcm) { + spin_unlock_bh(&mux->rx_lock); + return psock->rx_kcm; + } + + kcm_update_rx_mux_stats(mux, psock); + + if (list_empty(&mux->kcm_rx_waiters)) { + psock->ready_rx_msg = head; + list_add_tail(&psock->psock_ready_list, + &mux->psocks_ready); + spin_unlock_bh(&mux->rx_lock); + return NULL; + } + + kcm = list_first_entry(&mux->kcm_rx_waiters, + struct kcm_sock, wait_rx_list); + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + + psock->rx_kcm = kcm; + kcm->rx_psock = psock; + + spin_unlock_bh(&mux->rx_lock); + + return kcm; +} + +static void kcm_done(struct kcm_sock *kcm); + +static void kcm_done_work(struct work_struct *w) +{ + kcm_done(container_of(w, struct kcm_sock, done_work)); +} + +/* Lower sock held */ +static void unreserve_rx_kcm(struct kcm_psock *psock, + bool rcv_ready) +{ + struct kcm_sock *kcm = psock->rx_kcm; + struct kcm_mux *mux = psock->mux; + + if (!kcm) + return; + + spin_lock_bh(&mux->rx_lock); + + psock->rx_kcm = NULL; + kcm->rx_psock = NULL; + + /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with + * kcm_rfree + */ + smp_mb(); + + if (unlikely(kcm->done)) { + spin_unlock_bh(&mux->rx_lock); + + /* Need to run kcm_done in a task since we need to qcquire + * callback locks which may already be held here. + */ + INIT_WORK(&kcm->done_work, kcm_done_work); + schedule_work(&kcm->done_work); + return; + } + + if (unlikely(kcm->rx_disabled)) { + requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); + } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) { + /* Check for degenerative race with rx_wait that all + * data was dequeued (accounted for in kcm_rfree). + */ + kcm_rcv_ready(kcm); + } + spin_unlock_bh(&mux->rx_lock); +} + +static void kcm_start_rx_timer(struct kcm_psock *psock) +{ + if (psock->sk->sk_rcvtimeo) + mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo); +} + +/* Macro to invoke filter function. */ +#define KCM_RUN_FILTER(prog, ctx) \ + (*prog->bpf_func)(ctx, prog->insnsi) + +/* Lower socket lock held */ +static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, + unsigned int orig_offset, size_t orig_len) +{ + struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data; + struct kcm_rx_msg *rxm; + struct kcm_sock *kcm; + struct sk_buff *head, *skb; + size_t eaten = 0, cand_len; + ssize_t extra; + int err; + bool cloned_orig = false; + + if (psock->ready_rx_msg) + return 0; + + head = psock->rx_skb_head; + if (head) { + /* Message already in progress */ + + rxm = kcm_rx_msg(head); + if (unlikely(rxm->early_eaten)) { + /* Already some number of bytes on the receive sock + * data saved in rx_skb_head, just indicate they + * are consumed. + */ + eaten = orig_len <= rxm->early_eaten ? + orig_len : rxm->early_eaten; + rxm->early_eaten -= eaten; + + return eaten; + } + + if (unlikely(orig_offset)) { + /* Getting data with a non-zero offset when a message is + * in progress is not expected. If it does happen, we + * need to clone and pull since we can't deal with + * offsets in the skbs for a message expect in the head. + */ + orig_skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!orig_skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + if (!pskb_pull(orig_skb, orig_offset)) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + kfree_skb(orig_skb); + desc->error = -ENOMEM; + return 0; + } + cloned_orig = true; + orig_offset = 0; + } + + if (!psock->rx_skb_nextp) { + /* We are going to append to the frags_list of head. + * Need to unshare the frag_list. + */ + err = skb_unclone(head, GFP_ATOMIC); + if (err) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = err; + return 0; + } + + if (unlikely(skb_shinfo(head)->frag_list)) { + /* We can't append to an sk_buff that already + * has a frag_list. We create a new head, point + * the frag_list of that to the old head, and + * then are able to use the old head->next for + * appending to the message. + */ + if (WARN_ON(head->next)) { + desc->error = -EINVAL; + return 0; + } + + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + return 0; + } + skb->len = head->len; + skb->data_len = head->len; + skb->truesize = head->truesize; + *kcm_rx_msg(skb) = *kcm_rx_msg(head); + psock->rx_skb_nextp = &head->next; + skb_shinfo(skb)->frag_list = head; + psock->rx_skb_head = skb; + head = skb; + } else { + psock->rx_skb_nextp = + &skb_shinfo(head)->frag_list; + } + } + } + + while (eaten < orig_len) { + /* Always clone since we will consume something */ + skb = skb_clone(orig_skb, GFP_ATOMIC); + if (!skb) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = -ENOMEM; + break; + } + + cand_len = orig_len - eaten; + + head = psock->rx_skb_head; + if (!head) { + head = skb; + psock->rx_skb_head = head; + /* Will set rx_skb_nextp on next packet if needed */ + psock->rx_skb_nextp = NULL; + rxm = kcm_rx_msg(head); + memset(rxm, 0, sizeof(*rxm)); + rxm->offset = orig_offset + eaten; + } else { + /* Unclone since we may be appending to an skb that we + * already share a frag_list with. + */ + err = skb_unclone(skb, GFP_ATOMIC); + if (err) { + KCM_STATS_INCR(psock->stats.rx_mem_fail); + desc->error = err; + break; + } + + rxm = kcm_rx_msg(head); + *psock->rx_skb_nextp = skb; + psock->rx_skb_nextp = &skb->next; + head->data_len += skb->len; + head->len += skb->len; + head->truesize += skb->truesize; + } + + if (!rxm->full_len) { + ssize_t len; + + len = KCM_RUN_FILTER(psock->bpf_prog, head); + + if (!len) { + /* Need more header to determine length */ + if (!rxm->accum_len) { + /* Start RX timer for new message */ + kcm_start_rx_timer(psock); + } + rxm->accum_len += cand_len; + eaten += cand_len; + KCM_STATS_INCR(psock->stats.rx_need_more_hdr); + WARN_ON(eaten != orig_len); + break; + } else if (len > psock->sk->sk_rcvbuf) { + /* Message length exceeds maximum allowed */ + KCM_STATS_INCR(psock->stats.rx_msg_too_big); + desc->error = -EMSGSIZE; + psock->rx_skb_head = NULL; + kcm_abort_rx_psock(psock, EMSGSIZE, head); + break; + } else if (len <= (ssize_t)head->len - + skb->len - rxm->offset) { + /* Length must be into new skb (and also + * greater than zero) + */ + KCM_STATS_INCR(psock->stats.rx_bad_hdr_len); + desc->error = -EPROTO; + psock->rx_skb_head = NULL; + kcm_abort_rx_psock(psock, EPROTO, head); + break; + } + + rxm->full_len = len; + } + + extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len; + + if (extra < 0) { + /* Message not complete yet. */ + if (rxm->full_len - rxm->accum_len > + tcp_inq(psock->sk)) { + /* Don't have the whole messages in the socket + * buffer. Set psock->rx_need_bytes to wait for + * the rest of the message. Also, set "early + * eaten" since we've already buffered the skb + * but don't consume yet per tcp_read_sock. + */ + + if (!rxm->accum_len) { + /* Start RX timer for new message */ + kcm_start_rx_timer(psock); + } + + psock->rx_need_bytes = rxm->full_len - + rxm->accum_len; + rxm->accum_len += cand_len; + rxm->early_eaten = cand_len; + KCM_STATS_ADD(psock->stats.rx_bytes, cand_len); + desc->count = 0; /* Stop reading socket */ + break; + } + rxm->accum_len += cand_len; + eaten += cand_len; + WARN_ON(eaten != orig_len); + break; + } + + /* Positive extra indicates ore bytes than needed for the + * message + */ + + WARN_ON(extra > cand_len); + + eaten += (cand_len - extra); + + /* Hurray, we have a new message! */ + del_timer(&psock->rx_msg_timer); + psock->rx_skb_head = NULL; + KCM_STATS_INCR(psock->stats.rx_msgs); + +try_queue: + kcm = reserve_rx_kcm(psock, head); + if (!kcm) { + /* Unable to reserve a KCM, message is held in psock. */ + break; + } + + if (kcm_queue_rcv_skb(&kcm->sk, head)) { + /* Should mean socket buffer full */ + unreserve_rx_kcm(psock, false); + goto try_queue; + } + } + + if (cloned_orig) + kfree_skb(orig_skb); + + KCM_STATS_ADD(psock->stats.rx_bytes, eaten); + + return eaten; +} + +/* Called with lock held on lower socket */ +static int psock_tcp_read_sock(struct kcm_psock *psock) +{ + read_descriptor_t desc; + + desc.arg.data = psock; + desc.error = 0; + desc.count = 1; /* give more than one skb per call */ + + /* sk should be locked here, so okay to do tcp_read_sock */ + tcp_read_sock(psock->sk, &desc, kcm_tcp_recv); + + unreserve_rx_kcm(psock, true); + + return desc.error; +} + +/* Lower sock lock held */ +static void psock_tcp_data_ready(struct sock *sk) +{ + struct kcm_psock *psock; + + read_lock_bh(&sk->sk_callback_lock); + + psock = (struct kcm_psock *)sk->sk_user_data; + if (unlikely(!psock || psock->rx_stopped)) + goto out; + + if (psock->ready_rx_msg) + goto out; + + if (psock->rx_need_bytes) { + if (tcp_inq(sk) >= psock->rx_need_bytes) + psock->rx_need_bytes = 0; + else + goto out; + } + + if (psock_tcp_read_sock(psock) == -ENOMEM) + queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); + +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void do_psock_rx_work(struct kcm_psock *psock) +{ + read_descriptor_t rd_desc; + struct sock *csk = psock->sk; + + /* We need the read lock to synchronize with psock_tcp_data_ready. We + * need the socket lock for calling tcp_read_sock. + */ + lock_sock(csk); + read_lock_bh(&csk->sk_callback_lock); + + if (unlikely(csk->sk_user_data != psock)) + goto out; + + if (unlikely(psock->rx_stopped)) + goto out; + + if (psock->ready_rx_msg) + goto out; + + rd_desc.arg.data = psock; + + if (psock_tcp_read_sock(psock) == -ENOMEM) + queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0); + +out: + read_unlock_bh(&csk->sk_callback_lock); + release_sock(csk); +} + +static void psock_rx_work(struct work_struct *w) +{ + do_psock_rx_work(container_of(w, struct kcm_psock, rx_work)); +} + +static void psock_rx_delayed_work(struct work_struct *w) +{ + do_psock_rx_work(container_of(w, struct kcm_psock, + rx_delayed_work.work)); +} + +static void psock_tcp_state_change(struct sock *sk) +{ + /* TCP only does a POLLIN for a half close. Do a POLLHUP here + * since application will normally not poll with POLLIN + * on the TCP sockets. + */ + + report_csk_error(sk, EPIPE); +} + +static void psock_tcp_write_space(struct sock *sk) +{ + struct kcm_psock *psock; + struct kcm_mux *mux; + struct kcm_sock *kcm; + + read_lock_bh(&sk->sk_callback_lock); + + psock = (struct kcm_psock *)sk->sk_user_data; + if (unlikely(!psock)) + goto out; + + mux = psock->mux; + + spin_lock_bh(&mux->lock); + + /* Check if the socket is reserved so someone is waiting for sending. */ + kcm = psock->tx_kcm; + if (kcm) + queue_work(kcm_wq, &kcm->tx_work); + + spin_unlock_bh(&mux->lock); +out: + read_unlock_bh(&sk->sk_callback_lock); +} + +static void unreserve_psock(struct kcm_sock *kcm); + +/* kcm sock is locked. */ +static struct kcm_psock *reserve_psock(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + + psock = kcm->tx_psock; + + smp_rmb(); /* Must read tx_psock before tx_wait */ + + if (psock) { + WARN_ON(kcm->tx_wait); + if (unlikely(psock->tx_stopped)) + unreserve_psock(kcm); + else + return kcm->tx_psock; + } + + spin_lock_bh(&mux->lock); + + /* Check again under lock to see if psock was reserved for this + * psock via psock_unreserve. + */ + psock = kcm->tx_psock; + if (unlikely(psock)) { + WARN_ON(kcm->tx_wait); + spin_unlock_bh(&mux->lock); + return kcm->tx_psock; + } + + if (!list_empty(&mux->psocks_avail)) { + psock = list_first_entry(&mux->psocks_avail, + struct kcm_psock, + psock_avail_list); + list_del(&psock->psock_avail_list); + if (kcm->tx_wait) { + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + } + kcm->tx_psock = psock; + psock->tx_kcm = kcm; + KCM_STATS_INCR(psock->stats.reserved); + } else if (!kcm->tx_wait) { + list_add_tail(&kcm->wait_psock_list, + &mux->kcm_tx_waiters); + kcm->tx_wait = true; + } + + spin_unlock_bh(&mux->lock); + + return psock; +} + +/* mux lock held */ +static void psock_now_avail(struct kcm_psock *psock) +{ + struct kcm_mux *mux = psock->mux; + struct kcm_sock *kcm; + + if (list_empty(&mux->kcm_tx_waiters)) { + list_add_tail(&psock->psock_avail_list, + &mux->psocks_avail); + } else { + kcm = list_first_entry(&mux->kcm_tx_waiters, + struct kcm_sock, + wait_psock_list); + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + psock->tx_kcm = kcm; + + /* Commit before changing tx_psock since that is read in + * reserve_psock before queuing work. + */ + smp_mb(); + + kcm->tx_psock = psock; + KCM_STATS_INCR(psock->stats.reserved); + queue_work(kcm_wq, &kcm->tx_work); + } +} + +/* kcm sock is locked. */ +static void unreserve_psock(struct kcm_sock *kcm) +{ + struct kcm_psock *psock; + struct kcm_mux *mux = kcm->mux; + + spin_lock_bh(&mux->lock); + + psock = kcm->tx_psock; + + if (WARN_ON(!psock)) { + spin_unlock_bh(&mux->lock); + return; + } + + smp_rmb(); /* Read tx_psock before tx_wait */ + + kcm_update_tx_mux_stats(mux, psock); + + WARN_ON(kcm->tx_wait); + + kcm->tx_psock = NULL; + psock->tx_kcm = NULL; + KCM_STATS_INCR(psock->stats.unreserved); + + if (unlikely(psock->tx_stopped)) { + if (psock->done) { + /* Deferred free */ + list_del(&psock->psock_list); + mux->psocks_cnt--; + sock_put(psock->sk); + fput(psock->sk->sk_socket->file); + kmem_cache_free(kcm_psockp, psock); + } + + /* Don't put back on available list */ + + spin_unlock_bh(&mux->lock); + + return; + } + + psock_now_avail(psock); + + spin_unlock_bh(&mux->lock); +} + +static void kcm_report_tx_retry(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + spin_lock_bh(&mux->lock); + KCM_STATS_INCR(mux->stats.tx_retries); + spin_unlock_bh(&mux->lock); +} + +/* Write any messages ready on the kcm socket. Called with kcm sock lock + * held. Return bytes actually sent or error. + */ +static int kcm_write_msgs(struct kcm_sock *kcm) +{ + struct sock *sk = &kcm->sk; + struct kcm_psock *psock; + struct sk_buff *skb, *head; + struct kcm_tx_msg *txm; + unsigned short fragidx, frag_offset; + unsigned int sent, total_sent = 0; + int ret = 0; + + kcm->tx_wait_more = false; + psock = kcm->tx_psock; + if (unlikely(psock && psock->tx_stopped)) { + /* A reserved psock was aborted asynchronously. Unreserve + * it and we'll retry the message. + */ + unreserve_psock(kcm); + kcm_report_tx_retry(kcm); + if (skb_queue_empty(&sk->sk_write_queue)) + return 0; + + kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0; + + } else if (skb_queue_empty(&sk->sk_write_queue)) { + return 0; + } + + head = skb_peek(&sk->sk_write_queue); + txm = kcm_tx_msg(head); + + if (txm->sent) { + /* Send of first skbuff in queue already in progress */ + if (WARN_ON(!psock)) { + ret = -EINVAL; + goto out; + } + sent = txm->sent; + frag_offset = txm->frag_offset; + fragidx = txm->fragidx; + skb = txm->frag_skb; + + goto do_frag; + } + +try_again: + psock = reserve_psock(kcm); + if (!psock) + goto out; + + do { + skb = head; + txm = kcm_tx_msg(head); + sent = 0; + +do_frag_list: + if (WARN_ON(!skb_shinfo(skb)->nr_frags)) { + ret = -EINVAL; + goto out; + } + + for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; + fragidx++) { + skb_frag_t *frag; + + frag_offset = 0; +do_frag: + frag = &skb_shinfo(skb)->frags[fragidx]; + if (WARN_ON(!frag->size)) { + ret = -EINVAL; + goto out; + } + + ret = kernel_sendpage(psock->sk->sk_socket, + frag->page.p, + frag->page_offset + frag_offset, + frag->size - frag_offset, + MSG_DONTWAIT); + if (ret <= 0) { + if (ret == -EAGAIN) { + /* Save state to try again when there's + * write space on the socket + */ + txm->sent = sent; + txm->frag_offset = frag_offset; + txm->fragidx = fragidx; + txm->frag_skb = skb; + + ret = 0; + goto out; + } + + /* Hard failure in sending message, abort this + * psock since it has lost framing + * synchonization and retry sending the + * message from the beginning. + */ + kcm_abort_tx_psock(psock, ret ? -ret : EPIPE, + true); + unreserve_psock(kcm); + + txm->sent = 0; + kcm_report_tx_retry(kcm); + ret = 0; + + goto try_again; + } + + sent += ret; + frag_offset += ret; + KCM_STATS_ADD(psock->stats.tx_bytes, ret); + if (frag_offset < frag->size) { + /* Not finished with this frag */ + goto do_frag; + } + } + + if (skb == head) { + if (skb_has_frag_list(skb)) { + skb = skb_shinfo(skb)->frag_list; + goto do_frag_list; + } + } else if (skb->next) { + skb = skb->next; + goto do_frag_list; + } + + /* Successfully sent the whole packet, account for it. */ + skb_dequeue(&sk->sk_write_queue); + kfree_skb(head); + sk->sk_wmem_queued -= sent; + total_sent += sent; + KCM_STATS_INCR(psock->stats.tx_msgs); + } while ((head = skb_peek(&sk->sk_write_queue))); +out: + if (!head) { + /* Done with all queued messages. */ + WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); + unreserve_psock(kcm); + } + + /* Check if write space is available */ + sk->sk_write_space(sk); + + return total_sent ? : ret; +} + +static void kcm_tx_work(struct work_struct *w) +{ + struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work); + struct sock *sk = &kcm->sk; + int err; + + lock_sock(sk); + + /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx + * aborts + */ + err = kcm_write_msgs(kcm); + if (err < 0) { + /* Hard failure in write, report error on KCM socket */ + pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err); + report_csk_error(&kcm->sk, -err); + goto out; + } + + /* Primarily for SOCK_SEQPACKET sockets */ + if (likely(sk->sk_socket) && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_space(sk); + } + +out: + release_sock(sk); +} + +static void kcm_push(struct kcm_sock *kcm) +{ + if (kcm->tx_wait_more) + kcm_write_msgs(kcm); +} + +static ssize_t kcm_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int flags) + +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct sk_buff *skb = NULL, *head = NULL; + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + bool eor; + int err = 0; + int i; + + if (flags & MSG_SENDPAGE_NOTLAST) + flags |= MSG_MORE; + + /* No MSG_EOR from splice, only look at MSG_MORE */ + eor = !(flags & MSG_MORE); + + lock_sock(sk); + + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + err = -EPIPE; + if (sk->sk_err) + goto out_error; + + if (kcm->seq_skb) { + /* Previously opened message */ + head = kcm->seq_skb; + skb = kcm_tx_msg(head)->last_skb; + i = skb_shinfo(skb)->nr_frags; + + if (skb_can_coalesce(skb, i, page, offset)) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + goto coalesced; + } + + if (i >= MAX_SKB_FRAGS) { + struct sk_buff *tskb; + + tskb = alloc_skb(0, sk->sk_allocation); + while (!tskb) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + if (head == skb) + skb_shinfo(head)->frag_list = tskb; + else + skb->next = tskb; + + skb = tskb; + skb->ip_summed = CHECKSUM_UNNECESSARY; + i = 0; + } + } else { + /* Call the sk_stream functions to manage the sndbuf mem. */ + if (!sk_stream_memory_free(sk)) { + kcm_push(kcm); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + head = alloc_skb(0, sk->sk_allocation); + while (!head) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + skb = head; + i = 0; + } + + get_page(page); + skb_fill_page_desc(skb, i, page, offset, size); + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + +coalesced: + skb->len += size; + skb->data_len += size; + skb->truesize += size; + sk->sk_wmem_queued += size; + sk_mem_charge(sk, size); + + if (head != skb) { + head->len += size; + head->data_len += size; + head->truesize += size; + } + + if (eor) { + bool not_busy = skb_queue_empty(&sk->sk_write_queue); + + /* Message complete, queue it on send buffer */ + __skb_queue_tail(&sk->sk_write_queue, head); + kcm->seq_skb = NULL; + KCM_STATS_INCR(kcm->stats.tx_msgs); + + if (flags & MSG_BATCH) { + kcm->tx_wait_more = true; + } else if (kcm->tx_wait_more || not_busy) { + err = kcm_write_msgs(kcm); + if (err < 0) { + /* We got a hard error in write_msgs but have + * already queued this message. Report an error + * in the socket, but don't affect return value + * from sendmsg + */ + pr_warn("KCM: Hard failure on kcm_write_msgs\n"); + report_csk_error(&kcm->sk, -err); + } + } + } else { + /* Message not complete, save state */ + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } + + KCM_STATS_ADD(kcm->stats.tx_bytes, size); + + release_sock(sk); + return size; + +out_error: + kcm_push(kcm); + + err = sk_stream_error(sk, flags, err); + + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); + + release_sock(sk); + return err; +} + +static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + struct sk_buff *skb = NULL, *head = NULL; + size_t copy, copied = 0; + long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + int eor = (sock->type == SOCK_DGRAM) ? + !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR); + int err = -EPIPE; + + lock_sock(sk); + + /* Per tcp_sendmsg this should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err) + goto out_error; + + if (kcm->seq_skb) { + /* Previously opened message */ + head = kcm->seq_skb; + skb = kcm_tx_msg(head)->last_skb; + goto start; + } + + /* Call the sk_stream functions to manage the sndbuf mem. */ + if (!sk_stream_memory_free(sk)) { + kcm_push(kcm); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + /* New message, alloc head skb */ + head = alloc_skb(0, sk->sk_allocation); + while (!head) { + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + + head = alloc_skb(0, sk->sk_allocation); + } + + skb = head; + + /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling + * csum_and_copy_from_iter from skb_do_copy_data_nocache. + */ + skb->ip_summed = CHECKSUM_UNNECESSARY; + +start: + while (msg_data_left(msg)) { + bool merge = true; + int i = skb_shinfo(skb)->nr_frags; + struct page_frag *pfrag = sk_page_frag(sk); + + if (!sk_page_frag_refill(sk, pfrag)) + goto wait_for_memory; + + if (!skb_can_coalesce(skb, i, pfrag->page, + pfrag->offset)) { + if (i == MAX_SKB_FRAGS) { + struct sk_buff *tskb; + + tskb = alloc_skb(0, sk->sk_allocation); + if (!tskb) + goto wait_for_memory; + + if (head == skb) + skb_shinfo(head)->frag_list = tskb; + else + skb->next = tskb; + + skb = tskb; + skb->ip_summed = CHECKSUM_UNNECESSARY; + continue; + } + merge = false; + } + + copy = min_t(int, msg_data_left(msg), + pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, copy)) + goto wait_for_memory; + + err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, + pfrag->page, + pfrag->offset, + copy); + if (err) + goto out_error; + + /* Update the skb. */ + if (merge) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + skb_fill_page_desc(skb, i, pfrag->page, + pfrag->offset, copy); + get_page(pfrag->page); + } + + pfrag->offset += copy; + copied += copy; + if (head != skb) { + head->len += copy; + head->data_len += copy; + } + + continue; + +wait_for_memory: + kcm_push(kcm); + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_error; + } + + if (eor) { + bool not_busy = skb_queue_empty(&sk->sk_write_queue); + + /* Message complete, queue it on send buffer */ + __skb_queue_tail(&sk->sk_write_queue, head); + kcm->seq_skb = NULL; + KCM_STATS_INCR(kcm->stats.tx_msgs); + + if (msg->msg_flags & MSG_BATCH) { + kcm->tx_wait_more = true; + } else if (kcm->tx_wait_more || not_busy) { + err = kcm_write_msgs(kcm); + if (err < 0) { + /* We got a hard error in write_msgs but have + * already queued this message. Report an error + * in the socket, but don't affect return value + * from sendmsg + */ + pr_warn("KCM: Hard failure on kcm_write_msgs\n"); + report_csk_error(&kcm->sk, -err); + } + } + } else { + /* Message not complete, save state */ +partial_message: + kcm->seq_skb = head; + kcm_tx_msg(head)->last_skb = skb; + } + + KCM_STATS_ADD(kcm->stats.tx_bytes, copied); + + release_sock(sk); + return copied; + +out_error: + kcm_push(kcm); + + if (copied && sock->type == SOCK_SEQPACKET) { + /* Wrote some bytes before encountering an + * error, return partial success. + */ + goto partial_message; + } + + if (head != kcm->seq_skb) + kfree_skb(head); + + err = sk_stream_error(sk, msg->msg_flags, err); + + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN)) + sk->sk_write_space(sk); + + release_sock(sk); + return err; +} + +static struct sk_buff *kcm_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct sk_buff *skb; + + while (!(skb = skb_peek(&sk->sk_receive_queue))) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + sk_wait_data(sk, &timeo, NULL); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static int kcm_recvmsg(struct socket *sock, struct msghdr *msg, + size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + int err = 0; + long timeo; + struct kcm_rx_msg *rxm; + int copied = 0; + struct sk_buff *skb; + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + + skb = kcm_wait_data(sk, flags, timeo, &err); + if (!skb) + goto out; + + /* Okay, have a message on the receive queue */ + + rxm = kcm_rx_msg(skb); + + if (len > rxm->full_len) + len = rxm->full_len; + + err = skb_copy_datagram_msg(skb, rxm->offset, msg, len); + if (err < 0) + goto out; + + copied = len; + if (likely(!(flags & MSG_PEEK))) { + KCM_STATS_ADD(kcm->stats.rx_bytes, copied); + if (copied < rxm->full_len) { + if (sock->type == SOCK_DGRAM) { + /* Truncated message */ + msg->msg_flags |= MSG_TRUNC; + goto msg_finished; + } + rxm->offset += copied; + rxm->full_len -= copied; + } else { +msg_finished: + /* Finished with message */ + msg->msg_flags |= MSG_EOR; + KCM_STATS_INCR(kcm->stats.rx_msgs); + skb_unlink(skb, &sk->sk_receive_queue); + kfree_skb(skb); + } + } + +out: + release_sock(sk); + + return copied ? : err; +} + +static ssize_t kcm_sock_splice(struct sock *sk, + struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + int ret; + + release_sock(sk); + ret = splice_to_pipe(pipe, spd); + lock_sock(sk); + + return ret; +} + +static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm = kcm_sk(sk); + long timeo; + struct kcm_rx_msg *rxm; + int err = 0; + size_t copied; + struct sk_buff *skb; + + /* Only support splice for SOCKSEQPACKET */ + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + + skb = kcm_wait_data(sk, flags, timeo, &err); + if (!skb) + goto err_out; + + /* Okay, have a message on the receive queue */ + + rxm = kcm_rx_msg(skb); + + if (len > rxm->full_len) + len = rxm->full_len; + + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags, + kcm_sock_splice); + if (copied < 0) { + err = copied; + goto err_out; + } + + KCM_STATS_ADD(kcm->stats.rx_bytes, copied); + + rxm->offset += copied; + rxm->full_len -= copied; + + /* We have no way to return MSG_EOR. If all the bytes have been + * read we still leave the message in the receive socket buffer. + * A subsequent recvmsg needs to be done to return MSG_EOR and + * finish reading the message. + */ + + release_sock(sk); + + return copied; + +err_out: + release_sock(sk); + + return err; +} + +/* kcm sock lock held */ +static void kcm_recv_disable(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + if (kcm->rx_disabled) + return; + + spin_lock_bh(&mux->rx_lock); + + kcm->rx_disabled = 1; + + /* If a psock is reserved we'll do cleanup in unreserve */ + if (!kcm->rx_psock) { + if (kcm->rx_wait) { + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + } + + requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); + } + + spin_unlock_bh(&mux->rx_lock); +} + +/* kcm sock lock held */ +static void kcm_recv_enable(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + + if (!kcm->rx_disabled) + return; + + spin_lock_bh(&mux->rx_lock); + + kcm->rx_disabled = 0; + kcm_rcv_ready(kcm); + + spin_unlock_bh(&mux->rx_lock); +} + +static int kcm_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + int val, valbool; + int err = 0; + + if (level != SOL_KCM) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EINVAL; + + valbool = val ? 1 : 0; + + switch (optname) { + case KCM_RECV_DISABLE: + lock_sock(&kcm->sk); + if (valbool) + kcm_recv_disable(kcm); + else + kcm_recv_enable(kcm); + release_sock(&kcm->sk); + break; + default: + err = -ENOPROTOOPT; + } + + return err; +} + +static int kcm_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + int val, len; + + if (level != SOL_KCM) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + if (len < 0) + return -EINVAL; + + switch (optname) { + case KCM_RECV_DISABLE: + val = kcm->rx_disabled; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux) +{ + struct kcm_sock *tkcm; + struct list_head *head; + int index = 0; + + /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so + * we set sk_state, otherwise epoll_wait always returns right away with + * POLLHUP + */ + kcm->sk.sk_state = TCP_ESTABLISHED; + + /* Add to mux's kcm sockets list */ + kcm->mux = mux; + spin_lock_bh(&mux->lock); + + head = &mux->kcm_socks; + list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) { + if (tkcm->index != index) + break; + head = &tkcm->kcm_sock_list; + index++; + } + + list_add(&kcm->kcm_sock_list, head); + kcm->index = index; + + mux->kcm_socks_cnt++; + spin_unlock_bh(&mux->lock); + + INIT_WORK(&kcm->tx_work, kcm_tx_work); + + spin_lock_bh(&mux->rx_lock); + kcm_rcv_ready(kcm); + spin_unlock_bh(&mux->rx_lock); +} + +static void kcm_rx_msg_timeout(unsigned long arg) +{ + struct kcm_psock *psock = (struct kcm_psock *)arg; + + /* Message assembly timed out */ + KCM_STATS_INCR(psock->stats.rx_msg_timeouts); + kcm_abort_rx_psock(psock, ETIMEDOUT, NULL); +} + +static int kcm_attach(struct socket *sock, struct socket *csock, + struct bpf_prog *prog) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + struct kcm_mux *mux = kcm->mux; + struct sock *csk; + struct kcm_psock *psock = NULL, *tpsock; + struct list_head *head; + int index = 0; + + if (csock->ops->family != PF_INET && + csock->ops->family != PF_INET6) + return -EINVAL; + + csk = csock->sk; + if (!csk) + return -EINVAL; + + /* Only support TCP for now */ + if (csk->sk_protocol != IPPROTO_TCP) + return -EINVAL; + + psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); + if (!psock) + return -ENOMEM; + + psock->mux = mux; + psock->sk = csk; + psock->bpf_prog = prog; + + setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout, + (unsigned long)psock); + + INIT_WORK(&psock->rx_work, psock_rx_work); + INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work); + + sock_hold(csk); + + write_lock_bh(&csk->sk_callback_lock); + psock->save_data_ready = csk->sk_data_ready; + psock->save_write_space = csk->sk_write_space; + psock->save_state_change = csk->sk_state_change; + csk->sk_user_data = psock; + csk->sk_data_ready = psock_tcp_data_ready; + csk->sk_write_space = psock_tcp_write_space; + csk->sk_state_change = psock_tcp_state_change; + write_unlock_bh(&csk->sk_callback_lock); + + /* Finished initialization, now add the psock to the MUX. */ + spin_lock_bh(&mux->lock); + head = &mux->psocks; + list_for_each_entry(tpsock, &mux->psocks, psock_list) { + if (tpsock->index != index) + break; + head = &tpsock->psock_list; + index++; + } + + list_add(&psock->psock_list, head); + psock->index = index; + + KCM_STATS_INCR(mux->stats.psock_attach); + mux->psocks_cnt++; + psock_now_avail(psock); + spin_unlock_bh(&mux->lock); + + /* Schedule RX work in case there are already bytes queued */ + queue_work(kcm_wq, &psock->rx_work); + + return 0; +} + +static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info) +{ + struct socket *csock; + struct bpf_prog *prog; + int err; + + csock = sockfd_lookup(info->fd, &err); + if (!csock) + return -ENOENT; + + prog = bpf_prog_get(info->bpf_fd); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out; + } + + if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + err = -EINVAL; + goto out; + } + + err = kcm_attach(sock, csock, prog); + if (err) { + bpf_prog_put(prog); + goto out; + } + + /* Keep reference on file also */ + + return 0; +out: + fput(csock->file); + return err; +} + +static void kcm_unattach(struct kcm_psock *psock) +{ + struct sock *csk = psock->sk; + struct kcm_mux *mux = psock->mux; + + /* Stop getting callbacks from TCP socket. After this there should + * be no way to reserve a kcm for this psock. + */ + write_lock_bh(&csk->sk_callback_lock); + csk->sk_user_data = NULL; + csk->sk_data_ready = psock->save_data_ready; + csk->sk_write_space = psock->save_write_space; + csk->sk_state_change = psock->save_state_change; + psock->rx_stopped = 1; + + if (WARN_ON(psock->rx_kcm)) { + write_unlock_bh(&csk->sk_callback_lock); + return; + } + + spin_lock_bh(&mux->rx_lock); + + /* Stop receiver activities. After this point psock should not be + * able to get onto ready list either through callbacks or work. + */ + if (psock->ready_rx_msg) { + list_del(&psock->psock_ready_list); + kfree_skb(psock->ready_rx_msg); + psock->ready_rx_msg = NULL; + KCM_STATS_INCR(mux->stats.rx_ready_drops); + } + + spin_unlock_bh(&mux->rx_lock); + + write_unlock_bh(&csk->sk_callback_lock); + + del_timer_sync(&psock->rx_msg_timer); + cancel_work_sync(&psock->rx_work); + cancel_delayed_work_sync(&psock->rx_delayed_work); + + bpf_prog_put(psock->bpf_prog); + + kfree_skb(psock->rx_skb_head); + psock->rx_skb_head = NULL; + + spin_lock_bh(&mux->lock); + + aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats); + + KCM_STATS_INCR(mux->stats.psock_unattach); + + if (psock->tx_kcm) { + /* psock was reserved. Just mark it finished and we will clean + * up in the kcm paths, we need kcm lock which can not be + * acquired here. + */ + KCM_STATS_INCR(mux->stats.psock_unattach_rsvd); + spin_unlock_bh(&mux->lock); + + /* We are unattaching a socket that is reserved. Abort the + * socket since we may be out of sync in sending on it. We need + * to do this without the mux lock. + */ + kcm_abort_tx_psock(psock, EPIPE, false); + + spin_lock_bh(&mux->lock); + if (!psock->tx_kcm) { + /* psock now unreserved in window mux was unlocked */ + goto no_reserved; + } + psock->done = 1; + + /* Commit done before queuing work to process it */ + smp_mb(); + + /* Queue tx work to make sure psock->done is handled */ + queue_work(kcm_wq, &psock->tx_kcm->tx_work); + spin_unlock_bh(&mux->lock); + } else { +no_reserved: + if (!psock->tx_stopped) + list_del(&psock->psock_avail_list); + list_del(&psock->psock_list); + mux->psocks_cnt--; + spin_unlock_bh(&mux->lock); + + sock_put(csk); + fput(csk->sk_socket->file); + kmem_cache_free(kcm_psockp, psock); + } +} + +static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info) +{ + struct kcm_sock *kcm = kcm_sk(sock->sk); + struct kcm_mux *mux = kcm->mux; + struct kcm_psock *psock; + struct socket *csock; + struct sock *csk; + int err; + + csock = sockfd_lookup(info->fd, &err); + if (!csock) + return -ENOENT; + + csk = csock->sk; + if (!csk) { + err = -EINVAL; + goto out; + } + + err = -ENOENT; + + spin_lock_bh(&mux->lock); + + list_for_each_entry(psock, &mux->psocks, psock_list) { + if (psock->sk != csk) + continue; + + /* Found the matching psock */ + + if (psock->unattaching || WARN_ON(psock->done)) { + err = -EALREADY; + break; + } + + psock->unattaching = 1; + + spin_unlock_bh(&mux->lock); + + kcm_unattach(psock); + + err = 0; + goto out; + } + + spin_unlock_bh(&mux->lock); + +out: + fput(csock->file); + return err; +} + +static struct proto kcm_proto = { + .name = "KCM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct kcm_sock), +}; + +/* Clone a kcm socket. */ +static int kcm_clone(struct socket *osock, struct kcm_clone *info, + struct socket **newsockp) +{ + struct socket *newsock; + struct sock *newsk; + struct file *newfile; + int err, newfd; + + err = -ENFILE; + newsock = sock_alloc(); + if (!newsock) + goto out; + + newsock->type = osock->type; + newsock->ops = osock->ops; + + __module_get(newsock->ops->owner); + + newfd = get_unused_fd_flags(0); + if (unlikely(newfd < 0)) { + err = newfd; + goto out_fd_fail; + } + + newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name); + if (unlikely(IS_ERR(newfile))) { + err = PTR_ERR(newfile); + goto out_sock_alloc_fail; + } + + newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL, + &kcm_proto, true); + if (!newsk) { + err = -ENOMEM; + goto out_sk_alloc_fail; + } + + sock_init_data(newsock, newsk); + init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux); + + fd_install(newfd, newfile); + *newsockp = newsock; + info->fd = newfd; + + return 0; + +out_sk_alloc_fail: + fput(newfile); +out_sock_alloc_fail: + put_unused_fd(newfd); +out_fd_fail: + sock_release(newsock); +out: + return err; +} + +static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int err; + + switch (cmd) { + case SIOCKCMATTACH: { + struct kcm_attach info; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_attach_ioctl(sock, &info); + + break; + } + case SIOCKCMUNATTACH: { + struct kcm_unattach info; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_unattach_ioctl(sock, &info); + + break; + } + case SIOCKCMCLONE: { + struct kcm_clone info; + struct socket *newsock = NULL; + + if (copy_from_user(&info, (void __user *)arg, sizeof(info))) + err = -EFAULT; + + err = kcm_clone(sock, &info, &newsock); + + if (!err) { + if (copy_to_user((void __user *)arg, &info, + sizeof(info))) { + err = -EFAULT; + sock_release(newsock); + } + } + + break; + } + default: + err = -ENOIOCTLCMD; + break; + } + + return err; +} + +static void free_mux(struct rcu_head *rcu) +{ + struct kcm_mux *mux = container_of(rcu, + struct kcm_mux, rcu); + + kmem_cache_free(kcm_muxp, mux); +} + +static void release_mux(struct kcm_mux *mux) +{ + struct kcm_net *knet = mux->knet; + struct kcm_psock *psock, *tmp_psock; + + /* Release psocks */ + list_for_each_entry_safe(psock, tmp_psock, + &mux->psocks, psock_list) { + if (!WARN_ON(psock->unattaching)) + kcm_unattach(psock); + } + + if (WARN_ON(mux->psocks_cnt)) + return; + + __skb_queue_purge(&mux->rx_hold_queue); + + mutex_lock(&knet->mutex); + aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats); + aggregate_psock_stats(&mux->aggregate_psock_stats, + &knet->aggregate_psock_stats); + list_del_rcu(&mux->kcm_mux_list); + knet->count--; + mutex_unlock(&knet->mutex); + + call_rcu(&mux->rcu, free_mux); +} + +static void kcm_done(struct kcm_sock *kcm) +{ + struct kcm_mux *mux = kcm->mux; + struct sock *sk = &kcm->sk; + int socks_cnt; + + spin_lock_bh(&mux->rx_lock); + if (kcm->rx_psock) { + /* Cleanup in unreserve_rx_kcm */ + WARN_ON(kcm->done); + kcm->rx_disabled = 1; + kcm->done = 1; + spin_unlock_bh(&mux->rx_lock); + return; + } + + if (kcm->rx_wait) { + list_del(&kcm->wait_rx_list); + kcm->rx_wait = false; + } + /* Move any pending receive messages to other kcm sockets */ + requeue_rx_msgs(mux, &sk->sk_receive_queue); + + spin_unlock_bh(&mux->rx_lock); + + if (WARN_ON(sk_rmem_alloc_get(sk))) + return; + + /* Detach from MUX */ + spin_lock_bh(&mux->lock); + + list_del(&kcm->kcm_sock_list); + mux->kcm_socks_cnt--; + socks_cnt = mux->kcm_socks_cnt; + + spin_unlock_bh(&mux->lock); + + if (!socks_cnt) { + /* We are done with the mux now. */ + release_mux(mux); + } + + WARN_ON(kcm->rx_wait); + + sock_put(&kcm->sk); +} + +/* Called by kcm_release to close a KCM socket. + * If this is the last KCM socket on the MUX, destroy the MUX. + */ +static int kcm_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct kcm_sock *kcm; + struct kcm_mux *mux; + struct kcm_psock *psock; + + if (!sk) + return 0; + + kcm = kcm_sk(sk); + mux = kcm->mux; + + sock_orphan(sk); + kfree_skb(kcm->seq_skb); + + lock_sock(sk); + /* Purge queue under lock to avoid race condition with tx_work trying + * to act when queue is nonempty. If tx_work runs after this point + * it will just return. + */ + __skb_queue_purge(&sk->sk_write_queue); + release_sock(sk); + + spin_lock_bh(&mux->lock); + if (kcm->tx_wait) { + /* Take of tx_wait list, after this point there should be no way + * that a psock will be assigned to this kcm. + */ + list_del(&kcm->wait_psock_list); + kcm->tx_wait = false; + } + spin_unlock_bh(&mux->lock); + + /* Cancel work. After this point there should be no outside references + * to the kcm socket. + */ + cancel_work_sync(&kcm->tx_work); + + lock_sock(sk); + psock = kcm->tx_psock; + if (psock) { + /* A psock was reserved, so we need to kill it since it + * may already have some bytes queued from a message. We + * need to do this after removing kcm from tx_wait list. + */ + kcm_abort_tx_psock(psock, EPIPE, false); + unreserve_psock(kcm); + } + release_sock(sk); + + WARN_ON(kcm->tx_wait); + WARN_ON(kcm->tx_psock); + + sock->sk = NULL; + + kcm_done(kcm); + + return 0; +} + +static const struct proto_ops kcm_dgram_ops = { + .family = PF_KCM, + .owner = THIS_MODULE, + .release = kcm_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = kcm_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = kcm_setsockopt, + .getsockopt = kcm_getsockopt, + .sendmsg = kcm_sendmsg, + .recvmsg = kcm_recvmsg, + .mmap = sock_no_mmap, + .sendpage = kcm_sendpage, +}; + +static const struct proto_ops kcm_seqpacket_ops = { + .family = PF_KCM, + .owner = THIS_MODULE, + .release = kcm_release, + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = kcm_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = kcm_setsockopt, + .getsockopt = kcm_getsockopt, + .sendmsg = kcm_sendmsg, + .recvmsg = kcm_recvmsg, + .mmap = sock_no_mmap, + .sendpage = kcm_sendpage, + .splice_read = kcm_splice_read, +}; + +/* Create proto operation for kcm sockets */ +static int kcm_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + struct sock *sk; + struct kcm_mux *mux; + + switch (sock->type) { + case SOCK_DGRAM: + sock->ops = &kcm_dgram_ops; + break; + case SOCK_SEQPACKET: + sock->ops = &kcm_seqpacket_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + if (protocol != KCMPROTO_CONNECTED) + return -EPROTONOSUPPORT; + + sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern); + if (!sk) + return -ENOMEM; + + /* Allocate a kcm mux, shared between KCM sockets */ + mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL); + if (!mux) { + sk_free(sk); + return -ENOMEM; + } + + spin_lock_init(&mux->lock); + spin_lock_init(&mux->rx_lock); + INIT_LIST_HEAD(&mux->kcm_socks); + INIT_LIST_HEAD(&mux->kcm_rx_waiters); + INIT_LIST_HEAD(&mux->kcm_tx_waiters); + + INIT_LIST_HEAD(&mux->psocks); + INIT_LIST_HEAD(&mux->psocks_ready); + INIT_LIST_HEAD(&mux->psocks_avail); + + mux->knet = knet; + + /* Add new MUX to list */ + mutex_lock(&knet->mutex); + list_add_rcu(&mux->kcm_mux_list, &knet->mux_list); + knet->count++; + mutex_unlock(&knet->mutex); + + skb_queue_head_init(&mux->rx_hold_queue); + + /* Init KCM socket */ + sock_init_data(sock, sk); + init_kcm_sock(kcm_sk(sk), mux); + + return 0; +} + +static struct net_proto_family kcm_family_ops = { + .family = PF_KCM, + .create = kcm_create, + .owner = THIS_MODULE, +}; + +static __net_init int kcm_init_net(struct net *net) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + + INIT_LIST_HEAD_RCU(&knet->mux_list); + mutex_init(&knet->mutex); + + return 0; +} + +static __net_exit void kcm_exit_net(struct net *net) +{ + struct kcm_net *knet = net_generic(net, kcm_net_id); + + /* All KCM sockets should be closed at this point, which should mean + * that all multiplexors and psocks have been destroyed. + */ + WARN_ON(!list_empty(&knet->mux_list)); +} + +static struct pernet_operations kcm_net_ops = { + .init = kcm_init_net, + .exit = kcm_exit_net, + .id = &kcm_net_id, + .size = sizeof(struct kcm_net), +}; + +static int __init kcm_init(void) +{ + int err = -ENOMEM; + + kcm_muxp = kmem_cache_create("kcm_mux_cache", + sizeof(struct kcm_mux), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (!kcm_muxp) + goto fail; + + kcm_psockp = kmem_cache_create("kcm_psock_cache", + sizeof(struct kcm_psock), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (!kcm_psockp) + goto fail; + + kcm_wq = create_singlethread_workqueue("kkcmd"); + if (!kcm_wq) + goto fail; + + err = proto_register(&kcm_proto, 1); + if (err) + goto fail; + + err = sock_register(&kcm_family_ops); + if (err) + goto sock_register_fail; + + err = register_pernet_device(&kcm_net_ops); + if (err) + goto net_ops_fail; + + err = kcm_proc_init(); + if (err) + goto proc_init_fail; + + return 0; + +proc_init_fail: + unregister_pernet_device(&kcm_net_ops); + +net_ops_fail: + sock_unregister(PF_KCM); + +sock_register_fail: + proto_unregister(&kcm_proto); + +fail: + kmem_cache_destroy(kcm_muxp); + kmem_cache_destroy(kcm_psockp); + + if (kcm_wq) + destroy_workqueue(kcm_wq); + + return err; +} + +static void __exit kcm_exit(void) +{ + kcm_proc_exit(); + unregister_pernet_device(&kcm_net_ops); + sock_unregister(PF_KCM); + proto_unregister(&kcm_proto); + destroy_workqueue(kcm_wq); + + kmem_cache_destroy(kcm_muxp); + kmem_cache_destroy(kcm_psockp); +} + +module_init(kcm_init); +module_exit(kcm_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_KCM); + diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 1b8a5caa221e..3a8f881b22f1 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -327,7 +327,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, } /* prepare A-MPDU MLME for Rx aggregation */ - tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL); + tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL); if (!tid_agg_rx) goto end; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 1630975c89f1..804575ff7af5 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -92,7 +92,7 @@ struct ieee80211_fragment_entry { u16 extra_len; u16 last_frag; u8 rx_queue; - bool ccmp; /* Whether fragments were encrypted with CCMP */ + bool check_sequential_pn; /* needed for CCMP/GCMP */ u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ }; diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 3ece7d1034c8..b54f398cda5d 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) * computing cur_tp */ tmp_mrs = &mi->r[idx].stats; - tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma); + tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10; tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024; return tmp_cur_tp; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 3928dbd24e25..370d677b547b 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -414,15 +414,16 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) (max_tp_group != MINSTREL_CCK_GROUP)) return; + max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; + max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; + max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; + if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, mrs->prob_ewma); if (cur_tp_avg > tmp_tp_avg) mi->max_prob_rate = index; - max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES; - max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; - max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma; max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group, max_gpr_idx, max_gpr_prob); @@ -431,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index) } else { if (mrs->prob_ewma > tmp_prob) mi->max_prob_rate = index; - if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma) + if (mrs->prob_ewma > max_gpr_prob) mg->max_group_prob_rate = index; } } @@ -691,7 +692,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) if (likely(sta->ampdu_mlme.tid_tx[tid])) return; - ieee80211_start_tx_ba_session(pubsta, tid, 5000); + ieee80211_start_tx_ba_session(pubsta, tid, 0); } static void @@ -871,7 +872,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, * - if station is in dynamic SMPS (and streams > 1) * - for fallback rates, to increase chances of getting through */ - if (offset > 0 && + if (offset > 0 || (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC && group->streams > 1)) { ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts; @@ -1334,7 +1335,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) prob = mi->groups[i].rates[j].prob_ewma; /* convert tp_avg from pkt per second in kbps */ - tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024; + tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; + tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024; return tp_avg; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5690e4c67486..dc27becb9b71 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1777,7 +1777,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata, entry->seq = seq; entry->rx_queue = rx_queue; entry->last_frag = frag; - entry->ccmp = 0; + entry->check_sequential_pn = false; entry->extra_len = 0; return entry; @@ -1873,15 +1873,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->seqno_idx, &(rx->skb)); if (rx->key && (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP || - rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) && + rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP || + rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) && ieee80211_has_protected(fc)) { int queue = rx->security_idx; - /* Store CCMP PN so that we can verify that the next - * fragment has a sequential PN value. */ - entry->ccmp = 1; + + /* Store CCMP/GCMP PN so that we can verify that the + * next fragment has a sequential PN value. + */ + entry->check_sequential_pn = true; memcpy(entry->last_pn, rx->key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN); + BUILD_BUG_ON(offsetof(struct ieee80211_key, + u.ccmp.rx_pn) != + offsetof(struct ieee80211_key, + u.gcmp.rx_pn)); + BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) != + sizeof(rx->key->u.gcmp.rx_pn[queue])); + BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN != + IEEE80211_GCMP_PN_LEN); } return RX_QUEUED; } @@ -1896,15 +1908,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) return RX_DROP_MONITOR; } - /* Verify that MPDUs within one MSDU have sequential PN values. - * (IEEE 802.11i, 8.3.3.4.5) */ - if (entry->ccmp) { + /* "The receiver shall discard MSDUs and MMPDUs whose constituent + * MPDU PN values are not incrementing in steps of 1." + * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP) + * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP) + */ + if (entry->check_sequential_pn) { int i; u8 pn[IEEE80211_CCMP_PN_LEN], *rpn; int queue; + if (!rx->key || (rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP && - rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256)) + rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256)) return RX_DROP_UNUSABLE; memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN); for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) { @@ -3473,6 +3491,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return false; /* ignore action frames to TDLS-peers */ if (ieee80211_is_action(hdr->frame_control) && + !is_broadcast_ether_addr(bssid) && !ether_addr_equal(bssid, hdr->addr1)) return false; } diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 0328f7250693..299edc6add5a 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -605,17 +605,13 @@ static const struct file_operations ip_vs_app_fops = { int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - INIT_LIST_HEAD(&ipvs->app_list); - proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops); + proc_create("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_fops); return 0; } void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - unregister_ip_vs_app(ipvs, NULL /* all */); - remove_proc_entry("ip_vs_app", net->proc_net); + remove_proc_entry("ip_vs_app", ipvs->net->proc_net); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e7c1b052c2a3..404b2a4f4b5b 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1376,8 +1376,6 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; - pr_info("%s: enter\n", __func__); - /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services--; @@ -3947,7 +3945,6 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; int i, idx; /* Initialize rs_table */ @@ -3974,9 +3971,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) spin_lock_init(&ipvs->tot_stats.lock); - proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops); - proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops); - proc_create("ip_vs_stats_percpu", 0, net->proc_net, + proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops); + proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops); + proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net, &ip_vs_stats_percpu_fops); if (ip_vs_control_net_init_sysctl(ipvs)) @@ -3991,13 +3988,11 @@ err: void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { - struct net *net = ipvs->net; - ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); - remove_proc_entry("ip_vs_stats_percpu", net->proc_net); - remove_proc_entry("ip_vs_stats", net->proc_net); - remove_proc_entry("ip_vs", net->proc_net); + remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); + remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); + remove_proc_entry("ip_vs", ipvs->net->proc_net); free_percpu(ipvs->tot_stats.cpustats); } diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 9aea747b43ea..81b5ad6165ac 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -17,7 +17,9 @@ #include <net/netfilter/nft_masq.h> const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { - [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, + [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 }, + [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(nft_masq_policy); @@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); struct nft_masq *priv = nft_expr_priv(expr); int err; @@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx, if (err) return err; - if (tb[NFTA_MASQ_FLAGS] == NULL) - return 0; - - priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); - if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; + if (tb[NFTA_MASQ_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + + if (tb[NFTA_MASQ_REG_PROTO_MIN]) { + priv->sreg_proto_min = + nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]); + + err = nft_validate_register_load(priv->sreg_proto_min, plen); + if (err < 0) + return err; + + if (tb[NFTA_MASQ_REG_PROTO_MAX]) { + priv->sreg_proto_max = + nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]); + + err = nft_validate_register_load(priv->sreg_proto_max, + plen); + if (err < 0) + return err; + } else { + priv->sreg_proto_max = priv->sreg_proto_min; + } + } return 0; } @@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_masq *priv = nft_expr_priv(expr); - if (priv->flags == 0) - return 0; - - if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) + if (priv->flags != 0 && + nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) goto nla_put_failure; + if (priv->sreg_proto_min) { + if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN, + priv->sreg_proto_min) || + nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX, + priv->sreg_proto_max)) + goto nla_put_failure; + } + return 0; nla_put_failure: diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index fe885bf271c5..16c50b0dd426 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -28,6 +28,8 @@ #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */ +static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -181,6 +183,11 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif + case NFT_META_PRANDOM: { + struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); + *dest = prandom_u32_state(state); + break; + } default: WARN_ON(1); goto err; @@ -277,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_OIFNAME: len = IFNAMSIZ; break; + case NFT_META_PRANDOM: + prandom_init_once(&nft_prandom_state); + len = sizeof(u32); + break; default: return -EOPNOTSUPP; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c8a0b7da5ff4..d0cd2b9bf844 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -694,12 +694,45 @@ EXPORT_SYMBOL(xt_free_table_info); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { - struct xt_table *t; + struct xt_table *t, *found = NULL; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &net->xt.tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; + + if (net == &init_net) + goto out; + + /* Table doesn't exist in this netns, re-try init */ + list_for_each_entry(t, &init_net.xt.tables[af], list) { + if (strcmp(t->name, name)) + continue; + if (!try_module_get(t->me)) + return NULL; + + mutex_unlock(&xt[af].mutex); + if (t->table_init(net) != 0) { + module_put(t->me); + return NULL; + } + + found = t; + + mutex_lock(&xt[af].mutex); + break; + } + + if (!found) + goto out; + + /* and once again: */ + list_for_each_entry(t, &net->xt.tables[af], list) + if (strcmp(t->name, name) == 0) + return t; + + module_put(found->me); + out: mutex_unlock(&xt[af].mutex); return NULL; } @@ -1170,20 +1203,20 @@ static const struct file_operations xt_target_ops = { #endif /* CONFIG_PROC_FS */ /** - * xt_hook_link - set up hooks for a new table + * xt_hook_ops_alloc - set up hooks for a new table * @table: table with metadata needed to set up hooks * @fn: Hook function * - * This function will take care of creating and registering the necessary - * Netfilter hooks for XT tables. + * This function will create the nf_hook_ops that the x_table needs + * to hand to xt_hook_link_net(). */ -struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) +struct nf_hook_ops * +xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) { unsigned int hook_mask = table->valid_hooks; uint8_t i, num_hooks = hweight32(hook_mask); uint8_t hooknum; struct nf_hook_ops *ops; - int ret; ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL); if (ops == NULL) @@ -1200,27 +1233,9 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn) ++i; } - ret = nf_register_hooks(ops, num_hooks); - if (ret < 0) { - kfree(ops); - return ERR_PTR(ret); - } - return ops; } -EXPORT_SYMBOL_GPL(xt_hook_link); - -/** - * xt_hook_unlink - remove hooks for a table - * @ops: nf_hook_ops array as returned by nf_hook_link - * @hook_mask: the very same mask that was passed to nf_hook_link - */ -void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops) -{ - nf_unregister_hooks(ops, hweight32(table->valid_hooks)); - kfree(ops); -} -EXPORT_SYMBOL_GPL(xt_hook_unlink); +EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); int xt_proto_init(struct net *net, u_int8_t af) { diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 4e3c3affd285..2455b69b5810 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -262,7 +262,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) if (f->opt[optnum].kind == (*optp)) { __u32 len = f->opt[optnum].length; const __u8 *optend = optp + len; - int loop_cont = 0; fmatch = FMATCH_OK; @@ -275,7 +274,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: - loop_cont = 1; break; } diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index f0cb92f3ddaf..ada67422234b 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -55,8 +55,8 @@ struct netlbl_domhsh_tbl { static DEFINE_SPINLOCK(netlbl_domhsh_lock); #define netlbl_domhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock)) -static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL; -static struct netlbl_dom_map *netlbl_domhsh_def = NULL; +static struct netlbl_domhsh_tbl *netlbl_domhsh; +static struct netlbl_dom_map *netlbl_domhsh_def; /* * Domain Hash Table Helper Functions diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index b0380927f05f..9eaa9a1e8629 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -116,11 +116,11 @@ struct netlbl_unlhsh_walk_arg { static DEFINE_SPINLOCK(netlbl_unlhsh_lock); #define netlbl_unlhsh_rcu_deref(p) \ rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock)) -static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL; -static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL; +static struct netlbl_unlhsh_tbl *netlbl_unlhsh; +static struct netlbl_unlhsh_iface *netlbl_unlhsh_def; /* Accept unlabeled packets flag */ -static u8 netlabel_unlabel_acceptflg = 0; +static u8 netlabel_unlabel_acceptflg; /* NetLabel Generic NETLINK unlabeled family */ static struct genl_family netlbl_unlabel_gnl_family = { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index d41b1074cb2d..1ecfa710ca98 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1915,6 +1915,10 @@ retry: goto retry; } + if (!dev_validate_header(dev, skb->data, len)) { + err = -EINVAL; + goto out_unlock; + } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; @@ -2393,18 +2397,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static bool ll_header_truncated(const struct net_device *dev, int len) -{ - /* net device doesn't like empty head */ - if (unlikely(len < dev->hard_header_len)) { - net_warn_ratelimited("%s: packet size is too short (%d < %d)\n", - current->comm, len, dev->hard_header_len); - return true; - } - - return false; -} - static void tpacket_set_protocol(const struct net_device *dev, struct sk_buff *skb) { @@ -2522,16 +2514,20 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { + int hdrlen = min_t(int, copylen, tp_len); + skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); - err = skb_store_bits(skb, 0, data, copylen); + err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; + if (!dev_validate_header(dev, skb->data, hdrlen)) + return -EINVAL; if (!skb->protocol) tpacket_set_protocol(dev, skb); - data += copylen; - to_write -= copylen; + data += hdrlen; + to_write -= hdrlen; } offset = offset_in_page(data); @@ -2703,13 +2699,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } - if (dev->hard_header_len) { - if (ll_header_truncated(dev, tp_len)) { - tp_len = -EINVAL; - goto tpacket_error; - } - copylen = max_t(int, copylen, dev->hard_header_len); - } + copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), @@ -2905,9 +2895,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; - } else { - if (ll_header_truncated(dev, len)) - goto out_free; } /* Returns -EFAULT on error */ @@ -2915,6 +2902,12 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (err) goto out_free; + if (sock->type == SOCK_RAW && + !dev_validate_header(dev, skb->data, len)) { + err = -EINVAL; + goto out_free; + } + sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 6e7ec257790d..c589a9ba506a 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(ife_get_meta_u16); int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval) { - mi->metaval = kmemdup(&metaval, sizeof(u32), GFP_KERNEL); + mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL); if (!mi->metaval) return -ENOMEM; @@ -118,7 +118,7 @@ EXPORT_SYMBOL_GPL(ife_alloc_meta_u32); int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval) { - mi->metaval = kmemdup(&metaval, sizeof(u16), GFP_KERNEL); + mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL); if (!mi->metaval) return -ENOMEM; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 89c41a1f3589..350e134cffb3 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -66,6 +66,7 @@ static void ipt_destroy_target(struct xt_entry_target *t) struct xt_tgdtor_param par = { .target = t->u.kernel.target, .targinfo = t->data, + .family = NFPROTO_IPV4, }; if (par.target->destroy != NULL) par.target->destroy(&par); @@ -219,6 +220,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, par.hooknum = ipt->tcfi_hook; par.target = ipt->tcfi_t->u.kernel.target; par.targinfo = ipt->tcfi_t->data; + par.family = NFPROTO_IPV4; ret = par.target->target(skb, &par); switch (ret) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 95b021243233..2181ffc76638 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head) kfree(f); } +static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_flower_offload offload = {0}; + struct tc_to_netdev tc; + + if (!tc_should_offload(dev, 0)) + return; + + offload.command = TC_CLSFLOWER_DESTROY; + offload.cookie = cookie; + + tc.type = TC_SETUP_CLSFLOWER; + tc.cls_flower = &offload; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); +} + +static void fl_hw_replace_filter(struct tcf_proto *tp, + struct flow_dissector *dissector, + struct fl_flow_key *mask, + struct fl_flow_key *key, + struct tcf_exts *actions, + unsigned long cookie, u32 flags) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_flower_offload offload = {0}; + struct tc_to_netdev tc; + + if (!tc_should_offload(dev, flags)) + return; + + offload.command = TC_CLSFLOWER_REPLACE; + offload.cookie = cookie; + offload.dissector = dissector; + offload.mask = mask; + offload.key = key; + offload.exts = actions; + + tc.type = TC_SETUP_CLSFLOWER; + tc.cls_flower = &offload; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); +} + static bool fl_destroy(struct tcf_proto *tp, bool force) { struct cls_fl_head *head = rtnl_dereference(tp->root); @@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force) return false; list_for_each_entry_safe(f, next, &head->filters, list) { + fl_hw_destroy_filter(tp, (unsigned long)f); list_del_rcu(&f->list); call_rcu(&f->rcu, fl_destroy_filter); } @@ -459,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_filter *fnew; struct nlattr *tb[TCA_FLOWER_MAX + 1]; struct fl_flow_mask mask = {}; + u32 flags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -486,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } fnew->handle = handle; + if (tb[TCA_FLOWER_FLAGS]) + flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); + err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) goto errout; @@ -498,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, head->ht_params); if (err) goto errout; - if (fold) + + fl_hw_replace_filter(tp, + &head->dissector, + &mask.key, + &fnew->key, + &fnew->exts, + (unsigned long)fnew, + flags); + + if (fold) { rhashtable_remove_fast(&head->ht, &fold->ht_node, head->ht_params); + fl_hw_destroy_filter(tp, (unsigned long)fold); + } *arg = (unsigned long) fnew; @@ -527,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg) rhashtable_remove_fast(&head->ht, &f->ht_node, head->ht_params); list_del_rcu(&f->list); + fl_hw_destroy_filter(tp, (unsigned long)f); tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, fl_destroy_filter); return 0; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index d0dff0cd8186..34b4ddaca27c 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -276,7 +276,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - skb = p->q->ops->dequeue(p->q); + skb = qdisc_dequeue_peeked(p->q); if (skb == NULL) return NULL; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ec529121f38a..ce46f1c7f133 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -526,6 +526,8 @@ static int sctp_v6_cmp_addr(const union sctp_addr *addr1, } return 0; } + if (addr1->v6.sin6_port != addr2->v6.sin6_port) + return 0; if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) return 0; /* If this is a linklocal address, compare the scope_id. */ diff --git a/net/sctp/proc.c b/net/sctp/proc.c index cfc3c7101a38..5cfac8d5d3b3 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -480,7 +480,7 @@ static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v) static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; - struct sctp_transport *tsp; + struct sctp_transport *transport, *tsp; if (v == SEQ_START_TOKEN) { seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX " @@ -488,10 +488,10 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) return 0; } - tsp = (struct sctp_transport *)v; - if (!sctp_transport_hold(tsp)) + transport = (struct sctp_transport *)v; + if (!sctp_transport_hold(transport)) return 0; - assoc = tsp->asoc; + assoc = transport->asoc; list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list, transports) { @@ -544,7 +544,7 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - sctp_transport_put(tsp); + sctp_transport_put(transport); return 0; } diff --git a/net/socket.c b/net/socket.c index c044d1e8508c..886649c88d8f 100644 --- a/net/socket.c +++ b/net/socket.c @@ -533,7 +533,7 @@ static const struct inode_operations sockfs_inode_ops = { * NULL is returned. */ -static struct socket *sock_alloc(void) +struct socket *sock_alloc(void) { struct inode *inode; struct socket *sock; @@ -554,6 +554,7 @@ static struct socket *sock_alloc(void) this_cpu_add(sockets_in_use, 1); return sock; } +EXPORT_SYMBOL(sock_alloc); /** * sock_release - close a socket @@ -1874,7 +1875,8 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, - struct used_address *used_address) + struct used_address *used_address, + unsigned int allowed_msghdr_flags) { struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; @@ -1900,6 +1902,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, if (msg_sys->msg_controllen > INT_MAX) goto out_freeiov; + flags |= (msg_sys->msg_flags & allowed_msghdr_flags); ctl_len = msg_sys->msg_controllen; if ((MSG_CMSG_COMPAT & flags) && ctl_len) { err = @@ -1978,7 +1981,7 @@ long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags) if (!sock) goto out; - err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL); + err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0); fput_light(sock->file, fput_needed); out: @@ -2005,6 +2008,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; struct used_address used_address; + unsigned int oflags = flags; if (vlen > UIO_MAXIOV) vlen = UIO_MAXIOV; @@ -2019,11 +2023,15 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, entry = mmsg; compat_entry = (struct compat_mmsghdr __user *)mmsg; err = 0; + flags |= MSG_BATCH; while (datagrams < vlen) { + if (datagrams == vlen - 1) + flags = oflags; + if (MSG_CMSG_COMPAT & flags) { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry, - &msg_sys, flags, &used_address); + &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); @@ -2031,7 +2039,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, } else { err = ___sys_sendmsg(sock, (struct user_msghdr __user *)entry, - &msg_sys, flags, &used_address); + &msg_sys, flags, &used_address, MSG_EOR); if (err < 0) break; err = put_user(err, &entry->msg_len); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 799e65b944b9..cabf586f47d7 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -740,7 +740,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) default: printk(KERN_CRIT "%s: bad return from " "gss_fill_context: %zd\n", __func__, err); - BUG(); + gss_msg->msg.errno = -EIO; } goto err_release_msg; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 2b32fd602669..273bc3a35425 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1225,7 +1225,7 @@ int qword_get(char **bpp, char *dest, int bufsize) if (bp[0] == '\\' && bp[1] == 'x') { /* HEX STRING */ bp += 2; - while (len < bufsize) { + while (len < bufsize - 1) { int h, l; h = hex_to_bin(bp[0]); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index cc1251d07297..2dcd7640eeb5 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -341,6 +341,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, rqst->rq_reply_bytes_recvd = 0; rqst->rq_bytes_sent = 0; rqst->rq_xid = headerp->rm_xid; + + rqst->rq_private_buf.len = size; set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); buf = &rqst->rq_rcv_buf; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 47f7da58a7f0..8b5833c1ff2e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1093,8 +1093,11 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, .cb = cb, .idx = idx, }; + int err; - switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb); + err = switchdev_port_obj_dump(dev, &dump.fdb.obj, + switchdev_port_fdb_dump_cb); + cb->args[1] = err; return dump.idx; } EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index e401108360a2..ae469b37d852 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -412,11 +412,6 @@ enomem: return -ENOMEM; } -void tipc_bcast_reinit(struct net *net) -{ - tipc_link_reinit(tipc_bc_sndlink(net), tipc_own_addr(net)); -} - void tipc_bcast_stop(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 1944c6c00bb9..d5e79b3767fd 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -46,7 +46,6 @@ struct tipc_node_map; extern const char tipc_bclink_name[]; int tipc_bcast_init(struct net *net); -void tipc_bcast_reinit(struct net *net); void tipc_bcast_stop(struct net *net); void tipc_bcast_add_peer(struct net *net, struct tipc_link *l, struct sk_buff_head *xmitq); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 802ffad3200d..27a5406213c6 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -40,6 +40,7 @@ #include "link.h" #include "discover.h" #include "bcast.h" +#include "netlink.h" #define MAX_ADDR_STR 60 @@ -54,23 +55,6 @@ static struct tipc_media * const media_info_array[] = { NULL }; -static const struct nla_policy -tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { - [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_BEARER_NAME] = { - .type = NLA_STRING, - .len = TIPC_MAX_BEARER_NAME - }, - [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, - [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } -}; - -static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { - [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING }, - [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } -}; - static void bearer_disable(struct net *net, struct tipc_bearer *b); /** diff --git a/net/tipc/link.c b/net/tipc/link.c index e31d92f80572..7d2bb3e70baa 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1,7 +1,7 @@ /* * net/tipc/link.c: TIPC link code * - * Copyright (c) 1996-2007, 2012-2015, Ericsson AB + * Copyright (c) 1996-2007, 2012-2016, Ericsson AB * Copyright (c) 2004-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -127,6 +127,7 @@ struct tipc_link { /* Management and link supervision data */ u32 peer_session; + u32 session; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; @@ -136,11 +137,7 @@ struct tipc_link { u16 peer_caps; bool active; u32 silent_intv_cnt; - struct { - unchar hdr[INT_H_SIZE]; - unchar body[TIPC_MAX_IF_NAME]; - } proto_msg; - struct tipc_msg *pmsg; + char if_name[TIPC_MAX_IF_NAME]; u32 priority; char net_plane; @@ -195,14 +192,6 @@ struct tipc_link { static const char *link_co_err = "Link tunneling error, "; static const char *link_rst_msg = "Resetting link "; -/* Properties valid for media, bearar and link */ -static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { - [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, - [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, - [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } -}; - /* Send states for broadcast NACKs */ enum { @@ -215,10 +204,11 @@ enum { * Interval between NACKs when packets arrive out of order */ #define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) -/* - * Out-of-range value for link session numbers + +/* Wildcard value for link session numbers. When it is known that + * peer endpoint is down, any session number must be accepted. */ -#define WILDCARD_SESSION 0x10000 +#define ANY_SESSION 0x10000 /* Link FSM states: */ @@ -398,16 +388,6 @@ char *tipc_link_name(struct tipc_link *l) return l->name; } -static u32 link_own_addr(struct tipc_link *l) -{ - return msg_prevnode(l->pmsg); -} - -void tipc_link_reinit(struct tipc_link *l, u32 addr) -{ - msg_set_prevnode(l->pmsg, addr); -} - /** * tipc_link_create - create a new link * @n: pointer to associated node @@ -441,29 +421,22 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct tipc_link **link) { struct tipc_link *l; - struct tipc_msg *hdr; l = kzalloc(sizeof(*l), GFP_ATOMIC); if (!l) return false; *link = l; - l->pmsg = (struct tipc_msg *)&l->proto_msg; - hdr = l->pmsg; - tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer); - msg_set_size(hdr, sizeof(l->proto_msg)); - msg_set_session(hdr, session); - msg_set_bearer_id(hdr, l->bearer_id); + l->session = session; /* Note: peer i/f name is completed by reset/activate message */ sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown", tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode), if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); - strcpy((char *)msg_data(hdr), if_name); - + strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; l->net = net; - l->peer_session = WILDCARD_SESSION; + l->peer_session = ANY_SESSION; l->bearer_id = bearer_id; l->tolerance = tolerance; l->net_plane = net_plane; @@ -790,7 +763,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) struct tipc_msg *msg = buf_msg(skb_peek(list)); int imp = msg_importance(msg); u32 oport = msg_origport(msg); - u32 addr = link_own_addr(link); + u32 addr = tipc_own_addr(link->net); struct sk_buff *skb; /* This really cannot happen... */ @@ -839,16 +812,9 @@ void link_prepare_wakeup(struct tipc_link *l) void tipc_link_reset(struct tipc_link *l) { - /* Link is down, accept any session */ - l->peer_session = WILDCARD_SESSION; - - /* If peer is up, it only accepts an incremented session number */ - msg_set_session(l->pmsg, msg_session(l->pmsg) + 1); - - /* Prepare for renewed mtu size negotiation */ + l->peer_session = ANY_SESSION; + l->session++; l->mtu = l->advertised_mtu; - - /* Clean up all queues and counters: */ __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); skb_queue_splice_init(&l->wakeupq, l->inputq); @@ -1156,7 +1122,7 @@ int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) /* Broadcast ACK must be sent via a unicast link => defer to caller */ if (link_is_bc_rcvlink(l)) { - if (((l->rcv_nxt ^ link_own_addr(l)) & 0xf) != 0xf) + if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf) return 0; l->rcv_unacked = 0; return TIPC_LINK_SND_BC_ACK; @@ -1268,15 +1234,30 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq) { - struct sk_buff *skb = NULL; - struct tipc_msg *hdr = l->pmsg; + struct sk_buff *skb; + struct tipc_msg *hdr; + struct sk_buff_head *dfq = &l->deferdq; bool node_up = link_is_up(l->bc_rcvlink); /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) return; - msg_set_type(hdr, mtyp); + if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) + return; + + if (!skb_queue_empty(dfq)) + rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; + + skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, + TIPC_MAX_IF_NAME, l->addr, + tipc_own_addr(l->net), 0, 0, 0); + if (!skb) + return; + + hdr = buf_msg(skb); + msg_set_session(hdr, l->session); + msg_set_bearer_id(hdr, l->bearer_id); msg_set_net_plane(hdr, l->net_plane); msg_set_next_sent(hdr, l->snd_nxt); msg_set_ack(hdr, l->rcv_nxt - 1); @@ -1286,36 +1267,23 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, msg_set_linkprio(hdr, priority); msg_set_redundant_link(hdr, node_up); msg_set_seq_gap(hdr, 0); - - /* Compatibility: created msg must not be in sequence with pkt flow */ msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); if (mtyp == STATE_MSG) { - if (!tipc_link_is_up(l)) - return; - - /* Override rcvgap if there are packets in deferred queue */ - if (!skb_queue_empty(&l->deferdq)) - rcvgap = buf_seqno(skb_peek(&l->deferdq)) - l->rcv_nxt; - if (rcvgap) { - msg_set_seq_gap(hdr, rcvgap); - l->stats.sent_nacks++; - } + msg_set_seq_gap(hdr, rcvgap); + msg_set_size(hdr, INT_H_SIZE); msg_set_probe(hdr, probe); - if (probe) - l->stats.sent_probes++; l->stats.sent_states++; l->rcv_unacked = 0; } else { /* RESET_MSG or ACTIVATE_MSG */ msg_set_max_pkt(hdr, l->advertised_mtu); - msg_set_ack(hdr, l->rcv_nxt - 1); - msg_set_next_sent(hdr, 1); + strcpy(msg_data(hdr), l->if_name); } - skb = tipc_buf_acquire(msg_size(hdr)); - if (!skb) - return; - skb_copy_to_linear_data(skb, hdr, msg_size(hdr)); + if (probe) + l->stats.sent_probes++; + if (rcvgap) + l->stats.sent_nacks++; skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, skb); } @@ -1340,7 +1308,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, /* At least one packet required for safe algorithm => add dummy */ skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, - BASIC_H_SIZE, 0, l->addr, link_own_addr(l), + BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), 0, 0, TIPC_ERR_NO_PORT); if (!skb) { pr_warn("%sunable to create tunnel packet\n", link_co_err); @@ -1351,7 +1319,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, __skb_queue_purge(&tmpxq); /* Initialize reusable tunnel packet header */ - tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL, + tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL, mtyp, INT_H_SIZE, l->addr); pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq); msg_set_msgcnt(&tnlhdr, pktcnt); @@ -1410,7 +1378,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (tipc_link_is_blocked(l) || !xmitq) goto exit; - if (link_own_addr(l) > msg_prevnode(hdr)) + if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); switch (mtyp) { @@ -1418,7 +1386,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* Ignore duplicate RESET with old session number */ if ((less_eq(msg_session(hdr), l->peer_session)) && - (l->peer_session != WILDCARD_SESSION)) + (l->peer_session != ANY_SESSION)) break; /* fall thru' */ @@ -1515,7 +1483,7 @@ static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast, u16 gap_to = peers_snd_nxt - 1; skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, - 0, l->addr, link_own_addr(l), 0, 0, 0); + 0, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return false; hdr = buf_msg(skb); @@ -1670,7 +1638,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, if (mtyp != STATE_MSG) return 0; - if (dnode == link_own_addr(l)) { + if (dnode == tipc_own_addr(l->net)) { tipc_link_bc_ack_rcv(l, acked, xmitq); rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq); l->stats.recv_nacks++; diff --git a/net/tipc/link.h b/net/tipc/link.h index b4ee9d6e181d..6a94175ee20a 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -86,7 +86,6 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, struct tipc_link **link); -void tipc_link_reinit(struct tipc_link *l, u32 addr); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 777b979b8463..e190460fe0d3 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -47,12 +47,6 @@ #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ -static const struct nla_policy -tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { - [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } -}; - /** * struct name_info - name sequence publication info * @node_list: circular list of publications made by own node diff --git a/net/tipc/net.c b/net/tipc/net.c index 77bf9113c7a7..28bf4feeb81c 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -41,11 +41,7 @@ #include "socket.h" #include "node.h" #include "bcast.h" - -static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { - [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NET_ID] = { .type = NLA_U32 } -}; +#include "netlink.h" /* * The TIPC locking policy is designed to ensure a very fine locking @@ -116,7 +112,6 @@ int tipc_net_start(struct net *net, u32 addr) tn->own_addr = addr; tipc_named_reinit(net); tipc_sk_reinit(net); - tipc_bcast_reinit(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr, TIPC_ZONE_SCOPE, 0, tn->own_addr); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 8975b0135b76..56935df2167a 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -55,6 +55,75 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, } }; +const struct nla_policy +tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { + [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } +}; + +const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { + [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, + [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 }, + [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED }, + [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG } +}; + +const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { + [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NET_ID] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { + [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_LINK_NAME] = { .type = NLA_STRING, + .len = TIPC_MAX_LINK_NAME }, + [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { + [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, + [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } +}; + +/* Properties valid for media, bearer and link */ +const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { + [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { + [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_BEARER_NAME] = { .type = NLA_STRING, + .len = TIPC_MAX_BEARER_NAME }, + [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } +}; + +const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { + [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING }, + [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } +}; + +const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { + [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, + [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, + [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, + .len = sizeof(struct sockaddr_storage)}, +}; + /* Users of the legacy API (tipc-config) can't handle that we add operations, * so we have a separate genl handling for the new API. */ diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index 08a1db67b927..ed1dbcb4afbd 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -35,6 +35,7 @@ #ifndef _TIPC_NETLINK_H #define _TIPC_NETLINK_H +#include <net/netlink.h> extern struct genl_family tipc_genl_family; int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf); @@ -45,6 +46,16 @@ struct tipc_nl_msg { u32 seq; }; +extern const struct nla_policy tipc_nl_name_table_policy[]; +extern const struct nla_policy tipc_nl_sock_policy[]; +extern const struct nla_policy tipc_nl_net_policy[]; +extern const struct nla_policy tipc_nl_link_policy[]; +extern const struct nla_policy tipc_nl_node_policy[]; +extern const struct nla_policy tipc_nl_prop_policy[]; +extern const struct nla_policy tipc_nl_bearer_policy[]; +extern const struct nla_policy tipc_nl_media_policy[]; +extern const struct nla_policy tipc_nl_udp_policy[]; + int tipc_netlink_start(void); int tipc_netlink_compat_start(void); void tipc_netlink_stop(void); diff --git a/net/tipc/node.c b/net/tipc/node.c index cdb79503d890..ace178fd3850 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -41,6 +41,7 @@ #include "socket.h" #include "bcast.h" #include "discover.h" +#include "netlink.h" #define INVALID_NODE_SIG 0x10000 @@ -164,28 +165,6 @@ struct tipc_sock_conn { struct list_head list; }; -static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { - [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_LINK_NAME] = { - .type = NLA_STRING, - .len = TIPC_MAX_LINK_NAME - }, - [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, - [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, - [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, - [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, - [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, - [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } -}; - -static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { - [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } -}; - static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; @@ -843,7 +822,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); - if (reset && !tipc_link_is_reset(l)) + if (reset && l && !tipc_link_is_reset(l)) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 69c29050f14a..3eeb50a27b89 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -42,6 +42,7 @@ #include "name_distr.h" #include "socket.h" #include "bcast.h" +#include "netlink.h" #define SS_LISTENING -1 /* socket is listening */ #define SS_READY -2 /* socket is connectionless */ @@ -126,14 +127,6 @@ static const struct proto_ops stream_ops; static const struct proto_ops msg_ops; static struct proto tipc_proto; -static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { - [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, - [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, - [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 }, - [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED }, - [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG } -}; - static const struct rhashtable_params tsk_rht_params; /* @@ -673,7 +666,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); struct tipc_msg *mhdr = &tsk->phdr; - struct sk_buff_head *pktchain = &sk->sk_write_queue; + struct sk_buff_head pktchain; struct iov_iter save = msg->msg_iter; uint mtu; int rc; @@ -687,14 +680,16 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, msg_set_nameupper(mhdr, seq->upper); msg_set_hdr_sz(mhdr, MCAST_H_SIZE); + skb_queue_head_init(&pktchain); + new_mtu: mtu = tipc_bcast_get_mtu(net); - rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain); + rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain); if (unlikely(rc < 0)) return rc; do { - rc = tipc_bcast_xmit(net, pktchain); + rc = tipc_bcast_xmit(net, &pktchain); if (likely(!rc)) return dsz; @@ -704,7 +699,7 @@ new_mtu: if (!rc) continue; } - __skb_queue_purge(pktchain); + __skb_queue_purge(&pktchain); if (rc == -EMSGSIZE) { msg->msg_iter = save; goto new_mtu; @@ -863,7 +858,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) struct net *net = sock_net(sk); struct tipc_msg *mhdr = &tsk->phdr; u32 dnode, dport; - struct sk_buff_head *pktchain = &sk->sk_write_queue; + struct sk_buff_head pktchain; struct sk_buff *skb; struct tipc_name_seq *seq; struct iov_iter save; @@ -924,17 +919,18 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) msg_set_hdr_sz(mhdr, BASIC_H_SIZE); } + skb_queue_head_init(&pktchain); save = m->msg_iter; new_mtu: mtu = tipc_node_get_mtu(net, dnode, tsk->portid); - rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain); + rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain); if (rc < 0) return rc; do { - skb = skb_peek(pktchain); + skb = skb_peek(&pktchain); TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; - rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid); + rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid); if (likely(!rc)) { if (sock->state != SS_READY) sock->state = SS_CONNECTING; @@ -946,7 +942,7 @@ new_mtu: if (!rc) continue; } - __skb_queue_purge(pktchain); + __skb_queue_purge(&pktchain); if (rc == -EMSGSIZE) { m->msg_iter = save; goto new_mtu; @@ -1016,7 +1012,7 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *mhdr = &tsk->phdr; - struct sk_buff_head *pktchain = &sk->sk_write_queue; + struct sk_buff_head pktchain; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); u32 portid = tsk->portid; int rc = -EINVAL; @@ -1044,17 +1040,19 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); dnode = tsk_peer_node(tsk); + skb_queue_head_init(&pktchain); next: save = m->msg_iter; mtu = tsk->max_pkt; send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain); + rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain); if (unlikely(rc < 0)) return rc; + do { if (likely(!tsk_conn_cong(tsk))) { - rc = tipc_node_xmit(net, pktchain, dnode, portid); + rc = tipc_node_xmit(net, &pktchain, dnode, portid); if (likely(!rc)) { tsk->sent_unacked++; sent += send; @@ -1063,7 +1061,7 @@ next: goto next; } if (rc == -EMSGSIZE) { - __skb_queue_purge(pktchain); + __skb_queue_purge(&pktchain); tsk->max_pkt = tipc_node_get_mtu(net, dnode, portid); m->msg_iter = save; @@ -1077,7 +1075,7 @@ next: rc = tipc_wait_for_sndpkt(sock, &timeo); } while (!rc); - __skb_queue_purge(pktchain); + __skb_queue_purge(&pktchain); return sent ? sent : rc; } diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 22963cafd5ed..e6cb386fbf34 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -326,7 +326,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, return tipc_subscrp_cancel(s, subscriber); } - tipc_subscrp_subscribe(net, s, subscriber, swap); + if (s) + tipc_subscrp_subscribe(net, s, subscriber, swap); } /* Handle one request to establish a new subscriber */ diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index d63a911e7fe2..c94f9a15e2cd 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -48,20 +48,13 @@ #include <linux/tipc_netlink.h> #include "core.h" #include "bearer.h" +#include "netlink.h" /* IANA assigned UDP port */ #define UDP_PORT_DEFAULT 6118 #define UDP_MIN_HEADROOM 28 -static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = { - [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC}, - [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY, - .len = sizeof(struct sockaddr_storage)}, - [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY, - .len = sizeof(struct sockaddr_storage)}, -}; - /** * struct udp_media_addr - IP/UDP addressing information * @@ -181,6 +174,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, err = PTR_ERR(rt); goto tx_error; } + + skb->dev = rt->dst.dev; ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->udp_port, @@ -201,7 +196,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, ttl = ip6_dst_hoplimit(ndst); err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, ndst->dev, &src->ipv6, - &dst->ipv6, 0, ttl, src->udp_port, + &dst->ipv6, 0, ttl, 0, src->udp_port, dst->udp_port, false); #endif } @@ -274,7 +269,7 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, struct udp_media_addr *remote) { struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; - struct sockaddr_storage *sa_local, *sa_remote; + struct sockaddr_storage sa_local, sa_remote; if (!attrs[TIPC_NLA_BEARER_UDP_OPTS]) goto err; @@ -283,41 +278,48 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub, tipc_nl_udp_policy)) goto err; if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) { - sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]); - sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]); + nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL], + sizeof(sa_local)); + nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE], + sizeof(sa_remote)); } else { err: pr_err("Invalid UDP bearer configuration"); return -EINVAL; } - if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) { + if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) { struct sockaddr_in *ip4; - ip4 = (struct sockaddr_in *)sa_local; + ip4 = (struct sockaddr_in *)&sa_local; local->proto = htons(ETH_P_IP); local->udp_port = ip4->sin_port; local->ipv4.s_addr = ip4->sin_addr.s_addr; - ip4 = (struct sockaddr_in *)sa_remote; + ip4 = (struct sockaddr_in *)&sa_remote; remote->proto = htons(ETH_P_IP); remote->udp_port = ip4->sin_port; remote->ipv4.s_addr = ip4->sin_addr.s_addr; return 0; #if IS_ENABLED(CONFIG_IPV6) - } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) { + } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) { + int atype; struct sockaddr_in6 *ip6; - ip6 = (struct sockaddr_in6 *)sa_local; + ip6 = (struct sockaddr_in6 *)&sa_local; + atype = ipv6_addr_type(&ip6->sin6_addr); + if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id) + return -EINVAL; + local->proto = htons(ETH_P_IPV6); local->udp_port = ip6->sin6_port; - local->ipv6 = ip6->sin6_addr; + memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); ub->ifindex = ip6->sin6_scope_id; - ip6 = (struct sockaddr_in6 *)sa_remote; + ip6 = (struct sockaddr_in6 *)&sa_remote; remote->proto = htons(ETH_P_IPV6); remote->udp_port = ip6->sin6_port; - remote->ipv6 = ip6->sin6_addr; + memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr)); return 0; #endif } diff --git a/net/wireless/core.c b/net/wireless/core.c index 3a9c41bc849a..9f1c4aa851ef 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1157,6 +1157,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, return NOTIFY_DONE; } + wireless_nlevent_flush(); + return NOTIFY_OK; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 90890f183c0e..98c924260b3d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7554,7 +7554,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) && no_ht) { - kfree(connkeys); + kzfree(connkeys); return -EINVAL; } } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 79bd3a171caa..544558171787 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -917,6 +917,12 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); + /* stop critical protocol if supported */ + if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) { + rdev->crit_proto_nlportid = 0; + rdev_crit_proto_stop(rdev, wdev); + } + /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index c8717c1d082e..b50ee5d622e1 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -342,6 +342,40 @@ static const int compat_event_type_size[] = { /* IW event code */ +void wireless_nlevent_flush(void) +{ + struct sk_buff *skb; + struct net *net; + + ASSERT_RTNL(); + + for_each_net(net) { + while ((skb = skb_dequeue(&net->wext_nlevents))) + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, + GFP_KERNEL); + } +} +EXPORT_SYMBOL_GPL(wireless_nlevent_flush); + +static int wext_netdev_notifier_call(struct notifier_block *nb, + unsigned long state, void *ptr) +{ + /* + * When a netdev changes state in any way, flush all pending messages + * to avoid them going out in a strange order, e.g. RTM_NEWLINK after + * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close() + * or similar - all of which could otherwise happen due to delays from + * schedule_work(). + */ + wireless_nlevent_flush(); + + return NOTIFY_OK; +} + +static struct notifier_block wext_netdev_notifier = { + .notifier_call = wext_netdev_notifier_call, +}; + static int __net_init wext_pernet_init(struct net *net) { skb_queue_head_init(&net->wext_nlevents); @@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = { static int __init wireless_nlevent_init(void) { - return register_pernet_subsys(&wext_pernet_ops); + int err = register_pernet_subsys(&wext_pernet_ops); + + if (err) + return err; + + return register_netdevice_notifier(&wext_netdev_notifier); } subsys_initcall(wireless_nlevent_init); @@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init); /* Process events generated by the wireless layer or the driver. */ static void wireless_nlevent_process(struct work_struct *work) { - struct sk_buff *skb; - struct net *net; - rtnl_lock(); - - for_each_net(net) { - while ((skb = skb_dequeue(&net->wext_nlevents))) - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, - GFP_KERNEL); - } - + wireless_nlevent_flush(); rtnl_unlock(); } |