diff options
Diffstat (limited to 'net')
189 files changed, 10308 insertions, 2642 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 10da6c588bf8..116455ac3db5 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -671,7 +671,8 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev, return 0; } -static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +static void vlan_dev_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct vlan_pcpu_stats *p; u32 rx_errors = 0, tx_dropped = 0; @@ -702,8 +703,6 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st } stats->rx_errors = rx_errors; stats->tx_dropped = tx_dropped; - - return stats; } #ifdef CONFIG_NET_POLL_CONTROLLER diff --git a/net/Kconfig b/net/Kconfig index a29bb4b41c50..92ae1500d9e1 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -57,6 +57,7 @@ source "net/packet/Kconfig" source "net/unix/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" +source "net/smc/Kconfig" config INET bool "TCP/IP networking" diff --git a/net/Makefile b/net/Makefile index 4cafaa2b4667..5d6e0e5ff7f8 100644 --- a/net/Makefile +++ b/net/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_MAC80211) += mac80211/ obj-$(CONFIG_TIPC) += tipc/ obj-$(CONFIG_NETLABEL) += netlabel/ obj-$(CONFIG_IUCV) += iucv/ +obj-$(CONFIG_SMC) += smc/ obj-$(CONFIG_RFKILL) += rfkill/ obj-$(CONFIG_NET_9P) += 9p/ obj-$(CONFIG_CAIF) += caif/ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ed3b3192fb00..6c46d1b4cdbb 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -153,8 +153,8 @@ static int br_dev_stop(struct net_device *dev) return 0; } -static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) +static void br_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct net_bridge *br = netdev_priv(dev); struct pcpu_sw_netstats tmp, sum = { 0 }; @@ -178,8 +178,6 @@ static struct rtnl_link_stats64 *br_get_stats64(struct net_device *dev, stats->tx_packets = sum.tx_packets; stats->rx_bytes = sum.rx_bytes; stats->rx_packets = sum.rx_packets; - - return stats; } static int br_change_mtu(struct net_device *dev, int new_mtu) diff --git a/net/compat.c b/net/compat.c index 96c544b05b15..ba3ac722714d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -90,11 +90,11 @@ int get_compat_msghdr(struct msghdr *kmsg, #define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32)) #define CMSG_COMPAT_DATA(cmsg) \ - ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)))) + ((void __user *)((char __user *)(cmsg) + sizeof(struct compat_cmsghdr))) #define CMSG_COMPAT_SPACE(len) \ - (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len)) + (sizeof(struct compat_cmsghdr) + CMSG_COMPAT_ALIGN(len)) #define CMSG_COMPAT_LEN(len) \ - (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len)) + (sizeof(struct compat_cmsghdr) + (len)) #define CMSG_COMPAT_FIRSTHDR(msg) \ (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \ @@ -130,6 +130,9 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, __kernel_size_t kcmlen, tmp; int err = -EFAULT; + BUILD_BUG_ON(sizeof(struct compat_cmsghdr) != + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr))); + kcmlen = 0; kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf; ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); @@ -141,8 +144,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) return -EINVAL; - tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + - CMSG_ALIGN(sizeof(struct cmsghdr))); + tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); tmp = CMSG_ALIGN(tmp); kcmlen += tmp; ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); @@ -168,8 +170,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, goto Efault; if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) goto Einval; - tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + - CMSG_ALIGN(sizeof(struct cmsghdr))); + tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) goto Einval; kcmsg->cmsg_len = tmp; @@ -178,7 +179,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) || copy_from_user(CMSG_DATA(kcmsg), CMSG_COMPAT_DATA(ucmsg), - (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))))) + (ucmlen - sizeof(*ucmsg)))) goto Efault; /* Advance. */ diff --git a/net/core/dev.c b/net/core/dev.c index 07b307b0b414..ad5959e56116 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2408,28 +2408,6 @@ void netif_schedule_queue(struct netdev_queue *txq) } EXPORT_SYMBOL(netif_schedule_queue); -/** - * netif_wake_subqueue - allow sending packets on subqueue - * @dev: network device - * @queue_index: sub queue index - * - * Resume individual transmit queue of a device with multiple transmit queues. - */ -void netif_wake_subqueue(struct net_device *dev, u16 queue_index) -{ - struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); - - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { - struct Qdisc *q; - - rcu_read_lock(); - q = rcu_dereference(txq->qdisc); - __netif_schedule(q); - rcu_read_unlock(); - } -} -EXPORT_SYMBOL(netif_wake_subqueue); - void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { @@ -3153,9 +3131,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) if (!cl) return skb; - /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set - * earlier by the caller. - */ + /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ qdisc_bstats_cpu_update(cl->q, skb); switch (tc_classify(skb, cl, &cl_res, false)) { @@ -3320,7 +3296,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) qdisc_pkt_len_init(skb); #ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); + skb->tc_at_ingress = 0; # ifdef CONFIG_NET_EGRESS if (static_key_false(&egress_needed)) { skb = sch_handle_egress(skb, &rc, dev); @@ -3427,7 +3403,11 @@ EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; -int weight_p __read_mostly = 64; /* old backlog weight */ +int weight_p __read_mostly = 64; /* old backlog weight */ +int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ +int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ +int dev_rx_weight __read_mostly = 64; +int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -3916,7 +3896,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + skb->tc_at_ingress = 1; qdisc_bstats_cpu_update(cl->q, skb); switch (tc_classify(skb, cl, &cl_res, false)) { @@ -4089,12 +4069,8 @@ another_round: goto out; } -#ifdef CONFIG_NET_CLS_ACT - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - goto ncls; - } -#endif + if (skb_skip_tc_classify(skb)) + goto skip_classify; if (pfmemalloc) goto skip_taps; @@ -4122,10 +4098,8 @@ skip_taps: goto out; } #endif -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; -ncls: -#endif + skb_reset_tc(skb); +skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; @@ -4835,7 +4809,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = weight_p; + napi->weight = dev_rx_weight; while (again) { struct sk_buff *skb; diff --git a/net/core/filter.c b/net/core/filter.c index 1969b3f118c1..90383860e224 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1416,8 +1416,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_STACK, - .arg4_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1447,8 +1447,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_RAW_STACK, - .arg4_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -1601,10 +1601,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_STACK, - .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2306,8 +2306,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -2377,8 +2377,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_RAW_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2412,8 +2412,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_RAW_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; @@ -2483,8 +2483,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2509,8 +2509,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * @@ -2593,8 +2593,8 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * @@ -2776,11 +2776,33 @@ static bool __is_valid_access(int off, int size) { if (off < 0 || off >= sizeof(struct __sk_buff)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case offsetof(struct __sk_buff, cb[0]) ... + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + if (size == sizeof(__u16) && + off > offsetof(struct __sk_buff, cb[4]) + sizeof(__u16)) + return false; + if (size == sizeof(__u32) && + off > offsetof(struct __sk_buff, cb[4])) + return false; + if (size == sizeof(__u64) && + off > offsetof(struct __sk_buff, cb[2])) + return false; + if (size != sizeof(__u8) && + size != sizeof(__u16) && + size != sizeof(__u32) && + size != sizeof(__u64)) + return false; + break; + default: + if (size != sizeof(__u32)) + return false; + } return true; } @@ -2799,7 +2821,7 @@ static bool sk_filter_is_valid_access(int off, int size, if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: break; default: return false; @@ -2823,7 +2845,7 @@ static bool lwt_is_valid_access(int off, int size, case offsetof(struct __sk_buff, mark): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: break; default: return false; @@ -2915,7 +2937,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case offsetof(struct __sk_buff, tc_index): case offsetof(struct __sk_buff, priority): case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: case offsetof(struct __sk_buff, tc_classid): break; default: @@ -2972,32 +2994,33 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; + int off; - switch (ctx_off) { + switch (si->off) { case offsetof(struct __sk_buff, len): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, len)); break; case offsetof(struct __sk_buff, protocol): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, protocol)); break; case offsetof(struct __sk_buff, vlan_proto): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, vlan_proto)); break; @@ -3005,17 +3028,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, priority)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, priority)); break; case offsetof(struct __sk_buff, ingress_ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, skb_iif)); break; @@ -3023,17 +3046,17 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); - *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct __sk_buff, hash): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, hash)); break; @@ -3041,63 +3064,77 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, mark)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sk_buff, mark)); break; case offsetof(struct __sk_buff, pkt_type): - return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg, + si->src_reg, insn); case offsetof(struct __sk_buff, queue_mapping): - return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + return convert_skb_access(SKF_AD_QUEUE, si->dst_reg, + si->src_reg, insn); case offsetof(struct __sk_buff, vlan_present): return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, - dst_reg, src_reg, insn); + si->dst_reg, si->src_reg, insn); case offsetof(struct __sk_buff, vlan_tci): return convert_skb_access(SKF_AD_VLAN_TAG, - dst_reg, src_reg, insn); + si->dst_reg, si->src_reg, insn); case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct qdisc_skb_cb, data)) % + sizeof(__u64)); prog->cb_access = 1; - ctx_off -= offsetof(struct __sk_buff, cb[0]); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct qdisc_skb_cb, data); + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): - ctx_off -= offsetof(struct __sk_buff, tc_classid); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); + + off = si->off; + off -= offsetof(struct __sk_buff, tc_classid); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, tc_classid); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_end): - ctx_off -= offsetof(struct __sk_buff, data_end); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct bpf_skb_data_end, data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, - ctx_off); + off = si->off; + off -= offsetof(struct __sk_buff, data_end); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): @@ -3105,110 +3142,107 @@ static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, tc_index)); else - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sk_buff, tc_index)); - break; #else if (type == BPF_WRITE) - *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else - *insn++ = BPF_MOV64_IMM(dst_reg, 0); - break; + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif + break; } return insn - insn_buf; } static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, - int dst_reg, int src_reg, - int ctx_off, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sock, sk_family)); break; case offsetof(struct bpf_sock, type): - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); break; case offsetof(struct bpf_sock, protocol): - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; } return insn - insn_buf; } -static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct __sk_buff, ifindex): BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; default: - return sk_filter_convert_ctx_access(type, dst_reg, src_reg, - ctx_off, insn_buf, prog); + return sk_filter_convert_ctx_access(type, si, insn_buf, prog); } return insn - insn_buf; } -static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 xdp_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1b7673aac59d..c35aae13c8d2 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -138,6 +138,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_arp *key_arp; struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; @@ -379,6 +380,62 @@ mpls: nhoff += FCOE_HEADER_LEN; goto out_good; + + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): { + struct { + unsigned char ar_sha[ETH_ALEN]; + unsigned char ar_sip[4]; + unsigned char ar_tha[ETH_ALEN]; + unsigned char ar_tip[4]; + } *arp_eth, _arp_eth; + const struct arphdr *arp; + struct arphdr *_arp; + + arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, + hlen, &_arp); + if (!arp) + goto out_bad; + + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_hln != ETH_ALEN || + arp->ar_pln != 4 || + (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST))) + goto out_bad; + + arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), + sizeof(_arp_eth), data, + hlen, + &_arp_eth); + if (!arp_eth) + goto out_bad; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ARP)) { + + key_arp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ARP, + target_container); + + memcpy(&key_arp->sip, arp_eth->ar_sip, + sizeof(key_arp->sip)); + memcpy(&key_arp->tip, arp_eth->ar_tip, + sizeof(key_arp->tip)); + + /* Only store the lower byte of the opcode; + * this covers ARPOP_REPLY and ARPOP_REQUEST. + */ + key_arp->op = ntohs(arp->ar_op) & 0xff; + + ether_addr_copy(key_arp->sha, arp_eth->ar_sha); + ether_addr_copy(key_arp->tha, arp_eth->ar_tha); + } + + goto out_good; + } + default: goto out_bad; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 71bb3e2eca08..40ef8ae8d93d 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -352,7 +352,7 @@ static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 0; } -int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) { /* FIXME: * The LWT state is currently rebuilt for delete requests which diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8e69ce472236..96947f5d41e4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3439,9 +3439,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* skb was 'freed' by stack, so clean few * bits and reuse it */ -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; /* reset reclass/redir ttl */ -#endif + skb_reset_tc(skb); } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 5d26056b6d8f..9b8727c67b58 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -34,8 +34,6 @@ * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ -int sysctl_max_syn_backlog = 256; -EXPORT_SYMBOL(sysctl_max_syn_backlog); void reqsk_queue_alloc(struct request_sock_queue *queue) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 75e3ea7bda08..f538f764fca6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3829,6 +3829,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = 0; } + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { + struct rtnl_af_ops *af_ops; + + *idxattr = IFLA_STATS_AF_SPEC; + attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC); + if (!attr) + goto nla_put_failure; + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_stats_af) { + struct nlattr *af; + int err; + + af = nla_nest_start(skb, af_ops->family); + if (!af) + goto nla_put_failure; + + err = af_ops->fill_stats_af(skb, dev); + + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, af); + } + } + + nla_nest_end(skb, attr); + + *idxattr = 0; + } + nlmsg_end(skb, nlh); return 0; @@ -3885,6 +3918,23 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) size += rtnl_get_offload_stats_size(dev); + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { + struct rtnl_af_ops *af_ops; + + /* for IFLA_STATS_AF_SPEC */ + size += nla_total_size(0); + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_stats_af_size) { + size += nla_total_size( + af_ops->get_stats_af_size(dev)); + + /* for AF_* */ + size += nla_total_size(0); + } + } + } + return size; } diff --git a/net/core/scm.c b/net/core/scm.c index d8820438ba37..b6d83686e149 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -71,7 +71,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) struct file **fpp; int i, num; - num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int); if (num <= 0) return 0; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 88a8e429fc3e..758f140b6bed 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -1,3 +1,7 @@ +/* + * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + #include <linux/kernel.h> #include <linux/init.h> #include <linux/cryptohash.h> @@ -8,18 +12,18 @@ #include <linux/ktime.h> #include <linux/string.h> #include <linux/net.h> - +#include <linux/siphash.h> #include <net/secure_seq.h> #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#include <linux/in6.h> #include <net/tcp.h> -#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) -static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; +static siphash_key_t net_secret __read_mostly; static __always_inline void net_secret_init(void) { - net_get_random_once(net_secret, sizeof(net_secret)); + net_get_random_once(&net_secret, sizeof(net_secret)); } #endif @@ -44,80 +48,70 @@ static u32 seq_scale(u32 seq) u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport, u32 *tsoff) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u32 i; - + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; + u64 hash; net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32)daddr[i]; - secret[4] = net_secret[4] + - (((__force u16)sport << 16) + (__force u16)dport); - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; - return seq_scale(hash[0]); + hash = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + *tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0; + return seq_scale(hash); } EXPORT_SYMBOL(secure_tcpv6_sequence_number); u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u32 i; - + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .dport = dport + }; net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32) daddr[i]; - secret[4] = net_secret[4] + (__force u32)dport; - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - return hash[0]; + return siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); } EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET +/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), + * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, + * it would be easy enough to have the former function use siphash_4u32, passing + * the arguments as separate u32. + */ + u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 *tsoff) { - u32 hash[MD5_DIGEST_WORDS]; - + u64 hash; net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = ((__force u16)sport << 16) + (__force u16)dport; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; - return seq_scale(hash[0]); + hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); + *tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0; + return seq_scale(hash); } u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { - u32 hash[MD5_DIGEST_WORDS]; - net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = (__force u32)dport ^ net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; + return siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u16)dport, &net_secret); } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif @@ -126,21 +120,13 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) { - u32 hash[MD5_DIGEST_WORDS]; u64 seq; - net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = ((__force u16)sport << 16) + (__force u16)dport; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - seq = hash[0] | (((u64)hash[1]) << 32); + seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; - return seq; } EXPORT_SYMBOL(secure_dccp_sequence_number); @@ -149,26 +135,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number); u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, __be16 sport, __be16 dport) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; u64 seq; - u32 i; - net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32)daddr[i]; - secret[4] = net_secret[4] + - (((__force u16)sport << 16) + (__force u16)dport); - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - seq = hash[0] | (((u64)hash[1]) << 32); + seq = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; - return seq; } EXPORT_SYMBOL(secure_dccpv6_sequence_number); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 734c71468b01..f8dbe4a7ab46 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -878,9 +878,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); -#ifdef CONFIG_NET_CLS_ACT - CHECK_SKB_FIELD(tc_verd); -#endif #endif } diff --git a/net/core/sock.c b/net/core/sock.c index 4eca27dc5c94..8b35debfe454 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -222,7 +222,7 @@ static const char *const af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , - "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX" + "sk_lock-AF_QIPCRTR", "sk_lock-AF_SMC" , "sk_lock-AF_MAX" }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -239,7 +239,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , - "slock-AF_QIPCRTR", "slock-AF_MAX" + "slock-AF_QIPCRTR", "slock-AF_SMC" , "slock-AF_MAX" }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -256,7 +256,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , - "clock-AF_QIPCRTR", "clock-AF_MAX" + "clock-AF_QIPCRTR", "clock-AF_SMC" , "clock-AF_MAX" }; /* @@ -762,11 +762,8 @@ set_rcvbuf: goto set_rcvbuf; case SO_KEEPALIVE: -#ifdef CONFIG_INET - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) - tcp_set_keepalive(sk, valbool); -#endif + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 2a46e4009f62..eaa72eb0399c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif +static int proc_do_dev_weight(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret != 0) + return ret; + + dev_rx_weight = weight_p * dev_weight_rx_bias; + dev_tx_weight = weight_p * dev_weight_tx_bias; + + return ret; +} + static int proc_do_rss_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = { .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "dev_weight_rx_bias", + .data = &dev_weight_rx_bias, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "dev_weight_tx_bias", + .data = &dev_weight_tx_bias, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, }, { .procname = "netdev_max_backlog", diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 96e47c539bee..39bb5b3a82f2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -1,12 +1,13 @@ config HAVE_NET_DSA def_bool y - depends on NETDEVICES && !S390 + depends on INET && NETDEVICES && !S390 # Drivers must select NET_DSA and the appropriate tagging format config NET_DSA tristate "Distributed Switch Architecture" - depends on HAVE_NET_DSA && NET_SWITCHDEV + depends on HAVE_NET_DSA + select NET_SWITCHDEV select PHYLIB ---help--- Say Y if you want to enable support for the hardware switches supported diff --git a/net/dsa/Makefile b/net/dsa/Makefile index a3380ed0e0be..560b6747c276 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,7 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o dsa_core-y += dsa.o slave.o dsa2.o +dsa_core-$(CONFIG_NET_DSA_HWMON) += hwmon.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 7899919cd9f0..fd532487dfdf 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -9,9 +9,7 @@ * (at your option) any later version. */ -#include <linux/ctype.h> #include <linux/device.h> -#include <linux/hwmon.h> #include <linux/list.h> #include <linux/platform_device.h> #include <linux/slab.h> @@ -27,8 +25,6 @@ #include <linux/gpio/consumer.h> #include "dsa_priv.h" -char dsa_driver_version[] = "0.1"; - static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -64,27 +60,27 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { static DEFINE_MUTEX(dsa_switch_drivers_mutex); static LIST_HEAD(dsa_switch_drivers); -void register_switch_driver(struct dsa_switch_ops *ops) +void register_switch_driver(struct dsa_switch_driver *drv) { mutex_lock(&dsa_switch_drivers_mutex); - list_add_tail(&ops->list, &dsa_switch_drivers); + list_add_tail(&drv->list, &dsa_switch_drivers); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(register_switch_driver); -void unregister_switch_driver(struct dsa_switch_ops *ops) +void unregister_switch_driver(struct dsa_switch_driver *drv) { mutex_lock(&dsa_switch_drivers_mutex); - list_del_init(&ops->list); + list_del_init(&drv->list); mutex_unlock(&dsa_switch_drivers_mutex); } EXPORT_SYMBOL_GPL(unregister_switch_driver); -static struct dsa_switch_ops * +static const struct dsa_switch_ops * dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, const char **_name, void **priv) { - struct dsa_switch_ops *ret; + const struct dsa_switch_ops *ret; struct list_head *list; const char *name; @@ -93,9 +89,11 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, mutex_lock(&dsa_switch_drivers_mutex); list_for_each(list, &dsa_switch_drivers) { - struct dsa_switch_ops *ops; + const struct dsa_switch_ops *ops; + struct dsa_switch_driver *drv; - ops = list_entry(list, struct dsa_switch_ops, list); + drv = list_entry(list, struct dsa_switch_driver, list); + ops = drv->ops; name = ops->probe(parent, host_dev, sw_addr, priv); if (name != NULL) { @@ -110,105 +108,6 @@ dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, return ret; } -/* hwmon support ************************************************************/ - -#ifdef CONFIG_NET_DSA_HWMON - -static ssize_t temp1_input_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dsa_switch *ds = dev_get_drvdata(dev); - int temp, ret; - - ret = ds->ops->get_temp(ds, &temp); - if (ret < 0) - return ret; - - return sprintf(buf, "%d\n", temp * 1000); -} -static DEVICE_ATTR_RO(temp1_input); - -static ssize_t temp1_max_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dsa_switch *ds = dev_get_drvdata(dev); - int temp, ret; - - ret = ds->ops->get_temp_limit(ds, &temp); - if (ret < 0) - return ret; - - return sprintf(buf, "%d\n", temp * 1000); -} - -static ssize_t temp1_max_store(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct dsa_switch *ds = dev_get_drvdata(dev); - int temp, ret; - - ret = kstrtoint(buf, 0, &temp); - if (ret < 0) - return ret; - - ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); - if (ret < 0) - return ret; - - return count; -} -static DEVICE_ATTR_RW(temp1_max); - -static ssize_t temp1_max_alarm_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct dsa_switch *ds = dev_get_drvdata(dev); - bool alarm; - int ret; - - ret = ds->ops->get_temp_alarm(ds, &alarm); - if (ret < 0) - return ret; - - return sprintf(buf, "%d\n", alarm); -} -static DEVICE_ATTR_RO(temp1_max_alarm); - -static struct attribute *dsa_hwmon_attrs[] = { - &dev_attr_temp1_input.attr, /* 0 */ - &dev_attr_temp1_max.attr, /* 1 */ - &dev_attr_temp1_max_alarm.attr, /* 2 */ - NULL -}; - -static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, - struct attribute *attr, int index) -{ - struct device *dev = container_of(kobj, struct device, kobj); - struct dsa_switch *ds = dev_get_drvdata(dev); - struct dsa_switch_ops *ops = ds->ops; - umode_t mode = attr->mode; - - if (index == 1) { - if (!ops->get_temp_limit) - mode = 0; - else if (!ops->set_temp_limit) - mode &= ~S_IWUSR; - } else if (index == 2 && !ops->get_temp_alarm) { - mode = 0; - } - return mode; -} - -static const struct attribute_group dsa_hwmon_group = { - .attrs = dsa_hwmon_attrs, - .is_visible = dsa_hwmon_attrs_visible, -}; -__ATTRIBUTE_GROUPS(dsa_hwmon); - -#endif /* CONFIG_NET_DSA_HWMON */ - /* basic switch operations **************************************************/ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, struct device_node *port_dn, int port) @@ -308,7 +207,7 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { - struct dsa_switch_ops *ops = ds->ops; + const struct dsa_switch_ops *ops = ds->ops; struct dsa_switch_tree *dst = ds->dst; struct dsa_chip_data *cd = ds->cd; bool valid_name_found = false; @@ -329,8 +228,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) if (dst->cpu_switch != -1) { netdev_err(dst->master_netdev, "multiple cpu ports?!\n"); - ret = -EINVAL; - goto out; + return -EINVAL; } dst->cpu_switch = index; dst->cpu_port = i; @@ -343,10 +241,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) valid_name_found = true; } - if (!valid_name_found && i == DSA_MAX_PORTS) { - ret = -EINVAL; - goto out; - } + if (!valid_name_found && i == DSA_MAX_PORTS) + return -EINVAL; /* Make the built-in MII bus mask match the number of ports, * switch drivers can override this later @@ -363,10 +259,8 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) tag_protocol = ops->get_tag_protocol(ds); dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) { - ret = PTR_ERR(dst->tag_ops); - goto out; - } + if (IS_ERR(dst->tag_ops)) + return PTR_ERR(dst->tag_ops); dst->rcv = dst->tag_ops->rcv; } @@ -378,25 +272,23 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) */ ret = ops->setup(ds); if (ret < 0) - goto out; + return ret; if (ops->set_addr) { ret = ops->set_addr(ds, dst->master_netdev->dev_addr); if (ret < 0) - goto out; + return ret; } if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(parent); - if (!ds->slave_mii_bus) { - ret = -ENOMEM; - goto out; - } + if (!ds->slave_mii_bus) + return -ENOMEM; dsa_slave_mii_bus_init(ds); ret = mdiobus_register(ds->slave_mii_bus); if (ret < 0) - goto out; + return ret; } /* @@ -409,54 +301,24 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) continue; ret = dsa_slave_create(ds, parent, i, cd->port_names[i]); - if (ret < 0) { + if (ret < 0) netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", index, i, cd->port_names[i], ret); - ret = 0; - } } /* Perform configuration of the CPU and DSA ports */ ret = dsa_cpu_dsa_setups(ds, parent); - if (ret < 0) { + if (ret < 0) netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", index); - ret = 0; - } ret = dsa_cpu_port_ethtool_setup(ds); if (ret) return ret; -#ifdef CONFIG_NET_DSA_HWMON - /* If the switch provides a temperature sensor, - * register with hardware monitoring subsystem. - * Treat registration error as non-fatal and ignore it. - */ - if (ops->get_temp) { - const char *netname = netdev_name(dst->master_netdev); - char hname[IFNAMSIZ + 1]; - int i, j; - - /* Create valid hwmon 'name' attribute */ - for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) { - if (isalnum(netname[i])) - hname[j++] = netname[i]; - } - hname[j] = '\0'; - scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d", - hname, index); - ds->hwmon_dev = hwmon_device_register_with_groups(NULL, - ds->hwmon_name, ds, dsa_hwmon_groups); - if (IS_ERR(ds->hwmon_dev)) - ds->hwmon_dev = NULL; - } -#endif /* CONFIG_NET_DSA_HWMON */ - - return ret; + dsa_hwmon_register(ds); -out: - return ret; + return 0; } static struct dsa_switch * @@ -464,7 +326,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, struct device *parent, struct device *host_dev) { struct dsa_chip_data *cd = dst->pd->chip + index; - struct dsa_switch_ops *ops; + const struct dsa_switch_ops *ops; struct dsa_switch *ds; int ret; const char *name; @@ -514,10 +376,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { int port; -#ifdef CONFIG_NET_DSA_HWMON - if (ds->hwmon_dev) - hwmon_device_unregister(ds->hwmon_dev); -#endif + dsa_hwmon_unregister(ds); /* Destroy network devices for physical switch ports. */ for (port = 0; port < DSA_MAX_PORTS; port++) { @@ -940,9 +799,6 @@ static int dsa_probe(struct platform_device *pdev) struct dsa_switch_tree *dst; int ret; - pr_notice_once("Distributed Switch Architecture driver version %s\n", - dsa_driver_version); - if (pdev->dev.of_node) { ret = dsa_of_probe(&pdev->dev); if (ret) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index da3862124545..42a41d84053c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -81,30 +81,12 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst, static bool dsa_port_is_dsa(struct device_node *port) { - const char *name; - - name = of_get_property(port, "label", NULL); - if (!name) - return false; - - if (!strcmp(name, "dsa")) - return true; - - return false; + return !!of_parse_phandle(port, "link", 0); } static bool dsa_port_is_cpu(struct device_node *port) { - const char *name; - - name = of_get_property(port, "label", NULL); - if (!name) - return false; - - if (!strcmp(name, "cpu")) - return true; - - return false; + return !!of_parse_phandle(port, "ethernet", 0); } static bool dsa_ds_find_port(struct dsa_switch *ds, @@ -268,6 +250,8 @@ static int dsa_user_port_apply(struct device_node *port, u32 index, int err; name = of_get_property(port, "label", NULL); + if (!name) + name = "eth%d"; err = dsa_slave_create(ds, ds->dev, index, name); if (err) { @@ -650,8 +634,14 @@ static int _dsa_register_switch(struct dsa_switch *ds, struct device_node *np) } err = dsa_dst_parse(dst); - if (err) + if (err) { + if (err == -EPROBE_DEFER) { + dsa_dst_del_ds(dst, ds, ds->index); + return err; + } + goto out_del_dst; + } err = dsa_dst_apply(dst); if (err) { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 6cfd7388834e..7e3385ec73f4 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -49,7 +49,6 @@ struct dsa_slave_priv { }; /* dsa.c */ -extern char dsa_driver_version[]; int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, struct device_node *port_dn, int port); void dsa_cpu_dsa_destroy(struct device_node *port_dn); @@ -57,6 +56,15 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); +/* hwmon.c */ +#ifdef CONFIG_NET_DSA_HWMON +void dsa_hwmon_register(struct dsa_switch *ds); +void dsa_hwmon_unregister(struct dsa_switch *ds); +#else +static inline void dsa_hwmon_register(struct dsa_switch *ds) { } +static inline void dsa_hwmon_unregister(struct dsa_switch *ds) { } +#endif + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); diff --git a/net/dsa/hwmon.c b/net/dsa/hwmon.c new file mode 100644 index 000000000000..08831a811278 --- /dev/null +++ b/net/dsa/hwmon.c @@ -0,0 +1,147 @@ +/* + * net/dsa/hwmon.c - HWMON subsystem support + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/ctype.h> +#include <linux/hwmon.h> +#include <net/dsa.h> + +#include "dsa_priv.h" + +static ssize_t temp1_input_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dsa_switch *ds = dev_get_drvdata(dev); + int temp, ret; + + ret = ds->ops->get_temp(ds, &temp); + if (ret < 0) + return ret; + + return sprintf(buf, "%d\n", temp * 1000); +} +static DEVICE_ATTR_RO(temp1_input); + +static ssize_t temp1_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dsa_switch *ds = dev_get_drvdata(dev); + int temp, ret; + + ret = ds->ops->get_temp_limit(ds, &temp); + if (ret < 0) + return ret; + + return sprintf(buf, "%d\n", temp * 1000); +} + +static ssize_t temp1_max_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct dsa_switch *ds = dev_get_drvdata(dev); + int temp, ret; + + ret = kstrtoint(buf, 0, &temp); + if (ret < 0) + return ret; + + ret = ds->ops->set_temp_limit(ds, DIV_ROUND_CLOSEST(temp, 1000)); + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR_RW(temp1_max); + +static ssize_t temp1_max_alarm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct dsa_switch *ds = dev_get_drvdata(dev); + bool alarm; + int ret; + + ret = ds->ops->get_temp_alarm(ds, &alarm); + if (ret < 0) + return ret; + + return sprintf(buf, "%d\n", alarm); +} +static DEVICE_ATTR_RO(temp1_max_alarm); + +static struct attribute *dsa_hwmon_attrs[] = { + &dev_attr_temp1_input.attr, /* 0 */ + &dev_attr_temp1_max.attr, /* 1 */ + &dev_attr_temp1_max_alarm.attr, /* 2 */ + NULL +}; + +static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct dsa_switch *ds = dev_get_drvdata(dev); + const struct dsa_switch_ops *ops = ds->ops; + umode_t mode = attr->mode; + + if (index == 1) { + if (!ops->get_temp_limit) + mode = 0; + else if (!ops->set_temp_limit) + mode &= ~S_IWUSR; + } else if (index == 2 && !ops->get_temp_alarm) { + mode = 0; + } + return mode; +} + +static const struct attribute_group dsa_hwmon_group = { + .attrs = dsa_hwmon_attrs, + .is_visible = dsa_hwmon_attrs_visible, +}; +__ATTRIBUTE_GROUPS(dsa_hwmon); + +void dsa_hwmon_register(struct dsa_switch *ds) +{ + const char *netname = netdev_name(ds->dst->master_netdev); + char hname[IFNAMSIZ + 1]; + int i, j; + + /* If the switch provides temperature accessors, register with hardware + * monitoring subsystem. Treat registration error as non-fatal. + */ + if (!ds->ops->get_temp) + return; + + /* Create valid hwmon 'name' attribute */ + for (i = j = 0; i < IFNAMSIZ && netname[i]; i++) { + if (isalnum(netname[i])) + hname[j++] = netname[i]; + } + hname[j] = '\0'; + scnprintf(ds->hwmon_name, sizeof(ds->hwmon_name), "%s_dsa%d", hname, + ds->index); + ds->hwmon_dev = hwmon_device_register_with_groups(NULL, ds->hwmon_name, + ds, dsa_hwmon_groups); + if (IS_ERR(ds->hwmon_dev)) { + pr_warn("DSA: failed to register HWMON subsystem for switch %d\n", + ds->index); + ds->hwmon_dev = NULL; + } else { + pr_info("DSA: registered HWMON subsystem for switch %d\n", + ds->index); + } +} + +void dsa_hwmon_unregister(struct dsa_switch *ds) +{ + if (ds->hwmon_dev) { + hwmon_device_unregister(ds->hwmon_dev); + ds->hwmon_dev = NULL; + } +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 68c9eea00518..0cdcaf526987 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -673,7 +673,6 @@ static void dsa_slave_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); - strlcpy(drvinfo->version, dsa_driver_version, sizeof(drvinfo->version)); strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } @@ -984,6 +983,17 @@ static void dsa_slave_poll_controller(struct net_device *dev) } #endif +static int dsa_slave_get_phys_port_name(struct net_device *dev, + char *name, size_t len) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + if (snprintf(name, len, "p%d", p->port) >= len) + return -EINVAL; + + return 0; +} + void dsa_cpu_port_ethtool_init(struct ethtool_ops *ops) { ops->get_sset_count = dsa_cpu_port_get_sset_count; @@ -1031,6 +1041,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_bridge_getlink = switchdev_port_bridge_getlink, .ndo_bridge_setlink = switchdev_port_bridge_setlink, .ndo_bridge_dellink = switchdev_port_bridge_dellink, + .ndo_get_phys_port_name = dsa_slave_get_phys_port_name, }; static const struct switchdev_ops dsa_slave_switchdev_ops = { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f75069883f2b..aae410bb655a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1831,8 +1831,6 @@ static int __init inet_init(void) ip_init(); - tcp_v4_init(); - /* Setup TCP slab cache for open requests. */ tcp_init(); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 9a375b908d01..319c66de92eb 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1437,7 +1437,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) } /* Must be invoked inside of an RCU protected region. */ -void fib_select_default(const struct flowi4 *flp, struct fib_result *res) +static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct hlist_head *fa_head = res->fa_head; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 0777ea949223..fc310db2708b 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -209,19 +209,17 @@ static struct sock *icmp_sk(struct net *net) return *this_cpu_ptr(net->ipv4.icmp_sk); } +/* Called with BH disabled */ static inline struct sock *icmp_xmit_lock(struct net *net) { struct sock *sk; - local_bh_disable(); - sk = icmp_sk(net); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ - local_bh_enable(); return NULL; } return sk; @@ -229,7 +227,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net) static inline void icmp_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock(&sk->sk_lock.slock); } int sysctl_icmp_msgs_per_sec __read_mostly = 1000; @@ -282,6 +280,33 @@ bool icmp_global_allow(void) } EXPORT_SYMBOL(icmp_global_allow); +static bool icmpv4_mask_allow(struct net *net, int type, int code) +{ + if (type > NR_ICMP_TYPES) + return true; + + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + return true; + + /* Limit if icmp type is enabled in ratemask. */ + if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) + return true; + + return false; +} + +static bool icmpv4_global_allow(struct net *net, int type, int code) +{ + if (icmpv4_mask_allow(net, type, code)) + return true; + + if (icmp_global_allow()) + return true; + + return false; +} + /* * Send an ICMP frame. */ @@ -290,34 +315,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; + struct inet_peer *peer; bool rc = true; + int vif; - if (type > NR_ICMP_TYPES) - goto out; - - /* Don't limit PMTU discovery. */ - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + if (icmpv4_mask_allow(net, type, code)) goto out; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) goto out; - /* Limit if icmp type is enabled in ratemask. */ - if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) - goto out; - - rc = false; - if (icmp_global_allow()) { - int vif = l3mdev_master_ifindex(dst->dev); - struct inet_peer *peer; - - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); - rc = inet_peer_xrlim_allow(peer, - net->ipv4.sysctl_icmp_ratelimit); - if (peer) - inet_putpeer(peer); - } + vif = l3mdev_master_ifindex(dst->dev); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); + rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); + if (peer) + inet_putpeer(peer); out: return rc; } @@ -396,13 +409,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct inet_sock *inet; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); + int type = icmp_param->data.icmph.type; + int code = icmp_param->data.icmph.code; if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) return; + /* Needed by both icmp_global_allow and icmp_xmit_lock */ + local_bh_disable(); + + /* global icmp_msgs_per_sec */ + if (!icmpv4_global_allow(net, type, code)) + goto out_bh_enable; + sk = icmp_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; @@ -433,12 +455,13 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; - if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, - icmp_param->data.icmph.code)) + if (icmpv4_xrlim_allow(net, rt, &fl4, type, code)) icmp_push_reply(icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -571,7 +594,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct iphdr *iph; int room; - struct icmp_bxm *icmp_param; + struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); struct ipcm_cookie ipc; struct flowi4 fl4; @@ -648,13 +671,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } - icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); - if (!icmp_param) - return; + /* Needed by both icmp_global_allow and icmp_xmit_lock */ + local_bh_disable(); + + /* Check global sysctl_icmp_msgs_per_sec ratelimit */ + if (!icmpv4_global_allow(net, type, code)) + goto out_bh_enable; sk = icmp_xmit_lock(net); if (!sk) - goto out_free; + goto out_bh_enable; /* * Construct source address and options. @@ -681,7 +707,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) + if (ip_options_echo(&icmp_param.replyopts.opt.opt, skb_in)) goto out_unlock; @@ -689,25 +715,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Prepare data for ICMP header. */ - icmp_param->data.icmph.type = type; - icmp_param->data.icmph.code = code; - icmp_param->data.icmph.un.gateway = info; - icmp_param->data.icmph.checksum = 0; - icmp_param->skb = skb_in; - icmp_param->offset = skb_network_offset(skb_in); + icmp_param.data.icmph.type = type; + icmp_param.data.icmph.code = code; + icmp_param.data.icmph.un.gateway = info; + icmp_param.data.icmph.checksum = 0; + icmp_param.skb = skb_in; + icmp_param.offset = skb_network_offset(skb_in); inet_sk(sk)->tos = tos; sk->sk_mark = mark; ipc.addr = iph->saddr; - ipc.opt = &icmp_param->replyopts.opt; + ipc.opt = &icmp_param.replyopts.opt; ipc.tx_flags = 0; ipc.ttl = 0; ipc.tos = -1; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark, - type, code, icmp_param); + type, code, &icmp_param); if (IS_ERR(rt)) goto out_unlock; + /* peer icmp_ratelimit */ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) goto ende; @@ -716,21 +743,21 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); - icmp_param->data_len = skb_in->len - icmp_param->offset; - if (icmp_param->data_len > room) - icmp_param->data_len = room; - icmp_param->head_len = sizeof(struct icmphdr); + icmp_param.data_len = skb_in->len - icmp_param.offset; + if (icmp_param.data_len > room) + icmp_param.data_len = room; + icmp_param.head_len = sizeof(struct icmphdr); - icmp_push_reply(icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); -out_free: - kfree(icmp_param); +out_bh_enable: + local_bh_enable(); out:; } EXPORT_SYMBOL(icmp_send); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4dea33e5f295..3828b3a805cd 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -215,7 +215,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ddcd56c08d14..f8aff2c71cde 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -257,8 +257,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, - struct inet_timewait_death_row *twdr, int family) +void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) { struct inet_timewait_sock *tw; struct sock *sk; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 53ae0c6315ad..8a4409dd390a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -272,7 +272,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, continue; switch (cmsg->cmsg_type) { case IP_RETOPTS: - err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + err = cmsg->cmsg_len - sizeof(struct cmsghdr); /* Our caller is responsible for freeing ipc->opt */ err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg), @@ -843,6 +843,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, { struct ip_mreqn mreq; struct net_device *dev = NULL; + int midx; if (sk->sk_type == SOCK_STREAM) goto e_inval; @@ -887,11 +888,15 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EADDRNOTAVAIL; if (!dev) break; + + midx = l3mdev_master_ifindex(dev); + dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && - mreq.imr_ifindex != sk->sk_bound_dev_if) + mreq.imr_ifindex != sk->sk_bound_dev_if && + (!midx || midx != sk->sk_bound_dev_if)) break; inet->mc_index = mreq.imr_ifindex; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index fed3d29f9eb3..5476110598f7 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -188,8 +188,8 @@ int iptunnel_handle_offloads(struct sk_buff *skb, EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); /* Often modified stats are per cpu, other are shared (netdev->stats) */ -struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) +void ip_tunnel_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *tot) { int i; @@ -214,8 +214,6 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, tot->rx_bytes += rx_bytes; tot->tx_bytes += tx_bytes; } - - return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index efc1e76d4977..beacd028848c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -299,10 +299,29 @@ static void __net_exit ipmr_rules_exit(struct net *net) } #endif +static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct mfc_cache_cmp_arg *cmparg = arg->key; + struct mfc_cache *c = (struct mfc_cache *)ptr; + + return cmparg->mfc_mcastgrp != c->mfc_mcastgrp || + cmparg->mfc_origin != c->mfc_origin; +} + +static const struct rhashtable_params ipmr_rht_params = { + .head_offset = offsetof(struct mfc_cache, mnode), + .key_offset = offsetof(struct mfc_cache, cmparg), + .key_len = sizeof(struct mfc_cache_cmp_arg), + .nelem_hint = 3, + .locks_mul = 1, + .obj_cmpfn = ipmr_hash_cmp, + .automatic_shrinking = true, +}; + static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; - unsigned int i; /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (id != RT_TABLE_DEFAULT && id >= 1000000000) @@ -318,10 +337,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) write_pnet(&mrt->net, net); mrt->id = id; - /* Forwarding cache */ - for (i = 0; i < MFC_LINES; i++) - INIT_LIST_HEAD(&mrt->mfc_cache_array[i]); - + rhltable_init(&mrt->mfc_hash, &ipmr_rht_params); + INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process, @@ -338,6 +355,7 @@ static void ipmr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, true); + rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } @@ -839,13 +857,17 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) { - int line = MFC_HASH(mcastgrp, origin); + struct mfc_cache_cmp_arg arg = { + .mfc_mcastgrp = mcastgrp, + .mfc_origin = origin + }; + struct rhlist_head *tmp, *list; struct mfc_cache *c; - list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { - if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) - return c; - } + list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + return c; + return NULL; } @@ -853,13 +875,16 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, int vifi) { - int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY)); + struct mfc_cache_cmp_arg arg = { + .mfc_mcastgrp = htonl(INADDR_ANY), + .mfc_origin = htonl(INADDR_ANY) + }; + struct rhlist_head *tmp, *list; struct mfc_cache *c; - list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) - if (c->mfc_origin == htonl(INADDR_ANY) && - c->mfc_mcastgrp == htonl(INADDR_ANY) && - c->mfc_un.res.ttls[vifi] < 255) + list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (c->mfc_un.res.ttls[vifi] < 255) return c; return NULL; @@ -869,29 +894,51 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, __be32 mcastgrp, int vifi) { - int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY)); + struct mfc_cache_cmp_arg arg = { + .mfc_mcastgrp = mcastgrp, + .mfc_origin = htonl(INADDR_ANY) + }; + struct rhlist_head *tmp, *list; struct mfc_cache *c, *proxy; if (mcastgrp == htonl(INADDR_ANY)) goto skip; - list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) - if (c->mfc_origin == htonl(INADDR_ANY) && - c->mfc_mcastgrp == mcastgrp) { - if (c->mfc_un.res.ttls[vifi] < 255) - return c; - - /* It's ok if the vifi is part of the static tree */ - proxy = ipmr_cache_find_any_parent(mrt, - c->mfc_parent); - if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) - return c; - } + list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) { + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + /* It's ok if the vifi is part of the static tree */ + proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent); + if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) + return c; + } skip: return ipmr_cache_find_any_parent(mrt, vifi); } +/* Look for a (S,G,iif) entry if parent != -1 */ +static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, + __be32 origin, __be32 mcastgrp, + int parent) +{ + struct mfc_cache_cmp_arg arg = { + .mfc_mcastgrp = mcastgrp, + .mfc_origin = origin, + }; + struct rhlist_head *tmp, *list; + struct mfc_cache *c; + + list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (parent == -1 || parent == c->mfc_parent) + return c; + + return NULL; +} + /* Allocate a multicast cache entry */ static struct mfc_cache *ipmr_cache_alloc(void) { @@ -1028,10 +1075,10 @@ static int ipmr_cache_report(struct mr_table *mrt, static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) { + const struct iphdr *iph = ip_hdr(skb); + struct mfc_cache *c; bool found = false; int err; - struct mfc_cache *c; - const struct iphdr *iph = ip_hdr(skb); spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, list) { @@ -1095,46 +1142,39 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) { - int line; - struct mfc_cache *c, *next; + struct mfc_cache *c; - line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); + /* The entries are added/deleted only under RTNL */ + rcu_read_lock(); + c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, + mfc->mfcc_mcastgrp.s_addr, parent); + rcu_read_unlock(); + if (!c) + return -ENOENT; + rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); + list_del_rcu(&c->list); + mroute_netlink_event(mrt, c, RTM_DELROUTE); + ipmr_cache_free(c); - list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { - if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && - (parent == -1 || parent == c->mfc_parent)) { - list_del_rcu(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); - return 0; - } - } - return -ENOENT; + return 0; } static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent) { - bool found = false; - int line; struct mfc_cache *uc, *c; + bool found; + int ret; if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; - line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); - - list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { - if (c->mfc_origin == mfc->mfcc_origin.s_addr && - c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr && - (parent == -1 || parent == c->mfc_parent)) { - found = true; - break; - } - } - - if (found) { + /* The entries are added/deleted only under RTNL */ + rcu_read_lock(); + c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, + mfc->mfcc_mcastgrp.s_addr, parent); + rcu_read_unlock(); + if (c) { write_lock_bh(&mrt_lock); c->mfc_parent = mfc->mfcc_parent; ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); @@ -1160,8 +1200,14 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; - list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); - + ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode, + ipmr_rht_params); + if (ret) { + pr_err("ipmr: rhtable insert error %d\n", ret); + ipmr_cache_free(c); + return ret; + } + list_add_tail_rcu(&c->list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ @@ -1191,9 +1237,9 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, /* Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, bool all) { - int i; + struct mfc_cache *c, *tmp; LIST_HEAD(list); - struct mfc_cache *c, *next; + int i; /* Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { @@ -1204,19 +1250,18 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) unregister_netdevice_many(&list); /* Wipe the cache */ - for (i = 0; i < MFC_LINES; i++) { - list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (!all && (c->mfc_flags & MFC_STATIC)) - continue; - list_del_rcu(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_free(c); - } + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { + if (!all && (c->mfc_flags & MFC_STATIC)) + continue; + rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); + list_del_rcu(&c->list); + mroute_netlink_event(mrt, c, RTM_DELROUTE); + ipmr_cache_free(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); - list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { + list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_destroy_unres(mrt, c); @@ -1791,9 +1836,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local) { + int true_vifi = ipmr_find_vif(mrt, skb->dev); int psend = -1; int vif, ct; - int true_vifi = ipmr_find_vif(mrt, skb->dev); vif = cache->mfc_parent; cache->mfc_un.res.pkt++; @@ -2091,8 +2136,10 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mfc_parent >= MAXVIFS) + if (c->mfc_parent >= MAXVIFS) { + rtm->rtm_flags |= RTNH_F_UNRESOLVED; return -ENOENT; + } if (VIF_EXISTS(mrt, c->mfc_parent) && nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) @@ -2134,7 +2181,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, - struct rtmsg *rtm, int nowait, u32 portid) + struct rtmsg *rtm, u32 portid) { struct mfc_cache *cache; struct mr_table *mrt; @@ -2158,11 +2205,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, struct net_device *dev; int vif = -1; - if (nowait) { - rcu_read_unlock(); - return -EAGAIN; - } - dev = skb->dev; read_lock(&mrt_lock); if (dev) @@ -2296,34 +2338,30 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) struct mr_table *mrt; struct mfc_cache *mfc; unsigned int t = 0, s_t; - unsigned int h = 0, s_h; unsigned int e = 0, s_e; s_t = cb->args[0]; - s_h = cb->args[1]; - s_e = cb->args[2]; + s_e = cb->args[1]; rcu_read_lock(); ipmr_for_each_table(mrt, net) { if (t < s_t) goto next_table; - if (t > s_t) - s_h = 0; - for (h = s_h; h < MFC_LINES; h++) { - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { - if (e < s_e) - goto next_entry; - if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) - goto done; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + if (e < s_e) + goto next_entry; + if (ipmr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) + goto done; next_entry: - e++; - } - e = s_e = 0; + e++; } + e = 0; + s_e = 0; + spin_lock_bh(&mfc_unres_lock); list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) @@ -2340,16 +2378,15 @@ next_entry2: e++; } spin_unlock_bh(&mfc_unres_lock); - e = s_e = 0; - s_h = 0; + e = 0; + s_e = 0; next_table: t++; } done: rcu_read_unlock(); - cb->args[2] = e; - cb->args[1] = h; + cb->args[1] = e; cb->args[0] = t; return skb->len; @@ -2593,10 +2630,8 @@ struct ipmr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; struct list_head *cache; - int ct; }; - static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct ipmr_mfc_iter *it, loff_t pos) { @@ -2604,12 +2639,10 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct mfc_cache *mfc; rcu_read_lock(); - for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { - it->cache = &mrt->mfc_cache_array[it->ct]; - list_for_each_entry_rcu(mfc, it->cache, list) - if (pos-- == 0) - return mfc; - } + it->cache = &mrt->mfc_cache_list; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + if (pos-- == 0) + return mfc; rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); @@ -2636,17 +2669,16 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) it->mrt = mrt; it->cache = NULL; - it->ct = 0; return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) : SEQ_START_TOKEN; } static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct mfc_cache *mfc = v; struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt = it->mrt; + struct mfc_cache *mfc = v; ++*pos; @@ -2659,19 +2691,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; - BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]); - - while (++it->ct < MFC_LINES) { - it->cache = &mrt->mfc_cache_array[it->ct]; - if (list_empty(it->cache)) - continue; - return list_first_entry(it->cache, struct mfc_cache, list); - } - /* exhausted cache_array, show unresolved */ rcu_read_unlock(); it->cache = &mrt->mfc_unres_queue; - it->ct = 0; spin_lock_bh(&mfc_unres_lock); if (!list_empty(it->cache)) @@ -2691,7 +2713,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc_cache_array[it->ct]) + else if (it->cache == &mrt->mfc_cache_list) rcu_read_unlock(); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 7143ca1a6af9..0247ca032232 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -65,7 +65,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, - atomic_read(&tcp_death_row.tw_count), sockets, + atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 709ffe67d1de..4b7c231c1aef 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1758,7 +1758,6 @@ standard_hash: static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - const struct flowi4 *fl4, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { @@ -1883,7 +1882,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, &res, in_dev, daddr, saddr, tos); out: return err; brd_input: @@ -2454,7 +2453,7 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow); static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, - u32 seq, int event, int nowait, unsigned int flags) + u32 seq, int event) { struct rtable *rt = skb_rtable(skb); struct rtmsg *r; @@ -2463,7 +2462,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, u32 error; u32 metrics[RTAX_MAX]; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags); + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), 0); if (!nlh) return -EMSGSIZE; @@ -2541,18 +2540,12 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, - r, nowait, portid); + r, portid); if (err <= 0) { - if (!nowait) { - if (err == 0) - return 0; - goto nla_put_failure; - } else { - if (err == -EMSGSIZE) - goto nla_put_failure; - error = err; - } + if (err == 0) + return 0; + goto nla_put_failure; } } else #endif @@ -2638,9 +2631,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) skb->protocol = htons(ETH_P_IP); skb->dev = dev; skb->mark = mark; - local_bh_disable(); err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); - local_bh_enable(); rt = skb_rtable(skb); if (err == 0 && rt->dst.error) @@ -2665,7 +2656,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWROUTE, 0, 0); + RTM_NEWROUTE); if (err < 0) goto errout_free; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3e88467d70ee..496b97e17aaf 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -13,13 +13,13 @@ #include <linux/tcp.h> #include <linux/slab.h> #include <linux/random.h> -#include <linux/cryptohash.h> +#include <linux/siphash.h> #include <linux/kernel.h> #include <linux/export.h> #include <net/tcp.h> #include <net/route.h> -static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; +static siphash_key_t syncookie_secret[2] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -48,24 +48,13 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define TSBITS 6 #define TSMASK (((__u32)1 << TSBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); - static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp; - net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); - - tmp = this_cpu_ptr(ipv4_cookie_scratch); - memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); - tmp[0] = (__force u32)saddr; - tmp[1] = (__force u32)daddr; - tmp[2] = ((__force u32)sport << 16) + (__force u32)dport; - tmp[3] = count; - sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); - - return tmp[17]; + return siphash_4u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + count, &syncookie_secret[c]); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b2fa498b15d1..c8d283615c6f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -290,13 +290,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_max_tw_buckets", - .data = &tcp_death_row.sysctl_max_tw_buckets, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_fastopen", .data = &sysctl_tcp_fastopen, .maxlen = sizeof(int), @@ -310,13 +303,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_fastopen_key, }, { - .procname = "tcp_tw_recycle", - .data = &tcp_death_row.sysctl_tw_recycle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, .maxlen = sizeof(int), @@ -338,13 +324,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_max_syn_backlog", - .data = &sysctl_max_syn_backlog, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -558,13 +537,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "tcp_thin_dupack", - .data = &sysctl_tcp_thin_dupack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_early_retrans", .data = &sysctl_tcp_early_retrans, .maxlen = sizeof(int), @@ -960,6 +932,27 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_max_tw_buckets", + .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_tw_recycle", + .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_max_syn_backlog", + .data = &init_net.ipv4.sysctl_max_syn_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4a044964da66..aba6ea76338e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -406,7 +406,6 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; - tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); tp->tsoffset = 0; @@ -429,7 +428,7 @@ EXPORT_SYMBOL(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { - if (tsflags) { + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -958,10 +957,8 @@ new_segment: copied += copy; offset += copy; size -= copy; - if (!size) { - tcp_tx_timestamp(sk, sk->sk_tsflags, skb); + if (!size) goto out; - } if (skb->len < size_goal || (flags & MSG_OOB)) continue; @@ -987,8 +984,11 @@ wait_for_memory: } out: - if (copied && !(flags & MSG_SENDPAGE_NOTLAST)) - tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); + if (copied) { + tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + if (!(flags & MSG_SENDPAGE_NOTLAST)) + tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); + } return copied; do_error: @@ -1281,7 +1281,6 @@ new_segment: copied += copy; if (!msg_data_left(msg)) { - tcp_tx_timestamp(sk, sockc.tsflags, skb); if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; @@ -1312,8 +1311,10 @@ wait_for_memory: } out: - if (copied) + if (copied) { + tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); + } out_nopush: release_sock(sk); return copied + copied_syn; @@ -2473,11 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; - else { - tp->thin_dupack = val; - if (tp->thin_dupack) - tcp_disable_early_retrans(tp); - } break; case TCP_REPAIR: @@ -2764,6 +2760,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_sacked = sk->sk_max_ack_backlog; return; } + + slow = lock_sock_fast(sk); + info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; @@ -2814,15 +2813,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_total_retrans = tp->total_retrans; - slow = lock_sock_fast(sk); - info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); tcp_get_info_chrono_stats(tp, info); - unlock_sock_fast(sk, slow); - info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; @@ -2838,6 +2833,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) do_div(rate64, intv); info->tcpi_delivery_rate = rate64; } + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2967,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; + case TCP_THIN_DUPACK: - val = tp->thin_dupack; + val = 0; break; case TCP_REPAIR: @@ -3334,6 +3331,7 @@ void __init tcp_init(void) percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); + inet_hashinfo_init(&tcp_hashinfo); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, @@ -3377,10 +3375,7 @@ void __init tcp_init(void) cnt = tcp_hashinfo.ehash_mask + 1; - - tcp_death_row.sysctl_max_tw_buckets = cnt / 2; sysctl_tcp_max_orphans = cnt / 2; - sysctl_max_syn_backlog = max(128, cnt / 256); tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ @@ -3399,6 +3394,7 @@ void __init tcp_init(void) pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); + tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6c790754ae3e..1a34e9278c07 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,7 +79,7 @@ int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; -int sysctl_tcp_fack __read_mostly = 1; +int sysctl_tcp_fack __read_mostly; int sysctl_tcp_max_reordering __read_mostly = 300; int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; @@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; - -int sysctl_tcp_thin_dupack __read_mostly; - int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; @@ -904,8 +901,6 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tcp_disable_fack(tp); } - if (metric > 0) - tcp_disable_early_retrans(tp); tp->rack.reord = 1; } @@ -916,10 +911,6 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) tp->retransmit_skb_hint = skb; - - if (!tp->lost_out || - after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } /* Sum the number of packets on the wire we have marked as lost. @@ -1135,6 +1126,7 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; + struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */ struct rate_sample *rate; int flag; }; @@ -1217,7 +1209,8 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { - tcp_rack_advance(tp, xmit_time, sacked); + tcp_rack_advance(tp, sacked, end_seq, + xmit_time, &state->ack_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, @@ -1937,7 +1930,6 @@ void tcp_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sk_buff *skb; - bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ bool mark_lost; @@ -1982,7 +1974,6 @@ void tcp_enter_loss(struct sock *sk) TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; } } tcp_verify_left_out(tp); @@ -1998,13 +1989,15 @@ void tcp_enter_loss(struct sock *sk) tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); - /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous - * loss recovery is underway except recurring timeout(s) on - * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing + /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO + * if a previous recovery is underway, otherwise it may incorrectly + * call a timeout spurious if some previously retransmitted packets + * are s/acked (sec 3.2). We do not apply that retriction since + * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS + * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO + * on PTMU discovery to avoid sending new data. */ - tp->frto = sysctl_tcp_frto && - (new_recovery || icsk->icsk_retransmits) && - !inet_csk(sk)->icsk_mtup.probe_size; + tp->frto = sysctl_tcp_frto && !inet_csk(sk)->icsk_mtup.probe_size; } /* If ACK arrived pointing to a remembered SACK, it means that our @@ -2056,30 +2049,6 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } -static bool tcp_pause_early_retransmit(struct sock *sk, int flag) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned long delay; - - /* Delay early retransmit and entering fast recovery for - * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples - * available, or RTO is scheduled to fire first. - */ - if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt_us) - return false; - - delay = max(usecs_to_jiffies(tp->srtt_us >> 5), - msecs_to_jiffies(2)); - - if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) - return false; - - inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, - TCP_RTO_MAX); - return true; -} - /* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * @@ -2127,10 +2096,26 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * - * Essentially, we have now two algorithms counting + * Essentially, we have now a few algorithms detecting * lost packets. * - * FACK: It is the simplest heuristics. As soon as we decided + * If the receiver supports SACK: + * + * RFC6675/3517: It is the conventional algorithm. A packet is + * considered lost if the number of higher sequence packets + * SACKed is greater than or equal the DUPACK thoreshold + * (reordering). This is implemented in tcp_mark_head_lost and + * tcp_update_scoreboard. + * + * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm + * (2017-) that checks timing instead of counting DUPACKs. + * Essentially a packet is considered lost if it's not S/ACKed + * after RTT + reordering_window, where both metrics are + * dynamically measured and adjusted. This is implemented in + * tcp_rack_mark_lost. + * + * FACK (Disabled by default. Subsumbed by RACK): + * It is the simplest heuristics. As soon as we decided * that something is lost, we decide that _all_ not SACKed * packets until the most forward SACK are lost. I.e. * lost_out = fackets_out - sacked_out and left_out = fackets_out. @@ -2139,16 +2124,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * takes place. We use FACK by default until reordering * is suspected on the path to this destination. * - * NewReno: when Recovery is entered, we assume that one segment + * If the receiver does not support SACK: + * + * NewReno (RFC6582): in Recovery we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * - * Imagine, that's all! Forget about all this shamanism about CWND inflation - * deflation etc. CWND is real congestion window, never inflated, changes - * only according to classic VJ rules. - * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, @@ -2176,8 +2159,6 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - __u32 packets_out; - int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; /* Trick#1: The loss is proven. */ if (tp->lost_out) @@ -2187,39 +2168,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* Trick#4: It is still not OK... But will it be useful to delay - * recovery more? - */ - packets_out = tp->packets_out; - if (packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && - !tcp_may_send_now(sk)) { - /* We have nothing to send. This connection is limited - * either by receiver window or by application. - */ - return true; - } - - /* If a thin stream is detected, retransmit after first - * received dupack. Employ only if SACK is supported in order - * to avoid possible corner-case series of spurious retransmissions - * Use only if there are no unsent data. - */ - if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && - tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && - tcp_is_sack(tp) && !tcp_send_head(sk)) - return true; - - /* Trick#6: TCP early retransmit, per RFC5827. To avoid spurious - * retransmissions due to small network reorderings, we implement - * Mitigation A.3 in the RFC and delay the retransmission for a short - * interval if appropriate. - */ - if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && - (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) && - !tcp_may_send_now(sk)) - return !tcp_pause_early_retransmit(sk, flag); - return false; } @@ -2521,8 +2469,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, - int flag) +void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2690,7 +2637,7 @@ void tcp_simple_retransmit(struct sock *sk) } EXPORT_SYMBOL(tcp_simple_retransmit); -static void tcp_enter_recovery(struct sock *sk, bool ece_ack) +void tcp_enter_recovery(struct sock *sk, bool ece_ack) { struct tcp_sock *tp = tcp_sk(sk); int mib_idx; @@ -2726,14 +2673,18 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, tcp_try_undo_loss(sk, false)) return; - if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - if ((flag & FLAG_ORIG_SACK_ACKED) && - tcp_try_undo_loss(sk, true)) - return; + /* The ACK (s)acks some never-retransmitted data meaning not all + * the data packets before the timeout were lost. Therefore we + * undo the congestion window and state. This is essentially + * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since + * a retransmitted skb is permantly marked, we can apply such an + * operation even if F-RTO was not used. + */ + if ((flag & FLAG_ORIG_SACK_ACKED) && + tcp_try_undo_loss(sk, tp->undo_marker)) + return; + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ if (after(tp->snd_nxt, tp->high_seq)) { if (flag & FLAG_DATA_SACKED || is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ @@ -2800,6 +2751,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) return false; } +static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, + const struct skb_mstamp *ack_time) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Use RACK to detect loss */ + if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { + u32 prior_retrans = tp->retrans_out; + + tcp_rack_mark_lost(sk, ack_time); + if (prior_retrans > tp->retrans_out) + *ack_flag |= FLAG_LOST_RETRANS; + } +} + /* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and @@ -2813,7 +2779,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - bool is_dupack, int *ack_flag, int *rexmit) + bool is_dupack, int *ack_flag, int *rexmit, + const struct skb_mstamp *ack_time) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2864,13 +2831,6 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } } - /* Use RACK to detect loss */ - if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && - tcp_rack_mark_lost(sk)) { - flag |= FLAG_LOST_RETRANS; - *ack_flag |= FLAG_LOST_RETRANS; - } - /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: @@ -2888,11 +2848,13 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_keep_open(sk); return; } + tcp_rack_identify_loss(sk, ack_flag, ack_time); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - if (icsk->icsk_ca_state != TCP_CA_Open && - !(flag & FLAG_LOST_RETRANS)) + tcp_rack_identify_loss(sk, ack_flag, ack_time); + if (!(icsk->icsk_ca_state == TCP_CA_Open || + (*ack_flag & FLAG_LOST_RETRANS))) return; /* Change state if cwnd is undone or retransmits are lost */ default: @@ -2906,6 +2868,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); + tcp_rack_identify_loss(sk, ack_flag, ack_time); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -3024,7 +2987,7 @@ void tcp_rearm_rto(struct sock *sk) } else { u32 rto = inet_csk(sk)->icsk_rto; /* Offset the time elapsed after installing regular RTO */ - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); const u32 rto_time_stamp = @@ -3041,24 +3004,6 @@ void tcp_rearm_rto(struct sock *sk) } } -/* This function is called when the delayed ER timer fires. TCP enters - * fast recovery and performs fast-retransmit. - */ -void tcp_resume_early_retransmit(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tcp_rearm_rto(sk); - - /* Stop if ER is disabled after the delayed ER timer is scheduled */ - if (!tp->do_early_retrans) - return; - - tcp_enter_recovery(sk, false); - tcp_update_scoreboard(sk, 1); - tcp_xmit_retransmit_queue(sk); -} - /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3101,11 +3046,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una, int *acked, - struct tcp_sacktag_state *sack, - struct skb_mstamp *now) + struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt; + struct skb_mstamp *now = &sack->ack_time; struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; @@ -3165,7 +3110,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } else if (tcp_is_sack(tp)) { tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) - tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + tcp_rack_advance(tp, sacked, scb->end_seq, + &skb->skb_mstamp, + &sack->ack_time); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3595,7 +3542,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 lost = tp->lost; int acked = 0; /* Number of packets newly acked */ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ - struct skb_mstamp now; sack_state.first_sackt.v64 = 0; sack_state.rate = &rs; @@ -3621,10 +3567,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - skb_mstamp_get(&now); + skb_mstamp_get(&sack_state.ack_time); - if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) + if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); if (after(ack, prior_snd_una)) { @@ -3689,11 +3634,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, - &sack_state, &now); + &sack_state); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3708,15 +3654,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, &now, &rs); - tcp_cong_control(sk, ack, delivered, flag, &rs); + tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time, + sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3737,9 +3685,11 @@ old_ack: * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { + skb_mstamp_get(&sack_state.ack_time); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, + &sack_state.ack_time); tcp_xmit_recovery(sk, rexmit); } @@ -6363,7 +6313,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * timewait bucket, so that all the necessary checks * are made in the function processing timewait state. */ - if (tcp_death_row.sysctl_tw_recycle) { + if (net->ipv4.tcp_death_row.sysctl_tw_recycle) { bool strict; dst = af_ops->route_req(sk, &fl, req, &strict); @@ -6377,8 +6327,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } /* Kill the following clause, if you dislike this way. */ else if (!net->ipv4.sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && + (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (net->ipv4.sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst, false, tmp_opt.saw_tstamp)) { /* Without syncookies last quarter of diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fe9da4fb96bf..63214136cf1c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -146,6 +146,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct rtable *rt; int err; struct ip_options_rcu *inet_opt; + struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; @@ -196,7 +197,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = 0; } - if (tcp_death_row.sysctl_tw_recycle && + if (tcp_death_row->sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) tcp_fetch_timewait_stamp(sk, &rt->dst); @@ -215,7 +216,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); - err = inet_hash_connect(&tcp_death_row, sk); + err = inet_hash_connect(tcp_death_row, sk); if (err) goto failure; @@ -2228,7 +2229,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) int state; if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; @@ -2375,6 +2376,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, @@ -2418,7 +2420,7 @@ static void __net_exit tcp_sk_exit(struct net *net) static int __net_init tcp_sk_init(struct net *net) { - int res, cpu; + int res, cpu, cnt; net->ipv4.tcp_sk = alloc_percpu(struct sock *); if (!net->ipv4.tcp_sk) @@ -2457,6 +2459,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 0; + cnt = tcp_hashinfo.ehash_mask + 1; + net->ipv4.tcp_death_row.sysctl_tw_recycle = 0; + net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; + net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; + + net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); + return 0; fail: tcp_sk_exit(net); @@ -2466,7 +2475,7 @@ fail: static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); + inet_twsk_purge(&tcp_hashinfo, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { @@ -2477,7 +2486,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { void __init tcp_v4_init(void) { - inet_hashinfo_init(&tcp_hashinfo); if (register_pernet_subsys(&tcp_sk_ops)) panic("Failed to create the TCP control socket.\n"); } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index ba8f02d0f283..b9ed0d50aead 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -522,7 +522,6 @@ void tcp_init_metrics(struct sock *sk) val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) { tcp_disable_fack(tp); - tcp_disable_early_retrans(tp); tp->reordering = val; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 28ce5ee831f5..bdb443471c39 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -29,12 +29,6 @@ int sysctl_tcp_abort_on_overflow __read_mostly; -struct inet_timewait_death_row tcp_death_row = { - .sysctl_max_tw_buckets = NR_FILE * 2, - .hashinfo = &tcp_hashinfo, -}; -EXPORT_SYMBOL_GPL(tcp_death_row); - static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -100,6 +94,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; + struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { @@ -153,7 +148,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - if (tcp_death_row.sysctl_tw_recycle && + if (tcp_death_row->sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) inet_twsk_reschedule(tw, tw->tw_timeout); @@ -264,11 +259,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct tcp_sock *tp = tcp_sk(sk); struct inet_timewait_sock *tw; bool recycle_ok = false; + struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; - if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) + if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tcp_remember_stamp(sk); - tw = inet_twsk_alloc(sk, &tcp_death_row, state); + tw = inet_twsk_alloc(sk, tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -472,7 +468,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->sacked_out = 0; newtp->fackets_out = 0; newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tcp_enable_early_retrans(newtp); newtp->tlp_high_seq = 0; newtp->lsndtime = treq->snt_synack.stamp_jiffies; newsk->sk_txhash = treq->txhash; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1d5331a1b1dc..9a1a1494b9dd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -76,10 +76,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out += tcp_skb_pcount(skb); - if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { + if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); - } NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT, tcp_skb_pcount(skb)); @@ -2289,8 +2287,6 @@ bool tcp_schedule_loss_probe(struct sock *sk) u32 timeout, tlp_time_stamp, rto_time_stamp; u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) - return false; /* No consecutive loss probes. */ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { tcp_rearm_rto(sk); @@ -2309,8 +2305,9 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !tp->packets_out || - !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) + if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) || + !tp->packets_out || !tcp_is_sack(tp) || + icsk->icsk_ca_state != TCP_CA_Open) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && @@ -2831,36 +2828,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) return err; } -/* Check if we forward retransmits are possible in the current - * window/congestion state. - */ -static bool tcp_can_forward_retransmit(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - const struct tcp_sock *tp = tcp_sk(sk); - - /* Forward retransmissions are possible only during Recovery. */ - if (icsk->icsk_ca_state != TCP_CA_Recovery) - return false; - - /* No forward retransmissions in Reno are possible. */ - if (tcp_is_reno(tp)) - return false; - - /* Yeah, we have to make difficult choice between forward transmission - * and retransmission... Both ways have their merits... - * - * For now we do not retransmit anything, while we have some new - * segments to send. In the other cases, follow rule 3 for - * NextSeg() specified in RFC3517. - */ - - if (tcp_may_send_now(sk)) - return false; - - return true; -} - /* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either @@ -2875,24 +2842,16 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; struct sk_buff *hole = NULL; - u32 max_segs, last_lost; + u32 max_segs; int mib_idx; - int fwd_rexmitting = 0; if (!tp->packets_out) return; - if (!tp->lost_out) - tp->retransmit_high = tp->snd_una; - if (tp->retransmit_skb_hint) { skb = tp->retransmit_skb_hint; - last_lost = TCP_SKB_CB(skb)->end_seq; - if (after(last_lost, tp->retransmit_high)) - last_lost = tp->retransmit_high; } else { skb = tcp_write_queue_head(sk); - last_lost = tp->snd_una; } max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); @@ -2915,31 +2874,14 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ segs = min_t(int, segs, max_segs); - if (fwd_rexmitting) { -begin_fwd: - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) - break; - mib_idx = LINUX_MIB_TCPFORWARDRETRANS; - - } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { - tp->retransmit_high = last_lost; - if (!tcp_can_forward_retransmit(sk)) - break; - /* Backtrack if necessary to non-L'ed skb */ - if (hole) { - skb = hole; - hole = NULL; - } - fwd_rexmitting = 1; - goto begin_fwd; - + if (tp->retrans_out >= tp->lost_out) { + break; } else if (!(sacked & TCPCB_LOST)) { if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED))) hole = skb; continue; } else { - last_lost = TCP_SKB_CB(skb)->end_seq; if (icsk->icsk_ca_state != TCP_CA_Loss) mib_idx = LINUX_MIB_TCPFASTRETRANS; else @@ -2960,7 +2902,8 @@ begin_fwd: if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk)) + if (skb == tcp_write_queue_head(sk) && + icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index e36df4fcfeba..4ecb38ae8504 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -1,9 +1,32 @@ #include <linux/tcp.h> #include <net/tcp.h> -int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; +int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION; -/* Marks a packet lost, if some packet sent later has been (s)acked. +static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_skb_mark_lost_uncond_verify(tp, skb); + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { + /* Account for retransmits that are lost again */ + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); + } +} + +static bool tcp_rack_sent_after(const struct skb_mstamp *t1, + const struct skb_mstamp *t2, + u32 seq1, u32 seq2) +{ + return skb_mstamp_after(t1, t2) || + (t1->v64 == t2->v64 && after(seq1, seq2)); +} + +/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01): + * + * Marks a packet lost, if some packet sent later has been (s)acked. * The underlying idea is similar to the traditional dupthresh and FACK * but they look at different metrics: * @@ -16,31 +39,26 @@ int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS; * is being more resilient to reordering by simply allowing some * "settling delay", instead of tweaking the dupthresh. * - * The current version is only used after recovery starts but can be - * easily extended to detect the first loss. + * When tcp_rack_detect_loss() detects some packets are lost and we + * are not already in the CA_Recovery state, either tcp_rack_reo_timeout() + * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will + * make us enter the CA_Recovery state. */ -int tcp_rack_mark_lost(struct sock *sk) +static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, + u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - u32 reo_wnd, prior_retrans = tp->retrans_out; - - if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced) - return 0; - - /* Reset the advanced flag to avoid unnecessary queue scanning */ - tp->rack.advanced = 0; + u32 reo_wnd; + *reo_timeout = 0; /* To be more reordering resilient, allow min_rtt/4 settling delay * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed * RTT because reordering is often a path property and less related * to queuing or delayed ACKs. - * - * TODO: measure and adapt to the observed reordering delay, and - * use a timer to retransmit like the delayed early retransmit. */ reo_wnd = 1000; - if (tp->rack.reord && tcp_min_rtt(tp) != ~0U) + if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U) reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd); tcp_for_write_queue(skb, sk) { @@ -54,20 +72,29 @@ int tcp_rack_mark_lost(struct sock *sk) scb->sacked & TCPCB_SACKED_ACKED) continue; - if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) { + if (tcp_rack_sent_after(&tp->rack.mstamp, &skb->skb_mstamp, + tp->rack.end_seq, scb->end_seq)) { + /* Step 3 in draft-cheng-tcpm-rack-00.txt: + * A packet is lost if its elapsed time is beyond + * the recent RTT plus the reordering window. + */ + u32 elapsed = skb_mstamp_us_delta(now, + &skb->skb_mstamp); + s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; - if (skb_mstamp_us_delta(&tp->rack.mstamp, - &skb->skb_mstamp) <= reo_wnd) + if (remaining < 0) { + tcp_rack_mark_skb_lost(sk, skb); continue; - - /* skb is lost if packet sent later is sacked */ - tcp_skb_mark_lost_uncond_verify(tp, skb); - if (scb->sacked & TCPCB_SACKED_RETRANS) { - scb->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPLOSTRETRANSMIT); } + + /* Skip ones marked lost but not yet retransmitted */ + if ((scb->sacked & TCPCB_LOST) && + !(scb->sacked & TCPCB_SACKED_RETRANS)) + continue; + + /* Record maximum wait time (+1 to avoid 0) */ + *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining); + } else if (!(scb->sacked & TCPCB_RETRANS)) { /* Original data are sent sequentially so stop early * b/c the rest are all sent after rack_sent @@ -75,20 +102,43 @@ int tcp_rack_mark_lost(struct sock *sk) break; } } - return prior_retrans - tp->retrans_out; } -/* Record the most recently (re)sent time among the (s)acked packets */ -void tcp_rack_advance(struct tcp_sock *tp, - const struct skb_mstamp *xmit_time, u8 sacked) +void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 timeout; + + if (!tp->rack.advanced) + return; + + /* Reset the advanced flag to avoid unnecessary queue scanning */ + tp->rack.advanced = 0; + tcp_rack_detect_loss(sk, now, &timeout); + if (timeout) { + timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, + timeout, inet_csk(sk)->icsk_rto); + } +} + +/* Record the most recently (re)sent time among the (s)acked packets + * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from + * draft-cheng-tcpm-rack-00.txt + */ +void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, + const struct skb_mstamp *xmit_time, + const struct skb_mstamp *ack_time) { + u32 rtt_us; + if (tp->rack.mstamp.v64 && - !skb_mstamp_after(xmit_time, &tp->rack.mstamp)) + !tcp_rack_sent_after(xmit_time, &tp->rack.mstamp, + end_seq, tp->rack.end_seq)) return; + rtt_us = skb_mstamp_us_delta(ack_time, xmit_time); if (sacked & TCPCB_RETRANS) { - struct skb_mstamp now; - /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. @@ -99,11 +149,35 @@ void tcp_rack_advance(struct tcp_sock *tp, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ - skb_mstamp_get(&now); - if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp)) + if (rtt_us < tcp_min_rtt(tp)) return; } - + tp->rack.rtt_us = rtt_us; tp->rack.mstamp = *xmit_time; + tp->rack.end_seq = end_seq; tp->rack.advanced = 1; } + +/* We have waited long enough to accommodate reordering. Mark the expired + * packets lost and retransmit them. + */ +void tcp_rack_reo_timeout(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp now; + u32 timeout, prior_inflight; + + skb_mstamp_get(&now); + prior_inflight = tcp_packets_in_flight(tp); + tcp_rack_detect_loss(sk, &now, &timeout); + if (prior_inflight != tcp_packets_in_flight(tp)) { + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { + tcp_enter_recovery(sk, false); + if (!inet_csk(sk)->icsk_ca_ops->cong_control) + tcp_cwnd_reduction(sk, 1, 0); + } + tcp_xmit_retransmit_queue(sk); + } + if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS) + tcp_rearm_rto(sk); +} diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3705075f42c3..40d893556e67 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -563,8 +563,8 @@ void tcp_write_timer_handler(struct sock *sk) event = icsk->icsk_pending; switch (event) { - case ICSK_TIME_EARLY_RETRANS: - tcp_resume_early_retransmit(sk); + case ICSK_TIME_REO_TIMEOUT: + tcp_rack_reo_timeout(sk); break; case ICSK_TIME_LOSS_PROBE: tcp_send_loss_probe(sk); @@ -617,6 +617,7 @@ void tcp_set_keepalive(struct sock *sk, int val) else if (!val) inet_csk_delete_keepalive_timer(sk); } +EXPORT_SYMBOL_GPL(tcp_set_keepalive); static void tcp_keepalive_timer (unsigned long data) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1307a7c2e544..4318d72e0248 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -153,13 +153,18 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (!sk2->sk_reuseport || !sk->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || - !uid_eq(uid, sock_i_uid(sk2))) && saddr_comp(sk, sk2, true)) { - if (!bitmap) - return 1; - __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); + if (sk2->sk_reuseport && sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(uid, sock_i_uid(sk2))) { + if (!bitmap) + return 0; + } else { + if (!bitmap) + return 1; + __set_bit(udp_sk(sk2)->udp_port_hash >> log, + bitmap); + } } } return 0; @@ -188,11 +193,14 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (!sk2->sk_reuseport || !sk->sk_reuseport || - rcu_access_pointer(sk->sk_reuseport_cb) || - !uid_eq(uid, sock_i_uid(sk2))) && saddr_comp(sk, sk2, true)) { - res = 1; + if (sk2->sk_reuseport && sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(uid, sock_i_uid(sk2))) { + res = 0; + } else { + res = 1; + } break; } } @@ -285,6 +293,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, snum += rand; } while (snum != first); spin_unlock_bh(&hslot->lock); + cond_resched(); } while (++first != last); goto fail; } else { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c1e124bc8e1e..ac9bd5620f81 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4888,6 +4888,13 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) struct net *net = dev_net(ifa->idev->dev); int err = -ENOBUFS; + /* Don't send DELADDR notification for TENTATIVE address, + * since NEWADDR notification is sent only after removing + * TENTATIVE flag. + */ + if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + return; + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); if (!skb) goto errout; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 3036f665e6c8..230b5aac9f03 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -110,19 +110,17 @@ static const struct inet6_protocol icmpv6_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; +/* Called with BH disabled */ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) { struct sock *sk; - local_bh_disable(); - sk = icmpv6_sk(net); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an * outgoing ICMP6 packet. */ - local_bh_enable(); return NULL; } return sk; @@ -130,7 +128,7 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net) static __inline__ void icmpv6_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&sk->sk_lock.slock); + spin_unlock(&sk->sk_lock.slock); } /* @@ -168,6 +166,30 @@ static bool is_ineligible(const struct sk_buff *skb) return false; } +static bool icmpv6_mask_allow(int type) +{ + /* Informational messages are not limited. */ + if (type & ICMPV6_INFOMSG_MASK) + return true; + + /* Do not limit pmtu discovery, it would break it. */ + if (type == ICMPV6_PKT_TOOBIG) + return true; + + return false; +} + +static bool icmpv6_global_allow(int type) +{ + if (icmpv6_mask_allow(type)) + return true; + + if (icmp_global_allow()) + return true; + + return false; +} + /* * Check the ICMP output rate limit */ @@ -178,12 +200,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct dst_entry *dst; bool res = false; - /* Informational messages are not limited. */ - if (type & ICMPV6_INFOMSG_MASK) - return true; - - /* Do not limit pmtu discovery, it would break it. */ - if (type == ICMPV6_PKT_TOOBIG) + if (icmpv6_mask_allow(type)) return true; /* @@ -200,20 +217,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; + struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - if (icmp_global_allow()) { - struct inet_peer *peer; - - peer = inet_getpeer_v6(net->ipv6.peers, - &fl6->daddr, 1); - res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); - } + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); } dst_release(dst); return res; @@ -474,6 +487,13 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, return; } + /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ + local_bh_disable(); + + /* Check global sysctl_icmp_msgs_per_sec ratelimit */ + if (!icmpv6_global_allow(type)) + goto out_bh_enable; + mip6_addr_swap(skb); memset(&fl6, 0, sizeof(fl6)); @@ -492,7 +512,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, sk = icmpv6_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; + sk->sk_mark = mark; np = inet6_sk(sk); @@ -552,6 +573,8 @@ out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } /* Slightly more convenient version of icmp6_send. @@ -665,9 +688,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb) fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); + local_bh_disable(); sk = icmpv6_xmit_lock(net); if (!sk) - return; + goto out_bh_enable; sk->sk_mark = mark; np = inet6_sk(sk); @@ -709,6 +733,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb) dst_release(dst); out: icmpv6_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 604d8953c775..e275077e8af2 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2243,8 +2243,10 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mf6c_parent >= MAXMIFS) + if (c->mf6c_parent >= MAXMIFS) { + rtm->rtm_flags |= RTNH_F_UNRESOLVED; return -ENOENT; + } if (MIF_EXISTS(mrt, c->mf6c_parent) && nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index ee97c44e2aa0..a531ba032b85 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -595,16 +595,24 @@ done: if (val) { struct net_device *dev; + int midx; - if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) - goto e_inval; + rcu_read_lock(); - dev = dev_get_by_index(net, val); + dev = dev_get_by_index_rcu(net, val); if (!dev) { + rcu_read_unlock(); retv = -ENODEV; break; } - dev_put(dev); + midx = l3mdev_master_ifindex_rcu(dev); + + rcu_read_unlock(); + + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != val && + (!midx || midx != sk->sk_bound_dev_if)) + goto e_inval; } np->mcast_oif = val; retv = 0; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index e1f8b34d7a2e..9b522fa90e6d 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -126,12 +126,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return PTR_ERR(dst); rt = (struct rt6_info *) dst; - np = inet6_sk(sk); - if (!np) { - err = -EBADF; - goto dst_err_out; - } - if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = np->mcast_oif; else if (!fl6.flowi6_oif) @@ -166,7 +160,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } release_sock(sk); -dst_err_out: dst_release(dst); if (err) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 03a064803626..5215e1eba010 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -45,7 +45,7 @@ #include <net/seg6_hmac.h> #include <linux/random.h> -static char * __percpu *hmac_ring; +static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -192,7 +192,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, */ local_bh_disable(); - ring = *this_cpu_ptr(hmac_ring); + ring = this_cpu_ptr(hmac_ring); off = ring; /* source address */ @@ -353,27 +353,6 @@ out: } EXPORT_SYMBOL(seg6_push_hmac); -static int seg6_hmac_init_ring(void) -{ - int i; - - hmac_ring = alloc_percpu(char *); - - if (!hmac_ring) - return -ENOMEM; - - for_each_possible_cpu(i) { - char *ring = kzalloc(SEG6_HMAC_RING_SIZE, GFP_KERNEL); - - if (!ring) - return -ENOMEM; - - *per_cpu_ptr(hmac_ring, i) = ring; - } - - return 0; -} - static int seg6_hmac_init_algo(void) { struct seg6_hmac_algo *algo; @@ -422,16 +401,7 @@ static int seg6_hmac_init_algo(void) int __init seg6_hmac_init(void) { - int ret; - - ret = seg6_hmac_init_ring(); - if (ret < 0) - goto out; - - ret = seg6_hmac_init_algo(); - -out: - return ret; + return seg6_hmac_init_algo(); } EXPORT_SYMBOL(seg6_hmac_init); @@ -450,13 +420,6 @@ void seg6_hmac_exit(void) struct seg6_hmac_algo *algo = NULL; int i, alg_count, cpu; - for_each_possible_cpu(i) { - char *ring = *per_cpu_ptr(hmac_ring, i); - - kfree(ring); - } - free_percpu(hmac_ring); - alg_count = sizeof(hmac_algos) / sizeof(struct seg6_hmac_algo); for (i = 0; i < alg_count; i++) { algo = &hmac_algos[i]; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index a4d49760bf43..895ff650db43 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,7 +16,7 @@ #include <linux/tcp.h> #include <linux/random.h> -#include <linux/cryptohash.h> +#include <linux/siphash.h> #include <linux/kernel.h> #include <net/ipv6.h> #include <net/tcp.h> @@ -24,7 +24,7 @@ #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; +static siphash_key_t syncookie6_secret[2] __read_mostly; /* RFC 2460, Section 8.3: * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..] @@ -41,30 +41,27 @@ static __u16 const msstab[] = { 9000 - 60, }; -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch); - -static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr, +static u32 cookie_hash(const struct in6_addr *saddr, + const struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp; + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + u32 count; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *saddr, + .daddr = *daddr, + .count = count, + .sport = sport, + .dport = dport + }; net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret)); - - tmp = this_cpu_ptr(ipv6_cookie_scratch); - - /* - * we have 320 bits of information to hash, copy in the remaining - * 192 bits required for sha_transform, from the syncookie6_secret - * and overwrite the digest with the secret - */ - memcpy(tmp + 10, syncookie6_secret[c], 44); - memcpy(tmp, saddr, 16); - memcpy(tmp + 4, daddr, 16); - tmp[8] = ((__force u32)sport << 16) + (__force u32)dport; - tmp[9] = count; - sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); - - return tmp[17]; + return siphash(&combined, offsetofend(typeof(combined), dport), + &syncookie6_secret[c]); } static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 73bc8fc68acd..fc14e04028bf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -123,6 +123,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct dst_entry *dst; int addr_type; int err; + struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -258,7 +259,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); - if (tcp_death_row.sysctl_tw_recycle && + if (tcp_death_row->sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) tcp_fetch_timewait_stamp(sk, dst); @@ -273,7 +274,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); - err = inet6_hash_connect(&tcp_death_row, sk); + err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure; @@ -1744,7 +1745,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) srcp = ntohs(inet->inet_sport); if (icsk->icsk_pending == ICSK_TIME_RETRANS || - icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || + icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; @@ -1888,6 +1889,7 @@ struct proto tcpv6_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .sendpage = tcp_sendpage, @@ -1948,7 +1950,7 @@ static void __net_exit tcpv6_net_exit(struct net *net) static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET6); + inet_twsk_purge(&tcp_hashinfo, AF_INET6); } static struct pernet_operations tcpv6_net_ops = { diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index e2c6ae024565..8bf18a5f66e0 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -106,8 +106,8 @@ static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) +static void l2tp_eth_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct l2tp_eth *priv = netdev_priv(dev); @@ -117,10 +117,8 @@ static struct rtnl_link_stats64 *l2tp_eth_get_stats64(struct net_device *dev, stats->rx_bytes = atomic_long_read(&priv->rx_bytes); stats->rx_packets = atomic_long_read(&priv->rx_packets); stats->rx_errors = atomic_long_read(&priv->rx_errors); - return stats; } - static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_init = l2tp_eth_dev_init, .ndo_uninit = l2tp_eth_dev_uninit, diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 3d73278b86ca..b07a859f21bd 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -53,19 +53,26 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, struct sock *sk; sk_for_each_bound(sk, &l2tp_ip_bind_table) { - struct inet_sock *inet = inet_sk(sk); - struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); + const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); + const struct inet_sock *inet = inet_sk(sk); - if (l2tp == NULL) + if (!net_eq(sock_net(sk), net)) continue; - if ((l2tp->conn_id == tunnel_id) && - net_eq(sock_net(sk), net) && - !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && - (!inet->inet_daddr || !raddr || inet->inet_daddr == raddr) && - (!sk->sk_bound_dev_if || !dif || - sk->sk_bound_dev_if == dif)) - goto found; + if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif) + continue; + + if (inet->inet_rcv_saddr && laddr && + inet->inet_rcv_saddr != laddr) + continue; + + if (inet->inet_daddr && raddr && inet->inet_daddr != raddr) + continue; + + if (l2tp->conn_id != tunnel_id) + continue; + + goto found; } sk = NULL; @@ -258,7 +265,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!sock_flag(sk, SOCK_ZAPPED)) goto out; - if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip)) + if (sk->sk_state != TCP_CLOSE) goto out; chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 331ccf5a7bad..4b06eb415f68 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -57,8 +57,8 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk) return (struct l2tp_ip6_sock *)sk; } -static struct sock *__l2tp_ip6_bind_lookup(struct net *net, - struct in6_addr *laddr, +static struct sock *__l2tp_ip6_bind_lookup(const struct net *net, + const struct in6_addr *laddr, const struct in6_addr *raddr, int dif, u32 tunnel_id) { @@ -67,18 +67,26 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net, sk_for_each_bound(sk, &l2tp_ip6_bind_table) { const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk); const struct in6_addr *sk_raddr = &sk->sk_v6_daddr; - struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); + const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk); - if (l2tp == NULL) + if (!net_eq(sock_net(sk), net)) continue; - if ((l2tp->conn_id == tunnel_id) && - net_eq(sock_net(sk), net) && - (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) && - (!raddr || ipv6_addr_any(sk_raddr) || ipv6_addr_equal(sk_raddr, raddr)) && - (!sk->sk_bound_dev_if || !dif || - sk->sk_bound_dev_if == dif)) - goto found; + if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif) + continue; + + if (sk_laddr && !ipv6_addr_any(sk_laddr) && + !ipv6_addr_any(laddr) && !ipv6_addr_equal(sk_laddr, laddr)) + continue; + + if (!ipv6_addr_any(sk_raddr) && raddr && + !ipv6_addr_any(raddr) && !ipv6_addr_equal(sk_raddr, raddr)) + continue; + + if (l2tp->conn_id != tunnel_id) + continue; + + goto found; } sk = NULL; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index e91e503bf992..a0be2f6cd121 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3563,6 +3563,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif, } EXPORT_SYMBOL(ieee80211_nan_func_match); +static int ieee80211_set_multicast_to_unicast(struct wiphy *wiphy, + struct net_device *dev, + const bool enabled) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + sdata->u.ap.multicast_to_unicast = enabled; + + return 0; +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -3653,4 +3664,5 @@ const struct cfg80211_ops mac80211_config_ops = { .nan_change_conf = ieee80211_nan_change_conf, .add_nan_func = ieee80211_add_nan_func, .del_nan_func = ieee80211_del_nan_func, + .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index a0d901d8992e..89178b46b32f 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1267,7 +1267,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata, *sdata_tmp; struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx; struct ieee80211_chanctx *new_ctx = NULL; - int i, err, n_assigned, n_reserved, n_ready; + int err, n_assigned, n_reserved, n_ready; int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0; lockdep_assert_held(&local->mtx); @@ -1388,8 +1388,6 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) * Update all structures, values and pointers to point to new channel * context(s). */ - - i = 0; list_for_each_entry(ctx, &local->chanctx_list, list) { if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index e02ba42ca827..f62cd0e13c58 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -243,6 +243,31 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, return rv; } +static ssize_t misc_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + /* Max len of each line is 16 characters, plus 9 for 'pending:\n' */ + size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9; + char *buf = kzalloc(bufsz, GFP_KERNEL); + char *pos = buf, *end = buf + bufsz - 1; + ssize_t rv; + int i; + int ln; + + pos += scnprintf(pos, end - pos, "pending:\n"); + + for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { + ln = skb_queue_len(&local->pending[i]); + pos += scnprintf(pos, end - pos, "[%i] %d\n", + i, ln); + } + + rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); + kfree(buf); + return rv; +} + static ssize_t queues_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -263,6 +288,7 @@ static ssize_t queues_read(struct file *file, char __user *user_buf, DEBUGFS_READONLY_FILE_OPS(hwflags); DEBUGFS_READONLY_FILE_OPS(queues); +DEBUGFS_READONLY_FILE_OPS(misc); /* statistics stuff */ @@ -331,6 +357,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD(total_ps_buffered); DEBUGFS_ADD(wep_iv); DEBUGFS_ADD(queues); + DEBUGFS_ADD(misc); #ifdef CONFIG_PM DEBUGFS_ADD_MODE(reset, 0200); #endif diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 1a05f85cb1f0..8f5fff8b2040 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -519,6 +519,8 @@ static ssize_t ieee80211_if_fmt_aqm( } IEEE80211_IF_FILE_R(aqm); +IEEE80211_IF_FILE(multicast_to_unicast, u.ap.multicast_to_unicast, HEX); + /* IBSS attributes */ static ssize_t ieee80211_if_fmt_tsf( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) @@ -683,6 +685,7 @@ static void add_ap_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(dtim_count); DEBUGFS_ADD(num_buffered_multicast); DEBUGFS_ADD_MODE(tkip_mic_test, 0200); + DEBUGFS_ADD_MODE(multicast_to_unicast, 0600); } static void add_vlan_files(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b2069fbd60f9..159a1a733725 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -297,6 +297,7 @@ struct ieee80211_if_ap { driver_smps_mode; /* smps mode request */ struct work_struct request_smps_work; + bool multicast_to_unicast; }; struct ieee80211_if_wds { @@ -624,8 +625,8 @@ struct ieee80211_mesh_sync_ops { struct ieee80211_rx_status *rx_status); /* should be called with beacon_data under RCU read lock */ - void (*adjust_tbtt)(struct ieee80211_sub_if_data *sdata, - struct beacon_data *beacon); + void (*adjust_tsf)(struct ieee80211_sub_if_data *sdata, + struct beacon_data *beacon); /* add other framework functions here */ }; @@ -688,7 +689,6 @@ struct ieee80211_if_mesh { const struct ieee80211_mesh_sync_ops *sync_ops; s64 sync_offset_clockdrift_max; spinlock_t sync_offset_lock; - bool adjusting_tbtt; /* mesh power save */ enum nl80211_mesh_power_mode nonpeer_pm; int ps_peers_light_sleep; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d37ae7dc114b..40813dd3301c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1123,7 +1123,7 @@ static u16 ieee80211_netdev_select_queue(struct net_device *dev, return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } -static struct rtnl_link_stats64 * +static void ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { int i; @@ -1148,8 +1148,6 @@ ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->rx_bytes += rx_bytes; stats->tx_bytes += tx_bytes; } - - return stats; } static const struct net_device_ops ieee80211_dataif_ops = { diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 42120d965263..9c23172feba0 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -279,10 +279,6 @@ int mesh_add_meshconf_ie(struct ieee80211_sub_if_data *sdata, /* Mesh PS mode. See IEEE802.11-2012 8.4.2.100.8 */ *pos |= ifmsh->ps_peers_deep_sleep ? IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL : 0x00; - *pos++ |= ifmsh->adjusting_tbtt ? - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING : 0x00; - *pos++ = 0x00; - return 0; } @@ -850,7 +846,6 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) ifmsh->mesh_cc_id = 0; /* Disabled */ /* register sync ops from extensible synchronization framework */ ifmsh->sync_ops = ieee80211_mesh_sync_ops_get(ifmsh->mesh_sp_id); - ifmsh->adjusting_tbtt = false; ifmsh->sync_offset_clockdrift_max = 0; set_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags); ieee80211_mesh_root_setup(ifmsh); @@ -1349,7 +1344,7 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata) ieee80211_mesh_rootpath(sdata); if (test_and_clear_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags)) - mesh_sync_adjust_tbtt(sdata); + mesh_sync_adjust_tsf(sdata); if (test_and_clear_bit(MESH_WORK_MBSS_CHANGED, &ifmsh->wrkq_flags)) mesh_bss_info_changed(sdata); diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 26b9ccbe1fce..7e5f271e3c30 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -341,7 +341,7 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) } void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata); -void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata); +void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata); void ieee80211s_stop(void); #else static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata) diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 7fcdcf622655..fcba70e57073 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -505,12 +505,14 @@ mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *addr, /* Userspace handles station allocation */ if (sdata->u.mesh.user_mpm || - sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) - cfg80211_notify_new_peer_candidate(sdata->dev, addr, - elems->ie_start, - elems->total_len, - GFP_KERNEL); - else + sdata->u.mesh.security & IEEE80211_MESH_SEC_AUTHED) { + if (mesh_peer_accepts_plinks(elems) && + mesh_plink_availables(sdata)) + cfg80211_notify_new_peer_candidate(sdata->dev, addr, + elems->ie_start, + elems->total_len, + GFP_KERNEL); + } else sta = __mesh_sta_info_alloc(sdata, addr); return sta; diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index faca22cd02b5..a435f094a82e 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -12,7 +12,7 @@ #include "mesh.h" #include "driver-ops.h" -/* This is not in the standard. It represents a tolerable tbtt drift below +/* This is not in the standard. It represents a tolerable tsf drift below * which we do no TSF adjustment. */ #define TOFFSET_MINIMUM_ADJUSTMENT 10 @@ -46,7 +46,7 @@ static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie) IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0; } -void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) +void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; @@ -57,12 +57,12 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) spin_lock_bh(&ifmsh->sync_offset_lock); if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) { - msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n", + msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting\n", (long long) ifmsh->sync_offset_clockdrift_max); tsfdelta = -ifmsh->sync_offset_clockdrift_max; ifmsh->sync_offset_clockdrift_max = 0; } else { - msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting by %llu\n", + msync_dbg(sdata, "TSF : max clockdrift=%lld; adjusting by %llu\n", (long long) ifmsh->sync_offset_clockdrift_max, (unsigned long long) beacon_int_fraction); tsfdelta = -beacon_int_fraction; @@ -123,7 +123,6 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, */ if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) { - clear_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN); msync_dbg(sdata, "STA %pM : is adjusting TBTT\n", sta->sta.addr); goto no_sync; @@ -168,15 +167,13 @@ no_sync: rcu_read_unlock(); } -static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, +static void mesh_sync_offset_adjust_tsf(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - u8 cap; WARN_ON(ifmsh->mesh_sp_id != IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET); WARN_ON(!rcu_read_lock_held()); - cap = beacon->meshconf->meshconf_cap; spin_lock_bh(&ifmsh->sync_offset_lock); @@ -187,24 +184,16 @@ static void mesh_sync_offset_adjust_tbtt(struct ieee80211_sub_if_data *sdata, * the tsf adjustment to the mesh tasklet */ msync_dbg(sdata, - "TBTT : kicking off TBTT adjustment with clockdrift_max=%lld\n", + "TSF : kicking off TSF adjustment with clockdrift_max=%lld\n", ifmsh->sync_offset_clockdrift_max); set_bit(MESH_WORK_DRIFT_ADJUST, &ifmsh->wrkq_flags); - - ifmsh->adjusting_tbtt = true; } else { msync_dbg(sdata, - "TBTT : max clockdrift=%lld; too small to adjust\n", + "TSF : max clockdrift=%lld; too small to adjust\n", (long long)ifmsh->sync_offset_clockdrift_max); ifmsh->sync_offset_clockdrift_max = 0; - - ifmsh->adjusting_tbtt = false; } spin_unlock_bh(&ifmsh->sync_offset_lock); - - beacon->meshconf->meshconf_cap = ifmsh->adjusting_tbtt ? - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING | cap : - ~IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING & cap; } static const struct sync_method sync_methods[] = { @@ -212,7 +201,7 @@ static const struct sync_method sync_methods[] = { .method = IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET, .ops = { .rx_bcn_presp = &mesh_sync_offset_rx_bcn_presp, - .adjust_tbtt = &mesh_sync_offset_adjust_tbtt, + .adjust_tsf = &mesh_sync_offset_adjust_tsf, } }, }; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 098ce9b179ee..8a6344518674 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1486,10 +1486,6 @@ void ieee80211_recalc_ps(struct ieee80211_local *local) if (count == 1 && ieee80211_powersave_allowed(found)) { u8 dtimper = found->u.mgd.dtim_period; - s32 beaconint_us; - - beaconint_us = ieee80211_tu_to_usec( - found->vif.bss_conf.beacon_int); timeout = local->dynamic_ps_forced_timeout; if (timeout < 0) diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 14c5ba3a1b1c..3ebe4405a2d4 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -159,21 +159,23 @@ minstrel_update_rates(struct minstrel_priv *mp, struct minstrel_sta_info *mi) void minstrel_calc_rate_stats(struct minstrel_rate_stats *mrs) { + unsigned int cur_prob; + if (unlikely(mrs->attempts > 0)) { mrs->sample_skipped = 0; - mrs->cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); + cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); if (unlikely(!mrs->att_hist)) { - mrs->prob_ewma = mrs->cur_prob; + mrs->prob_ewma = cur_prob; } else { /* update exponential weighted moving variance */ - mrs->prob_ewmsd = minstrel_ewmsd(mrs->prob_ewmsd, - mrs->cur_prob, - mrs->prob_ewma, - EWMA_LEVEL); + mrs->prob_ewmv = minstrel_ewmv(mrs->prob_ewmv, + cur_prob, + mrs->prob_ewma, + EWMA_LEVEL); /*update exponential weighted moving avarage */ mrs->prob_ewma = minstrel_ewma(mrs->prob_ewma, - mrs->cur_prob, + cur_prob, EWMA_LEVEL); } mrs->att_hist += mrs->attempts; @@ -365,6 +367,11 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, return; #endif + /* Don't use EAPOL frames for sampling on non-mrr hw */ + if (mp->hw->max_rates == 1 && + (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) + return; + delta = (mi->total_packets * sampling_ratio / 100) - (mi->sample_packets + mi->sample_deferred / 2); diff --git a/net/mac80211/rc80211_minstrel.h b/net/mac80211/rc80211_minstrel.h index c230bbe93262..be6c3f35f48b 100644 --- a/net/mac80211/rc80211_minstrel.h +++ b/net/mac80211/rc80211_minstrel.h @@ -14,7 +14,7 @@ #define SAMPLE_COLUMNS 10 /* number of columns in sample table */ /* scaled fraction values */ -#define MINSTREL_SCALE 16 +#define MINSTREL_SCALE 12 #define MINSTREL_FRAC(val, div) (((val) << MINSTREL_SCALE) / div) #define MINSTREL_TRUNC(val) ((val) >> MINSTREL_SCALE) @@ -36,21 +36,16 @@ minstrel_ewma(int old, int new, int weight) } /* - * Perform EWMSD (Exponentially Weighted Moving Standard Deviation) calculation + * Perform EWMV (Exponentially Weighted Moving Variance) calculation */ static inline int -minstrel_ewmsd(int old_ewmsd, int cur_prob, int prob_ewma, int weight) +minstrel_ewmv(int old_ewmv, int cur_prob, int prob_ewma, int weight) { - int diff, incr, tmp_var; + int diff, incr; - /* calculate exponential weighted moving variance */ - diff = MINSTREL_TRUNC((cur_prob - prob_ewma) * 1000000); + diff = cur_prob - prob_ewma; incr = (EWMA_DIV - weight) * diff / EWMA_DIV; - tmp_var = old_ewmsd * old_ewmsd; - tmp_var = weight * (tmp_var + diff * incr / 1000000) / EWMA_DIV; - - /* return standard deviation */ - return (u16) int_sqrt(tmp_var); + return weight * (old_ewmv + MINSTREL_TRUNC(diff * incr)) / EWMA_DIV; } struct minstrel_rate_stats { @@ -59,15 +54,13 @@ struct minstrel_rate_stats { u16 success, last_success; /* total attempts/success counters */ - u64 att_hist, succ_hist; + u32 att_hist, succ_hist; /* statistis of packet delivery probability - * cur_prob - current prob within last update intervall * prob_ewma - exponential weighted moving average of prob * prob_ewmsd - exp. weighted moving standard deviation of prob */ - unsigned int cur_prob; - unsigned int prob_ewma; - u16 prob_ewmsd; + u16 prob_ewma; + u16 prob_ewmv; /* maximum retry counts */ u8 retry_count; @@ -153,6 +146,14 @@ struct minstrel_debugfs_info { char buf[]; }; +/* Get EWMSD (Exponentially Weighted Moving Standard Deviation) * 10 */ +static inline int +minstrel_get_ewmsd10(struct minstrel_rate_stats *mrs) +{ + unsigned int ewmv = mrs->prob_ewmv; + return int_sqrt(MINSTREL_TRUNC(ewmv * 1000 * 1000)); +} + extern const struct rate_control_ops mac80211_minstrel; void minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir); void minstrel_remove_sta_debugfs(void *priv, void *priv_sta); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index 820b0abc9c0d..36fc971deb86 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -75,7 +75,7 @@ minstrel_stats_open(struct inode *inode, struct file *file) { struct minstrel_sta_info *mi = inode->i_private; struct minstrel_debugfs_info *ms; - unsigned int i, tp_max, tp_avg, prob, eprob; + unsigned int i, tp_max, tp_avg, eprob; char *p; ms = kmalloc(2048, GFP_KERNEL); @@ -86,13 +86,14 @@ minstrel_stats_open(struct inode *inode, struct file *file) p = ms->buf; p += sprintf(p, "\n"); p += sprintf(p, - "best __________rate_________ ________statistics________ ________last_______ ______sum-of________\n"); + "best __________rate_________ ________statistics________ ____last_____ ______sum-of________\n"); p += sprintf(p, - "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); + "rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n"); for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; struct minstrel_rate_stats *mrs = &mi->r[i].stats; + unsigned int prob_ewmsd; *(p++) = (i == mi->max_tp_rate[0]) ? 'A' : ' '; *(p++) = (i == mi->max_tp_rate[1]) ? 'B' : ' '; @@ -107,17 +108,16 @@ minstrel_stats_open(struct inode *inode, struct file *file) tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + prob_ewmsd = minstrel_get_ewmsd10(mrs); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" - " %3u.%1u %3u %3u %-3u " + " %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, - prob / 10, prob % 10, + prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, @@ -148,7 +148,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) { struct minstrel_sta_info *mi = inode->i_private; struct minstrel_debugfs_info *ms; - unsigned int i, tp_max, tp_avg, prob, eprob; + unsigned int i, tp_max, tp_avg, eprob; char *p; ms = kmalloc(2048, GFP_KERNEL); @@ -161,6 +161,7 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) for (i = 0; i < mi->n_rates; i++) { struct minstrel_rate *mr = &mi->r[i]; struct minstrel_rate_stats *mrs = &mi->r[i].stats; + unsigned int prob_ewmsd; p += sprintf(p, "%s" ,((i == mi->max_tp_rate[0]) ? "A" : "")); p += sprintf(p, "%s" ,((i == mi->max_tp_rate[1]) ? "B" : "")); @@ -175,16 +176,15 @@ minstrel_stats_csv_open(struct inode *inode, struct file *file) tp_max = minstrel_get_tp_avg(mr, MINSTREL_FRAC(100,100)); tp_avg = minstrel_get_tp_avg(mr, mrs->prob_ewma); - prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + prob_ewmsd = minstrel_get_ewmsd10(mrs); - p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," + p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u,%u," "%llu,%llu,%d,%d\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, - prob / 10, prob % 10, + prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 30fbabf4bcbc..8e783e197e93 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -14,6 +14,7 @@ #include <linux/ieee80211.h> #include <net/mac80211.h> #include "rate.h" +#include "sta_info.h" #include "rc80211_minstrel.h" #include "rc80211_minstrel_ht.h" @@ -154,67 +155,47 @@ MODULE_PARM_DESC(minstrel_vht_only, const struct mcs_group minstrel_mcs_groups[] = { MCS_GROUP(1, 0, BW_20), MCS_GROUP(2, 0, BW_20), -#if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 0, BW_20), -#endif MCS_GROUP(1, 1, BW_20), MCS_GROUP(2, 1, BW_20), -#if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 1, BW_20), -#endif MCS_GROUP(1, 0, BW_40), MCS_GROUP(2, 0, BW_40), -#if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 0, BW_40), -#endif MCS_GROUP(1, 1, BW_40), MCS_GROUP(2, 1, BW_40), -#if MINSTREL_MAX_STREAMS >= 3 MCS_GROUP(3, 1, BW_40), -#endif CCK_GROUP, #ifdef CONFIG_MAC80211_RC_MINSTREL_VHT VHT_GROUP(1, 0, BW_20), VHT_GROUP(2, 0, BW_20), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 0, BW_20), -#endif VHT_GROUP(1, 1, BW_20), VHT_GROUP(2, 1, BW_20), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 1, BW_20), -#endif VHT_GROUP(1, 0, BW_40), VHT_GROUP(2, 0, BW_40), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 0, BW_40), -#endif VHT_GROUP(1, 1, BW_40), VHT_GROUP(2, 1, BW_40), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 1, BW_40), -#endif VHT_GROUP(1, 0, BW_80), VHT_GROUP(2, 0, BW_80), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 0, BW_80), -#endif VHT_GROUP(1, 1, BW_80), VHT_GROUP(2, 1, BW_80), -#if MINSTREL_MAX_STREAMS >= 3 VHT_GROUP(3, 1, BW_80), #endif -#endif }; static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; @@ -301,7 +282,7 @@ minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, break; /* short preamble */ - if (!(mi->groups[group].supported & BIT(idx))) + if (!(mi->supported[group] & BIT(idx))) idx += 4; } return &mi->groups[group].rates[idx]; @@ -486,7 +467,7 @@ minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) MCS_GROUP_RATES].streams; for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; - if (!mg->supported || group == MINSTREL_CCK_GROUP) + if (!mi->supported[group] || group == MINSTREL_CCK_GROUP) continue; tmp_idx = mg->max_group_prob_rate % MCS_GROUP_RATES; @@ -540,7 +521,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; - if (!mg->supported) + if (!mi->supported[group]) continue; mi->sample_count++; @@ -550,7 +531,7 @@ minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) tmp_group_tp_rate[j] = group; for (i = 0; i < MCS_GROUP_RATES; i++) { - if (!(mg->supported & BIT(i))) + if (!(mi->supported[group] & BIT(i))) continue; index = MCS_GROUP_RATES * group + i; @@ -636,7 +617,7 @@ minstrel_set_next_sample_idx(struct minstrel_ht_sta *mi) mi->sample_group %= ARRAY_SIZE(minstrel_mcs_groups); mg = &mi->groups[mi->sample_group]; - if (!mg->supported) + if (!mi->supported[mi->sample_group]) continue; if (++mg->index >= MCS_GROUP_RATES) { @@ -657,7 +638,7 @@ minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary) while (group > 0) { group--; - if (!mi->groups[group].supported) + if (!mi->supported[group]) continue; if (minstrel_mcs_groups[group].streams > @@ -994,7 +975,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) sample_idx = sample_table[mg->column][mg->index]; minstrel_set_next_sample_idx(mi); - if (!(mg->supported & BIT(sample_idx))) + if (!(mi->supported[sample_group] & BIT(sample_idx))) return -1; mrs = &mg->rates[sample_idx]; @@ -1049,22 +1030,6 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) } static void -minstrel_ht_check_cck_shortpreamble(struct minstrel_priv *mp, - struct minstrel_ht_sta *mi, bool val) -{ - u8 supported = mi->groups[MINSTREL_CCK_GROUP].supported; - - if (!supported || !mi->cck_supported_short) - return; - - if (supported & (mi->cck_supported_short << (val * 4))) - return; - - supported ^= mi->cck_supported_short | (mi->cck_supported_short << 4); - mi->groups[MINSTREL_CCK_GROUP].supported = supported; -} - -static void minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc) { @@ -1087,7 +1052,6 @@ minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, minstrel_aggr_check(sta, txrc->skb); info->flags |= mi->tx_flags; - minstrel_ht_check_cck_shortpreamble(mp, mi, txrc->short_preamble); #ifdef CONFIG_MAC80211_DEBUGFS if (mp->fixed_rate_idx != -1) @@ -1154,7 +1118,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, mi->cck_supported_short |= BIT(i); } - mi->groups[MINSTREL_CCK_GROUP].supported = mi->cck_supported; + mi->supported[MINSTREL_CCK_GROUP] = mi->cck_supported; } static void @@ -1168,6 +1132,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; u16 sta_cap = sta->ht_cap.cap; struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; + struct sta_info *sinfo = container_of(sta, struct sta_info, sta); int use_vht; int n_supported = 0; int ack_dur; @@ -1224,7 +1189,7 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, u32 gflags = minstrel_mcs_groups[i].flags; int bw, nss; - mi->groups[i].supported = 0; + mi->supported[i] = 0; if (i == MINSTREL_CCK_GROUP) { minstrel_ht_update_cck(mp, mi, sband, sta); continue; @@ -1256,8 +1221,8 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, if (use_vht && minstrel_vht_only) continue; #endif - mi->groups[i].supported = mcs->rx_mask[nss - 1]; - if (mi->groups[i].supported) + mi->supported[i] = mcs->rx_mask[nss - 1]; + if (mi->supported[i]) n_supported++; continue; } @@ -1283,16 +1248,19 @@ minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, else bw = BW_20; - mi->groups[i].supported = minstrel_get_valid_vht_rates(bw, nss, + mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss, vht_cap->vht_mcs.tx_mcs_map); - if (mi->groups[i].supported) + if (mi->supported[i]) n_supported++; } if (!n_supported) goto use_legacy; + if (test_sta_flag(sinfo, WLAN_STA_SHORT_PREAMBLE)) + mi->cck_supported_short |= mi->cck_supported_short << 4; + /* create an initial rate table with the lowest supported rates */ minstrel_ht_update_stats(mp, mi); minstrel_ht_update_rates(mp, mi); diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index e8b52a94d24b..de1646c42e82 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -52,9 +52,6 @@ struct minstrel_mcs_group_data { u8 index; u8 column; - /* bitfield of supported MCS rates of this group */ - u16 supported; - /* sorted rate set within a MCS group*/ u16 max_group_tp_rate[MAX_THR_RATES]; u16 max_group_prob_rate; @@ -101,6 +98,9 @@ struct minstrel_ht_sta { u8 cck_supported; u8 cck_supported_short; + /* Bitfield of supported MCS rates of all groups */ + u16 supported[MINSTREL_GROUPS_NB]; + /* MCS rate group info and statistics */ struct minstrel_mcs_group_data groups[MINSTREL_GROUPS_NB]; }; diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 5320e35ed3d0..7d969e300fb3 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -19,12 +19,12 @@ static char * minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) { const struct mcs_group *mg; - unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; + unsigned int j, tp_max, tp_avg, eprob, tx_time; char htmode = '2'; char gimode = 'L'; u32 gflags; - if (!mi->groups[i].supported) + if (!mi->supported[i]) return p; mg = &minstrel_mcs_groups[i]; @@ -41,8 +41,9 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; static const int bitrates[4] = { 10, 20, 55, 110 }; int idx = i * MCS_GROUP_RATES + j; + unsigned int prob_ewmsd; - if (!(mi->groups[i].supported & BIT(j))) + if (!(mi->supported[i] & BIT(j))) continue; if (gflags & IEEE80211_TX_RC_MCS) { @@ -83,17 +84,16 @@ minstrel_ht_stats_dump(struct minstrel_ht_sta *mi, int i, char *p) tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + prob_ewmsd = minstrel_get_ewmsd10(mrs); p += sprintf(p, "%4u.%1u %4u.%1u %3u.%1u %3u.%1u" - " %3u.%1u %3u %3u %-3u " + " %3u %3u %-3u " "%9llu %-9llu\n", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, - prob / 10, prob % 10, + prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, @@ -130,9 +130,9 @@ minstrel_ht_stats_open(struct inode *inode, struct file *file) p += sprintf(p, "\n"); p += sprintf(p, - " best ____________rate__________ ________statistics________ ________last_______ ______sum-of________\n"); + " best ____________rate__________ ________statistics________ _____last____ ______sum-of________\n"); p += sprintf(p, - "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [prob.|retry|suc|att] [#success | #attempts]\n"); + "mode guard # rate [name idx airtime max_tp] [avg(tp) avg(prob) sd(prob)] [retry|suc|att] [#success | #attempts]\n"); p = minstrel_ht_stats_dump(mi, MINSTREL_CCK_GROUP, p); for (i = 0; i < MINSTREL_CCK_GROUP; i++) @@ -165,12 +165,12 @@ static char * minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) { const struct mcs_group *mg; - unsigned int j, tp_max, tp_avg, prob, eprob, tx_time; + unsigned int j, tp_max, tp_avg, eprob, tx_time; char htmode = '2'; char gimode = 'L'; u32 gflags; - if (!mi->groups[i].supported) + if (!mi->supported[i]) return p; mg = &minstrel_mcs_groups[i]; @@ -187,8 +187,9 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) struct minstrel_rate_stats *mrs = &mi->groups[i].rates[j]; static const int bitrates[4] = { 10, 20, 55, 110 }; int idx = i * MCS_GROUP_RATES + j; + unsigned int prob_ewmsd; - if (!(mi->groups[i].supported & BIT(j))) + if (!(mi->supported[i] & BIT(j))) continue; if (gflags & IEEE80211_TX_RC_MCS) { @@ -226,16 +227,15 @@ minstrel_ht_stats_csv_dump(struct minstrel_ht_sta *mi, int i, char *p) tp_max = minstrel_ht_get_tp_avg(mi, i, j, MINSTREL_FRAC(100, 100)); tp_avg = minstrel_ht_get_tp_avg(mi, i, j, mrs->prob_ewma); - prob = MINSTREL_TRUNC(mrs->cur_prob * 1000); eprob = MINSTREL_TRUNC(mrs->prob_ewma * 1000); + prob_ewmsd = minstrel_get_ewmsd10(mrs); - p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," + p += sprintf(p, "%u.%u,%u.%u,%u.%u,%u.%u,%u,%u," "%u,%llu,%llu,", tp_max / 10, tp_max % 10, tp_avg / 10, tp_avg % 10, eprob / 10, eprob % 10, - mrs->prob_ewmsd / 10, mrs->prob_ewmsd % 10, - prob / 10, prob % 10, + prob_ewmsd / 10, prob_ewmsd % 10, mrs->retry_count, mrs->last_success, mrs->last_attempts, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3090dd4342f6..b791c4190564 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1908,7 +1908,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) unsigned int frag, seq; struct ieee80211_fragment_entry *entry; struct sk_buff *skb; - struct ieee80211_rx_status *status; hdr = (struct ieee80211_hdr *)rx->skb->data; fc = hdr->frame_control; @@ -2034,9 +2033,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) dev_kfree_skb(skb); } - /* Complete frame has been reassembled - process it now */ - status = IEEE80211_SKB_RXCB(rx->skb); - out: ieee80211_led_rx(rx->local); out_no_led: diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 23d8ac829279..faab3c490d2b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -1120,7 +1120,6 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; - size_t len; iebufsz = local->scan_ies_len + req->ie_len; @@ -1145,10 +1144,9 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_prepare_scan_chandef(&chandef, req->scan_width); - len = ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, - &sched_scan_ies, req->ie, - req->ie_len, bands_used, - rate_masks, &chandef); + ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, + &sched_scan_ies, req->ie, + req->ie_len, bands_used, rate_masks, &chandef); ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 50c309094c37..4774e663a411 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -513,23 +513,23 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - struct station_info *sinfo; + struct station_info *sinfo = NULL; int err = 0; lockdep_assert_held(&local->sta_mtx); - sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); - if (!sinfo) { - err = -ENOMEM; - goto out_err; - } - /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_err; } + sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); + if (!sinfo) { + err = -ENOMEM; + goto out_err; + } + local->num_sta++; local->sta_generation++; smp_mb(); @@ -2051,16 +2051,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; - struct rate_control_ref *ref = NULL; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta); - if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) - ref = local->rate_ctrl; - sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a diff --git a/net/mac80211/status.c b/net/mac80211/status.c index ddf71c648cab..d6a1bfaa7a81 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -541,6 +541,11 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, } else if (info->ack_frame_id) { ieee80211_report_ack_skb(local, info, acked, dropped); } + + if (!dropped && skb->destructor) { + skb->wifi_acked_valid = 1; + skb->wifi_acked = acked; + } } /* @@ -633,10 +638,9 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_supported_band *sband; int retry_count; - int rates_idx; bool acked, noack_success; - rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); + ieee80211_tx_get_rates(hw, info, &retry_count); sband = hw->wiphy->bands[info->band]; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 797e847cbc49..986de098803d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -16,6 +16,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/skbuff.h> +#include <linux/if_vlan.h> #include <linux/etherdevice.h> #include <linux/bitmap.h> #include <linux/rcupdate.h> @@ -63,6 +64,10 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct ieee80211_chanctx_conf *chanctx_conf; u32 rate_flags = 0; + /* assume HW handles this */ + if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) + return 0; + rcu_read_lock(); chanctx_conf = rcu_dereference(tx->sdata->vif.chanctx_conf); if (chanctx_conf) { @@ -71,10 +76,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, } rcu_read_unlock(); - /* assume HW handles this */ - if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) - return 0; - /* uh huh? */ if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; @@ -3571,6 +3572,115 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, rcu_read_unlock(); } +static int ieee80211_change_da(struct sk_buff *skb, struct sta_info *sta) +{ + struct ethhdr *eth; + int err; + + err = skb_ensure_writable(skb, ETH_HLEN); + if (unlikely(err)) + return err; + + eth = (void *)skb->data; + ether_addr_copy(eth->h_dest, sta->sta.addr); + + return 0; +} + +static bool ieee80211_multicast_to_unicast(struct sk_buff *skb, + struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + const struct ethhdr *eth = (void *)skb->data; + const struct vlan_ethhdr *ethvlan = (void *)skb->data; + __be16 ethertype; + + if (likely(!is_multicast_ether_addr(eth->h_dest))) + return false; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + if (sdata->u.vlan.sta) + return false; + if (sdata->wdev.use_4addr) + return false; + /* fall through */ + case NL80211_IFTYPE_AP: + /* check runtime toggle for this bss */ + if (!sdata->bss->multicast_to_unicast) + return false; + break; + default: + return false; + } + + /* multicast to unicast conversion only for some payload */ + ethertype = eth->h_proto; + if (ethertype == htons(ETH_P_8021Q) && skb->len >= VLAN_ETH_HLEN) + ethertype = ethvlan->h_vlan_encapsulated_proto; + switch (ethertype) { + case htons(ETH_P_ARP): + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + break; + default: + return false; + } + + return true; +} + +static void +ieee80211_convert_to_unicast(struct sk_buff *skb, struct net_device *dev, + struct sk_buff_head *queue) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + const struct ethhdr *eth = (struct ethhdr *)skb->data; + struct sta_info *sta, *first = NULL; + struct sk_buff *cloned_skb; + + rcu_read_lock(); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata) + /* AP-VLAN mismatch */ + continue; + if (unlikely(ether_addr_equal(eth->h_source, sta->sta.addr))) + /* do not send back to source */ + continue; + if (!first) { + first = sta; + continue; + } + cloned_skb = skb_clone(skb, GFP_ATOMIC); + if (!cloned_skb) + goto multicast; + if (unlikely(ieee80211_change_da(cloned_skb, sta))) { + dev_kfree_skb(cloned_skb); + goto multicast; + } + __skb_queue_tail(queue, cloned_skb); + } + + if (likely(first)) { + if (unlikely(ieee80211_change_da(skb, first))) + goto multicast; + __skb_queue_tail(queue, skb); + } else { + /* no STA connected, drop */ + kfree_skb(skb); + skb = NULL; + } + + goto out; +multicast: + __skb_queue_purge(queue); + __skb_queue_tail(queue, skb); +out: + rcu_read_unlock(); +} + /** * ieee80211_subif_start_xmit - netif start_xmit function for 802.3 vifs * @skb: packet to be sent @@ -3581,7 +3691,17 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) { - __ieee80211_subif_start_xmit(skb, dev, 0); + if (unlikely(ieee80211_multicast_to_unicast(skb, dev))) { + struct sk_buff_head queue; + + __skb_queue_head_init(&queue); + ieee80211_convert_to_unicast(skb, dev, &queue); + while ((skb = __skb_dequeue(&queue))) + __ieee80211_subif_start_xmit(skb, dev, 0); + } else { + __ieee80211_subif_start_xmit(skb, dev, 0); + } + return NETDEV_TX_OK; } @@ -4074,7 +4194,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, } if (ifmsh->sync_ops) - ifmsh->sync_ops->adjust_tbtt(sdata, beacon); + ifmsh->sync_ops->adjust_tsf(sdata, beacon); skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 43e45bb660bc..19ec2189d3ac 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -436,14 +436,10 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum nl80211_band band) { - struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband; enum ieee80211_sta_rx_bandwidth new_bw; u32 changed = 0; u8 nss; - sband = local->hw.wiphy->bands[band]; - /* ignore - no support for BF yet */ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) return 0; diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index efa3f48f1ec5..73e8f347802e 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -293,7 +293,8 @@ ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); /* remove ICV */ - if (pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) + if (!(status->flag & RX_FLAG_ICV_STRIPPED) && + pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) return RX_DROP_UNUSABLE; } diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 8af6dd388d11..c1ef22df865f 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -294,7 +294,8 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; /* Trim ICV */ - skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN); + if (!(status->flag & RX_FLAG_ICV_STRIPPED)) + skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 15fe97644ffe..4dc81963af8f 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -8,6 +8,7 @@ #include <linux/ipv6.h> #include <linux/mpls.h> #include <linux/vmalloc.h> +#include <linux/percpu.h> #include <net/ip.h> #include <net/dst.h> #include <net/sock.h> @@ -17,8 +18,8 @@ #include <net/netns/generic.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> -#include <net/addrconf.h> #endif +#include <net/addrconf.h> #include <net/nexthop.h> #include "internal.h" @@ -48,11 +49,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) return rt; } -static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) -{ - return rcu_dereference_rtnl(dev->mpls_ptr); -} - bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); @@ -98,6 +94,31 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); +void mpls_stats_inc_outucastpkts(struct net_device *dev, + const struct sk_buff *skb) +{ + struct mpls_dev *mdev; + + if (skb->protocol == htons(ETH_P_MPLS_UC)) { + mdev = mpls_dev_get(dev); + if (mdev) + MPLS_INC_STATS_LEN(mdev, skb->len, + tx_packets, + tx_bytes); + } else if (skb->protocol == htons(ETH_P_IP)) { + IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); +#if IS_ENABLED(CONFIG_IPV6) + } else if (skb->protocol == htons(ETH_P_IPV6)) { + struct inet6_dev *in6dev = __in6_dev_get(dev); + + if (in6dev) + IP6_UPD_PO_STATS(dev_net(dev), in6dev, + IPSTATS_MIB_OUT, skb->len); +#endif + } +} +EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts); + static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb, bool bos) { @@ -253,6 +274,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct mpls_nh *nh; struct mpls_entry_decoded dec; struct net_device *out_dev; + struct mpls_dev *out_mdev; struct mpls_dev *mdev; unsigned int hh_len; unsigned int new_header_size; @@ -262,17 +284,25 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Careful this entire function runs inside of an rcu critical section */ mdev = mpls_dev_get(dev); - if (!mdev || !mdev->input_enabled) + if (!mdev) goto drop; - if (skb->pkt_type != PACKET_HOST) + MPLS_INC_STATS_LEN(mdev, skb->len, rx_packets, + rx_bytes); + + if (!mdev->input_enabled) { + MPLS_INC_STATS(mdev, rx_dropped); goto drop; + } + + if (skb->pkt_type != PACKET_HOST) + goto err; if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) - goto drop; + goto err; if (!pskb_may_pull(skb, sizeof(*hdr))) - goto drop; + goto err; /* Read and decode the label */ hdr = mpls_hdr(skb); @@ -285,33 +315,35 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, skb_orphan(skb); rt = mpls_route_input_rcu(net, dec.label); - if (!rt) + if (!rt) { + MPLS_INC_STATS(mdev, rx_noroute); goto drop; + } nh = mpls_select_multipath(rt, skb, dec.bos); if (!nh) - goto drop; - - /* Find the output device */ - out_dev = rcu_dereference(nh->nh_dev); - if (!mpls_output_possible(out_dev)) - goto drop; + goto err; if (skb_warn_if_lro(skb)) - goto drop; + goto err; skb_forward_csum(skb); /* Verify ttl is valid */ if (dec.ttl <= 1) - goto drop; + goto err; dec.ttl -= 1; + /* Find the output device */ + out_dev = rcu_dereference(nh->nh_dev); + if (!mpls_output_possible(out_dev)) + goto tx_err; + /* Verify the destination can hold the packet */ new_header_size = mpls_nh_header_size(nh); mtu = mpls_dev_mtu(out_dev); if (mpls_pkt_too_big(skb, mtu - new_header_size)) - goto drop; + goto tx_err; hh_len = LL_RESERVED_SPACE(out_dev); if (!out_dev->header_ops) @@ -319,7 +351,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Ensure there is enough space for the headers in the skb */ if (skb_cow(skb, hh_len + new_header_size)) - goto drop; + goto tx_err; skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); @@ -327,7 +359,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ if (!mpls_egress(rt, skb, dec)) - goto drop; + goto err; } else { bool bos; int i; @@ -343,6 +375,8 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, } } + mpls_stats_inc_outucastpkts(out_dev, skb); + /* If via wasn't specified then send out using device address */ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) err = neigh_xmit(NEIGH_LINK_TABLE, out_dev, @@ -355,6 +389,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, __func__, err); return 0; +tx_err: + out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; + if (out_mdev) + MPLS_INC_STATS(out_mdev, tx_errors); + goto drop; +err: + MPLS_INC_STATS(mdev, rx_errors); drop: kfree_skb(skb); return NET_RX_DROP; @@ -853,6 +894,70 @@ errout: return err; } +static void mpls_get_stats(struct mpls_dev *mdev, + struct mpls_link_stats *stats) +{ + struct mpls_pcpu_stats *p; + int i; + + memset(stats, 0, sizeof(*stats)); + + for_each_possible_cpu(i) { + struct mpls_link_stats local; + unsigned int start; + + p = per_cpu_ptr(mdev->stats, i); + do { + start = u64_stats_fetch_begin(&p->syncp); + local = p->stats; + } while (u64_stats_fetch_retry(&p->syncp, start)); + + stats->rx_packets += local.rx_packets; + stats->rx_bytes += local.rx_bytes; + stats->tx_packets += local.tx_packets; + stats->tx_bytes += local.tx_bytes; + stats->rx_errors += local.rx_errors; + stats->tx_errors += local.tx_errors; + stats->rx_dropped += local.rx_dropped; + stats->tx_dropped += local.tx_dropped; + stats->rx_noroute += local.rx_noroute; + } +} + +static int mpls_fill_stats_af(struct sk_buff *skb, + const struct net_device *dev) +{ + struct mpls_link_stats *stats; + struct mpls_dev *mdev; + struct nlattr *nla; + + mdev = mpls_dev_get(dev); + if (!mdev) + return -ENODATA; + + nla = nla_reserve_64bit(skb, MPLS_STATS_LINK, + sizeof(struct mpls_link_stats), + MPLS_STATS_UNSPEC); + if (!nla) + return -EMSGSIZE; + + stats = nla_data(nla); + mpls_get_stats(mdev, stats); + + return 0; +} + +static size_t mpls_get_stats_af_size(const struct net_device *dev) +{ + struct mpls_dev *mdev; + + mdev = mpls_dev_get(dev); + if (!mdev) + return 0; + + return nla_total_size_64bit(sizeof(struct mpls_link_stats)); +} + #define MPLS_PERDEV_SYSCTL_OFFSET(field) \ (&((struct mpls_dev *)0)->field) @@ -911,6 +1016,7 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) { struct mpls_dev *mdev; int err = -ENOMEM; + int i; ASSERT_RTNL(); @@ -918,6 +1024,17 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) if (!mdev) return ERR_PTR(err); + mdev->stats = alloc_percpu(struct mpls_pcpu_stats); + if (!mdev->stats) + goto free; + + for_each_possible_cpu(i) { + struct mpls_pcpu_stats *mpls_stats; + + mpls_stats = per_cpu_ptr(mdev->stats, i); + u64_stats_init(&mpls_stats->syncp); + } + err = mpls_dev_sysctl_register(dev, mdev); if (err) goto free; @@ -927,10 +1044,19 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) return mdev; free: + free_percpu(mdev->stats); kfree(mdev); return ERR_PTR(err); } +static void mpls_dev_destroy_rcu(struct rcu_head *head) +{ + struct mpls_dev *mdev = container_of(head, struct mpls_dev, rcu); + + free_percpu(mdev->stats); + kfree(mdev); +} + static void mpls_ifdown(struct net_device *dev, int event) { struct mpls_route __rcu **platform_label; @@ -1045,7 +1171,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, if (mdev) { mpls_dev_sysctl_unregister(mdev); RCU_INIT_POINTER(dev->mpls_ptr, NULL); - kfree_rcu(mdev, rcu); + call_rcu(&mdev->rcu, mpls_dev_destroy_rcu); } break; case NETDEV_CHANGENAME: @@ -1706,6 +1832,12 @@ static struct pernet_operations mpls_net_ops = { .exit = mpls_net_exit, }; +static struct rtnl_af_ops mpls_af_ops __read_mostly = { + .family = AF_MPLS, + .fill_stats_af = mpls_fill_stats_af, + .get_stats_af_size = mpls_get_stats_af_size, +}; + static int __init mpls_init(void) { int err; @@ -1722,6 +1854,8 @@ static int __init mpls_init(void) dev_add_pack(&mpls_packet_type); + rtnl_af_register(&mpls_af_ops); + rtnl_register(PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, NULL); rtnl_register(PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, NULL); rtnl_register(PF_MPLS, RTM_GETROUTE, NULL, mpls_dump_routes, NULL); @@ -1738,6 +1872,7 @@ module_init(mpls_init); static void __exit mpls_exit(void) { rtnl_unregister_all(PF_MPLS); + rtnl_af_unregister(&mpls_af_ops); dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); diff --git a/net/mpls/internal.h b/net/mpls/internal.h index bdfef6c3271a..d97243034605 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -9,13 +9,58 @@ struct mpls_entry_decoded { u8 bos; }; +struct mpls_pcpu_stats { + struct mpls_link_stats stats; + struct u64_stats_sync syncp; +}; + struct mpls_dev { - int input_enabled; + int input_enabled; - struct ctl_table_header *sysctl; - struct rcu_head rcu; + struct mpls_pcpu_stats __percpu *stats; + + struct ctl_table_header *sysctl; + struct rcu_head rcu; }; +#if BITS_PER_LONG == 32 + +#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ + do { \ + __typeof__(*(mdev)->stats) *ptr = \ + raw_cpu_ptr((mdev)->stats); \ + local_bh_disable(); \ + u64_stats_update_begin(&ptr->syncp); \ + ptr->stats.pkts_field++; \ + ptr->stats.bytes_field += (len); \ + u64_stats_update_end(&ptr->syncp); \ + local_bh_enable(); \ + } while (0) + +#define MPLS_INC_STATS(mdev, field) \ + do { \ + __typeof__(*(mdev)->stats) *ptr = \ + raw_cpu_ptr((mdev)->stats); \ + local_bh_disable(); \ + u64_stats_update_begin(&ptr->syncp); \ + ptr->stats.field++; \ + u64_stats_update_end(&ptr->syncp); \ + local_bh_enable(); \ + } while (0) + +#else + +#define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ + do { \ + this_cpu_inc((mdev)->stats->stats.pkts_field); \ + this_cpu_add((mdev)->stats->stats.bytes_field, (len)); \ + } while (0) + +#define MPLS_INC_STATS(mdev, field) \ + this_cpu_inc((mdev)->stats->stats.field) + +#endif + struct sk_buff; #define LABEL_NOT_SPECIFIED (1 << 20) @@ -114,6 +159,11 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr * return result; } +static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) +{ + return rcu_dereference_rtnl(dev->mpls_ptr); +} + int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels, @@ -123,5 +173,7 @@ int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, bool mpls_output_possible(const struct net_device *dev); unsigned int mpls_dev_mtu(const struct net_device *dev); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); +void mpls_stats_inc_outucastpkts(struct net_device *dev, + const struct sk_buff *skb); #endif /* MPLS_INTERNAL_H */ diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 2f7ccd934416..02531284bc49 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -48,11 +48,15 @@ static int mpls_xmit(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; + struct mpls_dev *out_mdev; int err = 0; bool bos; int i; unsigned int ttl; + /* Find the output device */ + out_dev = dst->dev; + /* Obtain the ttl */ if (dst->ops->family == AF_INET) { ttl = ip_hdr(skb)->ttl; @@ -66,8 +70,6 @@ static int mpls_xmit(struct sk_buff *skb) skb_orphan(skb); - /* Find the output device */ - out_dev = dst->dev; if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; @@ -109,6 +111,8 @@ static int mpls_xmit(struct sk_buff *skb) bos = false; } + mpls_stats_inc_outucastpkts(out_dev, skb); + if (rt) err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, skb); @@ -122,6 +126,9 @@ static int mpls_xmit(struct sk_buff *skb) return LWTUNNEL_XMIT_DONE; drop: + out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; + if (out_mdev) + MPLS_INC_STATS(out_mdev, tx_errors); kfree_skb(skb); return -EINVAL; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 161b628ab2b0..edcc1e19ad53 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1210,7 +1210,9 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) skb = nskb; } - if (!pskb_expand_head(skb, 0, -delta, allocation)) + if (!pskb_expand_head(skb, 0, -delta, + (allocation & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOWARN | __GFP_NORETRY)) skb->truesize -= delta; return skb; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index d5d6caecd072..09141a18ee2d 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -97,7 +97,7 @@ static void internal_dev_destructor(struct net_device *dev) free_netdev(dev); } -static struct rtnl_link_stats64 * +static void internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) { int i; @@ -125,8 +125,6 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) stats->tx_bytes += local_stats.tx_bytes; stats->tx_packets += local_stats.tx_packets; } - - return stats; } static void internal_set_rx_headroom(struct net_device *dev, int new_hr) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b9e1a13b4ba3..ddbda255b6ae 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: + h.h3->tp_status = status; + flush_dcache_page(pgv_to_page(&h.h3->tp_status)); + break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame) flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return h.h2->tp_status; case TPACKET_V3: + flush_dcache_page(pgv_to_page(&h.h3->tp_status)); + return h.h3->tp_status; default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -476,6 +481,9 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: + h.h3->tp_sec = ts.tv_sec; + h.h3->tp_nsec = ts.tv_nsec; + break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -2497,6 +2505,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, ph.raw = frame; switch (po->tp_version) { + case TPACKET_V3: + if (ph.h3->tp_next_offset != 0) { + pr_warn_once("variable sized slot not supported"); + return -EINVAL; + } + tp_len = ph.h3->tp_len; + break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; @@ -2516,6 +2531,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { + case TPACKET_V3: + off = ph.h3->tp_net; + break; case TPACKET_V2: off = ph.h2->tp_net; break; @@ -2525,6 +2543,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, } } else { switch (po->tp_version) { + case TPACKET_V3: + off = ph.h3->tp_mac; + break; case TPACKET_V2: off = ph.h2->tp_mac; break; @@ -4113,11 +4134,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, struct tpacket_req *req = &req_u->req; lock_sock(sk); - /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ - if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { - net_warn_ratelimited("Tx-ring is not supported.\n"); - goto out; - } rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; @@ -4177,11 +4193,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; switch (po->tp_version) { case TPACKET_V3: - /* Transmit path is not supported. We checked - * it above but just being paranoid - */ - if (!tx_ring) + /* Block transmit is not supported yet */ + if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); + } else { + struct tpacket_req3 *req3 = &req_u->req3; + + if (req3->tp_retire_blk_tov || + req3->tp_sizeof_priv || + req3->tp_feature_req_word) { + err = -EINVAL; + goto out; + } + } break; default: break; diff --git a/net/packet/diag.c b/net/packet/diag.c index 0ed68f0238bf..7ef1c881ae74 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -73,8 +73,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type, { struct packet_diag_ring pdr; - if (!ring->pg_vec || ((ver > TPACKET_V2) && - (nl_type == PACKET_DIAG_TX_RING))) + if (!ring->pg_vec) return 0; pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 2ac1e6194be3..b405f77d664c 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -298,6 +298,33 @@ static int rds_enable_recvtstamp(struct sock *sk, char __user *optval, return 0; } +static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval, + int optlen) +{ + struct rds_rx_trace_so trace; + int i; + + if (optlen != sizeof(struct rds_rx_trace_so)) + return -EFAULT; + + if (copy_from_user(&trace, optval, sizeof(trace))) + return -EFAULT; + + if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX) + return -EFAULT; + + rs->rs_rx_traces = trace.rx_traces; + for (i = 0; i < rs->rs_rx_traces; i++) { + if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { + rs->rs_rx_traces = 0; + return -EFAULT; + } + rs->rs_rx_trace[i] = trace.rx_trace_pos[i]; + } + + return 0; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -338,6 +365,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, ret = rds_enable_recvtstamp(sock->sk, optval, optlen); release_sock(sock->sk); break; + case SO_RDS_MSG_RXPATH_LATENCY: + ret = rds_recv_track_latency(rs, optval, optlen); + break; default: ret = -ENOPROTOOPT; } @@ -484,6 +514,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_cong_list); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; + rs->rs_rx_traces = 0; spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); diff --git a/net/rds/bind.c b/net/rds/bind.c index 095f6ce583fe..3a915bedb76c 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -176,8 +176,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); - printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, " - "load rds_tcp or rds_rdma?\n"); + pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n", + __func__, &sin->sin_addr.s_addr); goto out; } diff --git a/net/rds/connection.c b/net/rds/connection.c index fe9d31c0b22d..0e04dcceb1d4 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -545,11 +545,11 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); -void rds_walk_conn_path_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens, - int (*visitor)(struct rds_conn_path *, void *), - size_t item_len) +static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int (*visitor)(struct rds_conn_path *, void *), + size_t item_len) { u64 buffer[(item_len + 7) / 8]; struct hlist_head *head; diff --git a/net/rds/ib.c b/net/rds/ib.c index 5680d90b0b77..8d70884d7bb6 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -111,6 +111,9 @@ static void rds_ib_dev_free(struct work_struct *work) kfree(i_ipaddr); } + if (rds_ibdev->vector_load) + kfree(rds_ibdev->vector_load); + kfree(rds_ibdev); } @@ -159,6 +162,14 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; + rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors, + GFP_KERNEL); + if (!rds_ibdev->vector_load) { + pr_err("RDS/IB: %s failed to allocate vector memory\n", + __func__); + goto put_dev; + } + rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(rds_ibdev->pd)) { diff --git a/net/rds/ib.h b/net/rds/ib.h index 45ac8e8e58f4..540458928f3c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -14,9 +14,10 @@ #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 -#define RDS_IB_DEFAULT_FR_WR 512 +#define RDS_IB_DEFAULT_FR_WR 256 +#define RDS_IB_DEFAULT_FR_INV_WR 256 -#define RDS_IB_DEFAULT_RETRY_COUNT 2 +#define RDS_IB_DEFAULT_RETRY_COUNT 1 #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ @@ -125,6 +126,7 @@ struct rds_ib_connection { /* To control the number of wrs from fastreg */ atomic_t i_fastreg_wrs; + atomic_t i_fastunreg_wrs; /* interrupt handling */ struct tasklet_struct i_send_tasklet; @@ -149,6 +151,7 @@ struct rds_ib_connection { u64 i_ack_recv; /* last ACK received */ struct rds_ib_refill_cache i_cache_incs; struct rds_ib_refill_cache i_cache_frags; + atomic_t i_cache_allocs; /* sending acks */ unsigned long i_ack_flags; @@ -179,6 +182,14 @@ struct rds_ib_connection { /* Batched completions */ unsigned int i_unsignaled_wrs; + + /* Endpoint role in connection */ + bool i_active_side; + atomic_t i_cq_quiesce; + + /* Send/Recv vectors */ + int i_scq_vector; + int i_rcq_vector; }; /* This assumes that atomic_t is at least 32 bits */ @@ -221,6 +232,7 @@ struct rds_ib_device { spinlock_t spinlock; /* protect the above */ atomic_t refcount; struct work_struct free_work; + int *vector_load; }; #define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device) @@ -249,6 +261,8 @@ struct rds_ib_statistics { uint64_t s_ib_rx_refill_from_cq; uint64_t s_ib_rx_refill_from_thread; uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_total_frags; + uint64_t s_ib_rx_total_incs; uint64_t s_ib_rx_credit_updates; uint64_t s_ib_ack_sent; uint64_t s_ib_ack_send_failure; @@ -271,6 +285,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_1m_reused; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; + uint64_t s_ib_recv_added_to_cache; + uint64_t s_ib_recv_removed_from_cache; }; extern struct workqueue_struct *rds_ib_wq; @@ -401,6 +417,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) +#define rds_ib_stats_add(member, count) \ + rds_stats_add_which(rds_ib_stats, member, count) unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 5b2ab95afa07..ce3775abc6e7 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -113,24 +113,26 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } if (conn->c_version < RDS_PROTOCOL(3, 1)) { - printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," - " no longer supported\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version)); + pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); rds_conn_destroy(conn); return; } else { - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n", + ic->i_active_side ? "Active" : "Passive", + &conn->c_laddr, &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); } - /* - * Init rings and fill recv. this needs to wait until protocol negotiation - * is complete, since ring layout is different from 3.0 to 3.1. + atomic_set(&ic->i_cq_quiesce, 0); + + /* Init rings and fill recv. this needs to wait until protocol + * negotiation is complete, since ring layout is different + * from 3.1 to 4.1. */ rds_ib_send_init_ring(ic); rds_ib_recv_init_ring(ic); @@ -267,6 +269,10 @@ static void rds_ib_tasklet_fn_send(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); + /* if cq has been already reaped, ignore incoming cq event */ + if (atomic_read(&ic->i_cq_quiesce)) + return; + poll_scq(ic, ic->i_send_cq, ic->i_send_wc); ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); poll_scq(ic, ic->i_send_cq, ic->i_send_wc); @@ -308,6 +314,10 @@ static void rds_ib_tasklet_fn_recv(unsigned long data) rds_ib_stats_inc(s_ib_tasklet_call); + /* if cq has been already reaped, ignore incoming cq event */ + if (atomic_read(&ic->i_cq_quiesce)) + return; + memset(&state, 0, sizeof(state)); poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state); ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); @@ -358,6 +368,28 @@ static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context) tasklet_schedule(&ic->i_send_tasklet); } +static inline int ibdev_get_unused_vector(struct rds_ib_device *rds_ibdev) +{ + int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1]; + int index = rds_ibdev->dev->num_comp_vectors - 1; + int i; + + for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) { + if (rds_ibdev->vector_load[i] < min) { + index = i; + min = rds_ibdev->vector_load[i]; + } + } + + rds_ibdev->vector_load[index]++; + return index; +} + +static inline void ibdev_put_vector(struct rds_ib_device *rds_ibdev, int index) +{ + rds_ibdev->vector_load[index]--; +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -383,7 +415,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) * completion queue and send queue. This extra space is used for FRMR * registration and invalidation work requests */ - fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); + fr_queue_space = rds_ibdev->use_fastreg ? + (RDS_IB_DEFAULT_FR_WR + 1) + + (RDS_IB_DEFAULT_FR_INV_WR + 1) + : 0; /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); @@ -396,25 +431,30 @@ static int rds_ib_setup_qp(struct rds_connection *conn) /* Protection domain and memory range */ ic->i_pd = rds_ibdev->pd; + ic->i_scq_vector = ibdev_get_unused_vector(rds_ibdev); cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1; - + cq_attr.comp_vector = ic->i_scq_vector; ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_send_cq)) { ret = PTR_ERR(ic->i_send_cq); ic->i_send_cq = NULL; + ibdev_put_vector(rds_ibdev, ic->i_scq_vector); rdsdebug("ib_create_cq send failed: %d\n", ret); goto out; } + ic->i_rcq_vector = ibdev_get_unused_vector(rds_ibdev); cq_attr.cqe = ic->i_recv_ring.w_nr; + cq_attr.comp_vector = ic->i_rcq_vector; ic->i_recv_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, rds_ib_cq_event_handler, conn, &cq_attr); if (IS_ERR(ic->i_recv_cq)) { ret = PTR_ERR(ic->i_recv_cq); ic->i_recv_cq = NULL; + ibdev_put_vector(rds_ibdev, ic->i_rcq_vector); rdsdebug("ib_create_cq recv failed: %d\n", ret); goto out; } @@ -445,6 +485,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.send_cq = ic->i_send_cq; attr.recv_cq = ic->i_recv_cq; atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); + atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR); /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -682,6 +723,7 @@ out: if (ic->i_cm_id == cm_id) ret = 0; } + ic->i_active_side = true; return ret; } @@ -767,17 +809,27 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0) && - (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); + (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) && + (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR)); tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); + atomic_set(&ic->i_cq_quiesce, 1); + /* first destroy the ib state that generates callbacks */ if (ic->i_cm_id->qp) rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) + if (ic->i_send_cq) { + if (ic->rds_ibdev) + ibdev_put_vector(ic->rds_ibdev, ic->i_scq_vector); ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) + } + + if (ic->i_recv_cq) { + if (ic->rds_ibdev) + ibdev_put_vector(ic->rds_ibdev, ic->i_rcq_vector); ib_destroy_cq(ic->i_recv_cq); + } /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) @@ -855,6 +907,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ic->i_sends = NULL; vfree(ic->i_recvs); ic->i_recvs = NULL; + ic->i_active_side = false; } int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index d921adc62765..48332a6ed738 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -104,14 +104,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) struct rds_ib_frmr *frmr = &ibmr->u.frmr; struct ib_send_wr *failed_wr; struct ib_reg_wr reg_wr; - int ret; + int ret, off = 0; while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { atomic_inc(&ibmr->ic->i_fastreg_wrs); cpu_relax(); } - ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE); + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, + &off, PAGE_SIZE); if (unlikely(ret != ibmr->sg_len)) return ret < 0 ? ret : -EINVAL; @@ -240,8 +241,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) if (frmr->fr_state != FRMR_IS_INUSE) goto out; - while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { - atomic_inc(&ibmr->ic->i_fastreg_wrs); + while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastunreg_wrs); cpu_relax(); } @@ -260,7 +261,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) if (unlikely(ret)) { frmr->fr_state = FRMR_IS_STALE; frmr->fr_inv = false; - atomic_inc(&ibmr->ic->i_fastreg_wrs); + atomic_inc(&ibmr->ic->i_fastunreg_wrs); pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); goto out; } @@ -288,9 +289,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (frmr->fr_inv) { frmr->fr_state = FRMR_IS_FREE; frmr->fr_inv = false; + atomic_inc(&ic->i_fastreg_wrs); + } else { + atomic_inc(&ic->i_fastunreg_wrs); } - - atomic_inc(&ic->i_fastreg_wrs); } void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 606a11f681d2..e10624aa6959 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -194,6 +194,8 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic, rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); + atomic_add(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs); + rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE); } /* Recycle inc after freeing attached frags */ @@ -261,6 +263,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i atomic_dec(&rds_ib_allocation); return NULL; } + rds_ib_stats_inc(s_ib_rx_total_incs); } INIT_LIST_HEAD(&ibinc->ii_frags); rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); @@ -278,6 +281,8 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); if (cache_item) { frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + atomic_sub(RDS_FRAG_SIZE / SZ_1K, &ic->i_cache_allocs); + rds_ib_stats_add(s_ib_recv_added_to_cache, RDS_FRAG_SIZE); } else { frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); if (!frag) @@ -290,6 +295,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic kmem_cache_free(rds_ib_frag_slab, frag); return NULL; } + rds_ib_stats_inc(s_ib_rx_total_frags); } INIT_LIST_HEAD(&frag->f_item); @@ -905,8 +911,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, ic->i_ibinc = ibinc; hdr = &ibinc->ii_inc.i_hdr; + ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = + local_clock(); memcpy(hdr, ihdr, sizeof(*hdr)); ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + ibinc->ii_inc.i_rx_lat_trace[RDS_MSG_RX_START] = + local_clock(); rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc, ic->i_recv_data_rem, hdr->h_flags); @@ -980,8 +990,8 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, + rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); } diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 84d90c97332f..5e72de10c484 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -69,16 +69,6 @@ static void rds_ib_send_complete(struct rds_message *rm, complete(rm, notify_status); } -static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, - struct rm_data_op *op, - int wc_status) -{ - if (op->op_nents) - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - DMA_TO_DEVICE); -} - static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, struct rm_rdma_op *op, int wc_status) @@ -139,6 +129,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_atomic_fadd); } +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + struct rds_message *rm = container_of(op, struct rds_message, data); + + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); + + if (rm->rdma.op_active && rm->data.op_notify) + rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status); +} + /* * Unmap the resources associated with a struct send_work. * @@ -300,8 +305,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, wc->status, + rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", + &conn->c_laddr, &conn->c_faddr, wc->status, ib_wc_status_msg(wc->status)); } } diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 7e78dca1f252..9252ad126335 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -55,6 +55,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rx_refill_from_cq", "ib_rx_refill_from_thread", "ib_rx_alloc_limit", + "ib_rx_total_frags", + "ib_rx_total_incs", "ib_rx_credit_updates", "ib_ack_sent", "ib_ack_send_failure", diff --git a/net/rds/rdma.c b/net/rds/rdma.c index ea961144084f..f06fac4886b0 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -40,7 +40,6 @@ /* * XXX * - build with sparse - * - should we limit the size of a mr region? let transport return failure? * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ @@ -200,6 +199,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } + /* Restrict the size of mr irrespective of underlying transport + * To account for unaligned mr regions, subtract one from nr_pages + */ + if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { + ret = -EMSGSIZE; + goto out; + } + rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); @@ -415,7 +422,8 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { - printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", + r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } @@ -626,6 +634,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; + + /* Enable rmda notification on data operation for composite + * rds messages and make sure notification is enabled only + * for the data operation which follows it so that application + * gets notified only after full message gets delivered. + */ + if (rm->data.op_sg) { + rm->rdma.op_notify = 0; + rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + } } /* The cookie contains the R_Key of the remote memory region, and diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index d5f311767157..fc59821f0a27 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -206,18 +206,13 @@ static int rds_rdma_init(void) { int ret; - ret = rds_rdma_listen_init(); + ret = rds_ib_init(); if (ret) goto out; - ret = rds_ib_init(); + ret = rds_rdma_listen_init(); if (ret) - goto err_ib_init; - - goto out; - -err_ib_init: - rds_rdma_listen_stop(); + rds_ib_exit(); out: return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index ebbf909b87ec..07fff73dd4f3 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -50,6 +50,9 @@ void rdsdebug(char *fmt, ...) #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) +/* Used to limit both RDMA and non-RDMA RDS message to 1MB */ +#define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20)) + #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) @@ -250,6 +253,11 @@ struct rds_ext_header_rdma_dest { #define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ +#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) +#define RDS_MSG_RX_HDR 0 +#define RDS_MSG_RX_START 1 +#define RDS_MSG_RX_END 2 +#define RDS_MSG_RX_CMSG 3 struct rds_incoming { atomic_t i_refcount; @@ -262,6 +270,7 @@ struct rds_incoming { rds_rdma_cookie_t i_rdma_cookie; struct timeval i_rx_tstamp; + u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; struct rds_mr { @@ -419,6 +428,7 @@ struct rds_message { } rdma; struct rm_data_op { unsigned int op_active:1; + unsigned int op_notify:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; @@ -571,6 +581,10 @@ struct rds_sock { unsigned char rs_recverr, rs_cong_monitor; u32 rs_hash_initval; + + /* Socket receive path trace points*/ + u8 rs_rx_traces; + u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) @@ -630,6 +644,9 @@ struct rds_statistics { uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; + uint64_t s_recv_bytes_added_to_socket; + uint64_t s_recv_bytes_removed_from_socket; + }; /* af_rds.c */ diff --git a/net/rds/recv.c b/net/rds/recv.c index 9d0666e5fe35..8b7e7b7f2c2d 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -43,6 +43,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr) { + int i; + atomic_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; @@ -50,6 +52,9 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, inc->i_rdma_cookie = 0; inc->i_rx_tstamp.tv_sec = 0; inc->i_rx_tstamp.tv_usec = 0; + + for (i = 0; i < RDS_RX_MAX_TRACES; i++) + inc->i_rx_lat_trace[i] = 0; } EXPORT_SYMBOL_GPL(rds_inc_init); @@ -94,6 +99,10 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, return; rs->rs_rcv_bytes += delta; + if (delta > 0) + rds_stats_add(s_recv_bytes_added_to_socket, delta); + else + rds_stats_add(s_recv_bytes_removed_from_socket, -delta); now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " @@ -369,6 +378,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, if (sock_flag(sk, SOCK_RCVTSTAMP)) do_gettimeofday(&inc->i_rx_tstamp); rds_inc_addref(inc); + inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); list_add_tail(&inc->i_item, &rs->rs_recv_queue); __rds_wake_sk_sleep(sk); } else { @@ -530,7 +540,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); if (ret) - return ret; + goto out; } if ((inc->i_rx_tstamp.tv_sec != 0) && @@ -539,10 +549,30 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, sizeof(struct timeval), &inc->i_rx_tstamp); if (ret) - return ret; + goto out; } - return 0; + if (rs->rs_rx_traces) { + struct rds_cmsg_rx_trace t; + int i, j; + + inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock(); + t.rx_traces = rs->rs_rx_traces; + for (i = 0; i < rs->rs_rx_traces; i++) { + j = rs->rs_rx_trace[i]; + t.rx_trace_pos[i] = j; + t.rx_trace[i] = inc->i_rx_lat_trace[j + 1] - + inc->i_rx_lat_trace[j]; + } + + ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RXPATH_LATENCY, + sizeof(t), &t); + if (ret) + goto out; + } + +out: + return ret; } int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/rds/send.c b/net/rds/send.c index 77c8c6e613ad..5cc64039caf7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -476,12 +476,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; + unsigned int notify = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); + notify = rm->rdma.op_notify | rm->data.op_notify; ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_active && notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); @@ -945,6 +947,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_rdma_map(rs, rm, cmsg); if (!ret) *allocated_mr = 1; + else if (ret == -ENODEV) + /* Accommodate the get_mr() case which can fail + * if connection isn't established yet. + */ + ret = -EAGAIN; break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: @@ -987,6 +994,26 @@ static int rds_send_mprds_hash(struct rds_sock *rs, struct rds_connection *conn) return hash; } +static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) +{ + struct rds_rdma_args *args; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) { + args = CMSG_DATA(cmsg); + *rdma_bytes += args->remote_vec.bytes; + } + } + return 0; +} + int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; @@ -1001,6 +1028,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) int nonblock = msg->msg_flags & MSG_DONTWAIT; long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; + size_t total_payload_len = payload_len, rdma_payload_len = 0; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1033,6 +1061,16 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) } release_sock(sk); + ret = rds_rdma_bytes(msg, &rdma_payload_len); + if (ret) + goto out; + + total_payload_len += rdma_payload_len; + if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { + ret = -EMSGSIZE; + goto out; + } + if (payload_len > rds_sk_sndbuf(rs)) { ret = -EMSGSIZE; goto out; @@ -1082,8 +1120,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Parse any control messages the user may have included. */ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); - if (ret) + if (ret) { + /* Trigger connection so that its ready for the next retry */ + if (ret == -EAGAIN) + rds_conn_connect_if_down(conn); goto out; + } if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", @@ -1169,7 +1211,7 @@ out: * or * RDS_FLAG_HB_PONG|RDS_FLAG_ACK_REQUIRED */ -int +static int rds_send_probe(struct rds_conn_path *cp, __be16 sport, __be16 dport, u8 h_flags) { @@ -1238,7 +1280,7 @@ rds_send_pong(struct rds_conn_path *cp, __be16 dport) return rds_send_probe(cp, 0, dport, 0); } -void +static void rds_send_ping(struct rds_connection *conn) { unsigned long flags; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index f74bab3ecdca..67d0929c7d3d 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -79,6 +79,7 @@ bail: * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side * by moving them to CONNECTING in this function. */ +static struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index ad4892e97f91..e006ef8e6d40 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -180,6 +180,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, rdsdebug("alloced tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, cp->cp_conn->c_faddr); + tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = + local_clock(); + /* * XXX * we might be able to use the __ variants when * we've already serialized at a higher level. @@ -204,6 +207,8 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, /* could be 0 for a 0 len message */ tc->t_tinc_data_rem = be32_to_cpu(tinc->ti_inc.i_hdr.h_len); + tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_START] = + local_clock(); } } diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 884027f62783..2064c3a35ef8 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -176,6 +176,50 @@ static void rfkill_led_trigger_unregister(struct rfkill *rfkill) { led_trigger_unregister(&rfkill->led_trigger); } + +static struct led_trigger rfkill_any_led_trigger; +static struct work_struct rfkill_any_work; + +static void rfkill_any_led_trigger_worker(struct work_struct *work) +{ + enum led_brightness brightness = LED_OFF; + struct rfkill *rfkill; + + mutex_lock(&rfkill_global_mutex); + list_for_each_entry(rfkill, &rfkill_list, node) { + if (!(rfkill->state & RFKILL_BLOCK_ANY)) { + brightness = LED_FULL; + break; + } + } + mutex_unlock(&rfkill_global_mutex); + + led_trigger_event(&rfkill_any_led_trigger, brightness); +} + +static void rfkill_any_led_trigger_event(void) +{ + schedule_work(&rfkill_any_work); +} + +static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev) +{ + rfkill_any_led_trigger_event(); +} + +static int rfkill_any_led_trigger_register(void) +{ + INIT_WORK(&rfkill_any_work, rfkill_any_led_trigger_worker); + rfkill_any_led_trigger.name = "rfkill-any"; + rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate; + return led_trigger_register(&rfkill_any_led_trigger); +} + +static void rfkill_any_led_trigger_unregister(void) +{ + led_trigger_unregister(&rfkill_any_led_trigger); + cancel_work_sync(&rfkill_any_work); +} #else static void rfkill_led_trigger_event(struct rfkill *rfkill) { @@ -189,6 +233,19 @@ static inline int rfkill_led_trigger_register(struct rfkill *rfkill) static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill) { } + +static void rfkill_any_led_trigger_event(void) +{ +} + +static int rfkill_any_led_trigger_register(void) +{ + return 0; +} + +static void rfkill_any_led_trigger_unregister(void) +{ +} #endif /* CONFIG_RFKILL_LEDS */ static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill, @@ -297,6 +354,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked) spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); + rfkill_any_led_trigger_event(); if (prev != curr) rfkill_event(rfkill); @@ -477,11 +535,9 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked) spin_unlock_irqrestore(&rfkill->lock, flags); rfkill_led_trigger_event(rfkill); + rfkill_any_led_trigger_event(); - if (!rfkill->registered) - return ret; - - if (prev != blocked) + if (rfkill->registered && prev != blocked) schedule_work(&rfkill->uevent_work); return ret; @@ -523,6 +579,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool blocked) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); + rfkill_any_led_trigger_event(); return blocked; } @@ -572,6 +629,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw) schedule_work(&rfkill->uevent_work); rfkill_led_trigger_event(rfkill); + rfkill_any_led_trigger_event(); } } EXPORT_SYMBOL(rfkill_set_states); @@ -988,6 +1046,7 @@ int __must_check rfkill_register(struct rfkill *rfkill) #endif } + rfkill_any_led_trigger_event(); rfkill_send_events(rfkill, RFKILL_OP_ADD); mutex_unlock(&rfkill_global_mutex); @@ -1020,6 +1079,7 @@ void rfkill_unregister(struct rfkill *rfkill) mutex_lock(&rfkill_global_mutex); rfkill_send_events(rfkill, RFKILL_OP_DEL); list_del_init(&rfkill->node); + rfkill_any_led_trigger_event(); mutex_unlock(&rfkill_global_mutex); rfkill_led_trigger_unregister(rfkill); @@ -1266,24 +1326,33 @@ static int __init rfkill_init(void) error = class_register(&rfkill_class); if (error) - goto out; + goto error_class; error = misc_register(&rfkill_miscdev); - if (error) { - class_unregister(&rfkill_class); - goto out; - } + if (error) + goto error_misc; + + error = rfkill_any_led_trigger_register(); + if (error) + goto error_led_trigger; #ifdef CONFIG_RFKILL_INPUT error = rfkill_handler_init(); - if (error) { - misc_deregister(&rfkill_miscdev); - class_unregister(&rfkill_class); - goto out; - } + if (error) + goto error_input; #endif - out: + return 0; + +#ifdef CONFIG_RFKILL_INPUT +error_input: + rfkill_any_led_trigger_unregister(); +#endif +error_led_trigger: + misc_deregister(&rfkill_miscdev); +error_misc: + class_unregister(&rfkill_class); +error_class: return error; } subsys_initcall(rfkill_init); @@ -1293,6 +1362,7 @@ static void __exit rfkill_exit(void) #ifdef CONFIG_RFKILL_INPUT rfkill_handler_exit(); #endif + rfkill_any_led_trigger_unregister(); misc_deregister(&rfkill_miscdev); class_unregister(&rfkill_class); } diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 5f63f6dcaabb..199b46e93e64 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -224,6 +224,14 @@ static int rxrpc_listen(struct socket *sock, int backlog) else sk->sk_max_ack_backlog = old; break; + case RXRPC_SERVER_LISTENING: + if (backlog == 0) { + rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED; + sk->sk_max_ack_backlog = 0; + rxrpc_discard_prealloc(rx); + ret = 0; + break; + } default: ret = -EBUSY; break; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f60e35576526..12be432be9b2 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -60,6 +60,7 @@ enum { RXRPC_CLIENT_BOUND, /* client local address bound */ RXRPC_SERVER_BOUND, /* server local address bound */ RXRPC_SERVER_LISTENING, /* server listening for connections */ + RXRPC_SERVER_LISTEN_DISABLED, /* server listening disabled */ RXRPC_CLOSE, /* socket is being closed */ }; @@ -593,200 +594,6 @@ struct rxrpc_ack_summary { u8 cumulative_acks; }; -enum rxrpc_skb_trace { - rxrpc_skb_rx_cleaned, - rxrpc_skb_rx_freed, - rxrpc_skb_rx_got, - rxrpc_skb_rx_lost, - rxrpc_skb_rx_received, - rxrpc_skb_rx_rotated, - rxrpc_skb_rx_purged, - rxrpc_skb_rx_seen, - rxrpc_skb_tx_cleaned, - rxrpc_skb_tx_freed, - rxrpc_skb_tx_got, - rxrpc_skb_tx_new, - rxrpc_skb_tx_rotated, - rxrpc_skb_tx_seen, - rxrpc_skb__nr_trace -}; - -extern const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7]; - -enum rxrpc_conn_trace { - rxrpc_conn_new_client, - rxrpc_conn_new_service, - rxrpc_conn_queued, - rxrpc_conn_seen, - rxrpc_conn_got, - rxrpc_conn_put_client, - rxrpc_conn_put_service, - rxrpc_conn__nr_trace -}; - -extern const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4]; - -enum rxrpc_client_trace { - rxrpc_client_activate_chans, - rxrpc_client_alloc, - rxrpc_client_chan_activate, - rxrpc_client_chan_disconnect, - rxrpc_client_chan_pass, - rxrpc_client_chan_unstarted, - rxrpc_client_cleanup, - rxrpc_client_count, - rxrpc_client_discard, - rxrpc_client_duplicate, - rxrpc_client_exposed, - rxrpc_client_replace, - rxrpc_client_to_active, - rxrpc_client_to_culled, - rxrpc_client_to_idle, - rxrpc_client_to_inactive, - rxrpc_client_to_waiting, - rxrpc_client_uncount, - rxrpc_client__nr_trace -}; - -extern const char rxrpc_client_traces[rxrpc_client__nr_trace][7]; -extern const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5]; - -enum rxrpc_call_trace { - rxrpc_call_new_client, - rxrpc_call_new_service, - rxrpc_call_queued, - rxrpc_call_queued_ref, - rxrpc_call_seen, - rxrpc_call_connected, - rxrpc_call_release, - rxrpc_call_got, - rxrpc_call_got_userid, - rxrpc_call_got_kernel, - rxrpc_call_put, - rxrpc_call_put_userid, - rxrpc_call_put_kernel, - rxrpc_call_put_noqueue, - rxrpc_call_error, - rxrpc_call__nr_trace -}; - -extern const char rxrpc_call_traces[rxrpc_call__nr_trace][4]; - -enum rxrpc_transmit_trace { - rxrpc_transmit_wait, - rxrpc_transmit_queue, - rxrpc_transmit_queue_last, - rxrpc_transmit_rotate, - rxrpc_transmit_rotate_last, - rxrpc_transmit_await_reply, - rxrpc_transmit_end, - rxrpc_transmit__nr_trace -}; - -extern const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4]; - -enum rxrpc_receive_trace { - rxrpc_receive_incoming, - rxrpc_receive_queue, - rxrpc_receive_queue_last, - rxrpc_receive_front, - rxrpc_receive_rotate, - rxrpc_receive_end, - rxrpc_receive__nr_trace -}; - -extern const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4]; - -enum rxrpc_recvmsg_trace { - rxrpc_recvmsg_enter, - rxrpc_recvmsg_wait, - rxrpc_recvmsg_dequeue, - rxrpc_recvmsg_hole, - rxrpc_recvmsg_next, - rxrpc_recvmsg_cont, - rxrpc_recvmsg_full, - rxrpc_recvmsg_data_return, - rxrpc_recvmsg_terminal, - rxrpc_recvmsg_to_be_accepted, - rxrpc_recvmsg_return, - rxrpc_recvmsg__nr_trace -}; - -extern const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5]; - -enum rxrpc_rtt_tx_trace { - rxrpc_rtt_tx_ping, - rxrpc_rtt_tx_data, - rxrpc_rtt_tx__nr_trace -}; - -extern const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5]; - -enum rxrpc_rtt_rx_trace { - rxrpc_rtt_rx_ping_response, - rxrpc_rtt_rx_requested_ack, - rxrpc_rtt_rx__nr_trace -}; - -extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5]; - -enum rxrpc_timer_trace { - rxrpc_timer_begin, - rxrpc_timer_init_for_reply, - rxrpc_timer_init_for_send_reply, - rxrpc_timer_expired, - rxrpc_timer_set_for_ack, - rxrpc_timer_set_for_ping, - rxrpc_timer_set_for_resend, - rxrpc_timer_set_for_send, - rxrpc_timer__nr_trace -}; - -extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8]; - -enum rxrpc_propose_ack_trace { - rxrpc_propose_ack_client_tx_end, - rxrpc_propose_ack_input_data, - rxrpc_propose_ack_ping_for_lost_ack, - rxrpc_propose_ack_ping_for_lost_reply, - rxrpc_propose_ack_ping_for_params, - rxrpc_propose_ack_processing_op, - rxrpc_propose_ack_respond_to_ack, - rxrpc_propose_ack_respond_to_ping, - rxrpc_propose_ack_retry_tx, - rxrpc_propose_ack_rotate_rx, - rxrpc_propose_ack_terminal_ack, - rxrpc_propose_ack__nr_trace -}; - -enum rxrpc_propose_ack_outcome { - rxrpc_propose_ack_use, - rxrpc_propose_ack_update, - rxrpc_propose_ack_subsume, - rxrpc_propose_ack__nr_outcomes -}; - -extern const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8]; -extern const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes]; - -enum rxrpc_congest_change { - rxrpc_cong_begin_retransmission, - rxrpc_cong_cleared_nacks, - rxrpc_cong_new_low_nack, - rxrpc_cong_no_change, - rxrpc_cong_progress, - rxrpc_cong_retransmit_again, - rxrpc_cong_rtt_window_end, - rxrpc_cong_saw_nack, - rxrpc_congest__nr_change -}; - -extern const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10]; -extern const char rxrpc_congest_changes[rxrpc_congest__nr_change][9]; - -extern const char *const rxrpc_pkts[]; -extern const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4]; - #include <trace/events/rxrpc.h> /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 832d854c2d5c..7c4c64ab8da2 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -349,7 +349,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, found_service: spin_lock(&rx->incoming_lock); - if (rx->sk.sk_state == RXRPC_CLOSE) { + if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || + rx->sk.sk_state == RXRPC_CLOSE) { trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 1ed18d8c9c9f..8b94db3c9b2e 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -43,24 +43,6 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = { [RXRPC_CALL_NETWORK_ERROR] = "NetError", }; -const char rxrpc_call_traces[rxrpc_call__nr_trace][4] = { - [rxrpc_call_new_client] = "NWc", - [rxrpc_call_new_service] = "NWs", - [rxrpc_call_queued] = "QUE", - [rxrpc_call_queued_ref] = "QUR", - [rxrpc_call_connected] = "CON", - [rxrpc_call_release] = "RLS", - [rxrpc_call_seen] = "SEE", - [rxrpc_call_got] = "GOT", - [rxrpc_call_got_userid] = "Gus", - [rxrpc_call_got_kernel] = "Gke", - [rxrpc_call_put] = "PUT", - [rxrpc_call_put_userid] = "Pus", - [rxrpc_call_put_kernel] = "Pke", - [rxrpc_call_put_noqueue] = "PNQ", - [rxrpc_call_error] = "*E*", -}; - struct kmem_cache *rxrpc_call_jar; LIST_HEAD(rxrpc_calls); DEFINE_RWLOCK(rxrpc_call_lock); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 6cbcdcc29853..40a1ef2adeb4 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -105,14 +105,6 @@ static void rxrpc_discard_expired_client_conns(struct work_struct *); static DECLARE_DELAYED_WORK(rxrpc_client_conn_reap, rxrpc_discard_expired_client_conns); -const char rxrpc_conn_cache_states[RXRPC_CONN__NR_CACHE_STATES][5] = { - [RXRPC_CONN_CLIENT_INACTIVE] = "Inac", - [RXRPC_CONN_CLIENT_WAITING] = "Wait", - [RXRPC_CONN_CLIENT_ACTIVE] = "Actv", - [RXRPC_CONN_CLIENT_CULLED] = "Cull", - [RXRPC_CONN_CLIENT_IDLE] = "Idle", -}; - /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index e1e83af47866..b0ecb770fdce 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -173,6 +173,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, /* Save the result of the call so that we can repeat it if necessary * through the channel, whilst disposing of the actual call record. */ + trace_rxrpc_disconnect_call(call); chan->last_service_id = call->service_id; if (call->abort_code) { chan->last_abort = call->abort_code; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 1d87b5453ef7..78ec33477adf 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -481,6 +481,7 @@ next_subpacket: return rxrpc_proto_abort("LSA", call, seq); } + trace_rxrpc_rx_data(call, seq, serial, flags, annotation); if (before_eq(seq, hard_ack)) { ack = RXRPC_ACK_DUPLICATE; ack_serial = serial; @@ -765,16 +766,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? buf.ack.reason : RXRPC_ACK__INVALID); - trace_rxrpc_rx_ack(call, first_soft_ack, summary.ack_reason, nr_acks); - - _proto("Rx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", - sp->hdr.serial, - ntohs(buf.ack.maxSkew), - first_soft_ack, - ntohl(buf.ack.previousPacket), - acked_serial, - rxrpc_ack_names[summary.ack_reason], - buf.ack.nAcks); + trace_rxrpc_rx_ack(call, sp->hdr.serial, acked_serial, + first_soft_ack, ntohl(buf.ack.previousPacket), + summary.ack_reason, nr_acks); if (buf.ack.reason == RXRPC_ACK_PING_RESPONSE) rxrpc_input_ping_response(call, skb->tstamp, acked_serial, @@ -931,7 +925,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, break; default: - _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial); break; } @@ -961,6 +954,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, break; } + trace_rxrpc_improper_term(call); __rxrpc_disconnect_call(conn, call); rxrpc_notify_socket(call); } diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 6dee55fad2d3..1a2d4b112064 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -77,12 +77,6 @@ unsigned int rxrpc_rx_jumbo_max = 4; */ unsigned int rxrpc_resend_timeout = 4 * 1000; -const char *const rxrpc_pkts[] = { - "?00", - "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", - "?09", "?10", "?11", "?12", "VERSION", "?14", "?15" -}; - const s8 rxrpc_ack_priority[] = { [0] = 0, [RXRPC_ACK_DELAY] = 1, @@ -94,148 +88,3 @@ const s8 rxrpc_ack_priority[] = { [RXRPC_ACK_NOSPACE] = 7, [RXRPC_ACK_PING_RESPONSE] = 8, }; - -const char rxrpc_ack_names[RXRPC_ACK__INVALID + 1][4] = { - "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", - "IDL", "-?-" -}; - -const char rxrpc_skb_traces[rxrpc_skb__nr_trace][7] = { - [rxrpc_skb_rx_cleaned] = "Rx CLN", - [rxrpc_skb_rx_freed] = "Rx FRE", - [rxrpc_skb_rx_got] = "Rx GOT", - [rxrpc_skb_rx_lost] = "Rx *L*", - [rxrpc_skb_rx_received] = "Rx RCV", - [rxrpc_skb_rx_purged] = "Rx PUR", - [rxrpc_skb_rx_rotated] = "Rx ROT", - [rxrpc_skb_rx_seen] = "Rx SEE", - [rxrpc_skb_tx_cleaned] = "Tx CLN", - [rxrpc_skb_tx_freed] = "Tx FRE", - [rxrpc_skb_tx_got] = "Tx GOT", - [rxrpc_skb_tx_new] = "Tx NEW", - [rxrpc_skb_tx_rotated] = "Tx ROT", - [rxrpc_skb_tx_seen] = "Tx SEE", -}; - -const char rxrpc_conn_traces[rxrpc_conn__nr_trace][4] = { - [rxrpc_conn_new_client] = "NWc", - [rxrpc_conn_new_service] = "NWs", - [rxrpc_conn_queued] = "QUE", - [rxrpc_conn_seen] = "SEE", - [rxrpc_conn_got] = "GOT", - [rxrpc_conn_put_client] = "PTc", - [rxrpc_conn_put_service] = "PTs", -}; - -const char rxrpc_client_traces[rxrpc_client__nr_trace][7] = { - [rxrpc_client_activate_chans] = "Activa", - [rxrpc_client_alloc] = "Alloc ", - [rxrpc_client_chan_activate] = "ChActv", - [rxrpc_client_chan_disconnect] = "ChDisc", - [rxrpc_client_chan_pass] = "ChPass", - [rxrpc_client_chan_unstarted] = "ChUnst", - [rxrpc_client_cleanup] = "Clean ", - [rxrpc_client_count] = "Count ", - [rxrpc_client_discard] = "Discar", - [rxrpc_client_duplicate] = "Duplic", - [rxrpc_client_exposed] = "Expose", - [rxrpc_client_replace] = "Replac", - [rxrpc_client_to_active] = "->Actv", - [rxrpc_client_to_culled] = "->Cull", - [rxrpc_client_to_idle] = "->Idle", - [rxrpc_client_to_inactive] = "->Inac", - [rxrpc_client_to_waiting] = "->Wait", - [rxrpc_client_uncount] = "Uncoun", -}; - -const char rxrpc_transmit_traces[rxrpc_transmit__nr_trace][4] = { - [rxrpc_transmit_wait] = "WAI", - [rxrpc_transmit_queue] = "QUE", - [rxrpc_transmit_queue_last] = "QLS", - [rxrpc_transmit_rotate] = "ROT", - [rxrpc_transmit_rotate_last] = "RLS", - [rxrpc_transmit_await_reply] = "AWR", - [rxrpc_transmit_end] = "END", -}; - -const char rxrpc_receive_traces[rxrpc_receive__nr_trace][4] = { - [rxrpc_receive_incoming] = "INC", - [rxrpc_receive_queue] = "QUE", - [rxrpc_receive_queue_last] = "QLS", - [rxrpc_receive_front] = "FRN", - [rxrpc_receive_rotate] = "ROT", - [rxrpc_receive_end] = "END", -}; - -const char rxrpc_recvmsg_traces[rxrpc_recvmsg__nr_trace][5] = { - [rxrpc_recvmsg_enter] = "ENTR", - [rxrpc_recvmsg_wait] = "WAIT", - [rxrpc_recvmsg_dequeue] = "DEQU", - [rxrpc_recvmsg_hole] = "HOLE", - [rxrpc_recvmsg_next] = "NEXT", - [rxrpc_recvmsg_cont] = "CONT", - [rxrpc_recvmsg_full] = "FULL", - [rxrpc_recvmsg_data_return] = "DATA", - [rxrpc_recvmsg_terminal] = "TERM", - [rxrpc_recvmsg_to_be_accepted] = "TBAC", - [rxrpc_recvmsg_return] = "RETN", -}; - -const char rxrpc_rtt_tx_traces[rxrpc_rtt_tx__nr_trace][5] = { - [rxrpc_rtt_tx_ping] = "PING", - [rxrpc_rtt_tx_data] = "DATA", -}; - -const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5] = { - [rxrpc_rtt_rx_ping_response] = "PONG", - [rxrpc_rtt_rx_requested_ack] = "RACK", -}; - -const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8] = { - [rxrpc_timer_begin] = "Begin ", - [rxrpc_timer_expired] = "*EXPR*", - [rxrpc_timer_init_for_reply] = "IniRpl", - [rxrpc_timer_init_for_send_reply] = "SndRpl", - [rxrpc_timer_set_for_ack] = "SetAck", - [rxrpc_timer_set_for_ping] = "SetPng", - [rxrpc_timer_set_for_send] = "SetTx ", - [rxrpc_timer_set_for_resend] = "SetRTx", -}; - -const char rxrpc_propose_ack_traces[rxrpc_propose_ack__nr_trace][8] = { - [rxrpc_propose_ack_client_tx_end] = "ClTxEnd", - [rxrpc_propose_ack_input_data] = "DataIn ", - [rxrpc_propose_ack_ping_for_lost_ack] = "LostAck", - [rxrpc_propose_ack_ping_for_lost_reply] = "LostRpl", - [rxrpc_propose_ack_ping_for_params] = "Params ", - [rxrpc_propose_ack_processing_op] = "ProcOp ", - [rxrpc_propose_ack_respond_to_ack] = "Rsp2Ack", - [rxrpc_propose_ack_respond_to_ping] = "Rsp2Png", - [rxrpc_propose_ack_retry_tx] = "RetryTx", - [rxrpc_propose_ack_rotate_rx] = "RxAck ", - [rxrpc_propose_ack_terminal_ack] = "ClTerm ", -}; - -const char *const rxrpc_propose_ack_outcomes[rxrpc_propose_ack__nr_outcomes] = { - [rxrpc_propose_ack_use] = "", - [rxrpc_propose_ack_update] = " Update", - [rxrpc_propose_ack_subsume] = " Subsume", -}; - -const char rxrpc_congest_modes[NR__RXRPC_CONGEST_MODES][10] = { - [RXRPC_CALL_SLOW_START] = "SlowStart", - [RXRPC_CALL_CONGEST_AVOIDANCE] = "CongAvoid", - [RXRPC_CALL_PACKET_LOSS] = "PktLoss ", - [RXRPC_CALL_FAST_RETRANSMIT] = "FastReTx ", -}; - -const char rxrpc_congest_changes[rxrpc_congest__nr_change][9] = { - [rxrpc_cong_begin_retransmission] = " Retrans", - [rxrpc_cong_cleared_nacks] = " Cleared", - [rxrpc_cong_new_low_nack] = " NewLowN", - [rxrpc_cong_no_change] = "", - [rxrpc_cong_progress] = " Progres", - [rxrpc_cong_retransmit_again] = " ReTxAgn", - [rxrpc_cong_rtt_window_end] = " RttWinE", - [rxrpc_cong_saw_nack] = " SawNack", -}; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 65cd980767fa..b9bcfbfb095c 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -52,6 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_sock *rx; struct rxrpc_peer *peer; struct rxrpc_call *call; + rxrpc_seq_t tx_hard_ack, rx_hard_ack; char lbuff[50], rbuff[50]; if (v == &rxrpc_calls) { @@ -82,9 +83,11 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) else strcpy(rbuff, "no_connection"); + tx_hard_ack = READ_ONCE(call->tx_hard_ack); + rx_hard_ack = READ_ONCE(call->rx_hard_ack); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" - " %-8.8s %08x %lx\n", + " %-8.8s %08x %lx %08x %02x %08x %02x\n", lbuff, rbuff, call->service_id, @@ -94,7 +97,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) atomic_read(&call->usage), rxrpc_call_states[call->state], call->abort_code, - call->user_call_ID); + call->user_call_ID, + tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, + rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack); return 0; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index b214a4d4a641..0a6ef217aa8a 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -376,7 +376,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, if (!CMSG_OK(msg, cmsg)) return -EINVAL; - len = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + len = cmsg->cmsg_len - sizeof(struct cmsghdr); _debug("CMSG %d, %d, %d", cmsg->cmsg_level, cmsg->cmsg_type, len); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 87956a768d1b..a9aa38d43fa7 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -707,6 +707,7 @@ config NET_ACT_SKBEDIT config NET_ACT_CSUM tristate "Checksum Updating" depends on NET_CLS_ACT && INET + select LIBCRC32C ---help--- Say Y here to update some common checksum after some direct packet alterations. diff --git a/net/sched/act_api.c b/net/sched/act_api.c index e10456ef6f7a..cd08df91351d 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -426,11 +426,9 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, { int ret = -1, i; - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - ret = TC_ACT_OK; - goto exec_done; - } + if (skb_skip_tc_classify(skb)) + return TC_ACT_OK; + for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; @@ -439,9 +437,8 @@ repeat: if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ if (ret != TC_ACT_PIPE) - goto exec_done; + break; } -exec_done: return ret; } EXPORT_SYMBOL(tcf_action_exec); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index a0edd80a44db..e978ccd4402c 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -30,6 +30,7 @@ #include <net/tcp.h> #include <net/udp.h> #include <net/ip6_checksum.h> +#include <net/sctp/checksum.h> #include <net/act_api.h> @@ -322,6 +323,25 @@ ignore_obscure_skb: return 1; } +static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl, + unsigned int ipl) +{ + struct sctphdr *sctph; + + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) + return 1; + + sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph)); + if (!sctph) + return 0; + + sctph->checksum = sctp_compute_cksum(skb, + skb_network_offset(skb) + ihl); + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) { const struct iphdr *iph; @@ -365,6 +385,11 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) ntohs(iph->tot_len), 1)) goto fail; break; + case IPPROTO_SCTP: + if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) && + !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len))) + goto fail; + break; } if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { @@ -481,6 +506,11 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) pl + sizeof(*ip6h), 1)) goto fail; goto done; + case IPPROTO_SCTP: + if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) && + !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h))) + goto fail; + goto done; default: goto ignore_skb; } diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 80b848d3f096..921fb20eaa7c 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -736,12 +736,11 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, u16 metalen = ife_get_sz(skb, ife); int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN; unsigned int skboff = skb->dev->hard_header_len; - u32 at = G_TC_AT(skb->tc_verd); int new_len = skb->len + hdrm; bool exceed_mtu = false; int err; - if (at & AT_EGRESS) { + if (!skb_at_tc_ingress(skb)) { if (new_len > skb->dev->mtu) exceed_mtu = true; } @@ -773,7 +772,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, return TC_ACT_SHOT; } - if (!(at & AT_EGRESS)) + if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); iethh = (struct ethhdr *)skb->data; @@ -816,7 +815,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, ether_addr_copy(oethh->h_dest, iethh->h_dest); oethh->h_proto = htons(ife->eth_type); - if (!(at & AT_EGRESS)) + if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); spin_unlock(&ife->tcf_lock); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 2d9fa6e0a1b4..84682f02b611 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -39,15 +39,15 @@ static bool tcf_mirred_is_act_redirect(int action) return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; } -static u32 tcf_mirred_act_direction(int action) +static bool tcf_mirred_act_wants_ingress(int action) { switch (action) { case TCA_EGRESS_REDIR: case TCA_EGRESS_MIRROR: - return AT_EGRESS; + return false; case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: - return AT_INGRESS; + return true; default: BUG(); } @@ -170,7 +170,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, int retval, err = 0; int m_eaction; int mac_len; - u32 at; tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -191,7 +190,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, goto out; } - at = G_TC_AT(skb->tc_verd); skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) goto out; @@ -200,8 +198,9 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, * and devices expect a mac header on xmit, then mac push/pull is * needed. */ - if (at != tcf_mirred_act_direction(m_eaction) && m_mac_header_xmit) { - if (at & AT_EGRESS) { + if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) && + m_mac_header_xmit) { + if (!skb_at_tc_ingress(skb)) { /* caught at egress, act ingress: pull mac */ mac_len = skb_network_header(skb) - skb_mac_header(skb); skb_pull_rcsum(skb2, mac_len); @@ -212,12 +211,14 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } /* mirror is always swallowed */ - if (tcf_mirred_is_act_redirect(m_eaction)) - skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); + if (tcf_mirred_is_act_redirect(m_eaction)) { + skb2->tc_redirected = 1; + skb2->tc_from_ingress = skb2->tc_at_ingress; + } skb2->skb_iif = skb->dev->ifindex; skb2->dev = dev; - if (tcf_mirred_act_direction(m_eaction) & AT_EGRESS) + if (!tcf_mirred_act_wants_ingress(m_eaction)) err = dev_queue_xmit(skb2); else err = netif_receive_skb(skb2); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 970db7a41684..27934456d984 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -40,6 +40,7 @@ struct fl_flow_key { }; struct flow_dissector_key_ports tp; struct flow_dissector_key_icmp icmp; + struct flow_dissector_key_arp arp; struct flow_dissector_key_keyid enc_key_id; union { struct flow_dissector_key_ipv4_addrs enc_ipv4; @@ -133,6 +134,14 @@ static void fl_clear_masked_range(struct fl_flow_key *key, memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); } +static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head, + struct fl_flow_key *mkey) +{ + return rhashtable_lookup_fast(&head->ht, + fl_key_get_start(mkey, &head->mask), + head->ht_params); +} + static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { @@ -180,9 +189,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); - f = rhashtable_lookup_fast(&head->ht, - fl_key_get_start(&skb_mkey, &head->mask), - head->ht_params); + f = fl_lookup(head, &skb_mkey); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); @@ -401,6 +408,16 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ARP_SIP] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ARP_SIP_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ARP_TIP] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ARP_TIP_MASK] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_ARP_OP] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ARP_OP_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ARP_SHA] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN }, }; static void fl_set_key_val(struct nlattr **tb, @@ -572,6 +589,23 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE_MASK, sizeof(key->icmp.code)); + } else if (key->basic.n_proto == htons(ETH_P_ARP) || + key->basic.n_proto == htons(ETH_P_RARP)) { + fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP, + &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK, + sizeof(key->arp.sip)); + fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP, + &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK, + sizeof(key->arp.tip)); + fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP, + &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK, + sizeof(key->arp.op)); + fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA, + mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK, + sizeof(key->arp.sha)); + fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA, + mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK, + sizeof(key->arp.tha)); } if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || @@ -689,6 +723,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ICMP, icmp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ARP, arp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); @@ -845,6 +881,11 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout; if (!tc_skip_sw(fnew->flags)) { + if (!fold && fl_lookup(head, &fnew->mkey)) { + err = -EEXIST; + goto errout; + } + err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) @@ -1112,6 +1153,27 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, TCA_FLOWER_KEY_ICMPV6_CODE_MASK, sizeof(key->icmp.code)))) goto nla_put_failure; + else if ((key->basic.n_proto == htons(ETH_P_ARP) || + key->basic.n_proto == htons(ETH_P_RARP)) && + (fl_dump_key_val(skb, &key->arp.sip, + TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip, + TCA_FLOWER_KEY_ARP_SIP_MASK, + sizeof(key->arp.sip)) || + fl_dump_key_val(skb, &key->arp.tip, + TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip, + TCA_FLOWER_KEY_ARP_TIP_MASK, + sizeof(key->arp.tip)) || + fl_dump_key_val(skb, &key->arp.op, + TCA_FLOWER_KEY_ARP_OP, &mask->arp.op, + TCA_FLOWER_KEY_ARP_OP_MASK, + sizeof(key->arp.op)) || + fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA, + mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK, + sizeof(key->arp.sha)) || + fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA, + mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK, + sizeof(key->arp.tha)))) + goto nla_put_failure; if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index f935429bd5ef..fcecf5aac666 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -141,10 +141,12 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp, struct tcf_exts e; int err; - tcf_exts_init(&e, TCA_MATCHALL_ACT, 0); + err = tcf_exts_init(&e, TCA_MATCHALL_ACT, 0); + if (err) + return err; err = tcf_exts_validate(net, tp, tb, est, &e, ovr); if (err < 0) - return err; + goto errout; if (tb[TCA_MATCHALL_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); @@ -154,6 +156,9 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp, tcf_exts_change(tp, &f->exts, &e); return 0; +errout: + tcf_exts_destroy(&e); + return err; } static int mall_change(struct net *net, struct sk_buff *in_skb, @@ -193,7 +198,9 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!f) return -ENOBUFS; - tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0); + err = tcf_exts_init(&f->exts, TCA_MATCHALL_ACT, 0); + if (err) + goto err_exts_init; if (!handle) handle = 1; @@ -202,13 +209,13 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, err = mall_set_parms(net, tp, f, base, tb, tca[TCA_RATE], ovr); if (err) - goto errout; + goto err_set_parms; if (tc_should_offload(dev, tp, flags)) { err = mall_replace_hw_filter(tp, f, (unsigned long) f); if (err) { if (tc_skip_sw(flags)) - goto errout; + goto err_replace_hw_filter; else err = 0; } @@ -219,7 +226,10 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, return 0; -errout: +err_replace_hw_filter: +err_set_parms: + tcf_exts_destroy(&f->exts); +err_exts_init: kfree(f); return err; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ae83c3aec308..a6ec3e4b57ab 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -334,7 +334,6 @@ static int u32_init(struct tcf_proto *tp) if (root_ht == NULL) return -ENOBUFS; - root_ht->divisor = 0; root_ht->refcnt++; root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; root_ht->prio = tp->prio; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d7b93429f0cc..ef53ede11590 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1861,6 +1861,7 @@ int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, { __be16 protocol = tc_skb_protocol(skb); #ifdef CONFIG_NET_CLS_ACT + const int max_reclassify_loop = 4; const struct tcf_proto *old_tp = tp; int limit = 0; @@ -1885,7 +1886,7 @@ reclassify: return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: - if (unlikely(limit++ >= MAX_REC_LOOP)) { + if (unlikely(limit++ >= max_reclassify_loop)) { net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", tp->q->ops->id, tp->prio & 0xffff, ntohs(tp->protocol)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6eb9c8e88519..b052b27a984e 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -247,7 +247,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets) void __qdisc_run(struct Qdisc *q) { - int quota = weight_p; + int quota = dev_tx_weight; int packets; while (qdisc_restart(q, &packets)) { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index bcfadfdea8e0..c8bb62a1e744 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -626,7 +626,7 @@ deliver: * If it's at ingress let's pretend the delay is * from the network (tstamp will be updated). */ - if (G_TC_FROM(skb->tc_verd) & AT_INGRESS) + if (skb->tc_redirected && skb->tc_from_ingress) skb->tstamp = 0; #endif diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index b0196366d58d..9fe6b427afed 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -401,8 +401,8 @@ static int teql_master_close(struct net_device *dev) return 0; } -static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, - struct rtnl_link_stats64 *stats) +static void teql_master_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats) { struct teql_master *m = netdev_priv(dev); @@ -410,7 +410,6 @@ static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev, stats->tx_bytes = m->tx_bytes; stats->tx_errors = m->tx_errors; stats->tx_dropped = m->tx_dropped; - return stats; } static int teql_master_mtu(struct net_device *dev, int new_mtu) diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 6c4f7496cec6..70f1b570bab9 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -11,7 +11,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ - output.o input.o debug.o ssnmap.o auth.o \ + output.o input.o debug.o stream.o auth.o \ offload.o sctp_probe-y := probe.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c index d3cc30c25c41..36294f7fb9a7 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -358,8 +358,8 @@ void sctp_association_free(struct sctp_association *asoc) sctp_tsnmap_free(&asoc->peer.tsn_map); - /* Free ssnmap storage. */ - sctp_ssnmap_free(asoc->ssnmap); + /* Free stream information. */ + sctp_stream_free(asoc->stream); /* Clean up the bound address list. */ sctp_bind_addr_free(&asoc->base.bind_addr); @@ -1137,7 +1137,7 @@ void sctp_assoc_update(struct sctp_association *asoc, /* Reinitialize SSN for both local streams * and peer's streams. */ - sctp_ssnmap_clear(asoc->ssnmap); + sctp_stream_clear(asoc->stream); /* Flush the ULP reassembly and ordered queue. * Any data there will now be stale and will @@ -1162,10 +1162,9 @@ void sctp_assoc_update(struct sctp_association *asoc, asoc->ctsn_ack_point = asoc->next_tsn - 1; asoc->adv_peer_ack_point = asoc->ctsn_ack_point; - if (!asoc->ssnmap) { - /* Move the ssnmap. */ - asoc->ssnmap = new->ssnmap; - new->ssnmap = NULL; + if (!asoc->stream) { + asoc->stream = new->stream; + new->stream = NULL; } if (!asoc->assoc_id) { diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 615f0ddd41df..e3621cb4827f 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -165,14 +165,12 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct iov_iter *from) { - int max, whole, i, offset, over, err; - int len, first_len; - int max_data; + size_t len, first_len, max_data, remaining; + size_t msg_len = iov_iter_count(from); + struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; - struct list_head *pos, *temp; - size_t msg_len = iov_iter_count(from); - __u8 frag; + int err; msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) @@ -185,7 +183,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || !SCTP_PR_POLICY(sinfo->sinfo_flags))) msg->expires_at = jiffies + - msecs_to_jiffies(sinfo->sinfo_timetolive); + msecs_to_jiffies(sinfo->sinfo_timetolive); /* This is the biggest possible DATA chunk that can fit into * the packet @@ -195,7 +193,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, sizeof(struct sctphdr) - sizeof(struct sctp_data_chunk); max_data = SCTP_TRUNC4(max_data); - max = asoc->frag_point; /* If the the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. @@ -208,12 +205,11 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, hmac_desc->hmac_len); } - /* Now, check if we need to reduce our max */ - if (max > max_data) - max = max_data; + /* Check what's our max considering the above */ + max_data = min_t(size_t, max_data, asoc->frag_point); - whole = 0; - first_len = max; + /* Set first_len and then account for possible bundles on first frag */ + first_len = max_data; /* Check to see if we have a pending SACK and try to let it be bundled * with this message. Do this if we don't have any data queued already. @@ -224,40 +220,38 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && - msg_len > max) - max_data -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); + msg_len > max_data) + first_len -= SCTP_PAD4(sizeof(sctp_sack_chunk_t)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) - max_data -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; - - /* Now that we adjusted completely, reset first_len */ - if (first_len > max_data) - first_len = max_data; + first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; /* Account for a different sized first fragment */ if (msg_len >= first_len) { - msg_len -= first_len; - whole = 1; msg->can_delay = 0; - } - - /* How many full sized? How many bytes leftover? */ - whole += msg_len / max; - over = msg_len % max; - offset = 0; - - if ((whole > 1) || (whole && over)) SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); + } else { + /* Which may be the only one... */ + first_len = msg_len; + } - /* Create chunks for all the full sized DATA chunks. */ - for (i = 0, len = first_len; i < whole; i++) { - frag = SCTP_DATA_MIDDLE_FRAG; + /* Create chunks for all DATA chunks. */ + for (remaining = msg_len; remaining; remaining -= len) { + u8 frag = SCTP_DATA_MIDDLE_FRAG; - if (0 == i) + if (remaining == msg_len) { + /* First frag, which may also be the last */ frag |= SCTP_DATA_FIRST_FRAG; + len = first_len; + } else { + /* Middle frags */ + len = max_data; + } - if ((i == (whole - 1)) && !over) { + if (len >= remaining) { + /* Last frag, which may also be the first */ + len = remaining; frag |= SCTP_DATA_LAST_FRAG; /* The application requests to set the I-bit of the @@ -271,7 +265,6 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0, GFP_KERNEL); - if (!chunk) { err = -ENOMEM; goto errout; @@ -282,45 +275,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, goto errout_chunk_free; /* Put the chunk->skb back into the form expected by send. */ - __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - - (__u8 *)chunk->skb->data); - - sctp_datamsg_assign(msg, chunk); - list_add_tail(&chunk->frag_list, &msg->chunks); - - /* The first chunk, the first chunk was likely short - * to allow bundling, so reset to full size. - */ - if (0 == i) - len = max; - } - - /* .. now the leftover bytes. */ - if (over) { - if (!whole) - frag = SCTP_DATA_NOT_FRAG; - else - frag = SCTP_DATA_LAST_FRAG; - - if ((sinfo->sinfo_flags & SCTP_EOF) || - (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) - frag |= SCTP_DATA_SACK_IMM; - - chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, - 0, GFP_KERNEL); - - if (!chunk) { - err = -ENOMEM; - goto errout; - } - - err = sctp_user_addto_chunk(chunk, over, from); - - /* Put the chunk->skb back into the form expected by send. */ - __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - - (__u8 *)chunk->skb->data); - if (err < 0) - goto errout_chunk_free; + __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - + chunk->skb->data); sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); @@ -338,6 +294,7 @@ errout: sctp_chunk_free(chunk); } sctp_datamsg_put(msg); + return ERR_PTR(err); } diff --git a/net/sctp/input.c b/net/sctp/input.c index 458e506ef84b..704ad19c1565 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1229,13 +1229,26 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sctp_association *asoc; asoc = __sctp_lookup_association(net, laddr, paddr, transportp); + if (asoc) + goto out; /* Further lookup for INIT/INIT-ACK packets. * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. */ - if (!asoc) - asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp); + asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp); + if (asoc) + goto out; + if (paddr->sa.sa_family == AF_INET) + pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n", + &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port), + &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port)); + else + pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n", + &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port), + &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port)); + +out: return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 5ed8e79bf102..6619367bb6ca 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -412,22 +412,20 @@ static void sctp_v6_copy_addrlist(struct list_head *addrlist, static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { - __be16 *port; - struct sctphdr *sh; + /* Always called on head skb, so this is safe */ + struct sctphdr *sh = sctp_hdr(skb); + struct sockaddr_in6 *sa = &addr->v6; - port = &addr->v6.sin6_port; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; - /* Always called on head skb, so this is safe */ - sh = sctp_hdr(skb); if (is_saddr) { - *port = sh->source; - addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; + sa->sin6_port = sh->source; + sa->sin6_addr = ipv6_hdr(skb)->saddr; } else { - *port = sh->dest; - addr->v6.sin6_addr = ipv6_hdr(skb)->daddr; + sa->sin6_port = sh->dest; + sa->sin6_addr = ipv6_hdr(skb)->daddr; } } diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index 40e7fac96c41..105ac3327b28 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -51,7 +51,6 @@ SCTP_DBG_OBJCNT(bind_addr); SCTP_DBG_OBJCNT(bind_bucket); SCTP_DBG_OBJCNT(chunk); SCTP_DBG_OBJCNT(addr); -SCTP_DBG_OBJCNT(ssnmap); SCTP_DBG_OBJCNT(datamsg); SCTP_DBG_OBJCNT(keys); @@ -67,7 +66,6 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = { SCTP_DBG_OBJCNT_ENTRY(bind_addr), SCTP_DBG_OBJCNT_ENTRY(bind_bucket), SCTP_DBG_OBJCNT_ENTRY(addr), - SCTP_DBG_OBJCNT_ENTRY(ssnmap), SCTP_DBG_OBJCNT_ENTRY(datamsg), SCTP_DBG_OBJCNT_ENTRY(keys), }; diff --git a/net/sctp/output.c b/net/sctp/output.c index f5320a87341e..07ab5062e541 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -81,8 +81,8 @@ static void sctp_packet_reset(struct sctp_packet *packet) /* Config a packet. * This appears to be a followup set of initializations. */ -struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, - __u32 vtag, int ecn_capable) +void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, + int ecn_capable) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; @@ -123,14 +123,12 @@ struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, if (chunk) sctp_packet_append_chunk(packet, chunk); } - - return packet; } /* Initialize the packet structure. */ -struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, - struct sctp_transport *transport, - __u16 sport, __u16 dport) +void sctp_packet_init(struct sctp_packet *packet, + struct sctp_transport *transport, + __u16 sport, __u16 dport) { struct sctp_association *asoc = transport->asoc; size_t overhead; @@ -151,8 +149,6 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, packet->overhead = overhead; sctp_packet_reset(packet); packet->vtag = 0; - - return packet; } /* Free a packet. */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 616a9428e0c4..f9c3c37c9ae0 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -237,23 +237,19 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { - void *from; - __be16 *port; - struct sctphdr *sh; + /* Always called on head skb, so this is safe */ + struct sctphdr *sh = sctp_hdr(skb); + struct sockaddr_in *sa = &addr->v4; - port = &addr->v4.sin_port; addr->v4.sin_family = AF_INET; - /* Always called on head skb, so this is safe */ - sh = sctp_hdr(skb); if (is_saddr) { - *port = sh->source; - from = &ip_hdr(skb)->saddr; + sa->sin_port = sh->source; + sa->sin_addr.s_addr = ip_hdr(skb)->saddr; } else { - *port = sh->dest; - from = &ip_hdr(skb)->daddr; + sa->sin_port = sh->dest; + sa->sin_addr.s_addr = ip_hdr(skb)->daddr; } - memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr)); } /* Initialize an sctp_addr from a socket. */ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 9e9690b7afe1..80a9088084ac 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1536,7 +1536,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); - stream = &chunk->asoc->ssnmap->out; + stream = chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. @@ -1547,9 +1547,9 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) ssn = 0; } else { if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) - ssn = sctp_ssn_next(stream, sid); + ssn = sctp_ssn_next(stream, out, sid); else - ssn = sctp_ssn_peek(stream, sid); + ssn = sctp_ssn_peek(stream, out, sid); } lchunk->subh.data_hdr->ssn = htons(ssn); @@ -2444,9 +2444,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, if (!asoc->temp) { int error; - asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams, + asoc->stream = sctp_stream_new(asoc->c.sinit_max_instreams, asoc->c.sinit_num_ostreams, gfp); - if (!asoc->ssnmap) + if (!asoc->stream) goto clean_up; error = sctp_assoc_set_id(asoc, gfp); @@ -3210,7 +3210,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, union sctp_params param; sctp_addiphdr_t *hdr; union sctp_addr_param *addr_param; - sctp_addip_param_t *asconf_param; struct sctp_chunk *asconf_ack; __be16 err_code; int length = 0; @@ -3230,7 +3229,6 @@ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, * asconf parameter. */ length = ntohs(addr_param->p.length); - asconf_param = (void *)addr_param + length; chunk_len -= length; /* create an ASCONF_ACK chunk. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8ec20a64a3f8..0ceded37d20b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -160,23 +160,22 @@ static sctp_disposition_t __sctp_sf_do_9_1_abort(struct net *net, /* Small helper function that checks if the chunk length * is of the appropriate length. The 'required_length' argument * is set to be the size of a specific chunk we are testing. - * Return Values: 1 = Valid length - * 0 = Invalid length + * Return Values: true = Valid length + * false = Invalid length * */ -static inline int -sctp_chunk_length_valid(struct sctp_chunk *chunk, - __u16 required_length) +static inline bool +sctp_chunk_length_valid(struct sctp_chunk *chunk, __u16 required_length) { __u16 chunk_length = ntohs(chunk->chunk_hdr->length); /* Previously already marked? */ if (unlikely(chunk->pdiscard)) - return 0; + return false; if (unlikely(chunk_length < required_length)) - return 0; + return false; - return 1; + return true; } /********************************************************** @@ -3237,36 +3236,34 @@ static sctp_disposition_t sctp_sf_tabort_8_4_8(struct net *net, struct sctp_chunk *abort; packet = sctp_ootb_pkt_new(net, asoc, chunk); + if (!packet) + return SCTP_DISPOSITION_NOMEM; - if (packet) { - /* Make an ABORT. The T bit will be set if the asoc - * is NULL. - */ - abort = sctp_make_abort(asoc, chunk, 0); - if (!abort) { - sctp_ootb_pkt_free(packet); - return SCTP_DISPOSITION_NOMEM; - } + /* Make an ABORT. The T bit will be set if the asoc + * is NULL. + */ + abort = sctp_make_abort(asoc, chunk, 0); + if (!abort) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } - /* Reflect vtag if T-Bit is set */ - if (sctp_test_T_bit(abort)) - packet->vtag = ntohl(chunk->sctp_hdr->vtag); + /* Reflect vtag if T-Bit is set */ + if (sctp_test_T_bit(abort)) + packet->vtag = ntohl(chunk->sctp_hdr->vtag); - /* Set the skb to the belonging sock for accounting. */ - abort->skb->sk = ep->base.sk; - - sctp_packet_append_chunk(packet, abort); + /* Set the skb to the belonging sock for accounting. */ + abort->skb->sk = ep->base.sk; - sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, - SCTP_PACKET(packet)); + sctp_packet_append_chunk(packet, abort); - SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); - sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - return SCTP_DISPOSITION_CONSUME; - } + SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); - return SCTP_DISPOSITION_NOMEM; + sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return SCTP_DISPOSITION_CONSUME; } /* @@ -3503,45 +3500,43 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(struct net *net, struct sctp_chunk *shut; packet = sctp_ootb_pkt_new(net, asoc, chunk); + if (!packet) + return SCTP_DISPOSITION_NOMEM; - if (packet) { - /* Make an SHUTDOWN_COMPLETE. - * The T bit will be set if the asoc is NULL. - */ - shut = sctp_make_shutdown_complete(asoc, chunk); - if (!shut) { - sctp_ootb_pkt_free(packet); - return SCTP_DISPOSITION_NOMEM; - } - - /* Reflect vtag if T-Bit is set */ - if (sctp_test_T_bit(shut)) - packet->vtag = ntohl(chunk->sctp_hdr->vtag); + /* Make an SHUTDOWN_COMPLETE. + * The T bit will be set if the asoc is NULL. + */ + shut = sctp_make_shutdown_complete(asoc, chunk); + if (!shut) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } - /* Set the skb to the belonging sock for accounting. */ - shut->skb->sk = ep->base.sk; + /* Reflect vtag if T-Bit is set */ + if (sctp_test_T_bit(shut)) + packet->vtag = ntohl(chunk->sctp_hdr->vtag); - sctp_packet_append_chunk(packet, shut); + /* Set the skb to the belonging sock for accounting. */ + shut->skb->sk = ep->base.sk; - sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, - SCTP_PACKET(packet)); + sctp_packet_append_chunk(packet, shut); - SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); - /* If the chunk length is invalid, we don't want to process - * the reset of the packet. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); - /* We need to discard the rest of the packet to prevent - * potential bomming attacks from additional bundled chunks. - * This is documented in SCTP Threats ID. - */ + /* If the chunk length is invalid, we don't want to process + * the reset of the packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - } - return SCTP_DISPOSITION_NOMEM; + /* We need to discard the rest of the packet to prevent + * potential bomming attacks from additional bundled chunks. + * This is documented in SCTP Threats ID. + */ + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } /* @@ -6036,8 +6031,9 @@ static struct sctp_packet *sctp_ootb_pkt_new(struct net *net, sctp_transport_route(transport, (union sctp_addr *)&chunk->dest, sctp_sk(net->sctp.ctl_sock)); - packet = sctp_packet_init(&transport->packet, transport, sport, dport); - packet = sctp_packet_config(packet, vtag, 0); + packet = &transport->packet; + sctp_packet_init(packet, transport, sport, dport); + sctp_packet_config(packet, vtag, 0); return packet; @@ -6278,9 +6274,8 @@ static int sctp_eat_data(const struct sctp_association *asoc, * and is invalid. */ ssn = ntohs(data_hdr->ssn); - if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) { + if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid))) return SCTP_IERROR_PROTO_VIOLATION; - } /* Send the data up to the user. Note: Schedule the * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 318c6786d653..635e03412693 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2430,7 +2430,6 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc); } else if (asoc) { asoc->pathmtu = params->spp_pathmtu; - sctp_frag_point(asoc, params->spp_pathmtu); } else { sp->pathmtu = params->spp_pathmtu; } diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c deleted file mode 100644 index b9c8521c1a98..000000000000 --- a/net/sctp/ssnmap.c +++ /dev/null @@ -1,125 +0,0 @@ -/* SCTP kernel implementation - * Copyright (c) 2003 International Business Machines, Corp. - * - * This file is part of the SCTP kernel implementation - * - * These functions manipulate sctp SSN tracker. - * - * This SCTP implementation is free software; - * you can redistribute it and/or modify it under the terms of - * the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This SCTP implementation is distributed in the hope that it - * will be useful, but WITHOUT ANY WARRANTY; without even the implied - * ************************ - * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU CC; see the file COPYING. If not, see - * <http://www.gnu.org/licenses/>. - * - * Please send any bug reports or fixes you make to the - * email address(es): - * lksctp developers <linux-sctp@vger.kernel.org> - * - * Written or modified by: - * Jon Grimm <jgrimm@us.ibm.com> - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <net/sctp/sctp.h> -#include <net/sctp/sm.h> - -static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, - __u16 out); - -/* Storage size needed for map includes 2 headers and then the - * specific needs of in or out streams. - */ -static inline size_t sctp_ssnmap_size(__u16 in, __u16 out) -{ - return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16); -} - - -/* Create a new sctp_ssnmap. - * Allocate room to store at least 'len' contiguous TSNs. - */ -struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, - gfp_t gfp) -{ - struct sctp_ssnmap *retval; - int size; - - size = sctp_ssnmap_size(in, out); - if (size <= KMALLOC_MAX_SIZE) - retval = kmalloc(size, gfp); - else - retval = (struct sctp_ssnmap *) - __get_free_pages(gfp, get_order(size)); - if (!retval) - goto fail; - - if (!sctp_ssnmap_init(retval, in, out)) - goto fail_map; - - SCTP_DBG_OBJCNT_INC(ssnmap); - - return retval; - -fail_map: - if (size <= KMALLOC_MAX_SIZE) - kfree(retval); - else - free_pages((unsigned long)retval, get_order(size)); -fail: - return NULL; -} - - -/* Initialize a block of memory as a ssnmap. */ -static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, - __u16 out) -{ - memset(map, 0x00, sctp_ssnmap_size(in, out)); - - /* Start 'in' stream just after the map header. */ - map->in.ssn = (__u16 *)&map[1]; - map->in.len = in; - - /* Start 'out' stream just after 'in'. */ - map->out.ssn = &map->in.ssn[in]; - map->out.len = out; - - return map; -} - -/* Clear out the ssnmap streams. */ -void sctp_ssnmap_clear(struct sctp_ssnmap *map) -{ - size_t size; - - size = (map->in.len + map->out.len) * sizeof(__u16); - memset(map->in.ssn, 0x00, size); -} - -/* Dispose of a ssnmap. */ -void sctp_ssnmap_free(struct sctp_ssnmap *map) -{ - int size; - - if (unlikely(!map)) - return; - - size = sctp_ssnmap_size(map->in.len, map->out.len); - if (size <= KMALLOC_MAX_SIZE) - kfree(map); - else - free_pages((unsigned long)map, get_order(size)); - - SCTP_DBG_OBJCNT_DEC(ssnmap); -} diff --git a/net/sctp/stream.c b/net/sctp/stream.c new file mode 100644 index 000000000000..f86de43cbbe5 --- /dev/null +++ b/net/sctp/stream.c @@ -0,0 +1,85 @@ +/* SCTP kernel implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * + * This file is part of the SCTP kernel implementation + * + * These functions manipulate sctp tsn mapping array. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * <http://www.gnu.org/licenses/>. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers <linux-sctp@vger.kernel.org> + * + * Written or modified by: + * Xin Long <lucien.xin@gmail.com> + */ + +#include <net/sctp/sctp.h> + +struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp) +{ + struct sctp_stream *stream; + int i; + + stream = kzalloc(sizeof(*stream), gfp); + if (!stream) + return NULL; + + stream->outcnt = outcnt; + stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp); + if (!stream->out) { + kfree(stream); + return NULL; + } + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; + + stream->incnt = incnt; + stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp); + if (!stream->in) { + kfree(stream->out); + kfree(stream); + return NULL; + } + + return stream; +} + +void sctp_stream_free(struct sctp_stream *stream) +{ + if (unlikely(!stream)) + return; + + kfree(stream->out); + kfree(stream->in); + kfree(stream); +} + +void sctp_stream_clear(struct sctp_stream *stream) +{ + int i; + + for (i = 0; i < stream->outcnt; i++) + stream->out[i].ssn = 0; + + for (i = 0; i < stream->incnt; i++) + stream->in[i].ssn = 0; +} diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 84d0fdaf7de9..aa3624d50278 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -760,11 +760,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, struct sk_buff_head *event_list; struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; - struct sctp_stream *in; + struct sctp_stream *stream; __u16 sid, csid, cssn; sid = event->stream; - in = &ulpq->asoc->ssnmap->in; + stream = ulpq->asoc->stream; event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; @@ -782,11 +782,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, if (csid < sid) continue; - if (cssn != sctp_ssn_peek(in, sid)) + if (cssn != sctp_ssn_peek(stream, in, sid)) break; - /* Found it, so mark in the ssnmap. */ - sctp_ssn_next(in, sid); + /* Found it, so mark in the stream. */ + sctp_ssn_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); @@ -849,7 +849,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { __u16 sid, ssn; - struct sctp_stream *in; + struct sctp_stream *stream; /* Check if this message needs ordering. */ if (SCTP_DATA_UNORDERED & event->msg_flags) @@ -858,10 +858,10 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, /* Note: The stream ID must be verified before this routine. */ sid = event->stream; ssn = event->ssn; - in = &ulpq->asoc->ssnmap->in; + stream = ulpq->asoc->stream; /* Is this the expected SSN for this stream ID? */ - if (ssn != sctp_ssn_peek(in, sid)) { + if (ssn != sctp_ssn_peek(stream, in, sid)) { /* We've received something out of order, so find where it * needs to be placed. We order by stream and then by SSN. */ @@ -870,7 +870,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, } /* Mark that the next chunk has been found. */ - sctp_ssn_next(in, sid); + sctp_ssn_next(stream, in, sid); /* Go find any other chunks that were waiting for * ordering. @@ -888,12 +888,12 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_ulpevent *event; - struct sctp_stream *in; + struct sctp_stream *stream; struct sk_buff_head temp; struct sk_buff_head *lobby = &ulpq->lobby; __u16 csid, cssn; - in = &ulpq->asoc->ssnmap->in; + stream = ulpq->asoc->stream; /* We are holding the chunks by stream, by SSN. */ skb_queue_head_init(&temp); @@ -912,7 +912,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) continue; /* see if this ssn has been marked by skipping */ - if (!SSN_lt(cssn, sctp_ssn_peek(in, csid))) + if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); @@ -932,8 +932,8 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) csid = cevent->stream; cssn = cevent->ssn; - if (csid == sid && cssn == sctp_ssn_peek(in, csid)) { - sctp_ssn_next(in, csid); + if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) { + sctp_ssn_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); @@ -955,17 +955,17 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) */ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) { - struct sctp_stream *in; + struct sctp_stream *stream; /* Note: The stream ID must be verified before this routine. */ - in = &ulpq->asoc->ssnmap->in; + stream = ulpq->asoc->stream; /* Is this an old SSN? If so ignore. */ - if (SSN_lt(ssn, sctp_ssn_peek(in, sid))) + if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid))) return; /* Mark that we are no longer expecting this SSN or lower. */ - sctp_ssn_skip(in, sid, ssn); + sctp_ssn_skip(stream, in, sid, ssn); /* Go find any other chunks that were waiting for * ordering and deliver them if needed. diff --git a/net/smc/Kconfig b/net/smc/Kconfig new file mode 100644 index 000000000000..c717ef0896aa --- /dev/null +++ b/net/smc/Kconfig @@ -0,0 +1,20 @@ +config SMC + tristate "SMC socket protocol family" + depends on INET && INFINIBAND + ---help--- + SMC-R provides a "sockets over RDMA" solution making use of + RDMA over Converged Ethernet (RoCE) technology to upgrade + AF_INET TCP connections transparently. + The Linux implementation of the SMC-R solution is designed as + a separate socket family SMC. + + Select this option if you want to run SMC socket applications + +config SMC_DIAG + tristate "SMC: socket monitoring interface" + depends on SMC + ---help--- + Support for SMC socket monitoring interface used by tools such as + smcss. + + if unsure, say Y. diff --git a/net/smc/Makefile b/net/smc/Makefile new file mode 100644 index 000000000000..188104654b54 --- /dev/null +++ b/net/smc/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_SMC) += smc.o +obj-$(CONFIG_SMC_DIAG) += smc_diag.o +smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o +smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c new file mode 100644 index 000000000000..5d4208ad029e --- /dev/null +++ b/net/smc/af_smc.c @@ -0,0 +1,1407 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * AF_SMC protocol family socket handler keeping the AF_INET sock address type + * applies to SOCK_STREAM sockets only + * offers an alternative communication option for TCP-protocol sockets + * applicable with RoCE-cards only + * + * Initial restrictions: + * - non-blocking connect postponed + * - IPv6 support postponed + * - support for alternate links postponed + * - partial support for non-blocking sockets only + * - support for urgent data postponed + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + * based on prototype from Frank Blaschka + */ + +#define KMSG_COMPONENT "smc" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/inetdevice.h> +#include <linux/workqueue.h> +#include <linux/in.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/smc.h> + +#include "smc.h" +#include "smc_clc.h" +#include "smc_llc.h" +#include "smc_cdc.h" +#include "smc_core.h" +#include "smc_ib.h" +#include "smc_pnet.h" +#include "smc_tx.h" +#include "smc_rx.h" +#include "smc_close.h" + +static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group + * creation + */ + +struct smc_lgr_list smc_lgr_list = { /* established link groups */ + .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), + .list = LIST_HEAD_INIT(smc_lgr_list.list), +}; + +static void smc_tcp_listen_work(struct work_struct *); + +static void smc_set_keepalive(struct sock *sk, int val) +{ + struct smc_sock *smc = smc_sk(sk); + + smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val); +} + +static struct smc_hashinfo smc_v4_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), +}; + +int smc_hash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + struct hlist_head *head; + + head = &h->ht; + + write_lock_bh(&h->lock); + sk_add_node(sk, head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + write_unlock_bh(&h->lock); + + return 0; +} +EXPORT_SYMBOL_GPL(smc_hash_sk); + +void smc_unhash_sk(struct sock *sk) +{ + struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; + + write_lock_bh(&h->lock); + if (sk_del_node_init(sk)) + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + write_unlock_bh(&h->lock); +} +EXPORT_SYMBOL_GPL(smc_unhash_sk); + +struct proto smc_proto = { + .name = "SMC", + .owner = THIS_MODULE, + .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v4_hashinfo, + .slab_flags = SLAB_DESTROY_BY_RCU, +}; +EXPORT_SYMBOL_GPL(smc_proto); + +static int smc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = 0; + + if (!sk) + goto out; + + smc = smc_sk(sk); + sock_hold(sk); + if (sk->sk_state == SMC_LISTEN) + /* smc_close_non_accepted() is called and acquires + * sock lock for child sockets again + */ + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + else + lock_sock(sk); + + if (smc->use_fallback) { + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); + } else { + rc = smc_close_active(smc); + sock_set_flag(sk, SOCK_DEAD); + sk->sk_shutdown |= SHUTDOWN_MASK; + } + if (smc->clcsock) { + sock_release(smc->clcsock); + smc->clcsock = NULL; + } + + /* detach socket */ + sock_orphan(sk); + sock->sk = NULL; + if (smc->use_fallback) { + schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); + } else if (sk->sk_state == SMC_CLOSED) { + smc_conn_free(&smc->conn); + schedule_delayed_work(&smc->sock_put_work, + SMC_CLOSE_SOCK_PUT_DELAY); + } + sk->sk_prot->unhash(sk); + release_sock(sk); + + sock_put(sk); +out: + return rc; +} + +static void smc_destruct(struct sock *sk) +{ + if (sk->sk_state != SMC_CLOSED) + return; + if (!sock_flag(sk, SOCK_DEAD)) + return; + + sk_refcnt_debug_dec(sk); +} + +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) +{ + struct smc_sock *smc; + struct sock *sk; + + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); + if (!sk) + return NULL; + + sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ + sk->sk_state = SMC_INIT; + sk->sk_destruct = smc_destruct; + sk->sk_protocol = SMCPROTO_SMC; + smc = smc_sk(sk); + INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_LIST_HEAD(&smc->accept_q); + spin_lock_init(&smc->accept_q_lock); + INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work); + sk->sk_prot->hash(sk); + sk_refcnt_debug_inc(sk); + + return sk; +} + +static int smc_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + + /* replicate tests from inet_bind(), to be safe wrt. future changes */ + rc = -EINVAL; + if (addr_len < sizeof(struct sockaddr_in)) + goto out; + + rc = -EAFNOSUPPORT; + /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ + if ((addr->sin_family != AF_INET) && + ((addr->sin_family != AF_UNSPEC) || + (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) + goto out; + + lock_sock(sk); + + /* Check if socket is already active */ + rc = -EINVAL; + if (sk->sk_state != SMC_INIT) + goto out_rel; + + smc->clcsock->sk->sk_reuse = sk->sk_reuse; + rc = kernel_bind(smc->clcsock, uaddr, addr_len); + +out_rel: + release_sock(sk); +out: + return rc; +} + +static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + /* options we don't get control via setsockopt for */ + nsk->sk_type = osk->sk_type; + nsk->sk_sndbuf = osk->sk_sndbuf; + nsk->sk_rcvbuf = osk->sk_rcvbuf; + nsk->sk_sndtimeo = osk->sk_sndtimeo; + nsk->sk_rcvtimeo = osk->sk_rcvtimeo; + nsk->sk_mark = osk->sk_mark; + nsk->sk_priority = osk->sk_priority; + nsk->sk_rcvlowat = osk->sk_rcvlowat; + nsk->sk_bound_dev_if = osk->sk_bound_dev_if; + nsk->sk_err = osk->sk_err; + + nsk->sk_flags &= ~mask; + nsk->sk_flags |= osk->sk_flags & mask; +} + +#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_BROADCAST) | \ + (1UL << SOCK_TIMESTAMP) | \ + (1UL << SOCK_DBG) | \ + (1UL << SOCK_RCVTSTAMP) | \ + (1UL << SOCK_RCVTSTAMPNS) | \ + (1UL << SOCK_LOCALROUTE) | \ + (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \ + (1UL << SOCK_RXQ_OVFL) | \ + (1UL << SOCK_WIFI_STATUS) | \ + (1UL << SOCK_NOFCS) | \ + (1UL << SOCK_FILTER_LOCKED)) +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ +static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) +{ + smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); +} + +#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \ + (1UL << SOCK_KEEPOPEN) | \ + (1UL << SOCK_LINGER) | \ + (1UL << SOCK_DBG)) +/* copy only settings and flags relevant for smc from clc to smc socket */ +static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) +{ + smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); +} + +/* determine subnet and mask of internal TCP socket */ +int smc_netinfo_by_tcpsk(struct socket *clcsock, + __be32 *subnet, u8 *prefix_len) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct sockaddr_in addr; + int rc = -ENOENT; + int len; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + + /* get address to which the internal TCP socket is bound */ + kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); + /* analyze IPv4 specific data of net_device belonging to TCP socket */ + for_ifa(dst->dev->ip_ptr) { + if (ifa->ifa_address != addr.sin_addr.s_addr) + continue; + *prefix_len = inet_mask_len(ifa->ifa_mask); + *subnet = ifa->ifa_address & ifa->ifa_mask; + rc = 0; + break; + } endfor_ifa(dst->dev->ip_ptr); + +out_rel: + dst_release(dst); +out: + return rc; +} + +static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) +{ + struct smc_link_group *lgr = smc->conn.lgr; + struct smc_link *link; + int rest; + int rc; + + link = &lgr->lnk[SMC_SINGLE_LINK]; + /* receive CONFIRM LINK request from server over RoCE fabric */ + rest = wait_for_completion_interruptible_timeout( + &link->llc_confirm, + SMC_LLC_WAIT_FIRST_TIME); + if (rest <= 0) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE); + return rc; + } + + rc = smc_ib_modify_qp_rts(link); + if (rc) + return SMC_CLC_DECL_INTERR; + + smc_wr_remember_qp_attr(link); + /* send CONFIRM LINK response over RoCE fabric */ + rc = smc_llc_send_confirm_link(link, + link->smcibdev->mac[link->ibport - 1], + gid, SMC_LLC_RESP); + if (rc < 0) + return SMC_CLC_DECL_TCL; + + return rc; +} + +static void smc_conn_save_peer_info(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + smc->conn.peer_conn_idx = clc->conn_idx; + smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); + smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); + atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); +} + +static void smc_link_save_peer_info(struct smc_link *link, + struct smc_clc_msg_accept_confirm *clc) +{ + link->peer_qpn = ntoh24(clc->qpn); + memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE); + memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac)); + link->peer_psn = ntoh24(clc->psn); + link->peer_mtu = clc->qp_mtu; +} + +/* setup for RDMA connection of client */ +static int smc_connect_rdma(struct smc_sock *smc) +{ + struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr; + struct smc_clc_msg_accept_confirm aclc; + int local_contact = SMC_FIRST_CONTACT; + struct smc_ib_device *smcibdev; + struct smc_link *link; + u8 srv_first_contact; + int reason_code = 0; + int rc = 0; + u8 ibport; + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(smc)) { + reason_code = SMC_CLC_DECL_IPSEC; + goto decline_rdma; + } + + /* PNET table look up: search active ib_device and port + * within same PNETID that also contains the ethernet device + * used for the internal TCP socket + */ + smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); + if (!smcibdev) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* do inband token exchange */ + reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); + if (reason_code < 0) { + rc = reason_code; + goto out_err; + } + if (reason_code > 0) /* configuration error */ + goto decline_rdma; + /* receive SMC Accept CLC message */ + reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), + SMC_CLC_ACCEPT); + if (reason_code < 0) { + rc = reason_code; + goto out_err; + } + if (reason_code > 0) + goto decline_rdma; + + srv_first_contact = aclc.hdr.flag; + mutex_lock(&smc_create_lgr_pending); + local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev, + ibport, &aclc.lcl, srv_first_contact); + if (local_contact < 0) { + rc = local_contact; + if (rc == -ENOMEM) + reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ + else if (rc == -ENOLINK) + reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ + goto decline_rdma_unlock; + } + link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + + smc_conn_save_peer_info(smc, &aclc); + + rc = smc_sndbuf_create(smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma_unlock; + } + rc = smc_rmb_create(smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma_unlock; + } + + if (local_contact == SMC_FIRST_CONTACT) + smc_link_save_peer_info(link, &aclc); + + rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma_unlock; + } + + if (local_contact == SMC_FIRST_CONTACT) { + rc = smc_ib_ready_link(link); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma_unlock; + } + } + + rc = smc_clc_send_confirm(smc); + if (rc) + goto out_err_unlock; + + if (local_contact == SMC_FIRST_CONTACT) { + /* QP confirmation over RoCE fabric */ + reason_code = smc_clnt_conf_first_link( + smc, &smcibdev->gid[ibport - 1]); + if (reason_code < 0) { + rc = reason_code; + goto out_err_unlock; + } + if (reason_code > 0) + goto decline_rdma_unlock; + } + + mutex_unlock(&smc_create_lgr_pending); + smc_tx_init(smc); + smc_rx_init(smc); + +out_connected: + smc_copy_sock_settings_to_clc(smc); + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + + return rc ? rc : local_contact; + +decline_rdma_unlock: + mutex_unlock(&smc_create_lgr_pending); + smc_conn_free(&smc->conn); +decline_rdma: + /* RDMA setup failed, switch back to TCP */ + smc->use_fallback = true; + if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { + rc = smc_clc_send_decline(smc, reason_code, 0); + if (rc < sizeof(struct smc_clc_msg_decline)) + goto out_err; + } + goto out_connected; + +out_err_unlock: + mutex_unlock(&smc_create_lgr_pending); + smc_conn_free(&smc->conn); +out_err: + return rc; +} + +static int smc_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EINVAL; + + smc = smc_sk(sk); + + /* separate smc parameter checking to be safe */ + if (alen < sizeof(addr->sa_family)) + goto out_err; + if (addr->sa_family != AF_INET) + goto out_err; + smc->addr = addr; /* needed for nonblocking connect */ + + lock_sock(sk); + switch (sk->sk_state) { + default: + goto out; + case SMC_ACTIVE: + rc = -EISCONN; + goto out; + case SMC_INIT: + rc = 0; + break; + } + + smc_copy_sock_settings_to_clc(smc); + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc) + goto out; + + /* setup RDMA connection */ + rc = smc_connect_rdma(smc); + if (rc < 0) + goto out; + else + rc = 0; /* success cases including fallback */ + +out: + release_sock(sk); +out_err: + return rc; +} + +static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) +{ + struct sock *sk = &lsmc->sk; + struct socket *new_clcsock; + struct sock *new_sk; + int rc; + + release_sock(&lsmc->sk); + new_sk = smc_sock_alloc(sock_net(sk), NULL); + if (!new_sk) { + rc = -ENOMEM; + lsmc->sk.sk_err = ENOMEM; + *new_smc = NULL; + lock_sock(&lsmc->sk); + goto out; + } + *new_smc = smc_sk(new_sk); + + rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); + lock_sock(&lsmc->sk); + if (rc < 0) { + lsmc->sk.sk_err = -rc; + new_sk->sk_state = SMC_CLOSED; + sock_set_flag(new_sk, SOCK_DEAD); + sk->sk_prot->unhash(new_sk); + sock_put(new_sk); + *new_smc = NULL; + goto out; + } + if (lsmc->sk.sk_state == SMC_CLOSED) { + if (new_clcsock) + sock_release(new_clcsock); + new_sk->sk_state = SMC_CLOSED; + sock_set_flag(new_sk, SOCK_DEAD); + sk->sk_prot->unhash(new_sk); + sock_put(new_sk); + *new_smc = NULL; + goto out; + } + + (*new_smc)->clcsock = new_clcsock; +out: + return rc; +} + +/* add a just created sock to the accept queue of the listen sock as + * candidate for a following socket accept call from user space + */ +static void smc_accept_enqueue(struct sock *parent, struct sock *sk) +{ + struct smc_sock *par = smc_sk(parent); + + sock_hold(sk); + spin_lock(&par->accept_q_lock); + list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); + spin_unlock(&par->accept_q_lock); + sk_acceptq_added(parent); +} + +/* remove a socket from the accept queue of its parental listening socket */ +static void smc_accept_unlink(struct sock *sk) +{ + struct smc_sock *par = smc_sk(sk)->listen_smc; + + spin_lock(&par->accept_q_lock); + list_del_init(&smc_sk(sk)->accept_q); + spin_unlock(&par->accept_q_lock); + sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); + sock_put(sk); +} + +/* remove a sock from the accept queue to bind it to a new socket created + * for a socket accept call from user space + */ +struct sock *smc_accept_dequeue(struct sock *parent, + struct socket *new_sock) +{ + struct smc_sock *isk, *n; + struct sock *new_sk; + + list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { + new_sk = (struct sock *)isk; + + smc_accept_unlink(new_sk); + if (new_sk->sk_state == SMC_CLOSED) { + /* tbd in follow-on patch: close this sock */ + continue; + } + if (new_sock) + sock_graft(new_sk, new_sock); + return new_sk; + } + return NULL; +} + +/* clean up for a created but never accepted sock */ +void smc_close_non_accepted(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + sock_hold(sk); + lock_sock(sk); + if (!sk->sk_lingertime) + /* wait for peer closing */ + sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; + if (!smc->use_fallback) + smc_close_active(smc); + if (smc->clcsock) { + struct socket *tcp; + + tcp = smc->clcsock; + smc->clcsock = NULL; + sock_release(tcp); + } + sock_set_flag(sk, SOCK_DEAD); + sk->sk_shutdown |= SHUTDOWN_MASK; + if (smc->use_fallback) { + schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); + } else { + smc_conn_free(&smc->conn); + schedule_delayed_work(&smc->sock_put_work, + SMC_CLOSE_SOCK_PUT_DELAY); + } + release_sock(sk); + sock_put(sk); +} + +static int smc_serv_conf_first_link(struct smc_sock *smc) +{ + struct smc_link_group *lgr = smc->conn.lgr; + struct smc_link *link; + int rest; + int rc; + + link = &lgr->lnk[SMC_SINGLE_LINK]; + /* send CONFIRM LINK request to client over the RoCE fabric */ + rc = smc_llc_send_confirm_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_REQ); + if (rc < 0) + return SMC_CLC_DECL_TCL; + + /* receive CONFIRM LINK response from client over the RoCE fabric */ + rest = wait_for_completion_interruptible_timeout( + &link->llc_confirm_resp, + SMC_LLC_WAIT_FIRST_TIME); + if (rest <= 0) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE); + } + + return rc; +} + +/* setup for RDMA connection of server */ +static void smc_listen_work(struct work_struct *work) +{ + struct smc_sock *new_smc = container_of(work, struct smc_sock, + smc_listen_work); + struct socket *newclcsock = new_smc->clcsock; + struct smc_sock *lsmc = new_smc->listen_smc; + struct smc_clc_msg_accept_confirm cclc; + int local_contact = SMC_REUSE_CONTACT; + struct sock *newsmcsk = &new_smc->sk; + struct smc_clc_msg_proposal pclc; + struct smc_ib_device *smcibdev; + struct sockaddr_in peeraddr; + struct smc_link *link; + int reason_code = 0; + int rc = 0, len; + __be32 subnet; + u8 prefix_len; + u8 ibport; + + /* do inband token exchange - + *wait for and receive SMC Proposal CLC message + */ + reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc), + SMC_CLC_PROPOSAL); + if (reason_code < 0) + goto out_err; + if (reason_code > 0) + goto decline_rdma; + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(new_smc)) { + reason_code = SMC_CLC_DECL_IPSEC; + goto decline_rdma; + } + + /* PNET table look up: search active ib_device and port + * within same PNETID that also contains the ethernet device + * used for the internal TCP socket + */ + smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); + if (!smcibdev) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* determine subnet and mask from internal TCP socket */ + rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); + if (rc) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + if ((pclc.outgoing_subnet != subnet) || + (pclc.prefix_len != prefix_len)) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* get address of the peer connected to the internal TCP socket */ + kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len); + + /* allocate connection / link group */ + mutex_lock(&smc_create_lgr_pending); + local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, + smcibdev, ibport, &pclc.lcl, 0); + if (local_contact == SMC_REUSE_CONTACT) + /* lock no longer needed, free it due to following + * smc_clc_wait_msg() call + */ + mutex_unlock(&smc_create_lgr_pending); + if (local_contact < 0) { + rc = local_contact; + if (rc == -ENOMEM) + reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ + else if (rc == -ENOLINK) + reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ + goto decline_rdma; + } + link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + + rc = smc_sndbuf_create(new_smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma; + } + rc = smc_rmb_create(new_smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma; + } + + rc = smc_clc_send_accept(new_smc, local_contact); + if (rc) + goto out_err; + + /* receive SMC Confirm CLC message */ + reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM); + if (reason_code < 0) + goto out_err; + if (reason_code > 0) + goto decline_rdma; + smc_conn_save_peer_info(new_smc, &cclc); + if (local_contact == SMC_FIRST_CONTACT) + smc_link_save_peer_info(link, &cclc); + + rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma; + } + + if (local_contact == SMC_FIRST_CONTACT) { + rc = smc_ib_ready_link(link); + if (rc) { + reason_code = SMC_CLC_DECL_INTERR; + goto decline_rdma; + } + /* QP confirmation over RoCE fabric */ + reason_code = smc_serv_conf_first_link(new_smc); + if (reason_code < 0) { + /* peer is not aware of a problem */ + rc = reason_code; + goto out_err; + } + if (reason_code > 0) + goto decline_rdma; + } + + smc_tx_init(new_smc); + smc_rx_init(new_smc); + +out_connected: + sk_refcnt_debug_inc(newsmcsk); + if (newsmcsk->sk_state == SMC_INIT) + newsmcsk->sk_state = SMC_ACTIVE; +enqueue: + if (local_contact == SMC_FIRST_CONTACT) + mutex_unlock(&smc_create_lgr_pending); + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); + if (lsmc->sk.sk_state == SMC_LISTEN) { + smc_accept_enqueue(&lsmc->sk, newsmcsk); + } else { /* no longer listening */ + smc_close_non_accepted(newsmcsk); + } + release_sock(&lsmc->sk); + + /* Wake up accept */ + lsmc->sk.sk_data_ready(&lsmc->sk); + sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ + return; + +decline_rdma: + /* RDMA setup failed, switch back to TCP */ + smc_conn_free(&new_smc->conn); + new_smc->use_fallback = true; + if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { + rc = smc_clc_send_decline(new_smc, reason_code, 0); + if (rc < sizeof(struct smc_clc_msg_decline)) + goto out_err; + } + goto out_connected; + +out_err: + newsmcsk->sk_state = SMC_CLOSED; + smc_conn_free(&new_smc->conn); + goto enqueue; /* queue new sock with sk_err set */ +} + +static void smc_tcp_listen_work(struct work_struct *work) +{ + struct smc_sock *lsmc = container_of(work, struct smc_sock, + tcp_listen_work); + struct smc_sock *new_smc; + int rc = 0; + + lock_sock(&lsmc->sk); + while (lsmc->sk.sk_state == SMC_LISTEN) { + rc = smc_clcsock_accept(lsmc, &new_smc); + if (rc) + goto out; + if (!new_smc) + continue; + + new_smc->listen_smc = lsmc; + new_smc->use_fallback = false; /* assume rdma capability first*/ + sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */ + INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); + smc_copy_sock_settings_to_smc(new_smc); + schedule_work(&new_smc->smc_listen_work); + } + +out: + release_sock(&lsmc->sk); + lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */ +} + +static int smc_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sk); + lock_sock(sk); + + rc = -EINVAL; + if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN)) + goto out; + + rc = 0; + if (sk->sk_state == SMC_LISTEN) { + sk->sk_max_ack_backlog = backlog; + goto out; + } + /* some socket options are handled in core, so we could not apply + * them to the clc socket -- copy smc socket options to clc socket + */ + smc_copy_sock_settings_to_clc(smc); + + rc = kernel_listen(smc->clcsock, backlog); + if (rc) + goto out; + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = SMC_LISTEN; + INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + schedule_work(&smc->tcp_listen_work); + +out: + release_sock(sk); + return rc; +} + +static int smc_accept(struct socket *sock, struct socket *new_sock, + int flags) +{ + struct sock *sk = sock->sk, *nsk; + DECLARE_WAITQUEUE(wait, current); + struct smc_sock *lsmc; + long timeo; + int rc = 0; + + lsmc = smc_sk(sk); + lock_sock(sk); + + if (lsmc->sk.sk_state != SMC_LISTEN) { + rc = -EINVAL; + goto out; + } + + /* Wait for an incoming connection */ + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + add_wait_queue_exclusive(sk_sleep(sk), &wait); + while (!(nsk = smc_accept_dequeue(sk, new_sock))) { + set_current_state(TASK_INTERRUPTIBLE); + if (!timeo) { + rc = -EAGAIN; + break; + } + release_sock(sk); + timeo = schedule_timeout(timeo); + /* wakeup by sk_data_ready in smc_listen_work() */ + sched_annotate_sleep(); + lock_sock(sk); + if (signal_pending(current)) { + rc = sock_intr_errno(timeo); + break; + } + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + + if (!rc) + rc = sock_error(nsk); + +out: + release_sock(sk); + return rc; +} + +static int smc_getname(struct socket *sock, struct sockaddr *addr, + int *len, int peer) +{ + struct smc_sock *smc; + + if (peer && (sock->sk->sk_state != SMC_ACTIVE) && + (sock->sk->sk_state != SMC_APPCLOSEWAIT1)) + return -ENOTCONN; + + smc = smc_sk(sock->sk); + + return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer); +} + +static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EPIPE; + + smc = smc_sk(sk); + lock_sock(sk); + if ((sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_INIT)) + goto out; + if (smc->use_fallback) + rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); + else + rc = smc_tx_sendmsg(smc, msg, len); +out: + release_sock(sk); + return rc; +} + +static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -ENOTCONN; + + smc = smc_sk(sk); + lock_sock(sk); + if ((sk->sk_state == SMC_INIT) || + (sk->sk_state == SMC_LISTEN) || + (sk->sk_state == SMC_CLOSED)) + goto out; + + if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { + rc = 0; + goto out; + } + + if (smc->use_fallback) + rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); + else + rc = smc_rx_recvmsg(smc, msg, len, flags); + +out: + release_sock(sk); + return rc; +} + +static unsigned int smc_accept_poll(struct sock *parent) +{ + struct smc_sock *isk; + struct sock *sk; + + lock_sock(parent); + list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) { + sk = (struct sock *)isk; + + if (sk->sk_state == SMC_ACTIVE) { + release_sock(parent); + return POLLIN | POLLRDNORM; + } + } + release_sock(parent); + + return 0; +} + +static unsigned int smc_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask = 0; + struct smc_sock *smc; + int rc; + + smc = smc_sk(sock->sk); + if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { + /* delegate to CLC child sock */ + mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); + /* if non-blocking connect finished ... */ + lock_sock(sk); + if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) { + sk->sk_err = smc->clcsock->sk->sk_err; + if (sk->sk_err) { + mask |= POLLERR; + } else { + rc = smc_connect_rdma(smc); + if (rc < 0) + mask |= POLLERR; + else + /* success cases including fallback */ + mask |= POLLOUT | POLLWRNORM; + } + } + release_sock(sk); + } else { + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == SMC_LISTEN) + /* woken up by sk_data_ready in smc_listen_work() */ + mask |= smc_accept_poll(sk); + if (sk->sk_err) + mask |= POLLERR; + if (atomic_read(&smc->conn.sndbuf_space) || + (sk->sk_shutdown & SEND_SHUTDOWN)) { + mask |= POLLOUT | POLLWRNORM; + } else { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + if (atomic_read(&smc->conn.bytes_to_rcv)) + mask |= POLLIN | POLLRDNORM; + if ((sk->sk_shutdown == SHUTDOWN_MASK) || + (sk->sk_state == SMC_CLOSED)) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM | POLLRDHUP; + if (sk->sk_state == SMC_APPCLOSEWAIT1) + mask |= POLLIN; + + } + + return mask; +} + +static int smc_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EINVAL; + int rc1 = 0; + + smc = smc_sk(sk); + + if ((how < SHUT_RD) || (how > SHUT_RDWR)) + return rc; + + lock_sock(sk); + + rc = -ENOTCONN; + if ((sk->sk_state != SMC_LISTEN) && + (sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_PEERCLOSEWAIT1) && + (sk->sk_state != SMC_PEERCLOSEWAIT2) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_APPCLOSEWAIT2) && + (sk->sk_state != SMC_APPFINCLOSEWAIT)) + goto out; + if (smc->use_fallback) { + rc = kernel_sock_shutdown(smc->clcsock, how); + sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; + if (sk->sk_shutdown == SHUTDOWN_MASK) + sk->sk_state = SMC_CLOSED; + goto out; + } + switch (how) { + case SHUT_RDWR: /* shutdown in both directions */ + rc = smc_close_active(smc); + break; + case SHUT_WR: + rc = smc_close_shutdown_write(smc); + break; + case SHUT_RD: + if (sk->sk_state == SMC_LISTEN) + rc = smc_close_active(smc); + else + rc = 0; + /* nothing more to do because peer is not involved */ + break; + } + rc1 = kernel_sock_shutdown(smc->clcsock, how); + /* map sock_shutdown_cmd constants to sk_shutdown value range */ + sk->sk_shutdown |= how + 1; + +out: + release_sock(sk); + return rc ? rc : rc1; +} + +static int smc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + + smc = smc_sk(sk); + + /* generic setsockopts reaching us here always apply to the + * CLC socket + */ + return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); +} + +static int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + + smc = smc_sk(sock->sk); + /* socket options apply to the CLC socket */ + return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname, + optval, optlen); +} + +static int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct smc_sock *smc; + + smc = smc_sk(sock->sk); + if (smc->use_fallback) + return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); + else + return sock_no_ioctl(sock, cmd, arg); +} + +static ssize_t smc_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -EPIPE; + + smc = smc_sk(sk); + lock_sock(sk); + if (sk->sk_state != SMC_ACTIVE) + goto out; + if (smc->use_fallback) + rc = kernel_sendpage(smc->clcsock, page, offset, + size, flags); + else + rc = sock_no_sendpage(sock, page, offset, size, flags); + +out: + release_sock(sk); + return rc; +} + +static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int rc = -ENOTCONN; + + smc = smc_sk(sk); + lock_sock(sk); + if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) + goto out; + if (smc->use_fallback) { + rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, + pipe, len, flags); + } else { + rc = -EOPNOTSUPP; + } +out: + release_sock(sk); + return rc; +} + +/* must look like tcp */ +static const struct proto_ops smc_sock_ops = { + .family = PF_SMC, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = smc_sendpage, + .splice_read = smc_splice_read, +}; + +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct smc_sock *smc; + struct sock *sk; + int rc; + + rc = -ESOCKTNOSUPPORT; + if (sock->type != SOCK_STREAM) + goto out; + + rc = -EPROTONOSUPPORT; + if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) + goto out; + + rc = -ENOBUFS; + sock->ops = &smc_sock_ops; + sk = smc_sock_alloc(net, sock); + if (!sk) + goto out; + + /* create internal TCP socket for CLC handshake and fallback */ + smc = smc_sk(sk); + smc->use_fallback = false; /* assume rdma capability first */ + rc = sock_create_kern(net, PF_INET, SOCK_STREAM, + IPPROTO_TCP, &smc->clcsock); + if (rc) + sk_common_release(sk); + smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); + smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); + +out: + return rc; +} + +static const struct net_proto_family smc_sock_family_ops = { + .family = PF_SMC, + .owner = THIS_MODULE, + .create = smc_create, +}; + +static int __init smc_init(void) +{ + int rc; + + rc = smc_pnet_init(); + if (rc) + return rc; + + rc = smc_llc_init(); + if (rc) { + pr_err("%s: smc_llc_init fails with %d\n", __func__, rc); + goto out_pnet; + } + + rc = smc_cdc_init(); + if (rc) { + pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc); + goto out_pnet; + } + + rc = proto_register(&smc_proto, 1); + if (rc) { + pr_err("%s: proto_register fails with %d\n", __func__, rc); + goto out_pnet; + } + + rc = sock_register(&smc_sock_family_ops); + if (rc) { + pr_err("%s: sock_register fails with %d\n", __func__, rc); + goto out_proto; + } + INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + + rc = smc_ib_register_client(); + if (rc) { + pr_err("%s: ib_register fails with %d\n", __func__, rc); + goto out_sock; + } + + return 0; + +out_sock: + sock_unregister(PF_SMC); +out_proto: + proto_unregister(&smc_proto); +out_pnet: + smc_pnet_exit(); + return rc; +} + +static void __exit smc_exit(void) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_freeing_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!list_empty(&smc_lgr_list.list)) + list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); + spin_unlock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { + list_del_init(&lgr->list); + smc_lgr_free(lgr); /* free link group */ + } + smc_ib_unregister_client(); + sock_unregister(PF_SMC); + proto_unregister(&smc_proto); + smc_pnet_exit(); +} + +module_init(smc_init); +module_exit(smc_exit); + +MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("smc socket address family"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_SMC); diff --git a/net/smc/smc.h b/net/smc/smc.h new file mode 100644 index 000000000000..ee5fbea24549 --- /dev/null +++ b/net/smc/smc.h @@ -0,0 +1,274 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the SMC module (socket related) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ +#ifndef __SMC_H +#define __SMC_H + +#include <linux/socket.h> +#include <linux/types.h> +#include <linux/compiler.h> /* __aligned */ +#include <net/sock.h> + +#include "smc_ib.h" + +#define SMCPROTO_SMC 0 /* SMC protocol */ + +#define SMC_MAX_PORTS 2 /* Max # of ports */ + +extern struct proto smc_proto; + +#ifdef ATOMIC64_INIT +#define KERNEL_HAS_ATOMIC64 +#endif + +enum smc_state { /* possible states of an SMC socket */ + SMC_ACTIVE = 1, + SMC_INIT = 2, + SMC_CLOSED = 7, + SMC_LISTEN = 10, + /* normal close */ + SMC_PEERCLOSEWAIT1 = 20, + SMC_PEERCLOSEWAIT2 = 21, + SMC_APPFINCLOSEWAIT = 24, + SMC_APPCLOSEWAIT1 = 22, + SMC_APPCLOSEWAIT2 = 23, + SMC_PEERFINCLOSEWAIT = 25, + /* abnormal close */ + SMC_PEERABORTWAIT = 26, + SMC_PROCESSABORT = 27, +}; + +struct smc_link_group; + +struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ + u8 type; +} __aligned(1); + +struct smc_cdc_conn_state_flags { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 peer_done_writing : 1; /* Sending done indicator */ + u8 peer_conn_closed : 1; /* Peer connection closed indicator */ + u8 peer_conn_abort : 1; /* Abnormal close indicator */ + u8 reserved : 5; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 5; + u8 peer_conn_abort : 1; + u8 peer_conn_closed : 1; + u8 peer_done_writing : 1; +#endif +}; + +struct smc_cdc_producer_flags { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ + u8 urg_data_pending : 1; /* Urgent Data Pending */ + u8 urg_data_present : 1; /* Urgent Data Present */ + u8 cons_curs_upd_req : 1; /* cursor update requested */ + u8 failover_validation : 1;/* message replay due to failover */ + u8 reserved : 3; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 3; + u8 failover_validation : 1; + u8 cons_curs_upd_req : 1; + u8 urg_data_present : 1; + u8 urg_data_pending : 1; + u8 write_blocked : 1; +#endif +}; + +/* in host byte order */ +union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ + struct { + u16 reserved; + u16 wrap; /* window wrap sequence number */ + u32 count; /* cursor (= offset) part */ + }; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); + +/* in host byte order, except for flag bitfields in network byte order */ +struct smc_host_cdc_msg { /* Connection Data Control message */ + struct smc_wr_rx_hdr common; /* .type = 0xFE */ + u8 len; /* length = 44 */ + u16 seqno; /* connection seq # */ + u32 token; /* alert_token */ + union smc_host_cursor prod; /* producer cursor */ + union smc_host_cursor cons; /* consumer cursor, + * piggy backed "ack" + */ + struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ + struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ + u8 reserved[18]; +} __aligned(8); + +struct smc_connection { + struct rb_node alert_node; + struct smc_link_group *lgr; /* link group of connection */ + u32 alert_token_local; /* unique conn. id */ + u8 peer_conn_idx; /* from tcp handshake */ + int peer_rmbe_size; /* size of peer rx buffer */ + atomic_t peer_rmbe_space;/* remaining free bytes in peer + * rmbe + */ + int rtoken_idx; /* idx to peer RMB rkey/addr */ + + struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ + int sndbuf_size; /* sndbuf size <== sock wmem */ + struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ + int rmbe_size; /* RMBE size <== sock rmem */ + int rmbe_size_short;/* compressed notation */ + int rmbe_update_limit; + /* lower limit for consumer + * cursor update + */ + + struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging + * buffer for CDC msg send + * .prod cf. TCP snd_nxt + * .cons cf. TCP sends ack + */ + union smc_host_cursor tx_curs_prep; /* tx - prepared data + * snd_max..wmem_alloc + */ + union smc_host_cursor tx_curs_sent; /* tx - sent data + * snd_nxt ? + */ + union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer + * snd-wnd-begin ? + */ + atomic_t sndbuf_space; /* remaining space in sndbuf */ + u16 tx_cdc_seq; /* sequence # for CDC send */ + spinlock_t send_lock; /* protect wr_sends */ + struct work_struct tx_work; /* retry of smc_cdc_msg_send */ + + struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. + * .prod cf. TCP rcv_nxt + * .cons cf. TCP snd_una + */ + union smc_host_cursor rx_curs_confirmed; /* confirmed to peer + * source of snd_una ? + */ + atomic_t bytes_to_rcv; /* arrived data, + * not yet received + */ +#ifndef KERNEL_HAS_ATOMIC64 + spinlock_t acurs_lock; /* protect cursors */ +#endif +}; + +struct smc_sock { /* smc sock container */ + struct sock sk; + struct socket *clcsock; /* internal tcp socket */ + struct smc_connection conn; /* smc connection */ + struct sockaddr *addr; /* inet connect address */ + struct smc_sock *listen_smc; /* listen parent */ + struct work_struct tcp_listen_work;/* handle tcp socket accepts */ + struct work_struct smc_listen_work;/* prepare new accept socket */ + struct list_head accept_q; /* sockets to be accepted */ + spinlock_t accept_q_lock; /* protects accept_q */ + struct delayed_work sock_put_work; /* final socket freeing */ + bool use_fallback; /* fallback to tcp */ + u8 wait_close_tx_prepared : 1; + /* shutdown wr or close + * started, waiting for unsent + * data to be sent + */ +}; + +static inline struct smc_sock *smc_sk(const struct sock *sk) +{ + return (struct smc_sock *)sk; +} + +#define SMC_SYSTEMID_LEN 8 + +extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ + +/* convert an u32 value into network byte order, store it into a 3 byte field */ +static inline void hton24(u8 *net, u32 host) +{ + __be32 t; + + t = cpu_to_be32(host); + memcpy(net, ((u8 *)&t) + 1, 3); +} + +/* convert a received 3 byte field into host byte order*/ +static inline u32 ntoh24(u8 *net) +{ + __be32 t = 0; + + memcpy(((u8 *)&t) + 1, net, 3); + return be32_to_cpu(t); +} + +#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ + +#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */ +/* theoretically, the RFC states that largest size would be 512K, + * i.e. compressed 5 and thus 6 sizes (0..5), despite + * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) + */ + +/* convert the RMB size into the compressed notation - minimum 16K. + * In contrast to plain ilog2, this rounds towards the next power of 2, + * so the socket application gets at least its desired sndbuf / rcvbuf size. + */ +static inline u8 smc_compress_bufsize(int size) +{ + u8 compressed; + + if (size <= SMC_BUF_MIN_SIZE) + return 0; + + size = (size - 1) >> 14; + compressed = ilog2(size) + 1; + if (compressed >= SMC_RMBE_SIZES) + compressed = SMC_RMBE_SIZES - 1; + return compressed; +} + +/* convert the RMB size from compressed notation into integer */ +static inline int smc_uncompress_bufsize(u8 compressed) +{ + u32 size; + + size = 0x00000001 << (((int)compressed) + 14); + return (int)size; +} + +#ifdef CONFIG_XFRM +static inline bool using_ipsec(struct smc_sock *smc) +{ + return (smc->clcsock->sk->sk_policy[0] || + smc->clcsock->sk->sk_policy[1]) ? 1 : 0; +} +#else +static inline bool using_ipsec(struct smc_sock *smc) +{ + return 0; +} +#endif + +struct smc_clc_msg_local; + +int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, + u8 *prefix_len); +void smc_conn_free(struct smc_connection *conn); +int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, + struct smc_ib_device *smcibdev, u8 ibport, + struct smc_clc_msg_local *lcl, int srv_first_contact); +struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); +void smc_close_non_accepted(struct sock *sk); + +#endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c new file mode 100644 index 000000000000..5a339493872e --- /dev/null +++ b/net/smc/smc_cdc.c @@ -0,0 +1,304 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Connection Data Control (CDC) + * handles flow control + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/spinlock.h> + +#include "smc.h" +#include "smc_wr.h" +#include "smc_cdc.h" +#include "smc_tx.h" +#include "smc_rx.h" +#include "smc_close.h" + +/********************************** send *************************************/ + +struct smc_cdc_tx_pend { + struct smc_connection *conn; /* socket connection */ + union smc_host_cursor cursor; /* tx sndbuf cursor sent */ + union smc_host_cursor p_cursor; /* rx RMBE cursor produced */ + u16 ctrl_seq; /* conn. tx sequence # */ +}; + +/* handler for send/transmission completion of a CDC msg */ +static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, + struct smc_link *link, + enum ib_wc_status wc_status) +{ + struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; + struct smc_sock *smc; + int diff; + + if (!cdcpend->conn) + /* already dismissed */ + return; + + smc = container_of(cdcpend->conn, struct smc_sock, conn); + bh_lock_sock(&smc->sk); + if (!wc_status) { + diff = smc_curs_diff(cdcpend->conn->sndbuf_size, + &cdcpend->conn->tx_curs_fin, + &cdcpend->cursor); + /* sndbuf_space is decreased in smc_sendmsg */ + smp_mb__before_atomic(); + atomic_add(diff, &cdcpend->conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + smp_mb__after_atomic(); + smc_curs_write(&cdcpend->conn->tx_curs_fin, + smc_curs_read(&cdcpend->cursor, cdcpend->conn), + cdcpend->conn); + } + smc_tx_sndbuf_nonfull(smc); + if (smc->sk.sk_state != SMC_ACTIVE) + /* wake up smc_close_wait_tx_pends() */ + smc->sk.sk_state_change(&smc->sk); + bh_unlock_sock(&smc->sk); +} + +int smc_cdc_get_free_slot(struct smc_link *link, + struct smc_wr_buf **wr_buf, + struct smc_cdc_tx_pend **pend) +{ + return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, + (struct smc_wr_tx_pend_priv **)pend); +} + +static inline void smc_cdc_add_pending_send(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend) +{ + BUILD_BUG_ON_MSG( + sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, + "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); + BUILD_BUG_ON_MSG( + offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, + "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); + BUILD_BUG_ON_MSG( + sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, + "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)"); + pend->conn = conn; + pend->cursor = conn->tx_curs_sent; + pend->p_cursor = conn->local_tx_ctrl.prod; + pend->ctrl_seq = conn->tx_cdc_seq; +} + +int smc_cdc_msg_send(struct smc_connection *conn, + struct smc_wr_buf *wr_buf, + struct smc_cdc_tx_pend *pend) +{ + struct smc_link *link; + int rc; + + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + + smc_cdc_add_pending_send(conn, pend); + + conn->tx_cdc_seq++; + conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; + smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, + &conn->local_tx_ctrl, conn); + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + if (!rc) + smc_curs_write(&conn->rx_curs_confirmed, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + + return rc; +} + +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn) +{ + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, + &pend); + if (rc) + return rc; + + return smc_cdc_msg_send(conn, wr_buf, pend); +} + +static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend, + unsigned long data) +{ + struct smc_connection *conn = (struct smc_connection *)data; + struct smc_cdc_tx_pend *cdc_pend = + (struct smc_cdc_tx_pend *)tx_pend; + + return cdc_pend->conn == conn; +} + +static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) +{ + struct smc_cdc_tx_pend *cdc_pend = + (struct smc_cdc_tx_pend *)tx_pend; + + cdc_pend->conn = NULL; +} + +void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) +{ + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + + smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, + smc_cdc_tx_filter, smc_cdc_tx_dismisser, + (unsigned long)conn); +} + +bool smc_cdc_tx_has_pending(struct smc_connection *conn) +{ + struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + + return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE, + smc_cdc_tx_filter, (unsigned long)conn); +} + +/********************************* receive ***********************************/ + +static inline bool smc_cdc_before(u16 seq1, u16 seq2) +{ + return (s16)(seq1 - seq2) < 0; +} + +static void smc_cdc_msg_recv_action(struct smc_sock *smc, + struct smc_link *link, + struct smc_cdc_msg *cdc) +{ + union smc_host_cursor cons_old, prod_old; + struct smc_connection *conn = &smc->conn; + int diff_cons, diff_prod; + + if (!cdc->prod_flags.failover_validation) { + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + } + smc_curs_write(&prod_old, + smc_curs_read(&conn->local_rx_ctrl.prod, conn), + conn); + smc_curs_write(&cons_old, + smc_curs_read(&conn->local_rx_ctrl.cons, conn), + conn); + smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn); + + diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old, + &conn->local_rx_ctrl.cons); + if (diff_cons) { + /* peer_rmbe_space is decreased during data transfer with RDMA + * write + */ + smp_mb__before_atomic(); + atomic_add(diff_cons, &conn->peer_rmbe_space); + /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ + smp_mb__after_atomic(); + } + + diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old, + &conn->local_rx_ctrl.prod); + if (diff_prod) { + /* bytes_to_rcv is decreased in smc_recvmsg */ + smp_mb__before_atomic(); + atomic_add(diff_prod, &conn->bytes_to_rcv); + /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + smp_mb__after_atomic(); + smc->sk.sk_data_ready(&smc->sk); + } + + if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + smc->sk.sk_err = ECONNRESET; + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + } + if (smc_cdc_rxed_any_close_or_senddone(conn)) + smc_close_passive_received(smc); + + /* piggy backed tx info */ + /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ + if (diff_cons && smc_tx_prepared_sends(conn)) { + smc_tx_sndbuf_nonempty(conn); + /* trigger socket release if connection closed */ + smc_close_wake_tx_prepared(smc); + } + + /* subsequent patch: trigger socket release if connection closed */ + + /* socket connected but not accepted */ + if (!smc->sk.sk_socket) + return; + + /* data available */ + if ((conn->local_rx_ctrl.prod_flags.write_blocked) || + (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) + smc_tx_consumer_update(conn); +} + +/* called under tasklet context */ +static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc, + struct smc_link *link, u64 wr_id) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + struct smc_connection *connection; + struct smc_sock *smc; + + /* lookup connection */ + read_lock_bh(&lgr->conns_lock); + connection = smc_lgr_find_conn(ntohl(cdc->token), lgr); + if (!connection) { + read_unlock_bh(&lgr->conns_lock); + return; + } + smc = container_of(connection, struct smc_sock, conn); + sock_hold(&smc->sk); + read_unlock_bh(&lgr->conns_lock); + bh_lock_sock(&smc->sk); + smc_cdc_msg_recv_action(smc, link, cdc); + bh_unlock_sock(&smc->sk); + sock_put(&smc->sk); /* no free sk in softirq-context */ +} + +/***************************** init, exit, misc ******************************/ + +static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_cdc_msg *cdc = buf; + + if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved)) + return; /* short message */ + if (cdc->len != sizeof(*cdc)) + return; /* invalid message */ + smc_cdc_msg_recv(cdc, link, wc->wr_id); +} + +static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = { + { + .handler = smc_cdc_rx_handler, + .type = SMC_CDC_MSG_TYPE + }, + { + .handler = NULL, + } +}; + +int __init smc_cdc_init(void) +{ + struct smc_wr_rx_handler *handler; + int rc = 0; + + for (handler = smc_cdc_rx_handlers; handler->handler; handler++) { + INIT_HLIST_NODE(&handler->list); + rc = smc_wr_rx_register_handler(handler); + if (rc) + break; + } + return rc; +} diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h new file mode 100644 index 000000000000..8e1d76f26007 --- /dev/null +++ b/net/smc/smc_cdc.h @@ -0,0 +1,218 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Connection Data Control (CDC) + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_CDC_H +#define SMC_CDC_H + +#include <linux/kernel.h> /* max_t */ +#include <linux/atomic.h> +#include <linux/in.h> +#include <linux/compiler.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_wr.h" + +#define SMC_CDC_MSG_TYPE 0xFE + +/* in network byte order */ +union smc_cdc_cursor { /* SMC cursor */ + struct { + __be16 reserved; + __be16 wrap; + __be32 count; + }; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t acurs; /* for atomic processing */ +#else + u64 acurs; /* for atomic processing */ +#endif +} __aligned(8); + +/* in network byte order */ +struct smc_cdc_msg { + struct smc_wr_rx_hdr common; /* .type = 0xFE */ + u8 len; /* 44 */ + __be16 seqno; + __be32 token; + union smc_cdc_cursor prod; + union smc_cdc_cursor cons; /* piggy backed "ack" */ + struct smc_cdc_producer_flags prod_flags; + struct smc_cdc_conn_state_flags conn_state_flags; + u8 reserved[18]; +} __aligned(8); + +static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) +{ + return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort || + conn->local_rx_ctrl.conn_state_flags.peer_conn_closed; +} + +static inline bool smc_cdc_rxed_any_close_or_senddone( + struct smc_connection *conn) +{ + return smc_cdc_rxed_any_close(conn) || + conn->local_rx_ctrl.conn_state_flags.peer_done_writing; +} + +static inline void smc_curs_add(int size, union smc_host_cursor *curs, + int value) +{ + curs->count += value; + if (curs->count >= size) { + curs->wrap++; + curs->count -= size; + } +} + +/* SMC cursors are 8 bytes long and require atomic reading and writing */ +static inline u64 smc_curs_read(union smc_host_cursor *curs, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&conn->acurs_lock, flags); + ret = curs->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); + return ret; +#else + return atomic64_read(&curs->acurs); +#endif +} + +static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&conn->acurs_lock, flags); + ret = curs->acurs; + spin_unlock_irqrestore(&conn->acurs_lock, flags); + return ret; +#else + return atomic64_read(&curs->acurs); +#endif +} + +static inline void smc_curs_write(union smc_host_cursor *curs, u64 val, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + curs->acurs = val; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&curs->acurs, val); +#endif +} + +static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val, + struct smc_connection *conn) +{ +#ifndef KERNEL_HAS_ATOMIC64 + unsigned long flags; + + spin_lock_irqsave(&conn->acurs_lock, flags); + curs->acurs = val; + spin_unlock_irqrestore(&conn->acurs_lock, flags); +#else + atomic64_set(&curs->acurs, val); +#endif +} + +/* calculate cursor difference between old and new, where old <= new */ +static inline int smc_curs_diff(unsigned int size, + union smc_host_cursor *old, + union smc_host_cursor *new) +{ + if (old->wrap != new->wrap) + return max_t(int, 0, + ((size - old->count) + new->count)); + + return max_t(int, 0, (new->count - old->count)); +} + +static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, + union smc_host_cursor *local, + struct smc_connection *conn) +{ + union smc_host_cursor temp; + + smc_curs_write(&temp, smc_curs_read(local, conn), conn); + peer->count = htonl(temp.count); + peer->wrap = htons(temp.wrap); + /* peer->reserved = htons(0); must be ensured by caller */ +} + +static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer, + struct smc_host_cdc_msg *local, + struct smc_connection *conn) +{ + peer->common.type = local->common.type; + peer->len = local->len; + peer->seqno = htons(local->seqno); + peer->token = htonl(local->token); + smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn); + smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn); + peer->prod_flags = local->prod_flags; + peer->conn_state_flags = local->conn_state_flags; +} + +static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local, + union smc_cdc_cursor *peer, + struct smc_connection *conn) +{ + union smc_host_cursor temp, old; + union smc_cdc_cursor net; + + smc_curs_write(&old, smc_curs_read(local, conn), conn); + smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn); + temp.count = ntohl(net.count); + temp.wrap = ntohs(net.wrap); + if ((old.wrap > temp.wrap) && temp.wrap) + return; + if ((old.wrap == temp.wrap) && + (old.count > temp.count)) + return; + smc_curs_write(local, smc_curs_read(&temp, conn), conn); +} + +static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local, + struct smc_cdc_msg *peer, + struct smc_connection *conn) +{ + local->common.type = peer->common.type; + local->len = peer->len; + local->seqno = ntohs(peer->seqno); + local->token = ntohl(peer->token); + smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn); + smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn); + local->prod_flags = peer->prod_flags; + local->conn_state_flags = peer->conn_state_flags; +} + +struct smc_cdc_tx_pend; + +int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf, + struct smc_cdc_tx_pend **pend); +void smc_cdc_tx_dismiss_slots(struct smc_connection *conn); +int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, + struct smc_cdc_tx_pend *pend); +int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); +bool smc_cdc_tx_has_pending(struct smc_connection *conn); +int smc_cdc_init(void) __init; + +#endif /* SMC_CDC_H */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c new file mode 100644 index 000000000000..cc6b6f8651eb --- /dev/null +++ b/net/smc/smc_clc.c @@ -0,0 +1,280 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * CLC (connection layer control) handshake over initial TCP socket to + * prepare for RDMA traffic + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/in.h> +#include <linux/if_ether.h> +#include <net/sock.h> +#include <net/tcp.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_clc.h" +#include "smc_ib.h" + +/* Wait for data on the tcp-socket, analyze received data + * Returns: + * 0 if success and it was not a decline that we received. + * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send. + * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. + */ +int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, + u8 expected_type) +{ + struct sock *clc_sk = smc->clcsock->sk; + struct smc_clc_msg_hdr *clcm = buf; + struct msghdr msg = {NULL, 0}; + int reason_code = 0; + struct kvec vec; + int len, datlen; + int krflags; + + /* peek the first few bytes to determine length of data to receive + * so we don't consume any subsequent CLC message or payload data + * in the TCP byte stream + */ + vec.iov_base = buf; + vec.iov_len = buflen; + krflags = MSG_PEEK | MSG_WAITALL; + smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, + sizeof(struct smc_clc_msg_hdr), krflags); + if (signal_pending(current)) { + reason_code = -EINTR; + clc_sk->sk_err = EINTR; + smc->sk.sk_err = EINTR; + goto out; + } + if (clc_sk->sk_err) { + reason_code = -clc_sk->sk_err; + smc->sk.sk_err = clc_sk->sk_err; + goto out; + } + if (!len) { /* peer has performed orderly shutdown */ + smc->sk.sk_err = ECONNRESET; + reason_code = -ECONNRESET; + goto out; + } + if (len < 0) { + smc->sk.sk_err = -len; + reason_code = len; + goto out; + } + datlen = ntohs(clcm->length); + if ((len < sizeof(struct smc_clc_msg_hdr)) || + (datlen < sizeof(struct smc_clc_msg_decline)) || + (datlen > sizeof(struct smc_clc_msg_accept_confirm)) || + memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) || + ((clcm->type != SMC_CLC_DECLINE) && + (clcm->type != expected_type))) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } + + /* receive the complete CLC message */ + vec.iov_base = buf; + vec.iov_len = buflen; + memset(&msg, 0, sizeof(struct msghdr)); + krflags = MSG_WAITALL; + smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags); + if (len < datlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } + if (clcm->type == SMC_CLC_DECLINE) { + reason_code = SMC_CLC_DECL_REPLY; + if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis) + == SMC_CLC_DECL_SYNCERR) + smc->conn.lgr->sync_err = true; + } + +out: + return reason_code; +} + +/* send CLC DECLINE message across internal TCP socket */ +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, + u8 out_of_sync) +{ + struct smc_clc_msg_decline dclc; + struct msghdr msg; + struct kvec vec; + int len; + + memset(&dclc, 0, sizeof(dclc)); + memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + dclc.hdr.type = SMC_CLC_DECLINE; + dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); + dclc.hdr.version = SMC_CLC_V1; + dclc.hdr.flag = out_of_sync ? 1 : 0; + memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); + dclc.peer_diagnosis = htonl(peer_diag_info); + memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &dclc; + vec.iov_len = sizeof(struct smc_clc_msg_decline); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, + sizeof(struct smc_clc_msg_decline)); + if (len < sizeof(struct smc_clc_msg_decline)) + smc->sk.sk_err = EPROTO; + if (len < 0) + smc->sk.sk_err = -len; + return len; +} + +/* send CLC PROPOSAL message across internal TCP socket */ +int smc_clc_send_proposal(struct smc_sock *smc, + struct smc_ib_device *smcibdev, + u8 ibport) +{ + struct smc_clc_msg_proposal pclc; + int reason_code = 0; + struct msghdr msg; + struct kvec vec; + int len, rc; + + /* send SMC Proposal CLC message */ + memset(&pclc, 0, sizeof(pclc)); + memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + pclc.hdr.type = SMC_CLC_PROPOSAL; + pclc.hdr.length = htons(sizeof(pclc)); + pclc.hdr.version = SMC_CLC_V1; /* SMC version */ + memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); + + /* determine subnet and mask from internal TCP socket */ + rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet, + &pclc.prefix_len); + if (rc) + return SMC_CLC_DECL_CNFERR; /* configuration error */ + memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &pclc; + vec.iov_len = sizeof(pclc); + /* due to the few bytes needed for clc-handshake this cannot block */ + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc)); + if (len < sizeof(pclc)) { + if (len >= 0) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; + } else { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } + } + + return reason_code; +} + +/* send CLC CONFIRM message across internal TCP socket */ +int smc_clc_send_confirm(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smc_clc_msg_accept_confirm cclc; + struct smc_link *link; + int reason_code = 0; + struct msghdr msg; + struct kvec vec; + int len; + + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + /* send SMC Confirm CLC msg */ + memset(&cclc, 0, sizeof(cclc)); + memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + cclc.hdr.type = SMC_CLC_CONFIRM; + cclc.hdr.length = htons(sizeof(cclc)); + cclc.hdr.version = SMC_CLC_V1; /* SMC version */ + memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); + memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN); + hton24(cclc.qpn, link->roce_qp->qp_num); + cclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ + cclc.rmbe_alert_token = htonl(conn->alert_token_local); + cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); + cclc.rmbe_size = conn->rmbe_size_short; + cclc.rmb_dma_addr = + cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); + hton24(cclc.psn, link->psn_initial); + + memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &cclc; + vec.iov_len = sizeof(cclc); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc)); + if (len < sizeof(cclc)) { + if (len >= 0) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; + } else { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } + } + return reason_code; +} + +/* send CLC ACCEPT message across internal TCP socket */ +int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) +{ + struct smc_connection *conn = &new_smc->conn; + struct smc_clc_msg_accept_confirm aclc; + struct smc_link *link; + struct msghdr msg; + struct kvec vec; + int rc = 0; + int len; + + link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + memset(&aclc, 0, sizeof(aclc)); + memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + aclc.hdr.type = SMC_CLC_ACCEPT; + aclc.hdr.length = htons(sizeof(aclc)); + aclc.hdr.version = SMC_CLC_V1; /* SMC version */ + if (srv_first_contact) + aclc.hdr.flag = 1; + memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1], + SMC_GID_SIZE); + memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN); + hton24(aclc.qpn, link->roce_qp->qp_num); + aclc.rmb_rkey = + htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ + aclc.rmbe_alert_token = htonl(conn->alert_token_local); + aclc.qp_mtu = link->path_mtu; + aclc.rmbe_size = conn->rmbe_size_short, + aclc.rmb_dma_addr = + cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); + hton24(aclc.psn, link->psn_initial); + memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &aclc; + vec.iov_len = sizeof(aclc); + len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc)); + if (len < sizeof(aclc)) { + if (len >= 0) + new_smc->sk.sk_err = EPROTO; + else + new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err; + rc = sock_error(&new_smc->sk); + } + + return rc; +} diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h new file mode 100644 index 000000000000..13db8ce177c9 --- /dev/null +++ b/net/smc/smc_clc.h @@ -0,0 +1,116 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * CLC (connection layer control) handshake over initial TCP socket to + * prepare for RDMA traffic + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_CLC_H +#define _SMC_CLC_H + +#include <rdma/ib_verbs.h> + +#include "smc.h" + +#define SMC_CLC_PROPOSAL 0x01 +#define SMC_CLC_ACCEPT 0x02 +#define SMC_CLC_CONFIRM 0x03 +#define SMC_CLC_DECLINE 0x04 + +/* eye catcher "SMCR" EBCDIC for CLC messages */ +static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; + +#define SMC_CLC_V1 0x1 /* SMC version */ +#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ +#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ +#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */ +#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ +#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */ +#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ +#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */ +#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ +#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */ +#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */ + +struct smc_clc_msg_hdr { /* header1 of clc messages */ + u8 eyecatcher[4]; /* eye catcher */ + u8 type; /* proposal / accept / confirm / decline */ + __be16 length; +#if defined(__BIG_ENDIAN_BITFIELD) + u8 version : 4, + flag : 1, + rsvd : 3; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 rsvd : 3, + flag : 1, + version : 4; +#endif +} __packed; /* format defined in RFC7609 */ + +struct smc_clc_msg_trail { /* trailer of clc messages */ + u8 eyecatcher[4]; +}; + +struct smc_clc_msg_local { /* header2 of clc messages */ + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */ + u8 gid[16]; /* gid of ib_device port */ + u8 mac[6]; /* mac of ib_device port */ +}; + +struct smc_clc_msg_proposal { /* clc proposal message */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + __be16 iparea_offset; /* offset to IP address information area */ + __be32 outgoing_subnet; /* subnet mask */ + u8 prefix_len; /* number of significant bits in mask */ + u8 reserved[2]; + u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __aligned(4); + +struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 conn_idx; /* Connection index, which RMBE in RMB */ + __be32 rmbe_alert_token;/* unique connection id */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ + qp_mtu : 4; /* QP mtu */ +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + rmbe_size : 4; +#endif + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* initial packet sequence number */ + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __packed; /* format defined in RFC7609 */ + +struct smc_clc_msg_decline { /* clc decline message */ + struct smc_clc_msg_hdr hdr; + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ + __be32 peer_diagnosis; /* diagnosis information */ + u8 reserved2[4]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __aligned(4); + +struct smc_sock; +struct smc_ib_device; + +int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, + u8 expected_type); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, + u8 out_of_sync); +int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, + u8 ibport); +int smc_clc_send_confirm(struct smc_sock *smc); +int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact); + +#endif diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c new file mode 100644 index 000000000000..03dfcc6b7661 --- /dev/null +++ b/net/smc/smc_close.c @@ -0,0 +1,442 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Socket Closing - normal and abnormal + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/workqueue.h> +#include <net/sock.h> + +#include "smc.h" +#include "smc_tx.h" +#include "smc_cdc.h" +#include "smc_close.h" + +#define SMC_CLOSE_WAIT_TX_PENDS_TIME (5 * HZ) + +static void smc_close_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + /* Close non-accepted connections */ + while ((sk = smc_accept_dequeue(parent, NULL))) + smc_close_non_accepted(sk); +} + +static void smc_close_wait_tx_pends(struct smc_sock *smc) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct sock *sk = &smc->sk; + signed long timeout; + + timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME; + add_wait_queue(sk_sleep(sk), &wait); + while (!signal_pending(current) && timeout) { + int rc; + + rc = sk_wait_event(sk, &timeout, + !smc_cdc_tx_has_pending(&smc->conn), + &wait); + if (rc) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); +} + +/* wait for sndbuf data being transmitted */ +static void smc_close_stream_wait(struct smc_sock *smc, long timeout) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct sock *sk = &smc->sk; + + if (!timeout) + return; + + if (!smc_tx_prepared_sends(&smc->conn)) + return; + + smc->wait_close_tx_prepared = 1; + add_wait_queue(sk_sleep(sk), &wait); + while (!signal_pending(current) && timeout) { + int rc; + + rc = sk_wait_event(sk, &timeout, + !smc_tx_prepared_sends(&smc->conn) || + (sk->sk_err == ECONNABORTED) || + (sk->sk_err == ECONNRESET), + &wait); + if (rc) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); + smc->wait_close_tx_prepared = 0; +} + +void smc_close_wake_tx_prepared(struct smc_sock *smc) +{ + if (smc->wait_close_tx_prepared) + /* wake up socket closing */ + smc->sk.sk_state_change(&smc->sk); +} + +static int smc_close_wr(struct smc_connection *conn) +{ + conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +static int smc_close_final(struct smc_connection *conn) +{ + if (atomic_read(&conn->bytes_to_rcv)) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + else + conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +static int smc_close_abort(struct smc_connection *conn) +{ + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + + return smc_cdc_get_slot_and_msg_send(conn); +} + +/* terminate smc socket abnormally - active abort + * RDMA communication no longer possible + */ +void smc_close_active_abort(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *txflags = + &smc->conn.local_tx_ctrl.conn_state_flags; + + bh_lock_sock(&smc->sk); + smc->sk.sk_err = ECONNABORTED; + if (smc->clcsock && smc->clcsock->sk) { + smc->clcsock->sk->sk_err = ECONNABORTED; + smc->clcsock->sk->sk_state_change(smc->clcsock->sk); + } + switch (smc->sk.sk_state) { + case SMC_INIT: + smc->sk.sk_state = SMC_PEERABORTWAIT; + break; + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + txflags->peer_conn_abort = 1; + sock_release(smc->clcsock); + if (!smc_cdc_rxed_any_close(&smc->conn)) + smc->sk.sk_state = SMC_PEERABORTWAIT; + else + smc->sk.sk_state = SMC_CLOSED; + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + if (!txflags->peer_conn_closed) { + smc->sk.sk_state = SMC_PEERABORTWAIT; + txflags->peer_conn_abort = 1; + sock_release(smc->clcsock); + } else { + smc->sk.sk_state = SMC_CLOSED; + } + break; + case SMC_PROCESSABORT: + case SMC_APPFINCLOSEWAIT: + if (!txflags->peer_conn_closed) { + txflags->peer_conn_abort = 1; + sock_release(smc->clcsock); + } + smc->sk.sk_state = SMC_CLOSED; + break; + case SMC_PEERFINCLOSEWAIT: + case SMC_PEERABORTWAIT: + case SMC_CLOSED: + break; + } + + sock_set_flag(&smc->sk, SOCK_DEAD); + bh_unlock_sock(&smc->sk); + smc->sk.sk_state_change(&smc->sk); +} + +int smc_close_active(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *txflags = + &smc->conn.local_tx_ctrl.conn_state_flags; + long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + int old_state; + int rc = 0; + + if (sock_flag(sk, SOCK_LINGER) && + !(current->flags & PF_EXITING)) + timeout = sk->sk_lingertime; + +again: + old_state = sk->sk_state; + switch (old_state) { + case SMC_INIT: + sk->sk_state = SMC_CLOSED; + if (smc->smc_listen_work.func) + flush_work(&smc->smc_listen_work); + sock_put(sk); + break; + case SMC_LISTEN: + sk->sk_state = SMC_CLOSED; + sk->sk_state_change(sk); /* wake up accept */ + if (smc->clcsock && smc->clcsock->sk) { + rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR); + /* wake up kernel_accept of smc_tcp_listen_worker */ + smc->clcsock->sk->sk_data_ready(smc->clcsock->sk); + } + release_sock(sk); + smc_close_cleanup_listen(sk); + flush_work(&smc->tcp_listen_work); + lock_sock(sk); + break; + case SMC_ACTIVE: + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_work_sync(&conn->tx_work); + lock_sock(sk); + if (sk->sk_state == SMC_ACTIVE) { + /* send close request */ + rc = smc_close_final(conn); + sk->sk_state = SMC_PEERCLOSEWAIT1; + } else { + /* peer event has changed the state */ + goto again; + } + break; + case SMC_APPFINCLOSEWAIT: + /* socket already shutdown wr or both (active close) */ + if (txflags->peer_done_writing && + !txflags->peer_conn_closed) { + /* just shutdown wr done, send close request */ + rc = smc_close_final(conn); + } + sk->sk_state = SMC_CLOSED; + smc_close_wait_tx_pends(smc); + break; + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + if (!smc_cdc_rxed_any_close(conn)) + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_work_sync(&conn->tx_work); + lock_sock(sk); + if (sk->sk_err != ECONNABORTED) { + /* confirm close from peer */ + rc = smc_close_final(conn); + if (rc) + break; + } + if (smc_cdc_rxed_any_close(conn)) + /* peer has closed the socket already */ + sk->sk_state = SMC_CLOSED; + else + /* peer has just issued a shutdown write */ + sk->sk_state = SMC_PEERFINCLOSEWAIT; + smc_close_wait_tx_pends(smc); + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + /* peer sending PeerConnectionClosed will cause transition */ + break; + case SMC_PROCESSABORT: + cancel_work_sync(&conn->tx_work); + smc_close_abort(conn); + sk->sk_state = SMC_CLOSED; + smc_close_wait_tx_pends(smc); + break; + case SMC_PEERABORTWAIT: + case SMC_CLOSED: + /* nothing to do, add tracing in future patch */ + break; + } + + if (old_state != sk->sk_state) + sk->sk_state_change(&smc->sk); + return rc; +} + +static void smc_close_passive_abort_received(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *txflags = + &smc->conn.local_tx_ctrl.conn_state_flags; + struct sock *sk = &smc->sk; + + switch (sk->sk_state) { + case SMC_ACTIVE: + case SMC_APPFINCLOSEWAIT: + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + smc_close_abort(&smc->conn); + sk->sk_state = SMC_PROCESSABORT; + break; + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + if (txflags->peer_done_writing && + !txflags->peer_conn_closed) { + /* just shutdown, but not yet closed locally */ + smc_close_abort(&smc->conn); + sk->sk_state = SMC_PROCESSABORT; + } else { + sk->sk_state = SMC_CLOSED; + } + break; + case SMC_PEERFINCLOSEWAIT: + case SMC_PEERABORTWAIT: + sk->sk_state = SMC_CLOSED; + break; + case SMC_INIT: + case SMC_PROCESSABORT: + /* nothing to do, add tracing in future patch */ + break; + } +} + +/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort, + * or peer_done_writing. + * Called under tasklet context. + */ +void smc_close_passive_received(struct smc_sock *smc) +{ + struct smc_cdc_conn_state_flags *rxflags = + &smc->conn.local_rx_ctrl.conn_state_flags; + struct sock *sk = &smc->sk; + int old_state; + + sk->sk_shutdown |= RCV_SHUTDOWN; + if (smc->clcsock && smc->clcsock->sk) + smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(&smc->sk, SOCK_DONE); + + old_state = sk->sk_state; + + if (rxflags->peer_conn_abort) { + smc_close_passive_abort_received(smc); + goto wakeup; + } + + switch (sk->sk_state) { + case SMC_INIT: + if (atomic_read(&smc->conn.bytes_to_rcv) || + (rxflags->peer_done_writing && + !rxflags->peer_conn_closed)) + sk->sk_state = SMC_APPCLOSEWAIT1; + else + sk->sk_state = SMC_CLOSED; + break; + case SMC_ACTIVE: + sk->sk_state = SMC_APPCLOSEWAIT1; + break; + case SMC_PEERCLOSEWAIT1: + if (rxflags->peer_done_writing) + sk->sk_state = SMC_PEERCLOSEWAIT2; + /* fall through to check for closing */ + case SMC_PEERCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + if (!smc_cdc_rxed_any_close(&smc->conn)) + break; + if (sock_flag(sk, SOCK_DEAD) && + (sk->sk_shutdown == SHUTDOWN_MASK)) { + /* smc_release has already been called locally */ + sk->sk_state = SMC_CLOSED; + } else { + /* just shutdown, but not yet closed locally */ + sk->sk_state = SMC_APPFINCLOSEWAIT; + } + break; + case SMC_APPCLOSEWAIT1: + case SMC_APPCLOSEWAIT2: + case SMC_APPFINCLOSEWAIT: + case SMC_PEERABORTWAIT: + case SMC_PROCESSABORT: + case SMC_CLOSED: + /* nothing to do, add tracing in future patch */ + break; + } + +wakeup: + if (old_state != sk->sk_state) + sk->sk_state_change(sk); + sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */ + sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */ + + if ((sk->sk_state == SMC_CLOSED) && + (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) { + smc_conn_free(&smc->conn); + schedule_delayed_work(&smc->sock_put_work, + SMC_CLOSE_SOCK_PUT_DELAY); + } +} + +void smc_close_sock_put_work(struct work_struct *work) +{ + struct smc_sock *smc = container_of(to_delayed_work(work), + struct smc_sock, + sock_put_work); + + smc->sk.sk_prot->unhash(&smc->sk); + sock_put(&smc->sk); +} + +int smc_close_shutdown_write(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT; + struct sock *sk = &smc->sk; + int old_state; + int rc = 0; + + if (sock_flag(sk, SOCK_LINGER)) + timeout = sk->sk_lingertime; + +again: + old_state = sk->sk_state; + switch (old_state) { + case SMC_ACTIVE: + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_work_sync(&conn->tx_work); + lock_sock(sk); + /* send close wr request */ + rc = smc_close_wr(conn); + if (sk->sk_state == SMC_ACTIVE) + sk->sk_state = SMC_PEERCLOSEWAIT1; + else + goto again; + break; + case SMC_APPCLOSEWAIT1: + /* passive close */ + if (!smc_cdc_rxed_any_close(conn)) + smc_close_stream_wait(smc, timeout); + release_sock(sk); + cancel_work_sync(&conn->tx_work); + lock_sock(sk); + /* confirm close from peer */ + rc = smc_close_wr(conn); + sk->sk_state = SMC_APPCLOSEWAIT2; + break; + case SMC_APPCLOSEWAIT2: + case SMC_PEERFINCLOSEWAIT: + case SMC_PEERCLOSEWAIT1: + case SMC_PEERCLOSEWAIT2: + case SMC_APPFINCLOSEWAIT: + case SMC_PROCESSABORT: + case SMC_PEERABORTWAIT: + /* nothing to do, add tracing in future patch */ + break; + } + + if (old_state != sk->sk_state) + sk->sk_state_change(&smc->sk); + return rc; +} diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h new file mode 100644 index 000000000000..bc9a2df3633c --- /dev/null +++ b/net/smc/smc_close.h @@ -0,0 +1,28 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Socket Closing + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_CLOSE_H +#define SMC_CLOSE_H + +#include <linux/workqueue.h> + +#include "smc.h" + +#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ) +#define SMC_CLOSE_SOCK_PUT_DELAY HZ + +void smc_close_wake_tx_prepared(struct smc_sock *smc); +void smc_close_active_abort(struct smc_sock *smc); +int smc_close_active(struct smc_sock *smc); +void smc_close_passive_received(struct smc_sock *smc); +void smc_close_sock_put_work(struct work_struct *work); +int smc_close_shutdown_write(struct smc_sock *smc); + +#endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c new file mode 100644 index 000000000000..8b1d34378829 --- /dev/null +++ b/net/smc/smc_core.c @@ -0,0 +1,677 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Basic Transport Functions exploiting Infiniband API + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/socket.h> +#include <linux/if_vlan.h> +#include <linux/random.h> +#include <linux/workqueue.h> +#include <net/tcp.h> +#include <net/sock.h> +#include <rdma/ib_verbs.h> + +#include "smc.h" +#include "smc_clc.h" +#include "smc_core.h" +#include "smc_ib.h" +#include "smc_wr.h" +#include "smc_llc.h" +#include "smc_cdc.h" +#include "smc_close.h" + +#define SMC_LGR_NUM_INCR 256 +#define SMC_LGR_FREE_DELAY (600 * HZ) + +static u32 smc_lgr_num; /* unique link group number */ + +/* Register connection's alert token in our lookup structure. + * To use rbtrees we have to implement our own insert core. + * Requires @conns_lock + * @smc connection to register + * Returns 0 on success, != otherwise. + */ +static void smc_lgr_add_alert_token(struct smc_connection *conn) +{ + struct rb_node **link, *parent = NULL; + u32 token = conn->alert_token_local; + + link = &conn->lgr->conns_all.rb_node; + while (*link) { + struct smc_connection *cur = rb_entry(*link, + struct smc_connection, alert_node); + + parent = *link; + if (cur->alert_token_local > token) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + /* Put the new node there */ + rb_link_node(&conn->alert_node, parent, link); + rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); +} + +/* Register connection in link group by assigning an alert token + * registered in a search tree. + * Requires @conns_lock + * Note that '0' is a reserved value and not assigned. + */ +static void smc_lgr_register_conn(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + static atomic_t nexttoken = ATOMIC_INIT(0); + + /* find a new alert_token_local value not yet used by some connection + * in this link group + */ + sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ + while (!conn->alert_token_local) { + conn->alert_token_local = atomic_inc_return(&nexttoken); + if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) + conn->alert_token_local = 0; + } + smc_lgr_add_alert_token(conn); + conn->lgr->conns_num++; +} + +/* Unregister connection and reset the alert token of the given connection< + */ +static void __smc_lgr_unregister_conn(struct smc_connection *conn) +{ + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + struct smc_link_group *lgr = conn->lgr; + + rb_erase(&conn->alert_node, &lgr->conns_all); + lgr->conns_num--; + conn->alert_token_local = 0; + conn->lgr = NULL; + sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ +} + +/* Unregister connection and trigger lgr freeing if applicable + */ +static void smc_lgr_unregister_conn(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + int reduced = 0; + + write_lock_bh(&lgr->conns_lock); + if (conn->alert_token_local) { + reduced = 1; + __smc_lgr_unregister_conn(conn); + } + write_unlock_bh(&lgr->conns_lock); + if (reduced && !lgr->conns_num) + schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY); +} + +static void smc_lgr_free_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(to_delayed_work(work), + struct smc_link_group, + free_work); + bool conns; + + spin_lock_bh(&smc_lgr_list.lock); + read_lock_bh(&lgr->conns_lock); + conns = RB_EMPTY_ROOT(&lgr->conns_all); + read_unlock_bh(&lgr->conns_lock); + if (!conns) { /* number of lgr connections is no longer zero */ + spin_unlock_bh(&smc_lgr_list.lock); + return; + } + list_del_init(&lgr->list); /* remove from smc_lgr_list */ + spin_unlock_bh(&smc_lgr_list.lock); + smc_lgr_free(lgr); +} + +/* create a new SMC link group */ +static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, + struct smc_ib_device *smcibdev, u8 ibport, + char *peer_systemid, unsigned short vlan_id) +{ + struct smc_link_group *lgr; + struct smc_link *lnk; + u8 rndvec[3]; + int rc = 0; + int i; + + lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); + if (!lgr) { + rc = -ENOMEM; + goto out; + } + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + lgr->sync_err = false; + lgr->daddr = peer_in_addr; + memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); + lgr->vlan_id = vlan_id; + rwlock_init(&lgr->sndbufs_lock); + rwlock_init(&lgr->rmbs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + INIT_LIST_HEAD(&lgr->sndbufs[i]); + INIT_LIST_HEAD(&lgr->rmbs[i]); + } + smc_lgr_num += SMC_LGR_NUM_INCR; + memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE); + INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); + lgr->conns_all = RB_ROOT; + + lnk = &lgr->lnk[SMC_SINGLE_LINK]; + /* initialize link */ + lnk->smcibdev = smcibdev; + lnk->ibport = ibport; + lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; + if (!smcibdev->initialized) + smc_ib_setup_per_ibdev(smcibdev); + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto free_lgr; + init_waitqueue_head(&lnk->wr_tx_wait); + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + init_completion(&lnk->llc_confirm); + init_completion(&lnk->llc_confirm_resp); + + smc->conn.lgr = lgr; + rwlock_init(&lgr->conns_lock); + spin_lock_bh(&smc_lgr_list.lock); + list_add(&lgr->list, &smc_lgr_list.list); + spin_unlock_bh(&smc_lgr_list.lock); + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +free_lgr: + kfree(lgr); +out: + return rc; +} + +static void smc_sndbuf_unuse(struct smc_connection *conn) +{ + if (conn->sndbuf_desc) { + conn->sndbuf_desc->used = 0; + conn->sndbuf_size = 0; + } +} + +static void smc_rmb_unuse(struct smc_connection *conn) +{ + if (conn->rmb_desc) { + conn->rmb_desc->used = 0; + conn->rmbe_size = 0; + } +} + +/* remove a finished connection from its link group */ +void smc_conn_free(struct smc_connection *conn) +{ + struct smc_link_group *lgr = conn->lgr; + + if (!lgr) + return; + smc_cdc_tx_dismiss_slots(conn); + smc_lgr_unregister_conn(conn); + smc_rmb_unuse(conn); + smc_sndbuf_unuse(conn); +} + +static void smc_link_clear(struct smc_link *lnk) +{ + lnk->peer_qpn = 0; + smc_ib_modify_qp_reset(lnk); + smc_wr_free_link(lnk); + smc_ib_destroy_queue_pair(lnk); + smc_ib_dealloc_protection_domain(lnk); + smc_wr_free_link_mem(lnk); +} + +static void smc_lgr_free_sndbufs(struct smc_link_group *lgr) +{ + struct smc_buf_desc *sndbuf_desc, *bf_desc; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i], + list) { + list_del(&sndbuf_desc->list); + smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + smc_uncompress_bufsize(i), + sndbuf_desc, DMA_TO_DEVICE); + kfree(sndbuf_desc->cpu_addr); + kfree(sndbuf_desc); + } + } +} + +static void smc_lgr_free_rmbs(struct smc_link_group *lgr) +{ + struct smc_buf_desc *rmb_desc, *bf_desc; + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i], + list) { + list_del(&rmb_desc->list); + smc_ib_buf_unmap(lnk->smcibdev, + smc_uncompress_bufsize(i), + rmb_desc, DMA_FROM_DEVICE); + kfree(rmb_desc->cpu_addr); + kfree(rmb_desc); + } + } +} + +/* remove a link group */ +void smc_lgr_free(struct smc_link_group *lgr) +{ + smc_lgr_free_rmbs(lgr); + smc_lgr_free_sndbufs(lgr); + smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); + kfree(lgr); +} + +/* terminate linkgroup abnormally */ +void smc_lgr_terminate(struct smc_link_group *lgr) +{ + struct smc_connection *conn; + struct smc_sock *smc; + struct rb_node *node; + + spin_lock_bh(&smc_lgr_list.lock); + if (list_empty(&lgr->list)) { + /* termination already triggered */ + spin_unlock_bh(&smc_lgr_list.lock); + return; + } + /* do not use this link group for new connections */ + list_del_init(&lgr->list); + spin_unlock_bh(&smc_lgr_list.lock); + + write_lock_bh(&lgr->conns_lock); + node = rb_first(&lgr->conns_all); + while (node) { + conn = rb_entry(node, struct smc_connection, alert_node); + smc = container_of(conn, struct smc_sock, conn); + sock_hold(&smc->sk); + __smc_lgr_unregister_conn(conn); + smc_close_active_abort(smc); + sock_put(&smc->sk); + node = rb_first(&lgr->conns_all); + } + write_unlock_bh(&lgr->conns_lock); +} + +/* Determine vlan of internal TCP socket. + * @vlan_id: address to store the determined vlan id into + */ +static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + int rc = 0; + + *vlan_id = 0; + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + + if (is_vlan_dev(dst->dev)) + *vlan_id = vlan_dev_vlan_id(dst->dev); + +out_rel: + dst_release(dst); +out: + return rc; +} + +/* determine the link gid matching the vlan id of the link group */ +static int smc_link_determine_gid(struct smc_link_group *lgr) +{ + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + struct ib_gid_attr gattr; + union ib_gid gid; + int i; + + if (!lgr->vlan_id) { + lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1]; + return 0; + } + + for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len; + i++) { + if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, + &gattr)) + continue; + if (gattr.ndev && + (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) { + lnk->gid = gid; + return 0; + } + } + return -ENODEV; +} + +/* create a new SMC connection (and a new link group if necessary) */ +int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, + struct smc_ib_device *smcibdev, u8 ibport, + struct smc_clc_msg_local *lcl, int srv_first_contact) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr; + unsigned short vlan_id; + enum smc_lgr_role role; + int local_contact = SMC_FIRST_CONTACT; + int rc = 0; + + role = smc->listen_smc ? SMC_SERV : SMC_CLNT; + rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); + if (rc) + return rc; + + if ((role == SMC_CLNT) && srv_first_contact) + /* create new link group as well */ + goto create; + + /* determine if an existing link group can be reused */ + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry(lgr, &smc_lgr_list.list, list) { + write_lock_bh(&lgr->conns_lock); + if (!memcmp(lgr->peer_systemid, lcl->id_for_peer, + SMC_SYSTEMID_LEN) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, + SMC_GID_SIZE) && + !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, + sizeof(lcl->mac)) && + !lgr->sync_err && + (lgr->role == role) && + (lgr->vlan_id == vlan_id) && + ((role == SMC_CLNT) || + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) { + /* link group found */ + local_contact = SMC_REUSE_CONTACT; + conn->lgr = lgr; + smc_lgr_register_conn(conn); /* add smc conn to lgr */ + write_unlock_bh(&lgr->conns_lock); + break; + } + write_unlock_bh(&lgr->conns_lock); + } + spin_unlock_bh(&smc_lgr_list.lock); + + if (role == SMC_CLNT && !srv_first_contact && + (local_contact == SMC_FIRST_CONTACT)) { + /* Server reuses a link group, but Client wants to start + * a new one + * send out_of_sync decline, reason synchr. error + */ + return -ENOLINK; + } + +create: + if (local_contact == SMC_FIRST_CONTACT) { + rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport, + lcl->id_for_peer, vlan_id); + if (rc) + goto out; + smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_link_determine_gid(conn->lgr); + } + conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; + conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg); +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&conn->acurs_lock); +#endif + +out: + return rc ? rc : local_contact; +} + +/* try to reuse a sndbuf description slot of the sndbufs list for a certain + * buf_size; if not available, return NULL + */ +static inline +struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr, + int compressed_bufsize) +{ + struct smc_buf_desc *sndbuf_slot; + + read_lock_bh(&lgr->sndbufs_lock); + list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize], + list) { + if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) { + read_unlock_bh(&lgr->sndbufs_lock); + return sndbuf_slot; + } + } + read_unlock_bh(&lgr->sndbufs_lock); + return NULL; +} + +/* try to reuse an rmb description slot of the rmbs list for a certain + * rmbe_size; if not available, return NULL + */ +static inline +struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr, + int compressed_bufsize) +{ + struct smc_buf_desc *rmb_slot; + + read_lock_bh(&lgr->rmbs_lock); + list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize], + list) { + if (cmpxchg(&rmb_slot->used, 0, 1) == 0) { + read_unlock_bh(&lgr->rmbs_lock); + return rmb_slot; + } + } + read_unlock_bh(&lgr->rmbs_lock); + return NULL; +} + +/* one of the conditions for announcing a receiver's current window size is + * that it "results in a minimum increase in the window size of 10% of the + * receive buffer space" [RFC7609] + */ +static inline int smc_rmb_wnd_update_limit(int rmbe_size) +{ + return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); +} + +/* create the tx buffer for an SMC socket */ +int smc_sndbuf_create(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + int tmp_bufsize, tmp_bufsize_short; + struct smc_buf_desc *sndbuf_desc; + int rc; + + /* use socket send buffer size (w/o overhead) as start value */ + for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); + tmp_bufsize_short >= 0; tmp_bufsize_short--) { + tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); + /* check for reusable sndbuf_slot in the link group */ + sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short); + if (sndbuf_desc) { + memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize); + break; /* found reusable slot */ + } + /* try to alloc a new send buffer */ + sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL); + if (!sndbuf_desc) + break; /* give up with -ENOMEM */ + sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize, + GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | + __GFP_NORETRY); + if (!sndbuf_desc->cpu_addr) { + kfree(sndbuf_desc); + /* if send buffer allocation has failed, + * try a smaller one + */ + continue; + } + rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + tmp_bufsize, sndbuf_desc, + DMA_TO_DEVICE); + if (rc) { + kfree(sndbuf_desc->cpu_addr); + kfree(sndbuf_desc); + continue; /* if mapping failed, try smaller one */ + } + sndbuf_desc->used = 1; + write_lock_bh(&lgr->sndbufs_lock); + list_add(&sndbuf_desc->list, + &lgr->sndbufs[tmp_bufsize_short]); + write_unlock_bh(&lgr->sndbufs_lock); + break; + } + if (sndbuf_desc && sndbuf_desc->cpu_addr) { + conn->sndbuf_desc = sndbuf_desc; + conn->sndbuf_size = tmp_bufsize; + smc->sk.sk_sndbuf = tmp_bufsize * 2; + atomic_set(&conn->sndbuf_space, tmp_bufsize); + return 0; + } else { + return -ENOMEM; + } +} + +/* create the RMB for an SMC socket (even though the SMC protocol + * allows more than one RMB-element per RMB, the Linux implementation + * uses just one RMB-element per RMB, i.e. uses an extra RMB for every + * connection in a link group + */ +int smc_rmb_create(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + int tmp_bufsize, tmp_bufsize_short; + struct smc_buf_desc *rmb_desc; + int rc; + + /* use socket recv buffer size (w/o overhead) as start value */ + for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2); + tmp_bufsize_short >= 0; tmp_bufsize_short--) { + tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); + /* check for reusable rmb_slot in the link group */ + rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short); + if (rmb_desc) { + memset(rmb_desc->cpu_addr, 0, tmp_bufsize); + break; /* found reusable slot */ + } + /* try to alloc a new RMB */ + rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL); + if (!rmb_desc) + break; /* give up with -ENOMEM */ + rmb_desc->cpu_addr = kzalloc(tmp_bufsize, + GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | + __GFP_NORETRY); + if (!rmb_desc->cpu_addr) { + kfree(rmb_desc); + /* if RMB allocation has failed, + * try a smaller one + */ + continue; + } + rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + tmp_bufsize, rmb_desc, + DMA_FROM_DEVICE); + if (rc) { + kfree(rmb_desc->cpu_addr); + kfree(rmb_desc); + continue; /* if mapping failed, try smaller one */ + } + rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd, + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE, + &rmb_desc->mr_rx[SMC_SINGLE_LINK]); + if (rc) { + smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + tmp_bufsize, rmb_desc, + DMA_FROM_DEVICE); + kfree(rmb_desc->cpu_addr); + kfree(rmb_desc); + continue; + } + rmb_desc->used = 1; + write_lock_bh(&lgr->rmbs_lock); + list_add(&rmb_desc->list, + &lgr->rmbs[tmp_bufsize_short]); + write_unlock_bh(&lgr->rmbs_lock); + break; + } + if (rmb_desc && rmb_desc->cpu_addr) { + conn->rmb_desc = rmb_desc; + conn->rmbe_size = tmp_bufsize; + conn->rmbe_size_short = tmp_bufsize_short; + smc->sk.sk_rcvbuf = tmp_bufsize * 2; + atomic_set(&conn->bytes_to_rcv, 0); + conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize); + return 0; + } else { + return -ENOMEM; + } +} + +static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) +{ + int i; + + for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { + if (!test_and_set_bit(i, lgr->rtokens_used_mask)) + return i; + } + return -ENOSPC; +} + +/* save rkey and dma_addr received from peer during clc handshake */ +int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc) +{ + u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr); + struct smc_link_group *lgr = conn->lgr; + u32 rkey = ntohl(clc->rmb_rkey); + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && + test_bit(i, lgr->rtokens_used_mask)) { + conn->rtoken_idx = i; + return 0; + } + } + conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr); + if (conn->rtoken_idx < 0) + return conn->rtoken_idx; + lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey; + lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr; + return 0; +} diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h new file mode 100644 index 000000000000..27eb38056a27 --- /dev/null +++ b/net/smc/smc_core.h @@ -0,0 +1,181 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for SMC Connections, Link Groups and Links + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_CORE_H +#define _SMC_CORE_H + +#include <linux/atomic.h> +#include <rdma/ib_verbs.h> + +#include "smc.h" +#include "smc_ib.h" + +#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ + +struct smc_lgr_list { /* list of link group definition */ + struct list_head list; + spinlock_t lock; /* protects list of link groups */ +}; + +extern struct smc_lgr_list smc_lgr_list; /* list of link groups */ + +enum smc_lgr_role { /* possible roles of a link group */ + SMC_CLNT, /* client */ + SMC_SERV /* server */ +}; + +#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ + +struct smc_wr_buf { + u8 raw[SMC_WR_BUF_SIZE]; +}; + +struct smc_link { + struct smc_ib_device *smcibdev; /* ib-device */ + u8 ibport; /* port - values 1 | 2 */ + struct ib_pd *roce_pd; /* IB protection domain, + * unique for every RoCE QP + */ + struct ib_qp *roce_qp; /* IB queue pair */ + struct ib_qp_attr qp_attr; /* IB queue pair attributes */ + + struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */ + struct ib_send_wr *wr_tx_ibs; /* WR send meta data */ + struct ib_sge *wr_tx_sges; /* WR send gather meta data */ + struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ + /* above four vectors have wr_tx_cnt elements and use the same index */ + dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ + atomic_long_t wr_tx_id; /* seq # of last sent WR */ + unsigned long *wr_tx_mask; /* bit mask of used indexes */ + u32 wr_tx_cnt; /* number of WR send buffers */ + wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ + + struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ + struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ + /* above three vectors have wr_rx_cnt elements and use the same index */ + dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + u64 wr_rx_id; /* seq # of last recv WR */ + u32 wr_rx_cnt; /* number of WR recv buffers */ + + union ib_gid gid; /* gid matching used vlan id */ + u32 peer_qpn; /* QP number of peer */ + enum ib_mtu path_mtu; /* used mtu */ + enum ib_mtu peer_mtu; /* mtu size of peer */ + u32 psn_initial; /* QP tx initial packet seqno */ + u32 peer_psn; /* QP rx initial packet seqno */ + u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ + u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/ + u8 link_id; /* unique # within link group */ + struct completion llc_confirm; /* wait for rx of conf link */ + struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ +}; + +/* For now we just allow one parallel link per link group. The SMC protocol + * allows more (up to 8). + */ +#define SMC_LINKS_PER_LGR_MAX 1 +#define SMC_SINGLE_LINK 0 + +#define SMC_FIRST_CONTACT 1 /* first contact to a peer */ +#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/ + +/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ +struct smc_buf_desc { + struct list_head list; + u64 dma_addr[SMC_LINKS_PER_LGR_MAX]; + /* mapped address of buffer */ + void *cpu_addr; /* virtual address of buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: + * rkey provided to peer + */ + u32 used; /* currently used / unused */ +}; + +struct smc_rtoken { /* address/key of remote RMB */ + u64 dma_addr; + u32 rkey; +}; + +#define SMC_LGR_ID_SIZE 4 + +struct smc_link_group { + struct list_head list; + enum smc_lgr_role role; /* client or server */ + __be32 daddr; /* destination ip address */ + struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ + char peer_systemid[SMC_SYSTEMID_LEN]; + /* unique system_id of peer */ + struct rb_root conns_all; /* connection tree */ + rwlock_t conns_lock; /* protects conns_all */ + unsigned int conns_num; /* current # of connections */ + unsigned short vlan_id; /* vlan id of link group */ + + struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ + rwlock_t sndbufs_lock; /* protects tx buffers */ + struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ + rwlock_t rmbs_lock; /* protects rx buffers */ + struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] + [SMC_LINKS_PER_LGR_MAX]; + /* remote addr/key pairs */ + unsigned long rtokens_used_mask[BITS_TO_LONGS( + SMC_RMBS_PER_LGR_MAX)]; + /* used rtoken elements */ + + u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ + struct delayed_work free_work; /* delayed freeing of an lgr */ + bool sync_err; /* lgr no longer fits to peer */ +}; + +/* Find the connection associated with the given alert token in the link group. + * To use rbtrees we have to implement our own search core. + * Requires @conns_lock + * @token alert token to search for + * @lgr link group to search in + * Returns connection associated with token if found, NULL otherwise. + */ +static inline struct smc_connection *smc_lgr_find_conn( + u32 token, struct smc_link_group *lgr) +{ + struct smc_connection *res = NULL; + struct rb_node *node; + + node = lgr->conns_all.rb_node; + while (node) { + struct smc_connection *cur = rb_entry(node, + struct smc_connection, alert_node); + + if (cur->alert_token_local > token) { + node = node->rb_left; + } else { + if (cur->alert_token_local < token) { + node = node->rb_right; + } else { + res = cur; + break; + } + } + } + + return res; +} + +struct smc_sock; +struct smc_clc_msg_accept_confirm; + +void smc_lgr_free(struct smc_link_group *lgr); +void smc_lgr_terminate(struct smc_link_group *lgr); +int smc_sndbuf_create(struct smc_sock *smc); +int smc_rmb_create(struct smc_sock *smc); +int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc); + +#endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c new file mode 100644 index 000000000000..d2d01cf70224 --- /dev/null +++ b/net/smc/smc_diag.c @@ -0,0 +1,215 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Monitoring SMC transport protocol sockets + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/sock_diag.h> +#include <linux/inet_diag.h> +#include <linux/smc_diag.h> +#include <net/netlink.h> +#include <net/smc.h> + +#include "smc.h" +#include "smc_core.h" + +static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) +{ + sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", + be16_to_cpu(((__be16 *)gid_raw)[0]), + be16_to_cpu(((__be16 *)gid_raw)[1]), + be16_to_cpu(((__be16 *)gid_raw)[2]), + be16_to_cpu(((__be16 *)gid_raw)[3]), + be16_to_cpu(((__be16 *)gid_raw)[4]), + be16_to_cpu(((__be16 *)gid_raw)[5]), + be16_to_cpu(((__be16 *)gid_raw)[6]), + be16_to_cpu(((__be16 *)gid_raw)[7])); +} + +static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + r->diag_family = sk->sk_family; + if (!smc->clcsock) + return; + r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); + r->id.idiag_dport = smc->clcsock->sk->sk_dport; + r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; + r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; +} + +static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, + struct smc_diag_msg *r, + struct user_namespace *user_ns) +{ + if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) + return 1; + + r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->diag_inode = sock_i_ino(sk); + return 0; +} + +static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct smc_diag_req *req, + struct nlattr *bc) +{ + struct smc_sock *smc = smc_sk(sk); + struct user_namespace *user_ns; + struct smc_diag_msg *r; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + smc_diag_msg_common_fill(r, sk); + r->diag_state = sk->sk_state; + r->diag_fallback = smc->use_fallback; + user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); + if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) + goto errout; + + if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) { + struct smc_connection *conn = &smc->conn; + struct smc_diag_conninfo cinfo = { + .token = conn->alert_token_local, + .sndbuf_size = conn->sndbuf_size, + .rmbe_size = conn->rmbe_size, + .peer_rmbe_size = conn->peer_rmbe_size, + + .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, + .rx_prod.count = conn->local_rx_ctrl.prod.count, + .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, + .rx_cons.count = conn->local_rx_ctrl.cons.count, + + .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, + .tx_prod.count = conn->local_tx_ctrl.prod.count, + .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, + .tx_cons.count = conn->local_tx_ctrl.cons.count, + + .tx_prod_flags = + *(u8 *)&conn->local_tx_ctrl.prod_flags, + .tx_conn_state_flags = + *(u8 *)&conn->local_tx_ctrl.conn_state_flags, + .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, + .rx_conn_state_flags = + *(u8 *)&conn->local_rx_ctrl.conn_state_flags, + + .tx_prep.wrap = conn->tx_curs_prep.wrap, + .tx_prep.count = conn->tx_curs_prep.count, + .tx_sent.wrap = conn->tx_curs_sent.wrap, + .tx_sent.count = conn->tx_curs_sent.count, + .tx_fin.wrap = conn->tx_curs_fin.wrap, + .tx_fin.count = conn->tx_curs_fin.count, + }; + + if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) + goto errout; + } + + if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) { + struct smc_diag_lgrinfo linfo = { + .role = smc->conn.lgr->role, + .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport, + .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id, + }; + + memcpy(linfo.lnk[0].ibname, + smc->conn.lgr->lnk[0].smcibdev->ibdev->name, + sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name)); + smc_gid_be16_convert(linfo.lnk[0].gid, + smc->conn.lgr->lnk[0].gid.raw); + smc_gid_be16_convert(linfo.lnk[0].peer_gid, + smc->conn.lgr->lnk[0].peer_gid); + + if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) + goto errout; + } + + nlmsg_end(skb, nlh); + return 0; + +errout: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *bc = NULL; + struct hlist_head *head; + struct sock *sk; + int rc = 0; + + read_lock(&smc_proto.h.smc_hash->lock); + head = &smc_proto.h.smc_hash->ht; + if (hlist_empty(head)) + goto out; + + sk_for_each(sk, head) { + if (!net_eq(sock_net(sk), net)) + continue; + rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); + if (rc) + break; + } + +out: + read_unlock(&smc_proto.h.smc_hash->lock); + return rc; +} + +static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) +{ + struct net *net = sock_net(skb->sk); + + if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && + h->nlmsg_flags & NLM_F_DUMP) { + { + struct netlink_dump_control c = { + .dump = smc_diag_dump, + .min_dump_alloc = SKB_WITH_OVERHEAD(32768), + }; + return netlink_dump_start(net->diag_nlsk, skb, h, &c); + } + } + return 0; +} + +static const struct sock_diag_handler smc_diag_handler = { + .family = AF_SMC, + .dump = smc_diag_handler_dump, +}; + +static int __init smc_diag_init(void) +{ + return sock_diag_register(&smc_diag_handler); +} + +static void __exit smc_diag_exit(void) +{ + sock_diag_unregister(&smc_diag_handler); +} + +module_init(smc_diag_init); +module_exit(smc_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c new file mode 100644 index 000000000000..e6743c008ac5 --- /dev/null +++ b/net/smc/smc_ib.c @@ -0,0 +1,466 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * IB infrastructure: + * Establish SMC-R as an Infiniband Client to be notified about added and + * removed IB devices of type RDMA. + * Determine device and port characteristics for these IB devices. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/random.h> +#include <linux/workqueue.h> +#include <rdma/ib_verbs.h> + +#include "smc_pnet.h" +#include "smc_ib.h" +#include "smc_core.h" +#include "smc_wr.h" +#include "smc.h" + +#define SMC_QP_MIN_RNR_TIMER 5 +#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ +#define SMC_QP_RETRY_CNT 7 /* 7: infinite */ +#define SMC_QP_RNR_RETRY 7 /* 7: infinite */ + +struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ + .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock), + .list = LIST_HEAD_INIT(smc_ib_devices.list), +}; + +#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%" + +u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system + * identifier + */ + +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, + struct ib_mr **mr) +{ + int rc; + + if (*mr) + return 0; /* already done */ + + /* obtain unique key - + * next invocation of get_dma_mr returns a different key! + */ + *mr = pd->device->get_dma_mr(pd, access_flags); + rc = PTR_ERR_OR_ZERO(*mr); + if (IS_ERR(*mr)) + *mr = NULL; + return rc; +} + +static int smc_ib_modify_qp_init(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.pkey_index = 0; + qp_attr.port_num = lnk->ibport; + qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE + | IB_ACCESS_REMOTE_WRITE; + return ib_modify_qp(lnk->roce_qp, &qp_attr, + IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_ACCESS_FLAGS | IB_QP_PORT); +} + +static int smc_ib_modify_qp_rtr(struct smc_link *lnk) +{ + enum ib_qp_attr_mask qp_attr_mask = + IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | + IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RTR; + qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); + qp_attr.ah_attr.port_num = lnk->ibport; + qp_attr.ah_attr.ah_flags = IB_AH_GRH; + qp_attr.ah_attr.grh.hop_limit = 1; + memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid, + sizeof(lnk->peer_gid)); + memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac, + sizeof(lnk->peer_mac)); + qp_attr.dest_qp_num = lnk->peer_qpn; + qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ + qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming + * requests + */ + qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER; + + return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask); +} + +int smc_ib_modify_qp_rts(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */ + qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */ + qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */ + qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */ + qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and + * atomic ops allowed + */ + return ib_modify_qp(lnk->roce_qp, &qp_attr, + IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | + IB_QP_SQ_PSN | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC); +} + +int smc_ib_modify_qp_reset(struct smc_link *lnk) +{ + struct ib_qp_attr qp_attr; + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IB_QPS_RESET; + return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE); +} + +int smc_ib_ready_link(struct smc_link *lnk) +{ + struct smc_link_group *lgr = + container_of(lnk, struct smc_link_group, lnk[0]); + int rc = 0; + + rc = smc_ib_modify_qp_init(lnk); + if (rc) + goto out; + + rc = smc_ib_modify_qp_rtr(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv, + IB_CQ_SOLICITED_MASK); + if (rc) + goto out; + rc = smc_wr_rx_post_init(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + + if (lgr->role == SMC_SERV) { + rc = smc_ib_modify_qp_rts(lnk); + if (rc) + goto out; + smc_wr_remember_qp_attr(lnk); + } +out: + return rc; +} + +/* process context wrapper for might_sleep smc_ib_remember_port_attr */ +static void smc_ib_port_event_work(struct work_struct *work) +{ + struct smc_ib_device *smcibdev = container_of( + work, struct smc_ib_device, port_event_work); + u8 port_idx; + + for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) { + smc_ib_remember_port_attr(smcibdev, port_idx + 1); + clear_bit(port_idx, &smcibdev->port_event_mask); + } +} + +/* can be called in IRQ context */ +static void smc_ib_global_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + struct smc_ib_device *smcibdev; + u8 port_idx; + + smcibdev = container_of(handler, struct smc_ib_device, event_handler); + if (!smc_pnet_find_ib(smcibdev->ibdev->name)) + return; + + switch (ibevent->event) { + case IB_EVENT_PORT_ERR: + port_idx = ibevent->element.port_num - 1; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); + /* fall through */ + case IB_EVENT_DEVICE_FATAL: + /* tbd in follow-on patch: + * abnormal close of corresponding connections + */ + break; + case IB_EVENT_PORT_ACTIVE: + port_idx = ibevent->element.port_num - 1; + set_bit(port_idx, &smcibdev->port_event_mask); + schedule_work(&smcibdev->port_event_work); + break; + default: + break; + } +} + +void smc_ib_dealloc_protection_domain(struct smc_link *lnk) +{ + ib_dealloc_pd(lnk->roce_pd); + lnk->roce_pd = NULL; +} + +int smc_ib_create_protection_domain(struct smc_link *lnk) +{ + int rc; + + lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0); + rc = PTR_ERR_OR_ZERO(lnk->roce_pd); + if (IS_ERR(lnk->roce_pd)) + lnk->roce_pd = NULL; + return rc; +} + +static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv) +{ + switch (ibevent->event) { + case IB_EVENT_DEVICE_FATAL: + case IB_EVENT_GID_CHANGE: + case IB_EVENT_PORT_ERR: + case IB_EVENT_QP_ACCESS_ERR: + /* tbd in follow-on patch: + * abnormal close of corresponding connections + */ + break; + default: + break; + } +} + +void smc_ib_destroy_queue_pair(struct smc_link *lnk) +{ + ib_destroy_qp(lnk->roce_qp); + lnk->roce_qp = NULL; +} + +/* create a queue pair within the protection domain for a link */ +int smc_ib_create_queue_pair(struct smc_link *lnk) +{ + struct ib_qp_init_attr qp_attr = { + .event_handler = smc_ib_qp_event_handler, + .qp_context = lnk, + .send_cq = lnk->smcibdev->roce_cq_send, + .recv_cq = lnk->smcibdev->roce_cq_recv, + .srq = NULL, + .cap = { + .max_send_wr = SMC_WR_BUF_CNT, + /* include unsolicited rdma_writes as well, + * there are max. 2 RDMA_WRITE per 1 WR_SEND + */ + .max_recv_wr = SMC_WR_BUF_CNT * 3, + .max_send_sge = SMC_IB_MAX_SEND_SGE, + .max_recv_sge = 1, + .max_inline_data = SMC_WR_TX_SIZE, + }, + .sq_sig_type = IB_SIGNAL_REQ_WR, + .qp_type = IB_QPT_RC, + }; + int rc; + + lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); + rc = PTR_ERR_OR_ZERO(lnk->roce_qp); + if (IS_ERR(lnk->roce_qp)) + lnk->roce_qp = NULL; + else + smc_wr_remember_qp_attr(lnk); + return rc; +} + +/* map a new TX or RX buffer to DMA */ +int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + int rc = 0; + + if (buf_slot->dma_addr[SMC_SINGLE_LINK]) + return rc; /* already mapped */ + buf_slot->dma_addr[SMC_SINGLE_LINK] = + ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr, + buf_size, data_direction); + if (ib_dma_mapping_error(smcibdev->ibdev, + buf_slot->dma_addr[SMC_SINGLE_LINK])) + rc = -EIO; + return rc; +} + +void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + if (!buf_slot->dma_addr[SMC_SINGLE_LINK]) + return; /* already unmapped */ + ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size, + data_direction); + buf_slot->dma_addr[SMC_SINGLE_LINK] = 0; +} + +static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct net_device *ndev; + int rc; + + rc = ib_query_gid(smcibdev->ibdev, ibport, 0, + &smcibdev->gid[ibport - 1], NULL); + /* the SMC protocol requires specification of the roce MAC address; + * if net_device cannot be determined, it can be derived from gid 0 + */ + ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); + if (ndev) { + memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); + } else if (!rc) { + memcpy(&smcibdev->mac[ibport - 1][0], + &smcibdev->gid[ibport - 1].raw[8], 3); + memcpy(&smcibdev->mac[ibport - 1][3], + &smcibdev->gid[ibport - 1].raw[13], 3); + smcibdev->mac[ibport - 1][0] &= ~0x02; + } + return rc; +} + +/* Create an identifier unique for this instance of SMC-R. + * The MAC-address of the first active registered IB device + * plus a random 2-byte number is used to create this identifier. + * This name is delivered to the peer during connection initialization. + */ +static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev, + u8 ibport) +{ + memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1], + sizeof(smcibdev->mac[ibport - 1])); + get_random_bytes(&local_systemid[0], 2); +} + +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) +{ + return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; +} + +int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) +{ + int rc; + + memset(&smcibdev->pattr[ibport - 1], 0, + sizeof(smcibdev->pattr[ibport - 1])); + rc = ib_query_port(smcibdev->ibdev, ibport, + &smcibdev->pattr[ibport - 1]); + if (rc) + goto out; + rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); + if (rc) + goto out; + if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET, + sizeof(local_systemid)) && + smc_ib_port_active(smcibdev, ibport)) + /* create unique system identifier */ + smc_ib_define_local_systemid(smcibdev, ibport); +out: + return rc; +} + +long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) +{ + struct ib_cq_init_attr cqattr = { + .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 }; + long rc; + + smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, + smc_wr_tx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send); + if (IS_ERR(smcibdev->roce_cq_send)) { + smcibdev->roce_cq_send = NULL; + return rc; + } + smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev, + smc_wr_rx_cq_handler, NULL, + smcibdev, &cqattr); + rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv); + if (IS_ERR(smcibdev->roce_cq_recv)) { + smcibdev->roce_cq_recv = NULL; + goto err; + } + INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev, + smc_ib_global_event_handler); + ib_register_event_handler(&smcibdev->event_handler); + smc_wr_add_dev(smcibdev); + smcibdev->initialized = 1; + return rc; + +err: + ib_destroy_cq(smcibdev->roce_cq_send); + return rc; +} + +static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) +{ + if (!smcibdev->initialized) + return; + smc_wr_remove_dev(smcibdev); + ib_unregister_event_handler(&smcibdev->event_handler); + ib_destroy_cq(smcibdev->roce_cq_recv); + ib_destroy_cq(smcibdev->roce_cq_send); +} + +static struct ib_client smc_ib_client; + +/* callback function for ib_register_client() */ +static void smc_ib_add_dev(struct ib_device *ibdev) +{ + struct smc_ib_device *smcibdev; + + if (ibdev->node_type != RDMA_NODE_IB_CA) + return; + + smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); + if (!smcibdev) + return; + + smcibdev->ibdev = ibdev; + INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); + + spin_lock(&smc_ib_devices.lock); + list_add_tail(&smcibdev->list, &smc_ib_devices.list); + spin_unlock(&smc_ib_devices.lock); + ib_set_client_data(ibdev, &smc_ib_client, smcibdev); +} + +/* callback function for ib_register_client() */ +static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) +{ + struct smc_ib_device *smcibdev; + + smcibdev = ib_get_client_data(ibdev, &smc_ib_client); + ib_set_client_data(ibdev, &smc_ib_client, NULL); + spin_lock(&smc_ib_devices.lock); + list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ + spin_unlock(&smc_ib_devices.lock); + smc_pnet_remove_by_ibdev(smcibdev); + smc_ib_cleanup_per_ibdev(smcibdev); + kfree(smcibdev); +} + +static struct ib_client smc_ib_client = { + .name = "smc_ib", + .add = smc_ib_add_dev, + .remove = smc_ib_remove_dev, +}; + +int __init smc_ib_register_client(void) +{ + return ib_register_client(&smc_ib_client); +} + +void smc_ib_unregister_client(void) +{ + ib_unregister_client(&smc_ib_client); +} diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h new file mode 100644 index 000000000000..a95f74bb5569 --- /dev/null +++ b/net/smc/smc_ib.h @@ -0,0 +1,71 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for IB environment + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com> + */ + +#ifndef _SMC_IB_H +#define _SMC_IB_H + +#include <linux/if_ether.h> +#include <rdma/ib_verbs.h> + +#define SMC_MAX_PORTS 2 /* Max # of ports */ +#define SMC_GID_SIZE sizeof(union ib_gid) + +#define SMC_IB_MAX_SEND_SGE 2 + +struct smc_ib_devices { /* list of smc ib devices definition */ + struct list_head list; + spinlock_t lock; /* protects list of smc ib devices */ +}; + +extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */ + +struct smc_ib_device { /* ib-device infos for smc */ + struct list_head list; + struct ib_device *ibdev; + struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */ + struct ib_event_handler event_handler; /* global ib_event handler */ + struct ib_cq *roce_cq_send; /* send completion queue */ + struct ib_cq *roce_cq_recv; /* recv completion queue */ + struct tasklet_struct send_tasklet; /* called by send cq handler */ + struct tasklet_struct recv_tasklet; /* called by recv cq handler */ + char mac[SMC_MAX_PORTS][ETH_ALEN]; + /* mac address per port*/ + union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */ + u8 initialized : 1; /* ib dev CQ, evthdl done */ + struct work_struct port_event_work; + unsigned long port_event_mask; +}; + +struct smc_buf_desc; +struct smc_link; + +int smc_ib_register_client(void) __init; +void smc_ib_unregister_client(void); +bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); +int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); +int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); +void smc_ib_dealloc_protection_domain(struct smc_link *lnk); +int smc_ib_create_protection_domain(struct smc_link *lnk); +void smc_ib_destroy_queue_pair(struct smc_link *lnk); +int smc_ib_create_queue_pair(struct smc_link *lnk); +int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, + struct ib_mr **mr); +int smc_ib_ready_link(struct smc_link *lnk); +int smc_ib_modify_qp_rts(struct smc_link *lnk); +int smc_ib_modify_qp_reset(struct smc_link *lnk); +long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); + + +#endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c new file mode 100644 index 000000000000..c2f9165d13ef --- /dev/null +++ b/net/smc/smc_llc.c @@ -0,0 +1,158 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Link Layer Control (LLC) + * + * For now, we only support the necessary "confirm link" functionality + * which happens for the first RoCE link after successful CLC handshake. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com> + * Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <net/tcp.h> +#include <rdma/ib_verbs.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_clc.h" +#include "smc_llc.h" + +/********************************** send *************************************/ + +struct smc_llc_tx_pend { +}; + +/* handler for send/transmission completion of an LLC msg */ +static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend, + struct smc_link *link, + enum ib_wc_status wc_status) +{ + /* future work: handle wc_status error for recovery and failover */ +} + +/** + * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits + * @link: Pointer to SMC link used for sending LLC control message. + * @wr_buf: Out variable returning pointer to work request payload buffer. + * @pend: Out variable returning pointer to private pending WR tracking. + * It's the context the transmit complete handler will get. + * + * Reserves and pre-fills an entry for a pending work request send/tx. + * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx. + * Can sleep due to smc_get_ctrl_buf (if not in softirq context). + * + * Return: 0 on success, otherwise an error value. + */ +static int smc_llc_add_pending_send(struct smc_link *link, + struct smc_wr_buf **wr_buf, + struct smc_wr_tx_pend_priv **pend) +{ + int rc; + + rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend); + if (rc < 0) + return rc; + BUILD_BUG_ON_MSG( + sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE, + "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)"); + BUILD_BUG_ON_MSG( + sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE, + "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); + BUILD_BUG_ON_MSG( + sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, + "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)"); + return 0; +} + +/* high-level API to send LLC confirm link */ +int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], + union ib_gid *gid, + enum smc_llc_reqresp reqresp) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + struct smc_llc_msg_confirm_link *confllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + confllc = (struct smc_llc_msg_confirm_link *)wr_buf; + memset(confllc, 0, sizeof(*confllc)); + confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; + confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link); + if (reqresp == SMC_LLC_RESP) + confllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(confllc->sender_mac, mac, ETH_ALEN); + memcpy(confllc->sender_gid, gid, SMC_GID_SIZE); + hton24(confllc->sender_qp_num, link->roce_qp->qp_num); + /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */ + memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); + confllc->max_links = SMC_LINKS_PER_LGR_MAX; + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/********************************* receive ***********************************/ + +static void smc_llc_rx_confirm_link(struct smc_link *link, + struct smc_llc_msg_confirm_link *llc) +{ + struct smc_link_group *lgr; + + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + if (lgr->role == SMC_SERV) + complete(&link->llc_confirm_resp); + } else { + if (lgr->role == SMC_CLNT) { + link->link_id = llc->link_num; + complete(&link->llc_confirm); + } + } +} + +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + union smc_llc_msg *llc = buf; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK) + smc_llc_rx_confirm_link(link, &llc->confirm_link); +} + +/***************************** init, exit, misc ******************************/ + +static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_LINK + }, + { + .handler = NULL, + } +}; + +int __init smc_llc_init(void) +{ + struct smc_wr_rx_handler *handler; + int rc = 0; + + for (handler = smc_llc_rx_handlers; handler->handler; handler++) { + INIT_HLIST_NODE(&handler->list); + rc = smc_wr_rx_register_handler(handler); + if (rc) + break; + } + return rc; +} diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h new file mode 100644 index 000000000000..b472f853953a --- /dev/null +++ b/net/smc/smc_llc.h @@ -0,0 +1,63 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for LLC (link layer control) message handling + * + * Copyright IBM Corp. 2016 + * + * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com> + * Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_LLC_H +#define SMC_LLC_H + +#include "smc_wr.h" + +#define SMC_LLC_FLAG_RESP 0x80 + +#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) + +enum smc_llc_reqresp { + SMC_LLC_REQ, + SMC_LLC_RESP +}; + +enum smc_llc_msg_type { + SMC_LLC_CONFIRM_LINK = 0x01, +}; + +#define SMC_LLC_DATA_LEN 40 + +struct smc_llc_hdr { + struct smc_wr_rx_hdr common; + u8 length; /* 44 */ + u8 reserved; + u8 flags; +}; + +struct smc_llc_msg_confirm_link { /* type 0x01 */ + struct smc_llc_hdr hd; + u8 sender_mac[ETH_ALEN]; + u8 sender_gid[SMC_GID_SIZE]; + u8 sender_qp_num[3]; + u8 link_num; + u8 link_uid[SMC_LGR_ID_SIZE]; + u8 max_links; + u8 reserved[9]; +}; + +union smc_llc_msg { + struct smc_llc_msg_confirm_link confirm_link; + struct { + struct smc_llc_hdr hdr; + u8 data[SMC_LLC_DATA_LEN]; + } raw; +}; + +/* transmit */ +int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, + enum smc_llc_reqresp reqresp); +int smc_llc_init(void) __init; + +#endif /* SMC_LLC_H */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c new file mode 100644 index 000000000000..9d3e7fb8348d --- /dev/null +++ b/net/smc/smc_pnet.c @@ -0,0 +1,534 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic netlink support functions to configure an SMC-R PNET table + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> + */ + +#include <linux/module.h> +#include <linux/list.h> +#include <linux/ctype.h> +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/if.h> +#include <uapi/linux/smc.h> + +#include <rdma/ib_verbs.h> + +#include "smc_pnet.h" +#include "smc_ib.h" + +#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */ + +static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { + [SMC_PNETID_NAME] = { + .type = NLA_NUL_STRING, + .len = SMC_MAX_PNET_ID_LEN - 1 + }, + [SMC_PNETID_ETHNAME] = { + .type = NLA_NUL_STRING, + .len = IFNAMSIZ - 1 + }, + [SMC_PNETID_IBNAME] = { + .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX - 1 + }, + [SMC_PNETID_IBPORT] = { .type = NLA_U8 } +}; + +static struct genl_family smc_pnet_nl_family; + +/** + * struct smc_pnettable - SMC PNET table anchor + * @lock: Lock for list action + * @pnetlist: List of PNETIDs + */ +static struct smc_pnettable { + rwlock_t lock; + struct list_head pnetlist; +} smc_pnettable = { + .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist), + .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock) +}; + +/** + * struct smc_pnetentry - pnet identifier name entry + * @list: List node. + * @pnet_name: Pnet identifier name + * @ndev: pointer to network device. + * @smcibdev: Pointer to IB device. + */ +struct smc_pnetentry { + struct list_head list; + char pnet_name[SMC_MAX_PNET_ID_LEN + 1]; + struct net_device *ndev; + struct smc_ib_device *smcibdev; + u8 ib_port; +}; + +/* Check if two RDMA device entries are identical. Use device name and port + * number for comparison. + */ +static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname, + u8 ibport) +{ + return pnetelem->ib_port == ibport && + !strncmp(pnetelem->smcibdev->ibdev->name, ibname, + sizeof(pnetelem->smcibdev->ibdev->name)); +} + +/* Find a pnetid in the pnet table. + */ +static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name) +{ + struct smc_pnetentry *pnetelem, *found_pnetelem = NULL; + + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (!strncmp(pnetelem->pnet_name, pnet_name, + sizeof(pnetelem->pnet_name))) { + found_pnetelem = pnetelem; + break; + } + } + read_unlock(&smc_pnettable.lock); + return found_pnetelem; +} + +/* Remove a pnetid from the pnet table. + */ +static int smc_pnet_remove_by_pnetid(char *pnet_name) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (!strncmp(pnetelem->pnet_name, pnet_name, + sizeof(pnetelem->pnet_name))) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Remove a pnet entry mentioning a given network device from the pnet table. + */ +static int smc_pnet_remove_by_ndev(struct net_device *ndev) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (pnetelem->ndev == ndev) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Remove a pnet entry mentioning a given ib device from the pnet table. + */ +int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + int rc = -ENOENT; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + if (pnetelem->smcibdev == ibdev) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + rc = 0; + break; + } + } + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. + */ +static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem) +{ + struct smc_pnetentry *pnetelem; + int rc = -EEXIST; + + write_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name, + sizeof(new_pnetelem->pnet_name)) || + !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name, + sizeof(new_pnetelem->ndev->name)) || + smc_pnet_same_ibname(pnetelem, + new_pnetelem->smcibdev->ibdev->name, + new_pnetelem->ib_port)) + goto found; + } + list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist); + rc = 0; +found: + write_unlock(&smc_pnettable.lock); + return rc; +} + +/* The limit for pnetid is 16 characters. + * Valid characters should be (single-byte character set) a-z, A-Z, 0-9. + * Lower case letters are converted to upper case. + * Interior blanks should not be used. + */ +static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) +{ + char *bf = skip_spaces(pnet_name); + size_t len = strlen(bf); + char *end = bf + len; + + if (!len) + return false; + while (--end >= bf && isspace(*end)) + ; + if (end - bf >= SMC_MAX_PNET_ID_LEN) + return false; + while (bf <= end) { + if (!isalnum(*bf)) + return false; + *pnetid++ = islower(*bf) ? toupper(*bf) : *bf; + bf++; + } + *pnetid = '\0'; + return true; +} + +/* Find an infiniband device by a given name. The device might not exist. */ +struct smc_ib_device *smc_pnet_find_ib(char *ib_name) +{ + struct smc_ib_device *ibdev; + + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (!strncmp(ibdev->ibdev->name, ib_name, + sizeof(ibdev->ibdev->name))) { + goto out; + } + } + ibdev = NULL; +out: + spin_unlock(&smc_ib_devices.lock); + return ibdev; +} + +/* Parse the supplied netlink attributes and fill a pnetentry structure. + * For ethernet and infiniband device names verify that the devices exist. + */ +static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem, + struct nlattr *tb[]) +{ + char *string, *ibname = NULL; + int rc = 0; + + memset(pnetelem, 0, sizeof(*pnetelem)); + INIT_LIST_HEAD(&pnetelem->list); + if (tb[SMC_PNETID_NAME]) { + string = (char *)nla_data(tb[SMC_PNETID_NAME]); + if (!smc_pnetid_valid(string, pnetelem->pnet_name)) { + rc = -EINVAL; + goto error; + } + } + if (tb[SMC_PNETID_ETHNAME]) { + string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); + pnetelem->ndev = dev_get_by_name(net, string); + if (!pnetelem->ndev) + return -ENOENT; + } + if (tb[SMC_PNETID_IBNAME]) { + ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + ibname = strim(ibname); + pnetelem->smcibdev = smc_pnet_find_ib(ibname); + if (!pnetelem->smcibdev) { + rc = -ENOENT; + goto error; + } + } + if (tb[SMC_PNETID_IBPORT]) { + pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (pnetelem->ib_port > SMC_MAX_PORTS) { + rc = -EINVAL; + goto error; + } + } + return 0; + +error: + if (pnetelem->ndev) + dev_put(pnetelem->ndev); + return rc; +} + +/* Convert an smc_pnetentry to a netlink attribute sequence */ +static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem) +{ + if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) || + nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) || + nla_put_string(msg, SMC_PNETID_IBNAME, + pnetelem->smcibdev->ibdev->name) || + nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) + return -1; + return 0; +} + +/* Retrieve one PNETID entry */ +static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info) +{ + struct smc_pnetentry *pnetelem; + struct sk_buff *msg; + void *hdr; + int rc; + + pnetelem = smc_pnet_find_pnetid( + (char *)nla_data(info->attrs[SMC_PNETID_NAME])); + if (!pnetelem) + return -ENOENT; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, + &smc_pnet_nl_family, 0, SMC_PNETID_GET); + if (!hdr) { + rc = -EMSGSIZE; + goto err_out; + } + + if (smc_pnet_set_nla(msg, pnetelem)) { + rc = -ENOBUFS; + goto err_out; + } + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +err_out: + nlmsg_free(msg); + return rc; +} + +static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct smc_pnetentry *pnetelem; + int rc; + + pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); + if (!pnetelem) + return -ENOMEM; + rc = smc_pnet_fill_entry(net, pnetelem, info->attrs); + if (!rc) + rc = smc_pnet_enter(pnetelem); + if (rc) { + kfree(pnetelem); + return rc; + } + rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port); + if (rc) + smc_pnet_remove_by_pnetid(pnetelem->pnet_name); + return rc; +} + +static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) +{ + return smc_pnet_remove_by_pnetid( + (char *)nla_data(info->attrs[SMC_PNETID_NAME])); +} + +static int smc_pnet_dump_start(struct netlink_callback *cb) +{ + cb->args[0] = 0; + return 0; +} + +static int smc_pnet_dumpinfo(struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, + struct smc_pnetentry *pnetelem) +{ + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family, + flags, SMC_PNETID_GET); + if (!hdr) + return -ENOMEM; + if (smc_pnet_set_nla(skb, pnetelem) < 0) { + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; + } + genlmsg_end(skb, hdr); + return 0; +} + +static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_pnetentry *pnetelem; + int idx = 0; + + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (idx++ < cb->args[0]) + continue; + if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + pnetelem)) { + --idx; + break; + } + } + cb->args[0] = idx; + read_unlock(&smc_pnettable.lock); + return skb->len; +} + +/* Remove and delete all pnetids from pnet table. + */ +static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info) +{ + struct smc_pnetentry *pnetelem, *tmp_pe; + + write_lock(&smc_pnettable.lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist, + list) { + list_del(&pnetelem->list); + dev_put(pnetelem->ndev); + kfree(pnetelem); + } + write_unlock(&smc_pnettable.lock); + return 0; +} + +/* SMC_PNETID generic netlink operation definition */ +static const struct genl_ops smc_pnet_ops[] = { + { + .cmd = SMC_PNETID_GET, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_get, + .dumpit = smc_pnet_dump, + .start = smc_pnet_dump_start + }, + { + .cmd = SMC_PNETID_ADD, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_add + }, + { + .cmd = SMC_PNETID_DEL, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_del + }, + { + .cmd = SMC_PNETID_FLUSH, + .flags = GENL_ADMIN_PERM, + .policy = smc_pnet_policy, + .doit = smc_pnet_flush + } +}; + +/* SMC_PNETID family definition */ +static struct genl_family smc_pnet_nl_family = { + .hdrsize = 0, + .name = SMCR_GENL_FAMILY_NAME, + .version = SMCR_GENL_FAMILY_VERSION, + .maxattr = SMC_PNETID_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = smc_pnet_ops, + .n_ops = ARRAY_SIZE(smc_pnet_ops) +}; + +static int smc_pnet_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_REBOOT: + case NETDEV_UNREGISTER: + smc_pnet_remove_by_ndev(event_dev); + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block smc_netdev_notifier = { + .notifier_call = smc_pnet_netdev_event +}; + +int __init smc_pnet_init(void) +{ + int rc; + + rc = genl_register_family(&smc_pnet_nl_family); + if (rc) + return rc; + rc = register_netdevice_notifier(&smc_netdev_notifier); + if (rc) + genl_unregister_family(&smc_pnet_nl_family); + return rc; +} + +void smc_pnet_exit(void) +{ + smc_pnet_flush(NULL, NULL); + unregister_netdevice_notifier(&smc_netdev_notifier); + genl_unregister_family(&smc_pnet_nl_family); +} + +/* PNET table analysis for a given sock: + * determine ib_device and port belonging to used internal TCP socket + * ethernet interface. + */ +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport) +{ + struct dst_entry *dst = sk_dst_get(sk); + struct smc_pnetentry *pnetelem; + + *smcibdev = NULL; + *ibport = 0; + + if (!dst) + return; + if (!dst->dev) + goto out_rel; + read_lock(&smc_pnettable.lock); + list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { + if (dst->dev == pnetelem->ndev) { + *smcibdev = pnetelem->smcibdev; + *ibport = pnetelem->ib_port; + break; + } + } + read_unlock(&smc_pnettable.lock); +out_rel: + dst_release(dst); +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h new file mode 100644 index 000000000000..32ab3df928ca --- /dev/null +++ b/net/smc/smc_pnet.h @@ -0,0 +1,23 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * PNET table queries + * + * Copyright IBM Corp. 2016 + * + * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com> + */ + +#ifndef _SMC_PNET_H +#define _SMC_PNET_H + +struct smc_ib_device; + +int smc_pnet_init(void) __init; +void smc_pnet_exit(void); +int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); +struct smc_ib_device *smc_pnet_find_ib(char *ib_name); +void smc_pnet_find_roce_resource(struct sock *sk, + struct smc_ib_device **smcibdev, u8 *ibport); + +#endif diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c new file mode 100644 index 000000000000..5d1878732f46 --- /dev/null +++ b/net/smc/smc_rx.c @@ -0,0 +1,217 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage RMBE + * copy new RMBE data into user space + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <net/sock.h> + +#include "smc.h" +#include "smc_core.h" +#include "smc_cdc.h" +#include "smc_tx.h" /* smc_tx_consumer_update() */ +#include "smc_rx.h" + +/* callback implementation for sk.sk_data_ready() + * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data(). + * indirectly called by smc_cdc_msg_recv_action(). + */ +static void smc_rx_data_ready(struct sock *sk) +{ + struct socket_wq *wq; + + /* derived from sock_def_readable() */ + /* called already in smc_listen_work() */ + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | + POLLRDNORM | POLLRDBAND); + if ((sk->sk_shutdown == SHUTDOWN_MASK) || + (sk->sk_state == SMC_CLOSED)) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + rcu_read_unlock(); +} + +/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted + * @smc smc socket + * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * Returns: + * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. + * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). + */ +static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + int rc; + + if (atomic_read(&conn->bytes_to_rcv)) + return 1; + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + add_wait_queue(sk_sleep(sk), &wait); + rc = sk_wait_event(sk, timeo, + sk->sk_err || + sk->sk_shutdown & RCV_SHUTDOWN || + sock_flag(sk, SOCK_DONE) || + atomic_read(&conn->bytes_to_rcv) || + smc_cdc_rxed_any_close_or_senddone(conn), + &wait); + remove_wait_queue(sk_sleep(sk), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + return rc; +} + +/* rcvbuf consumer: main API called by socket layer. + * called under sk lock. + */ +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, + int flags) +{ + size_t copylen, read_done = 0, read_remaining = len; + size_t chunk_len, chunk_off, chunk_len_sum; + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + int readable, chunk; + char *rcvbuf_base; + struct sock *sk; + long timeo; + int target; /* Read at least these many bytes */ + int rc; + + if (unlikely(flags & MSG_ERRQUEUE)) + return -EINVAL; /* future work for sk.sk_family == AF_SMC */ + if (flags & MSG_OOB) + return -EINVAL; /* future work */ + + sk = &smc->sk; + if (sk->sk_state == SMC_LISTEN) + return -ENOTCONN; + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + msg->msg_namelen = 0; + /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ + rcvbuf_base = conn->rmb_desc->cpu_addr; + + do { /* while (read_remaining) */ + if (read_done >= target) + break; + + if (atomic_read(&conn->bytes_to_rcv)) + goto copy; + + if (read_done) { + if (sk->sk_err || + sk->sk_state == SMC_CLOSED || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current) || + smc_cdc_rxed_any_close_or_senddone(conn) || + conn->local_tx_ctrl.conn_state_flags. + peer_conn_abort) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + read_done = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN || + smc_cdc_rxed_any_close_or_senddone(conn) || + conn->local_tx_ctrl.conn_state_flags. + peer_conn_abort) + break; + if (sk->sk_state == SMC_CLOSED) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + read_done = -ENOTCONN; + break; + } + break; + } + if (signal_pending(current)) { + read_done = sock_intr_errno(timeo); + break; + } + } + + if (!atomic_read(&conn->bytes_to_rcv)) { + smc_rx_wait_data(smc, &timeo); + continue; + } + +copy: + /* initialize variables for 1st iteration of subsequent loop */ + /* could be just 1 byte, even after smc_rx_wait_data above */ + readable = atomic_read(&conn->bytes_to_rcv); + /* not more than what user space asked for */ + copylen = min_t(size_t, read_remaining, readable); + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + /* determine chunks where to read from rcvbuf */ + /* either unwrapped case, or 1st chunk of wrapped case */ + chunk_len = min_t(size_t, + copylen, conn->rmbe_size - cons.count); + chunk_len_sum = chunk_len; + chunk_off = cons.count; + for (chunk = 0; chunk < 2; chunk++) { + if (!(flags & MSG_TRUNC)) { + rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, + chunk_len); + if (rc) { + if (!read_done) + read_done = -EFAULT; + goto out; + } + } + read_remaining -= chunk_len; + read_done += chunk_len; + + if (chunk_len_sum == copylen) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + chunk_len = copylen - chunk_len; /* remainder */ + chunk_len_sum += chunk_len; + chunk_off = 0; /* modulo offset in recv ring buffer */ + } + + /* update cursors */ + if (!(flags & MSG_PEEK)) { + smc_curs_add(conn->rmbe_size, &cons, copylen); + /* increased in recv tasklet smc_cdc_msg_rcv() */ + smp_mb__before_atomic(); + atomic_sub(copylen, &conn->bytes_to_rcv); + /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + smp_mb__after_atomic(); + smc_curs_write(&conn->local_tx_ctrl.cons, + smc_curs_read(&cons, conn), + conn); + /* send consumer cursor update if required */ + /* similar to advertising new TCP rcv_wnd if required */ + smc_tx_consumer_update(conn); + } + } while (read_remaining); +out: + return read_done; +} + +/* Initialize receive properties on connection establishment. NB: not __init! */ +void smc_rx_init(struct smc_sock *smc) +{ + smc->sk.sk_data_ready = smc_rx_data_ready; +} diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h new file mode 100644 index 000000000000..b5b80e1f8b0f --- /dev/null +++ b/net/smc/smc_rx.h @@ -0,0 +1,23 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage RMBE + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_RX_H +#define SMC_RX_H + +#include <linux/socket.h> +#include <linux/types.h> + +#include "smc.h" + +void smc_rx_init(struct smc_sock *smc); +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, + int flags); + +#endif /* SMC_RX_H */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c new file mode 100644 index 000000000000..6e73b28915ea --- /dev/null +++ b/net/smc/smc_tx.c @@ -0,0 +1,483 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage send buffer. + * Producer: + * Copy user space data into send buffer, if send buffer space available. + * Consumer: + * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <linux/workqueue.h> +#include <net/sock.h> + +#include "smc.h" +#include "smc_wr.h" +#include "smc_cdc.h" +#include "smc_tx.h" + +/***************************** sndbuf producer *******************************/ + +/* callback implementation for sk.sk_write_space() + * to wakeup sndbuf producers that blocked with smc_tx_wait_memory(). + * called under sk_socket lock. + */ +static void smc_tx_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + struct smc_sock *smc = smc_sk(sk); + struct socket_wq *wq; + + /* similar to sk_stream_write_space */ + if (atomic_read(&smc->conn.sndbuf_space) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (skwq_has_sleeper(wq)) + wake_up_interruptible_poll(&wq->wait, + POLLOUT | POLLWRNORM | + POLLWRBAND); + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); + rcu_read_unlock(); + } +} + +/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory(). + * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). + */ +void smc_tx_sndbuf_nonfull(struct smc_sock *smc) +{ + if (smc->sk.sk_socket && + test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags)) + smc->sk.sk_write_space(&smc->sk); +} + +/* blocks sndbuf producer until at least one byte of free space available */ +static int smc_tx_wait_memory(struct smc_sock *smc, int flags) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + bool noblock; + long timeo; + int rc = 0; + + /* similar to sk_stream_wait_memory */ + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + noblock = timeo ? false : true; + add_wait_queue(sk_sleep(sk), &wait); + while (1) { + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); + if (sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { + rc = -EPIPE; + break; + } + if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -ECONNRESET; + break; + } + if (!timeo) { + if (noblock) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + rc = -EAGAIN; + break; + } + if (signal_pending(current)) { + rc = sock_intr_errno(timeo); + break; + } + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + if (atomic_read(&conn->sndbuf_space)) + break; /* at least 1 byte of free space available */ + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + sk_wait_event(sk, &timeo, + sk->sk_err || + (sk->sk_shutdown & SEND_SHUTDOWN) || + smc_cdc_rxed_any_close_or_senddone(conn) || + atomic_read(&conn->sndbuf_space), + &wait); + sk->sk_write_pending--; + } + remove_wait_queue(sk_sleep(sk), &wait); + return rc; +} + +/* sndbuf producer: main API called by socket layer. + * called under sock lock. + */ +int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) +{ + size_t copylen, send_done = 0, send_remaining = len; + size_t chunk_len, chunk_off, chunk_len_sum; + struct smc_connection *conn = &smc->conn; + union smc_host_cursor prep; + struct sock *sk = &smc->sk; + char *sndbuf_base; + int tx_cnt_prep; + int writespace; + int rc, chunk; + + /* This should be in poll */ + sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { + rc = -EPIPE; + goto out_err; + } + + while (msg_data_left(msg)) { + if (sk->sk_state == SMC_INIT) + return -ENOTCONN; + if (smc->sk.sk_shutdown & SEND_SHUTDOWN || + (smc->sk.sk_err == ECONNABORTED) || + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + return -EPIPE; + if (smc_cdc_rxed_any_close(conn)) + return send_done ?: -ECONNRESET; + + if (!atomic_read(&conn->sndbuf_space)) { + rc = smc_tx_wait_memory(smc, msg->msg_flags); + if (rc) { + if (send_done) + return send_done; + goto out_err; + } + continue; + } + + /* initialize variables for 1st iteration of subsequent loop */ + /* could be just 1 byte, even after smc_tx_wait_memory above */ + writespace = atomic_read(&conn->sndbuf_space); + /* not more than what user space asked for */ + copylen = min_t(size_t, send_remaining, writespace); + /* determine start of sndbuf */ + sndbuf_base = conn->sndbuf_desc->cpu_addr; + smc_curs_write(&prep, + smc_curs_read(&conn->tx_curs_prep, conn), + conn); + tx_cnt_prep = prep.count; + /* determine chunks where to write into sndbuf */ + /* either unwrapped case, or 1st chunk of wrapped case */ + chunk_len = min_t(size_t, + copylen, conn->sndbuf_size - tx_cnt_prep); + chunk_len_sum = chunk_len; + chunk_off = tx_cnt_prep; + for (chunk = 0; chunk < 2; chunk++) { + rc = memcpy_from_msg(sndbuf_base + chunk_off, + msg, chunk_len); + if (rc) { + if (send_done) + return send_done; + goto out_err; + } + send_done += chunk_len; + send_remaining -= chunk_len; + + if (chunk_len_sum == copylen) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + chunk_len = copylen - chunk_len; /* remainder */ + chunk_len_sum += chunk_len; + chunk_off = 0; /* modulo offset in send ring buffer */ + } + /* update cursors */ + smc_curs_add(conn->sndbuf_size, &prep, copylen); + smc_curs_write(&conn->tx_curs_prep, + smc_curs_read(&prep, conn), + conn); + /* increased in send tasklet smc_cdc_tx_handler() */ + smp_mb__before_atomic(); + atomic_sub(copylen, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + smp_mb__after_atomic(); + /* since we just produced more new data into sndbuf, + * trigger sndbuf consumer: RDMA write into peer RMBE and CDC + */ + smc_tx_sndbuf_nonempty(conn); + } /* while (msg_data_left(msg)) */ + + return send_done; + +out_err: + rc = sk_stream_error(sk, msg->msg_flags, rc); + /* make sure we wake any epoll edge trigger waiter */ + if (unlikely(rc == -EAGAIN)) + sk->sk_write_space(sk); + return rc; +} + +/***************************** sndbuf consumer *******************************/ + +/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ +static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, + int num_sges, struct ib_sge sges[]) +{ + struct smc_link_group *lgr = conn->lgr; + struct ib_send_wr *failed_wr = NULL; + struct ib_rdma_wr rdma_wr; + struct smc_link *link; + int rc; + + memset(&rdma_wr, 0, sizeof(rdma_wr)); + link = &lgr->lnk[SMC_SINGLE_LINK]; + rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link); + rdma_wr.wr.sg_list = sges; + rdma_wr.wr.num_sge = num_sges; + rdma_wr.wr.opcode = IB_WR_RDMA_WRITE; + rdma_wr.remote_addr = + lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + + /* RMBE within RMB */ + ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) + + /* offset within RMBE */ + peer_rmbe_offset; + rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr); + if (rc) + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + return rc; +} + +/* sndbuf consumer */ +static inline void smc_tx_advance_cursors(struct smc_connection *conn, + union smc_host_cursor *prod, + union smc_host_cursor *sent, + size_t len) +{ + smc_curs_add(conn->peer_rmbe_size, prod, len); + /* increased in recv tasklet smc_cdc_msg_rcv() */ + smp_mb__before_atomic(); + /* data in flight reduces usable snd_wnd */ + atomic_sub(len, &conn->peer_rmbe_space); + /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ + smp_mb__after_atomic(); + smc_curs_add(conn->sndbuf_size, sent, len); +} + +/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; + * usable snd_wnd as max transmit + */ +static int smc_tx_rdma_writes(struct smc_connection *conn) +{ + size_t src_off, src_len, dst_off, dst_len; /* current chunk values */ + size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk; + union smc_host_cursor sent, prep, prod, cons; + struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; + struct smc_link_group *lgr = conn->lgr; + int to_send, rmbespace; + struct smc_link *link; + int num_sges; + int rc; + + /* source: sndbuf */ + smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); + smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); + /* cf. wmem_alloc - (snd_max - snd_una) */ + to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep); + if (to_send <= 0) + return 0; + + /* destination: RMBE */ + /* cf. snd_wnd */ + rmbespace = atomic_read(&conn->peer_rmbe_space); + if (rmbespace <= 0) + return 0; + smc_curs_write(&prod, + smc_curs_read(&conn->local_tx_ctrl.prod, conn), + conn); + smc_curs_write(&cons, + smc_curs_read(&conn->local_rx_ctrl.cons, conn), + conn); + + /* if usable snd_wnd closes ask peer to advertise once it opens again */ + conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace); + /* cf. usable snd_wnd */ + len = min(to_send, rmbespace); + + /* initialize variables for first iteration of subsequent nested loop */ + link = &lgr->lnk[SMC_SINGLE_LINK]; + dst_off = prod.count; + if (prod.wrap == cons.wrap) { + /* the filled destination area is unwrapped, + * hence the available free destination space is wrapped + * and we need 2 destination chunks of sum len; start with 1st + * which is limited by what's available in sndbuf + */ + dst_len = min_t(size_t, + conn->peer_rmbe_size - prod.count, len); + } else { + /* the filled destination area is wrapped, + * hence the available free destination space is unwrapped + * and we need a single destination chunk of entire len + */ + dst_len = len; + } + dst_len_sum = dst_len; + src_off = sent.count; + /* dst_len determines the maximum src_len */ + if (sent.count + dst_len <= conn->sndbuf_size) { + /* unwrapped src case: single chunk of entire dst_len */ + src_len = dst_len; + } else { + /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ + src_len = conn->sndbuf_size - sent.count; + } + src_len_sum = src_len; + for (dstchunk = 0; dstchunk < 2; dstchunk++) { + num_sges = 0; + for (srcchunk = 0; srcchunk < 2; srcchunk++) { + sges[srcchunk].addr = + conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] + + src_off; + sges[srcchunk].length = src_len; + sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; + num_sges++; + src_off += src_len; + if (src_off >= conn->sndbuf_size) + src_off -= conn->sndbuf_size; + /* modulo in send ring */ + if (src_len_sum == dst_len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + src_len = dst_len - src_len; /* remainder */ + src_len_sum += src_len; + } + rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges); + if (rc) + return rc; + if (dst_len_sum == len) + break; /* either on 1st or 2nd iteration */ + /* prepare next (== 2nd) iteration */ + dst_off = 0; /* modulo offset in RMBE ring buffer */ + dst_len = len - dst_len; /* remainder */ + dst_len_sum += dst_len; + src_len = min_t(int, + dst_len, conn->sndbuf_size - sent.count); + src_len_sum = src_len; + } + + smc_tx_advance_cursors(conn, &prod, &sent, len); + /* update connection's cursors with advanced local cursors */ + smc_curs_write(&conn->local_tx_ctrl.prod, + smc_curs_read(&prod, conn), + conn); + /* dst: peer RMBE */ + smc_curs_write(&conn->tx_curs_sent, + smc_curs_read(&sent, conn), + conn); + /* src: local sndbuf */ + + return 0; +} + +/* Wakeup sndbuf consumers from any context (IRQ or process) + * since there is more data to transmit; usable snd_wnd as max transmit + */ +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + int rc; + + spin_lock_bh(&conn->send_lock); + rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf, + &pend); + if (rc < 0) { + if (rc == -EBUSY) { + struct smc_sock *smc = + container_of(conn, struct smc_sock, conn); + + if (smc->sk.sk_err == ECONNABORTED) { + rc = sock_error(&smc->sk); + goto out_unlock; + } + rc = 0; + schedule_work(&conn->tx_work); + } + goto out_unlock; + } + + rc = smc_tx_rdma_writes(conn); + if (rc) { + smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + (struct smc_wr_tx_pend_priv *)pend); + goto out_unlock; + } + + rc = smc_cdc_msg_send(conn, wr_buf, pend); + +out_unlock: + spin_unlock_bh(&conn->send_lock); + return rc; +} + +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit + */ +static void smc_tx_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(work, + struct smc_connection, + tx_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + lock_sock(&smc->sk); + smc_tx_sndbuf_nonempty(conn); + release_sock(&smc->sk); +} + +void smc_tx_consumer_update(struct smc_connection *conn) +{ + union smc_host_cursor cfed, cons; + struct smc_cdc_tx_pend *pend; + struct smc_wr_buf *wr_buf; + int to_confirm, rc; + + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + smc_curs_write(&cfed, + smc_curs_read(&conn->rx_curs_confirmed, conn), + conn); + to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons); + + if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || + ((to_confirm > conn->rmbe_update_limit) && + ((to_confirm > (conn->rmbe_size / 2)) || + conn->local_rx_ctrl.prod_flags.write_blocked))) { + rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + &wr_buf, &pend); + if (!rc) + rc = smc_cdc_msg_send(conn, wr_buf, pend); + if (rc < 0) { + schedule_work(&conn->tx_work); + return; + } + smc_curs_write(&conn->rx_curs_confirmed, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + } + if (conn->local_rx_ctrl.prod_flags.write_blocked && + !atomic_read(&conn->bytes_to_rcv)) + conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} + +/***************************** send initialize *******************************/ + +/* Initialize send properties on connection establishment. NB: not __init! */ +void smc_tx_init(struct smc_sock *smc) +{ + smc->sk.sk_write_space = smc_tx_write_space; + INIT_WORK(&smc->conn.tx_work, smc_tx_work); + spin_lock_init(&smc->conn.send_lock); +} diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h new file mode 100644 index 000000000000..1d6a0dcdcfe6 --- /dev/null +++ b/net/smc/smc_tx.h @@ -0,0 +1,35 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Manage send buffer + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> + */ + +#ifndef SMC_TX_H +#define SMC_TX_H + +#include <linux/socket.h> +#include <linux/types.h> + +#include "smc.h" +#include "smc_cdc.h" + +static inline int smc_tx_prepared_sends(struct smc_connection *conn) +{ + union smc_host_cursor sent, prep; + + smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); + smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); + return smc_curs_diff(conn->sndbuf_size, &sent, &prep); +} + +void smc_tx_init(struct smc_sock *smc); +int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); +int smc_tx_sndbuf_nonempty(struct smc_connection *conn); +void smc_tx_sndbuf_nonfull(struct smc_sock *smc); +void smc_tx_consumer_update(struct smc_connection *conn); + +#endif /* SMC_TX_H */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c new file mode 100644 index 000000000000..eadf157418dc --- /dev/null +++ b/net/smc/smc_wr.c @@ -0,0 +1,614 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Work Requests exploiting Infiniband API + * + * Work requests (WR) of type ib_post_send or ib_post_recv respectively + * are submitted to either RC SQ or RC RQ respectively + * (reliably connected send/receive queue) + * and become work queue entries (WQEs). + * While an SQ WR/WQE is pending, we track it until transmission completion. + * Through a send or receive completion queue (CQ) respectively, + * we get completion queue entries (CQEs) [aka work completions (WCs)]. + * Since the CQ callback is called from IRQ context, we split work by using + * bottom halves implemented by tasklets. + * + * SMC uses this to exchange LLC (link layer control) + * and CDC (connection data control) messages. + * + * Copyright IBM Corp. 2016 + * + * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> + */ + +#include <linux/atomic.h> +#include <linux/hashtable.h> +#include <linux/wait.h> +#include <rdma/ib_verbs.h> +#include <asm/div64.h> + +#include "smc.h" +#include "smc_wr.h" + +#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ + +#define SMC_WR_RX_HASH_BITS 4 +static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS); +static DEFINE_SPINLOCK(smc_wr_rx_hash_lock); + +struct smc_wr_tx_pend { /* control data for a pending send request */ + u64 wr_id; /* work request id sent */ + smc_wr_tx_handler handler; + enum ib_wc_status wc_status; /* CQE status */ + struct smc_link *link; + u32 idx; + struct smc_wr_tx_pend_priv priv; +}; + +/******************************** send queue *********************************/ + +/*------------------------------- completion --------------------------------*/ + +static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) +{ + u32 i; + + for (i = 0; i < link->wr_tx_cnt; i++) { + if (link->wr_tx_pends[i].wr_id == wr_id) + return i; + } + return link->wr_tx_cnt; +} + +static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) +{ + struct smc_wr_tx_pend pnd_snd; + struct smc_link *link; + u32 pnd_snd_idx; + int i; + + link = wc->qp->qp_context; + pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); + if (pnd_snd_idx == link->wr_tx_cnt) + return; + link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[pnd_snd_idx], 0, + sizeof(link->wr_tx_pends[pnd_snd_idx])); + memset(&link->wr_tx_bufs[pnd_snd_idx], 0, + sizeof(link->wr_tx_bufs[pnd_snd_idx])); + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) + return; + if (wc->status) { + struct smc_link_group *lgr; + + for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { + /* clear full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[i], 0, + sizeof(link->wr_tx_pends[i])); + memset(&link->wr_tx_bufs[i], 0, + sizeof(link->wr_tx_bufs[i])); + clear_bit(i, link->wr_tx_mask); + } + /* terminate connections of this link group abnormally */ + lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + smc_lgr_terminate(lgr); + } + if (pnd_snd.handler) + pnd_snd.handler(&pnd_snd.priv, link, wc->status); + wake_up(&link->wr_tx_wait); +} + +static void smc_wr_tx_tasklet_fn(unsigned long data) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int i = 0, rc; + int polled = 0; + +again: + polled++; + do { + rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_send, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; + for (i = 0; i < rc; i++) + smc_wr_tx_process_cqe(&wc[i]); + } while (rc > 0); + if (polled == 1) + goto again; +} + +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->send_tasklet); +} + +/*---------------------------- request submission ---------------------------*/ + +static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) +{ + *idx = link->wr_tx_cnt; + for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { + if (!test_and_set_bit(*idx, link->wr_tx_mask)) + return 0; + } + *idx = link->wr_tx_cnt; + return -EBUSY; +} + +/** + * smc_wr_tx_get_free_slot() - returns buffer for message assembly, + * and sets info for pending transmit tracking + * @link: Pointer to smc_link used to later send the message. + * @handler: Send completion handler function pointer. + * @wr_buf: Out value returns pointer to message buffer. + * @wr_pend_priv: Out value returns pointer serving as handler context. + * + * Return: 0 on success, or -errno on error. + */ +int smc_wr_tx_get_free_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv) +{ + struct smc_wr_tx_pend *wr_pend; + struct ib_send_wr *wr_ib; + u64 wr_id; + u32 idx; + int rc; + + *wr_buf = NULL; + *wr_pend_priv = NULL; + if (in_softirq()) { + rc = smc_wr_tx_get_free_slot_index(link, &idx); + if (rc) + return rc; + } else { + rc = wait_event_interruptible_timeout( + link->wr_tx_wait, + (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), + SMC_WR_TX_WAIT_FREE_SLOT_TIME); + if (!rc) { + /* timeout - terminate connections */ + struct smc_link_group *lgr; + + lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + smc_lgr_terminate(lgr); + return -EPIPE; + } + if (rc == -ERESTARTSYS) + return -EINTR; + if (idx == link->wr_tx_cnt) + return -EPIPE; + } + wr_id = smc_wr_tx_get_next_wr_id(link); + wr_pend = &link->wr_tx_pends[idx]; + wr_pend->wr_id = wr_id; + wr_pend->handler = handler; + wr_pend->link = link; + wr_pend->idx = idx; + wr_ib = &link->wr_tx_ibs[idx]; + wr_ib->wr_id = wr_id; + *wr_buf = &link->wr_tx_bufs[idx]; + *wr_pend_priv = &wr_pend->priv; + return 0; +} + +int smc_wr_tx_put_slot(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv) +{ + struct smc_wr_tx_pend *pend; + + pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); + if (pend->idx < link->wr_tx_cnt) { + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[pend->idx], 0, + sizeof(link->wr_tx_pends[pend->idx])); + memset(&link->wr_tx_bufs[pend->idx], 0, + sizeof(link->wr_tx_bufs[pend->idx])); + test_and_clear_bit(pend->idx, link->wr_tx_mask); + return 1; + } + + return 0; +} + +/* Send prepared WR slot via ib_post_send. + * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer + */ +int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) +{ + struct ib_send_wr *failed_wr = NULL; + struct smc_wr_tx_pend *pend; + int rc; + + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS); + pend = container_of(priv, struct smc_wr_tx_pend, priv); + rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], + &failed_wr); + if (rc) + smc_wr_tx_put_slot(link, priv); + return rc; +} + +void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, + smc_wr_tx_dismisser dismisser, + unsigned long data) +{ + struct smc_wr_tx_pend_priv *tx_pend; + struct smc_wr_rx_hdr *wr_rx; + int i; + + for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { + wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i]; + if (wr_rx->type != wr_rx_hdr_type) + continue; + tx_pend = &link->wr_tx_pends[i].priv; + if (filter(tx_pend, data)) + dismisser(tx_pend); + } +} + +bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, unsigned long data) +{ + struct smc_wr_tx_pend_priv *tx_pend; + struct smc_wr_rx_hdr *wr_rx; + int i; + + for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { + wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i]; + if (wr_rx->type != wr_rx_hdr_type) + continue; + tx_pend = &link->wr_tx_pends[i].priv; + if (filter(tx_pend, data)) + return true; + } + return false; +} + +/****************************** receive queue ********************************/ + +int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) +{ + struct smc_wr_rx_handler *h_iter; + int rc = 0; + + spin_lock(&smc_wr_rx_hash_lock); + hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) { + if (h_iter->type == handler->type) { + rc = -EEXIST; + goto out_unlock; + } + } + hash_add(smc_wr_rx_hash, &handler->list, handler->type); +out_unlock: + spin_unlock(&smc_wr_rx_hash_lock); + return rc; +} + +/* Demultiplex a received work request based on the message type to its handler. + * Relies on smc_wr_rx_hash having been completely filled before any IB WRs, + * and not being modified any more afterwards so we don't need to lock it. + */ +static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_wr_rx_handler *handler; + struct smc_wr_rx_hdr *wr_rx; + u64 temp_wr_id; + u32 index; + + if (wc->byte_len < sizeof(*wr_rx)) + return; /* short message */ + temp_wr_id = wc->wr_id; + index = do_div(temp_wr_id, link->wr_rx_cnt); + wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; + hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { + if (handler->type == wr_rx->type) + handler->handler(wc, wr_rx); + } +} + +static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) +{ + struct smc_link *link; + int i; + + for (i = 0; i < num; i++) { + link = wc[i].qp->qp_context; + if (wc[i].status == IB_WC_SUCCESS) { + smc_wr_rx_demultiplex(&wc[i]); + smc_wr_rx_post(link); /* refill WR RX */ + } else { + struct smc_link_group *lgr; + + /* handle status errors */ + switch (wc[i].status) { + case IB_WC_RETRY_EXC_ERR: + case IB_WC_RNR_RETRY_EXC_ERR: + case IB_WC_WR_FLUSH_ERR: + /* terminate connections of this link group + * abnormally + */ + lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + smc_lgr_terminate(lgr); + break; + default: + smc_wr_rx_post(link); /* refill WR RX */ + break; + } + } + } +} + +static void smc_wr_rx_tasklet_fn(unsigned long data) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)data; + struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; + int polled = 0; + int rc; + +again: + polled++; + do { + memset(&wc, 0, sizeof(wc)); + rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); + if (polled == 1) { + ib_req_notify_cq(dev->roce_cq_recv, + IB_CQ_SOLICITED_MASK + | IB_CQ_REPORT_MISSED_EVENTS); + } + if (!rc) + break; + smc_wr_rx_process_cqes(&wc[0], rc); + } while (rc > 0); + if (polled == 1) + goto again; +} + +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) +{ + struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; + + tasklet_schedule(&dev->recv_tasklet); +} + +int smc_wr_rx_post_init(struct smc_link *link) +{ + u32 i; + int rc = 0; + + for (i = 0; i < link->wr_rx_cnt; i++) + rc = smc_wr_rx_post(link); + return rc; +} + +/***************************** init, exit, misc ******************************/ + +void smc_wr_remember_qp_attr(struct smc_link *lnk) +{ + struct ib_qp_attr *attr = &lnk->qp_attr; + struct ib_qp_init_attr init_attr; + + memset(attr, 0, sizeof(*attr)); + memset(&init_attr, 0, sizeof(init_attr)); + ib_query_qp(lnk->roce_qp, attr, + IB_QP_STATE | + IB_QP_CUR_STATE | + IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_QKEY | + IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_RQ_PSN | + IB_QP_ALT_PATH | + IB_QP_MIN_RNR_TIMER | + IB_QP_SQ_PSN | + IB_QP_PATH_MIG_STATE | + IB_QP_CAP | + IB_QP_DEST_QPN, + &init_attr); + + lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->qp_attr.cap.max_send_wr); + lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->qp_attr.cap.max_recv_wr); +} + +static void smc_wr_init_sge(struct smc_link *lnk) +{ + u32 i; + + for (i = 0; i < lnk->wr_tx_cnt; i++) { + lnk->wr_tx_sges[i].addr = + lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; + lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; + lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_tx_ibs[i].next = NULL; + lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; + lnk->wr_tx_ibs[i].num_sge = 1; + lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; + lnk->wr_tx_ibs[i].send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE; + } + for (i = 0; i < lnk->wr_rx_cnt; i++) { + lnk->wr_rx_sges[i].addr = + lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; + lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE; + lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_rx_ibs[i].next = NULL; + lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i]; + lnk->wr_rx_ibs[i].num_sge = 1; + } +} + +void smc_wr_free_link(struct smc_link *lnk) +{ + struct ib_device *ibdev; + + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + + if (!lnk->smcibdev) + return; + ibdev = lnk->smcibdev->ibdev; + + if (lnk->wr_rx_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr = 0; + } + if (lnk->wr_tx_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, + DMA_TO_DEVICE); + lnk->wr_tx_dma_addr = 0; + } +} + +void smc_wr_free_link_mem(struct smc_link *lnk) +{ + kfree(lnk->wr_tx_pends); + lnk->wr_tx_pends = NULL; + kfree(lnk->wr_tx_mask); + lnk->wr_tx_mask = NULL; + kfree(lnk->wr_tx_sges); + lnk->wr_tx_sges = NULL; + kfree(lnk->wr_rx_sges); + lnk->wr_rx_sges = NULL; + kfree(lnk->wr_rx_ibs); + lnk->wr_rx_ibs = NULL; + kfree(lnk->wr_tx_ibs); + lnk->wr_tx_ibs = NULL; + kfree(lnk->wr_tx_bufs); + lnk->wr_tx_bufs = NULL; + kfree(lnk->wr_rx_bufs); + lnk->wr_rx_bufs = NULL; +} + +int smc_wr_alloc_link_mem(struct smc_link *link) +{ + /* allocate link related memory */ + link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); + if (!link->wr_tx_bufs) + goto no_mem; + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + GFP_KERNEL); + if (!link->wr_rx_bufs) + goto no_mem_wr_tx_bufs; + link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), + GFP_KERNEL); + if (!link->wr_tx_ibs) + goto no_mem_wr_rx_bufs; + link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + sizeof(link->wr_rx_ibs[0]), + GFP_KERNEL); + if (!link->wr_rx_ibs) + goto no_mem_wr_tx_ibs; + link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), + GFP_KERNEL); + if (!link->wr_tx_sges) + goto no_mem_wr_rx_ibs; + link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + sizeof(link->wr_rx_sges[0]), + GFP_KERNEL); + if (!link->wr_rx_sges) + goto no_mem_wr_tx_sges; + link->wr_tx_mask = kzalloc( + BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask), + GFP_KERNEL); + if (!link->wr_tx_mask) + goto no_mem_wr_rx_sges; + link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_pends[0]), + GFP_KERNEL); + if (!link->wr_tx_pends) + goto no_mem_wr_tx_mask; + return 0; + +no_mem_wr_tx_mask: + kfree(link->wr_tx_mask); +no_mem_wr_rx_sges: + kfree(link->wr_rx_sges); +no_mem_wr_tx_sges: + kfree(link->wr_tx_sges); +no_mem_wr_rx_ibs: + kfree(link->wr_rx_ibs); +no_mem_wr_tx_ibs: + kfree(link->wr_tx_ibs); +no_mem_wr_rx_bufs: + kfree(link->wr_rx_bufs); +no_mem_wr_tx_bufs: + kfree(link->wr_tx_bufs); +no_mem: + return -ENOMEM; +} + +void smc_wr_remove_dev(struct smc_ib_device *smcibdev) +{ + tasklet_kill(&smcibdev->recv_tasklet); + tasklet_kill(&smcibdev->send_tasklet); +} + +void smc_wr_add_dev(struct smc_ib_device *smcibdev) +{ + tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn, + (unsigned long)smcibdev); + tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn, + (unsigned long)smcibdev); +} + +int smc_wr_create_link(struct smc_link *lnk) +{ + struct ib_device *ibdev = lnk->smcibdev->ibdev; + int rc = 0; + + smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); + lnk->wr_rx_id = 0; + lnk->wr_rx_dma_addr = ib_dma_map_single( + ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { + lnk->wr_rx_dma_addr = 0; + rc = -EIO; + goto out; + } + lnk->wr_tx_dma_addr = ib_dma_map_single( + ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) { + rc = -EIO; + goto dma_unmap; + } + smc_wr_init_sge(lnk); + memset(lnk->wr_tx_mask, 0, + BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + return rc; + +dma_unmap: + ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, + SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + DMA_FROM_DEVICE); + lnk->wr_rx_dma_addr = 0; +out: + return rc; +} diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h new file mode 100644 index 000000000000..0b9beeda6053 --- /dev/null +++ b/net/smc/smc_wr.h @@ -0,0 +1,106 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Work Requests exploiting Infiniband API + * + * Copyright IBM Corp. 2016 + * + * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> + */ + +#ifndef SMC_WR_H +#define SMC_WR_H + +#include <linux/atomic.h> +#include <rdma/ib_verbs.h> +#include <asm/div64.h> + +#include "smc.h" +#include "smc_core.h" + +#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */ +#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ + +#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) +#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ) + +#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ + +#define SMC_WR_TX_PEND_PRIV_SIZE 32 + +struct smc_wr_tx_pend_priv { + u8 priv[SMC_WR_TX_PEND_PRIV_SIZE]; +}; + +typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *, + struct smc_link *, + enum ib_wc_status); + +typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *, + unsigned long); + +typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *); + +struct smc_wr_rx_handler { + struct hlist_node list; /* hash table collision resolution */ + void (*handler)(struct ib_wc *, void *); + u8 type; +}; + +/* Only used by RDMA write WRs. + * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly + */ +static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link) +{ + return atomic_long_inc_return(&link->wr_tx_id); +} + +static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) +{ + atomic_long_set(wr_tx_id, val); +} + +/* post a new receive work request to fill a completed old work request entry */ +static inline int smc_wr_rx_post(struct smc_link *link) +{ + struct ib_recv_wr *bad_recv_wr = NULL; + int rc; + u64 wr_id, temp_wr_id; + u32 index; + + wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */ + temp_wr_id = wr_id; + index = do_div(temp_wr_id, link->wr_rx_cnt); + link->wr_rx_ibs[index].wr_id = wr_id; + rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr); + return rc; +} + +int smc_wr_create_link(struct smc_link *lnk); +int smc_wr_alloc_link_mem(struct smc_link *lnk); +void smc_wr_free_link(struct smc_link *lnk); +void smc_wr_free_link_mem(struct smc_link *lnk); +void smc_wr_remember_qp_attr(struct smc_link *lnk); +void smc_wr_remove_dev(struct smc_ib_device *smcibdev); +void smc_wr_add_dev(struct smc_ib_device *smcibdev); + +int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler, + struct smc_wr_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv); +int smc_wr_tx_put_slot(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_send(struct smc_link *link, + struct smc_wr_tx_pend_priv *wr_pend_priv); +void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); +bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, unsigned long data); +void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, + smc_wr_tx_filter filter, + smc_wr_tx_dismisser dismisser, + unsigned long data); + +int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); +int smc_wr_rx_post_init(struct smc_link *link); +void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context); + +#endif /* SMC_WR_H */ diff --git a/net/socket.c b/net/socket.c index 0758e13754e2..b7a63d5bc915 100644 --- a/net/socket.c +++ b/net/socket.c @@ -287,7 +287,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static void init_inodecache(void) { sock_inode_cachep = kmem_cache_create("sock_inode_cache", sizeof(struct socket_alloc), @@ -296,9 +296,7 @@ static int init_inodecache(void) SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); - if (sock_inode_cachep == NULL) - return -ENOMEM; - return 0; + BUG_ON(sock_inode_cachep == NULL); } static const struct super_operations sockfs_ops = { @@ -1948,6 +1946,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, ctl_buf = msg_sys->msg_control; ctl_len = msg_sys->msg_controllen; } else if (ctl_len) { + BUILD_BUG_ON(sizeof(struct cmsghdr) != + CMSG_ALIGN(sizeof(struct cmsghdr))); if (ctl_len > sizeof(ctl)) { ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); if (ctl_buf == NULL) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index aa1babbea385..c35fad3e08e8 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -174,7 +174,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq) * and to identified node local sockets * @net: the applicable net namespace * @list: chain of buffers containing message - * Consumes the buffer chain, except when returning -ELINKCONG + * Consumes the buffer chain. * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE */ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) @@ -197,7 +197,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) tipc_bcast_unlock(net); /* Don't send to local node if adding to link failed */ - if (unlikely(rc)) { + if (unlikely(rc && (rc != -ELINKCONG))) { __skb_queue_purge(&rcvq); return rc; } @@ -206,7 +206,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) tipc_bcbase_xmit(net, &xmitq); tipc_sk_mcast_rcv(net, &rcvq, &inputq); __skb_queue_purge(list); - return 0; + return rc; } /* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link diff --git a/net/tipc/link.c b/net/tipc/link.c index 4e8647aef01c..b0f8646e0631 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -776,60 +776,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) /** * link_schedule_user - schedule a message sender for wakeup after congestion - * @link: congested link - * @list: message that was attempted sent + * @l: congested link + * @hdr: header of message that is being sent * Create pseudo msg to send back to user when congestion abates - * Does not consume buffer list */ -static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) +static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) { - struct tipc_msg *msg = buf_msg(skb_peek(list)); - int imp = msg_importance(msg); - u32 oport = msg_origport(msg); - u32 addr = tipc_own_addr(link->net); + u32 dnode = tipc_own_addr(l->net); + u32 dport = msg_origport(hdr); struct sk_buff *skb; - /* This really cannot happen... */ - if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { - pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); - return -ENOBUFS; - } - /* Non-blocking sender: */ - if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending) - return -ELINKCONG; - /* Create and schedule wakeup pseudo message */ skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, - addr, addr, oport, 0, 0); + dnode, l->addr, dport, 0, 0); if (!skb) return -ENOBUFS; - TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); - TIPC_SKB_CB(skb)->chain_imp = imp; - skb_queue_tail(&link->wakeupq, skb); - link->stats.link_congs++; + msg_set_dest_droppable(buf_msg(skb), true); + TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr); + skb_queue_tail(&l->wakeupq, skb); + l->stats.link_congs++; return -ELINKCONG; } /** * link_prepare_wakeup - prepare users for wakeup after congestion - * @link: congested link - * Move a number of waiting users, as permitted by available space in - * the send queue, from link wait queue to node wait queue for wakeup + * @l: congested link + * Wake up a number of waiting users, as permitted by available space + * in the send queue */ void link_prepare_wakeup(struct tipc_link *l) { - int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,}; - int imp, lim; struct sk_buff *skb, *tmp; + int imp, i = 0; skb_queue_walk_safe(&l->wakeupq, skb, tmp) { imp = TIPC_SKB_CB(skb)->chain_imp; - lim = l->backlog[imp].limit; - pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; - if ((pnd[imp] + l->backlog[imp].len) >= lim) + if (l->backlog[imp].len < l->backlog[imp].limit) { + skb_unlink(skb, &l->wakeupq); + skb_queue_tail(l->inputq, skb); + } else if (i++ > 10) { break; - skb_unlink(skb, &l->wakeupq); - skb_queue_tail(l->inputq, skb); + } } } @@ -869,8 +856,7 @@ void tipc_link_reset(struct tipc_link *l) * @list: chain of buffers containing message * @xmitq: returned list of packets to be sent by caller * - * Consumes the buffer chain, except when returning -ELINKCONG, - * since the caller then may want to make more send attempts. + * Consumes the buffer chain. * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted */ @@ -879,7 +865,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, { struct tipc_msg *hdr = buf_msg(skb_peek(list)); unsigned int maxwin = l->window; - unsigned int i, imp = msg_importance(hdr); + int imp = msg_importance(hdr); unsigned int mtu = l->mtu; u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; @@ -888,19 +874,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *backlogq = &l->backlogq; struct sk_buff *skb, *_skb, *bskb; int pkt_cnt = skb_queue_len(list); + int rc = 0; - /* Match msg importance against this and all higher backlog limits: */ - if (!skb_queue_empty(backlogq)) { - for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { - if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) - return link_schedule_user(l, list); - } - } if (unlikely(msg_size(hdr) > mtu)) { skb_queue_purge(list); return -EMSGSIZE; } + /* Allow oversubscription of one data msg per source at congestion */ + if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) { + if (imp == TIPC_SYSTEM_IMPORTANCE) { + pr_warn("%s<%s>, link overflow", link_rst_msg, l->name); + return -ENOBUFS; + } + rc = link_schedule_user(l, hdr); + } + if (pkt_cnt > 1) { l->stats.sent_fragmented++; l->stats.sent_fragments += pkt_cnt; @@ -946,7 +935,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, skb_queue_splice_tail_init(list, backlogq); } l->snd_nxt = seqno; - return 0; + return rc; } void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 2c3dc38abf9c..f07b51e3f6f1 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -98,8 +98,6 @@ struct tipc_skb_cb { u32 bytes_read; struct sk_buff *tail; bool validated; - bool wakeup_pending; - u16 chain_sz; u16 chain_imp; u16 ackers; }; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index e190460fe0d3..5a86df1e5fc2 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -608,7 +608,7 @@ not_found: * Returns non-zero if any off-node ports overlap */ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct tipc_plist *dports) + u32 limit, struct list_head *dports) { struct name_seq *seq; struct sub_seq *sseq; @@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, info = sseq->info; list_for_each_entry(publ, &info->node_list, node_list) { if (publ->scope <= limit) - tipc_plist_push(dports, publ->ref); + u32_push(dports, publ->ref); } if (info->cluster_list_size != info->node_list_size) @@ -1022,40 +1022,84 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -void tipc_plist_push(struct tipc_plist *pl, u32 port) +struct u32_item { + struct list_head list; + u32 value; +}; + +bool u32_find(struct list_head *l, u32 value) { - struct tipc_plist *nl; + struct u32_item *item; - if (likely(!pl->port)) { - pl->port = port; - return; + list_for_each_entry(item, l, list) { + if (item->value == value) + return true; } - if (pl->port == port) - return; - list_for_each_entry(nl, &pl->list, list) { - if (nl->port == port) - return; + return false; +} + +bool u32_push(struct list_head *l, u32 value) +{ + struct u32_item *item; + + list_for_each_entry(item, l, list) { + if (item->value == value) + return false; + } + item = kmalloc(sizeof(*item), GFP_ATOMIC); + if (unlikely(!item)) + return false; + + item->value = value; + list_add(&item->list, l); + return true; +} + +u32 u32_pop(struct list_head *l) +{ + struct u32_item *item; + u32 value = 0; + + if (list_empty(l)) + return 0; + item = list_first_entry(l, typeof(*item), list); + value = item->value; + list_del(&item->list); + kfree(item); + return value; +} + +bool u32_del(struct list_head *l, u32 value) +{ + struct u32_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, l, list) { + if (item->value != value) + continue; + list_del(&item->list); + kfree(item); + return true; } - nl = kmalloc(sizeof(*nl), GFP_ATOMIC); - if (nl) { - nl->port = port; - list_add(&nl->list, &pl->list); + return false; +} + +void u32_list_purge(struct list_head *l) +{ + struct u32_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, l, list) { + list_del(&item->list); + kfree(item); } } -u32 tipc_plist_pop(struct tipc_plist *pl) +int u32_list_len(struct list_head *l) { - struct tipc_plist *nl; - u32 port = 0; + struct u32_item *item; + int i = 0; - if (likely(list_empty(&pl->list))) { - port = pl->port; - pl->port = 0; - return port; + list_for_each_entry(item, l, list) { + i++; } - nl = list_first_entry(&pl->list, typeof(*nl), list); - port = nl->port; - list_del(&nl->list); - kfree(nl); - return port; + return i; } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 1524a73830f7..c89bb3f5c364 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -99,7 +99,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, - u32 limit, struct tipc_plist *dports); + u32 limit, struct list_head *dports); struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, u32 upper, u32 scope, u32 port_ref, u32 key); @@ -116,18 +116,11 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); -struct tipc_plist { - struct list_head list; - u32 port; -}; - -static inline void tipc_plist_init(struct tipc_plist *pl) -{ - INIT_LIST_HEAD(&pl->list); - pl->port = 0; -} - -void tipc_plist_push(struct tipc_plist *pl, u32 port); -u32 tipc_plist_pop(struct tipc_plist *pl); +bool u32_push(struct list_head *l, u32 value); +u32 u32_pop(struct list_head *l); +bool u32_find(struct list_head *l, u32 value); +bool u32_del(struct list_head *l, u32 value); +void u32_list_purge(struct list_head *l); +int u32_list_len(struct list_head *l); #endif diff --git a/net/tipc/node.c b/net/tipc/node.c index 9d2f4c2b08ab..2883f6a0ed98 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1167,7 +1167,7 @@ msg_full: * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection - * Consumes the buffer chain, except when returning -ELINKCONG + * Consumes the buffer chain. * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, @@ -1206,10 +1206,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); - if (likely(rc == 0)) - tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); - else if (rc == -ENOBUFS) + if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); + else + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); tipc_node_put(n); @@ -1221,20 +1221,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. - * TODO: Return real return value, and let callers use - * tipc_wait_for_sendpkt() where applicable */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; - int rc; skb_queue_head_init(&head); __skb_queue_tail(&head, skb); - rc = tipc_node_xmit(net, &head, dnode, selector); - if (rc == -ELINKCONG) - kfree_skb(skb); + tipc_node_xmit(net, &head, dnode, selector); return 0; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 800caaa699a1..d2f353934f82 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -67,12 +67,14 @@ enum { * @max_pkt: maximum packet size "hint" used when building messages sent by port * @portid: unique port identity in TIPC socket hash table * @phdr: preformatted message header used when sending messages + * #cong_links: list of congested links * @publications: list of publications for port + * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime * @probing_state: * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue - * @link_cong: non-zero if owner must sleep because of link congestion + * @cong_link_cnt: number of congested links * @sent_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer * @peer: 'connected' peer for dgram/rdm @@ -87,13 +89,13 @@ struct tipc_sock { u32 max_pkt; u32 portid; struct tipc_msg phdr; - struct list_head sock_list; + struct list_head cong_links; struct list_head publications; u32 pub_count; uint conn_timeout; atomic_t dupl_rcvcnt; bool probe_unacked; - bool link_cong; + u16 cong_link_cnt; u16 snt_unacked; u16 snd_win; u16 peer_caps; @@ -110,7 +112,6 @@ static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); static int tipc_release(struct socket *sock); static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags); -static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p); static void tipc_sk_timeout(unsigned long data); static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); @@ -119,8 +120,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); -static int __tipc_send_stream(struct socket *sock, struct msghdr *m, - size_t dsz); +static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); static const struct proto_ops packet_ops; @@ -334,6 +334,49 @@ static int tipc_set_sk_state(struct sock *sk, int state) return res; } +static int tipc_sk_sock_err(struct socket *sock, long *timeout) +{ + struct sock *sk = sock->sk; + int err = sock_error(sk); + int typ = sock->type; + + if (err) + return err; + if (typ == SOCK_STREAM || typ == SOCK_SEQPACKET) { + if (sk->sk_state == TIPC_DISCONNECTING) + return -EPIPE; + else if (!tipc_sk_connected(sk)) + return -ENOTCONN; + } + if (!*timeout) + return -EAGAIN; + if (signal_pending(current)) + return sock_intr_errno(*timeout); + + return 0; +} + +#define tipc_wait_for_cond(sock_, timeout_, condition_) \ +({ \ + int rc_ = 0; \ + int done_ = 0; \ + \ + while (!(condition_) && !done_) { \ + struct sock *sk_ = sock->sk; \ + DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ + \ + rc_ = tipc_sk_sock_err(sock_, timeout_); \ + if (rc_) \ + break; \ + prepare_to_wait(sk_sleep(sk_), &wait_, \ + TASK_INTERRUPTIBLE); \ + done_ = sk_wait_event(sk_, timeout_, \ + (condition_), &wait_); \ + remove_wait_queue(sk_sleep(sk_), &wait_); \ + } \ + rc_; \ +}) + /** * tipc_sk_create - create a TIPC socket * @net: network namespace (must be default network) @@ -382,6 +425,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; INIT_LIST_HEAD(&tsk->publications); + INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; tn = net_generic(sock_net(sk), tipc_net_id); tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, @@ -432,9 +476,14 @@ static void __tipc_shutdown(struct socket *sock, int error) struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); + long timeout = CONN_TIMEOUT_DEFAULT; u32 dnode = tsk_peer_node(tsk); struct sk_buff *skb; + /* Avoid that hi-prio shutdown msgs bypass msgs in link wakeup queue */ + tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && + !tsk_conn_cong(tsk))); + /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). */ @@ -505,7 +554,8 @@ static int tipc_release(struct socket *sock) /* Reject any messages that accumulated in backlog queue */ release_sock(sk); - + u32_list_purge(&tsk->cong_links); + tsk->cong_link_cnt = 0; call_rcu(&tsk->rcu, tipc_sk_callback); sock->sk = NULL; @@ -648,7 +698,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, switch (sk->sk_state) { case TIPC_ESTABLISHED: - if (!tsk->link_cong && !tsk_conn_cong(tsk)) + if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) mask |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: @@ -657,7 +707,7 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, mask |= (POLLIN | POLLRDNORM); break; case TIPC_OPEN: - if (!tsk->link_cong) + if (!tsk->cong_link_cnt) mask |= POLLOUT; if (tipc_sk_type_connectionless(sk) && (!skb_queue_empty(&sk->sk_receive_queue))) @@ -676,63 +726,48 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, * @sock: socket structure * @seq: destination address * @msg: message to send - * @dsz: total length of message data - * @timeo: timeout to wait for wakeup + * @dlen: length of data to send + * @timeout: timeout to wait for wakeup * * Called from function tipc_sendmsg(), which has done all sanity checks * Returns the number of bytes sent on success, or errno */ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, - struct msghdr *msg, size_t dsz, long timeo) + struct msghdr *msg, size_t dlen, long timeout) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); - struct tipc_msg *mhdr = &tsk->phdr; - struct sk_buff_head pktchain; - struct iov_iter save = msg->msg_iter; - uint mtu; + int mtu = tipc_bcast_get_mtu(net); + struct sk_buff_head pkts; int rc; - if (!timeo && tsk->link_cong) - return -ELINKCONG; - - msg_set_type(mhdr, TIPC_MCAST_MSG); - msg_set_lookup_scope(mhdr, TIPC_CLUSTER_SCOPE); - msg_set_destport(mhdr, 0); - msg_set_destnode(mhdr, 0); - msg_set_nametype(mhdr, seq->type); - msg_set_namelower(mhdr, seq->lower); - msg_set_nameupper(mhdr, seq->upper); - msg_set_hdr_sz(mhdr, MCAST_H_SIZE); - - skb_queue_head_init(&pktchain); + rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); + if (unlikely(rc)) + return rc; -new_mtu: - mtu = tipc_bcast_get_mtu(net); - rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain); - if (unlikely(rc < 0)) + msg_set_type(hdr, TIPC_MCAST_MSG); + msg_set_lookup_scope(hdr, TIPC_CLUSTER_SCOPE); + msg_set_destport(hdr, 0); + msg_set_destnode(hdr, 0); + msg_set_nametype(hdr, seq->type); + msg_set_namelower(hdr, seq->lower); + msg_set_nameupper(hdr, seq->upper); + msg_set_hdr_sz(hdr, MCAST_H_SIZE); + + skb_queue_head_init(&pkts); + rc = tipc_msg_build(hdr, msg, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) return rc; - do { - rc = tipc_bcast_xmit(net, &pktchain); - if (likely(!rc)) - return dsz; - - if (rc == -ELINKCONG) { - tsk->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (!rc) - continue; - } - __skb_queue_purge(&pktchain); - if (rc == -EMSGSIZE) { - msg->msg_iter = save; - goto new_mtu; - } - break; - } while (1); - return rc; + rc = tipc_bcast_xmit(net, &pkts); + if (unlikely(rc == -ELINKCONG)) { + tsk->cong_link_cnt = 1; + rc = 0; + } + + return rc ? rc : dlen; } /** @@ -746,7 +781,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { struct tipc_msg *msg; - struct tipc_plist dports; + struct list_head dports; u32 portid; u32 scope = TIPC_CLUSTER_SCOPE; struct sk_buff_head tmpq; @@ -754,7 +789,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff *skb, *_skb; __skb_queue_head_init(&tmpq); - tipc_plist_init(&dports); + INIT_LIST_HEAD(&dports); skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { @@ -768,8 +803,8 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, tipc_nametbl_mc_translate(net, msg_nametype(msg), msg_namelower(msg), msg_nameupper(msg), scope, &dports); - portid = tipc_plist_pop(&dports); - for (; portid; portid = tipc_plist_pop(&dports)) { + portid = u32_pop(&dports); + for (; portid; portid = u32_pop(&dports)) { _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); @@ -830,31 +865,6 @@ exit: kfree_skb(skb); } -static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - int done; - - do { - int err = sock_error(sk); - if (err) - return err; - if (sk->sk_shutdown & SEND_SHUTDOWN) - return -EPIPE; - if (!*timeo_p) - return -EAGAIN; - if (signal_pending(current)) - return sock_intr_errno(*timeo_p); - - add_wait_queue(sk_sleep(sk), &wait); - done = sk_wait_event(sk, timeo_p, !tsk->link_cong, &wait); - remove_wait_queue(sk_sleep(sk), &wait); - } while (!done); - return 0; -} - /** * tipc_sendmsg - send message in connectionless manner * @sock: socket structure @@ -881,35 +891,38 @@ static int tipc_sendmsg(struct socket *sock, return ret; } -static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) +static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) { - DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); - struct tipc_msg *mhdr = &tsk->phdr; - u32 dnode, dport; - struct sk_buff_head pktchain; - bool is_connectionless = tipc_sk_type_connectionless(sk); - struct sk_buff *skb; + struct tipc_sock *tsk = tipc_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct list_head *clinks = &tsk->cong_links; + bool syn = !tipc_sk_type_connectionless(sk); + struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; - struct iov_iter save; - u32 mtu; - long timeo; - int rc; + struct sk_buff_head pkts; + u32 type, inst, domain; + u32 dnode, dport; + int mtu, rc; - if (dsz > TIPC_MAX_USER_MSG_SIZE) + if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; + if (unlikely(!dest)) { - if (is_connectionless && tsk->peer.family == AF_TIPC) - dest = &tsk->peer; - else + dest = &tsk->peer; + if (!syn || dest->family != AF_TIPC) return -EDESTADDRREQ; - } else if (unlikely(m->msg_namelen < sizeof(*dest)) || - dest->family != AF_TIPC) { - return -EINVAL; } - if (!is_connectionless) { + + if (unlikely(m->msg_namelen < sizeof(*dest))) + return -EINVAL; + + if (unlikely(dest->family != AF_TIPC)) + return -EINVAL; + + if (unlikely(syn)) { if (sk->sk_state == TIPC_LISTEN) return -EPIPE; if (sk->sk_state != TIPC_OPEN) @@ -921,102 +934,62 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz) tsk->conn_instance = dest->addr.name.name.instance; } } - seq = &dest->addr.nameseq; - timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - if (dest->addrtype == TIPC_ADDR_MCAST) { - return tipc_sendmcast(sock, seq, m, dsz, timeo); - } else if (dest->addrtype == TIPC_ADDR_NAME) { - u32 type = dest->addr.name.name.type; - u32 inst = dest->addr.name.name.instance; - u32 domain = dest->addr.name.domain; + seq = &dest->addr.nameseq; + if (dest->addrtype == TIPC_ADDR_MCAST) + return tipc_sendmcast(sock, seq, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_NAME) { + type = dest->addr.name.name.type; + inst = dest->addr.name.name.instance; + domain = dest->addr.name.domain; dnode = domain; - msg_set_type(mhdr, TIPC_NAMED_MSG); - msg_set_hdr_sz(mhdr, NAMED_H_SIZE); - msg_set_nametype(mhdr, type); - msg_set_nameinst(mhdr, inst); - msg_set_lookup_scope(mhdr, tipc_addr_scope(domain)); + msg_set_type(hdr, TIPC_NAMED_MSG); + msg_set_hdr_sz(hdr, NAMED_H_SIZE); + msg_set_nametype(hdr, type); + msg_set_nameinst(hdr, inst); + msg_set_lookup_scope(hdr, tipc_addr_scope(domain)); dport = tipc_nametbl_translate(net, type, inst, &dnode); - msg_set_destnode(mhdr, dnode); - msg_set_destport(mhdr, dport); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; + } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; - msg_set_type(mhdr, TIPC_DIRECT_MSG); - msg_set_lookup_scope(mhdr, 0); - msg_set_destnode(mhdr, dnode); - msg_set_destport(mhdr, dest->addr.id.ref); - msg_set_hdr_sz(mhdr, BASIC_H_SIZE); + msg_set_type(hdr, TIPC_DIRECT_MSG); + msg_set_lookup_scope(hdr, 0); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dest->addr.id.ref); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); } - skb_queue_head_init(&pktchain); - save = m->msg_iter; -new_mtu: - mtu = tipc_node_get_mtu(net, dnode, tsk->portid); - rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain); - if (rc < 0) + /* Block or return if destination link is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode)); + if (unlikely(rc)) return rc; - do { - skb = skb_peek(&pktchain); - TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; - rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid); - if (likely(!rc)) { - if (!is_connectionless) - tipc_set_sk_state(sk, TIPC_CONNECTING); - return dsz; - } - if (rc == -ELINKCONG) { - tsk->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (!rc) - continue; - } - __skb_queue_purge(&pktchain); - if (rc == -EMSGSIZE) { - m->msg_iter = save; - goto new_mtu; - } - break; - } while (1); - - return rc; -} + skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; -static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - int done; + rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + if (unlikely(rc == -ELINKCONG)) { + u32_push(clinks, dnode); + tsk->cong_link_cnt++; + rc = 0; + } - do { - int err = sock_error(sk); - if (err) - return err; - if (sk->sk_state == TIPC_DISCONNECTING) - return -EPIPE; - else if (!tipc_sk_connected(sk)) - return -ENOTCONN; - if (!*timeo_p) - return -EAGAIN; - if (signal_pending(current)) - return sock_intr_errno(*timeo_p); + if (unlikely(syn && !rc)) + tipc_set_sk_state(sk, TIPC_CONNECTING); - add_wait_queue(sk_sleep(sk), &wait); - done = sk_wait_event(sk, timeo_p, - (!tsk->link_cong && - !tsk_conn_cong(tsk)) || - !tipc_sk_connected(sk), &wait); - remove_wait_queue(sk_sleep(sk), &wait); - } while (!done); - return 0; + return rc ? rc : dlen; } /** - * tipc_send_stream - send stream-oriented data + * tipc_sendstream - send stream-oriented data * @sock: socket structure * @m: data to send * @dsz: total length of data to be transmitted @@ -1026,94 +999,69 @@ static int tipc_wait_for_sndpkt(struct socket *sock, long *timeo_p) * Returns the number of bytes sent on success (or partial success), * or errno if no data sent */ -static int tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) +static int tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz) { struct sock *sk = sock->sk; int ret; lock_sock(sk); - ret = __tipc_send_stream(sock, m, dsz); + ret = __tipc_sendstream(sock, m, dsz); release_sock(sk); return ret; } -static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) +static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) { struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_msg *mhdr = &tsk->phdr; - struct sk_buff_head pktchain; DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); - u32 portid = tsk->portid; - int rc = -EINVAL; - long timeo; - u32 dnode; - uint mtu, send, sent = 0; - struct iov_iter save; - int hlen = MIN_H_SIZE; - - /* Handle implied connection establishment */ - if (unlikely(dest)) { - rc = __tipc_sendmsg(sock, m, dsz); - hlen = msg_hdr_sz(mhdr); - if (dsz && (dsz == rc)) - tsk->snt_unacked = tsk_inc(tsk, dsz + hlen); - return rc; - } - if (dsz > (uint)INT_MAX) - return -EMSGSIZE; - - if (unlikely(!tipc_sk_connected(sk))) { - if (sk->sk_state == TIPC_DISCONNECTING) - return -EPIPE; - else - return -ENOTCONN; - } + long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = &tsk->phdr; + struct net *net = sock_net(sk); + struct sk_buff_head pkts; + u32 dnode = tsk_peer_node(tsk); + int send, sent = 0; + int rc = 0; - timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); - if (!timeo && tsk->link_cong) - return -ELINKCONG; + skb_queue_head_init(&pkts); - dnode = tsk_peer_node(tsk); - skb_queue_head_init(&pktchain); + if (unlikely(dlen > INT_MAX)) + return -EMSGSIZE; -next: - save = m->msg_iter; - mtu = tsk->max_pkt; - send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); - rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain); - if (unlikely(rc < 0)) + /* Handle implicit connection setup */ + if (unlikely(dest)) { + rc = __tipc_sendmsg(sock, m, dlen); + if (dlen && (dlen == rc)) + tsk->snt_unacked = tsk_inc(tsk, dlen + msg_hdr_sz(hdr)); return rc; + } do { - if (likely(!tsk_conn_cong(tsk))) { - rc = tipc_node_xmit(net, &pktchain, dnode, portid); - if (likely(!rc)) { - tsk->snt_unacked += tsk_inc(tsk, send + hlen); - sent += send; - if (sent == dsz) - return dsz; - goto next; - } - if (rc == -EMSGSIZE) { - __skb_queue_purge(&pktchain); - tsk->max_pkt = tipc_node_get_mtu(net, dnode, - portid); - m->msg_iter = save; - goto next; - } - if (rc != -ELINKCONG) - break; + rc = tipc_wait_for_cond(sock, &timeout, + (!tsk->cong_link_cnt && + !tsk_conn_cong(tsk) && + tipc_sk_connected(sk))); + if (unlikely(rc)) + break; + + send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); + rc = tipc_msg_build(hdr, m, sent, send, tsk->max_pkt, &pkts); + if (unlikely(rc != send)) + break; - tsk->link_cong = 1; + rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + if (unlikely(rc == -ELINKCONG)) { + tsk->cong_link_cnt = 1; + rc = 0; + } + if (likely(!rc)) { + tsk->snt_unacked += tsk_inc(tsk, send + MIN_H_SIZE); + sent += send; } - rc = tipc_wait_for_sndpkt(sock, &timeo); - } while (!rc); + } while (sent < dlen && !rc); - __skb_queue_purge(&pktchain); - return sent ? sent : rc; + return rc ? rc : sent; } /** @@ -1131,7 +1079,7 @@ static int tipc_send_packet(struct socket *sock, struct msghdr *m, size_t dsz) if (dsz > TIPC_MAX_USER_MSG_SIZE) return -EMSGSIZE; - return tipc_send_stream(sock, m, dsz); + return tipc_sendstream(sock, m, dsz); } /* tipc_sk_finish_conn - complete the setup of a connection @@ -1698,6 +1646,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb, unsigned int limit = rcvbuf_limit(sk, skb); int err = TIPC_OK; int usr = msg_user(hdr); + u32 onode; if (unlikely(msg_user(hdr) == CONN_MANAGER)) { tipc_sk_proto_rcv(tsk, skb, xmitq); @@ -1705,8 +1654,10 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb, } if (unlikely(usr == SOCK_WAKEUP)) { + onode = msg_orignode(hdr); kfree_skb(skb); - tsk->link_cong = 0; + u32_del(&tsk->cong_links, onode); + tsk->cong_link_cnt--; sk->sk_write_space(sk); return false; } @@ -2114,7 +2065,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags) struct msghdr m = {NULL,}; tsk_advance_rx_queue(sk); - __tipc_send_stream(new_sock, &m, 0); + __tipc_sendstream(new_sock, &m, 0); } else { __skb_dequeue(&sk->sk_receive_queue); __skb_queue_head(&new_sk->sk_receive_queue, buf); @@ -2575,7 +2526,7 @@ static const struct proto_ops stream_ops = { .shutdown = tipc_shutdown, .setsockopt = tipc_setsockopt, .getsockopt = tipc_getsockopt, - .sendmsg = tipc_send_stream, + .sendmsg = tipc_sendstream, .recvmsg = tipc_recv_stream, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage diff --git a/net/wireless/Makefile b/net/wireless/Makefile index 816c9331c8d2..d06e5015751a 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_WEXT_PRIV) += wext-priv.o cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o +cfg80211-$(CONFIG_OF) += of.o cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o diff --git a/net/wireless/core.c b/net/wireless/core.c index 158c59ecf90a..903fc419217a 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1142,6 +1142,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) dev->priv_flags |= IFF_DONT_BRIDGE; + INIT_WORK(&wdev->disconnect_wk, cfg80211_autodisconnect_wk); + nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); break; case NETDEV_GOING_DOWN: @@ -1230,6 +1232,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, #ifdef CONFIG_CFG80211_WEXT kzfree(wdev->wext.keys); #endif + flush_work(&wdev->disconnect_wk); } /* * synchronise (so that we won't find this netdev diff --git a/net/wireless/core.h b/net/wireless/core.h index af6e023020b1..58ca206982fe 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -228,6 +228,7 @@ struct cfg80211_event { size_t resp_ie_len; struct cfg80211_bss *bss; int status; /* -1 = failed; 0..65535 = status code */ + enum nl80211_timeout_reason timeout_reason; } cr; struct { const u8 *req_ie; @@ -388,7 +389,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, int status, bool wextev, - struct cfg80211_bss *bss); + struct cfg80211_bss *bss, + enum nl80211_timeout_reason timeout_reason); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap); int cfg80211_disconnect(struct cfg80211_registered_device *rdev, @@ -400,6 +402,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev, const u8 *resp_ie, size_t resp_ie_len); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); +void cfg80211_autodisconnect_wk(struct work_struct *work); /* SME implementation */ void cfg80211_conn_work(struct work_struct *work); @@ -430,6 +433,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); void cfg80211_process_wdev_events(struct wireless_dev *wdev); +bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, + u32 center_freq_khz, u32 bw_khz); + /** * cfg80211_chandef_dfs_usable - checks if chandef is DFS usable * @wiphy: the wiphy to validate against diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 4646cf5695b9..22b3d9990065 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -48,7 +48,8 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, status_code, - status_code == WLAN_STATUS_SUCCESS, bss); + status_code == WLAN_STATUS_SUCCESS, bss, + NL80211_TIMEOUT_UNSPECIFIED); } EXPORT_SYMBOL(cfg80211_rx_assoc_resp); @@ -345,6 +346,11 @@ int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, !ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) return 0; + if (ether_addr_equal(wdev->disconnect_bssid, bssid) || + (wdev->current_bss && + ether_addr_equal(wdev->current_bss->pub.bssid, bssid))) + wdev->conn_owner_nlportid = 0; + return rdev_deauth(rdev, dev, &req); } @@ -657,8 +663,25 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, return err; } - if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) - return -EINVAL; + if (!ether_addr_equal(mgmt->sa, wdev_address(wdev))) { + /* Allow random TA to be used with Public Action frames if the + * driver has indicated support for this. Otherwise, only allow + * the local address to be used. + */ + if (!ieee80211_is_action(mgmt->frame_control) || + mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) + return -EINVAL; + if (!wdev->current_bss && + !wiphy_ext_feature_isset( + &rdev->wiphy, + NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA)) + return -EINVAL; + if (wdev->current_bss && + !wiphy_ext_feature_isset( + &rdev->wiphy, + NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED)) + return -EINVAL; + } /* Transmit the Action frame as requested by user space */ return rdev_mgmt_tx(rdev, wdev, params, cookie); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5c1b267e22be..63dfa60a29ef 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -405,6 +405,11 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FILS_NONCES] = { .len = 2 * FILS_NONCE_LEN }, [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, }, [NL80211_ATTR_BSSID] = { .len = ETH_ALEN }, + [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 }, + [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = { + .len = sizeof(struct nl80211_bss_select_rssi_adjust) + }, + [NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 }, }; /* policy for the key attributes */ @@ -6790,13 +6795,10 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, /* * If scan plans are not specified, - * %NL80211_ATTR_SCHED_SCAN_INTERVAL must be specified. In this + * %NL80211_ATTR_SCHED_SCAN_INTERVAL will be specified. In this * case one scan plan will be set with the specified scan * interval and infinite number of iterations. */ - if (!attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]) - return -EINVAL; - interval = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_INTERVAL]); if (!interval) return -EINVAL; @@ -6968,6 +6970,12 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, if (!n_plans || n_plans > wiphy->max_sched_scan_plans) return ERR_PTR(-EINVAL); + if (!wiphy_ext_feature_isset( + wiphy, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI) && + (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] || + attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST])) + return ERR_PTR(-EINVAL); + request = kzalloc(sizeof(*request) + sizeof(*request->ssids) * n_ssids + sizeof(*request->match_sets) * n_match_sets @@ -7174,6 +7182,26 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, request->delay = nla_get_u32(attrs[NL80211_ATTR_SCHED_SCAN_DELAY]); + if (attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]) { + request->relative_rssi = nla_get_s8( + attrs[NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI]); + request->relative_rssi_set = true; + } + + if (request->relative_rssi_set && + attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]) { + struct nl80211_bss_select_rssi_adjust *rssi_adjust; + + rssi_adjust = nla_data( + attrs[NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST]); + request->rssi_adjust.band = rssi_adjust->band; + request->rssi_adjust.delta = rssi_adjust->delta; + if (!is_band_valid(wiphy, request->rssi_adjust.band)) { + err = -EINVAL; + goto out_free; + } + } + err = nl80211_parse_sched_scan_plans(wiphy, n_plans, request, attrs); if (err) goto out_free; @@ -8068,8 +8096,17 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); + err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, ssid, ssid_len, &req); + + if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + dev->ieee80211_ptr->conn_owner_nlportid = + info->snd_portid; + memcpy(dev->ieee80211_ptr->disconnect_bssid, + bssid, ETH_ALEN); + } + wdev_unlock(dev->ieee80211_ptr); } @@ -8788,11 +8825,24 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } wdev_lock(dev->ieee80211_ptr); + err = cfg80211_connect(rdev, dev, &connect, connkeys, connect.prev_bssid); - wdev_unlock(dev->ieee80211_ptr); if (err) kzfree(connkeys); + + if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; + if (connect.bssid) + memcpy(dev->ieee80211_ptr->disconnect_bssid, + connect.bssid, ETH_ALEN); + else + memset(dev->ieee80211_ptr->disconnect_bssid, + 0, ETH_ALEN); + } + + wdev_unlock(dev->ieee80211_ptr); + return err; } @@ -9688,6 +9738,20 @@ static int nl80211_send_wowlan_nd(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_DELAY, req->delay)) return -ENOBUFS; + if (req->relative_rssi_set) { + struct nl80211_bss_select_rssi_adjust rssi_adjust; + + if (nla_put_s8(msg, NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI, + req->relative_rssi)) + return -ENOBUFS; + + rssi_adjust.band = req->rssi_adjust.band; + rssi_adjust.delta = req->rssi_adjust.delta; + if (nla_put(msg, NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST, + sizeof(rssi_adjust), &rssi_adjust)) + return -ENOBUFS; + } + freqs = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQUENCIES); if (!freqs) return -ENOBUFS; @@ -11822,9 +11886,6 @@ static int nl80211_set_multicast_to_unicast(struct sk_buff *skb, const struct nlattr *nla; bool enabled; - if (netif_running(dev)) - return -EBUSY; - if (!rdev->ops->set_multicast_to_unicast) return -EOPNOTSUPP; @@ -12825,7 +12886,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg, return -ENOBUFS; } -static int nl80211_send_scan_msg(struct sk_buff *msg, +static int nl80211_prep_scan_msg(struct sk_buff *msg, struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 portid, u32 seq, int flags, @@ -12856,7 +12917,7 @@ static int nl80211_send_scan_msg(struct sk_buff *msg, } static int -nl80211_send_sched_scan_msg(struct sk_buff *msg, +nl80211_prep_sched_scan_msg(struct sk_buff *msg, struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 portid, u32 seq, int flags, u32 cmd) @@ -12888,7 +12949,7 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, if (!msg) return; - if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, + if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0, NL80211_CMD_TRIGGER_SCAN) < 0) { nlmsg_free(msg); return; @@ -12907,7 +12968,7 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, if (!msg) return NULL; - if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, + if (nl80211_prep_scan_msg(msg, rdev, wdev, 0, 0, 0, aborted ? NL80211_CMD_SCAN_ABORTED : NL80211_CMD_NEW_SCAN_RESULTS) < 0) { nlmsg_free(msg); @@ -12917,31 +12978,13 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, return msg; } -void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, - struct sk_buff *msg) -{ - if (!msg) - return; - - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, - NL80211_MCGRP_SCAN, GFP_KERNEL); -} - -void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, - struct net_device *netdev) +/* send message created by nl80211_build_scan_msg() */ +void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) { - struct sk_buff *msg; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, - NL80211_CMD_SCHED_SCAN_RESULTS) < 0) { - nlmsg_free(msg); - return; - } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, NL80211_MCGRP_SCAN, GFP_KERNEL); } @@ -12955,7 +12998,7 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, if (!msg) return; - if (nl80211_send_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) { + if (nl80211_prep_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) { nlmsg_free(msg); return; } @@ -13057,7 +13100,7 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + len, gfp); if (!msg) return; @@ -13204,12 +13247,14 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - int status, gfp_t gfp) + int status, + enum nl80211_timeout_reason timeout_reason, + gfp_t gfp) { struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp); if (!msg) return; @@ -13225,7 +13270,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : status) || - (status < 0 && nla_put_flag(msg, NL80211_ATTR_TIMED_OUT)) || + (status < 0 && + (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) || + nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) || (req_ie && nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) || (resp_ie && @@ -13251,7 +13298,7 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp); if (!msg) return; @@ -13288,7 +13335,7 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + msg = nlmsg_new(100 + ie_len, GFP_KERNEL); if (!msg) return; @@ -13364,7 +13411,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, trace_cfg80211_notify_new_peer_candidate(dev, addr); - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + ie_len, gfp); if (!msg) return; @@ -13735,7 +13782,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + len, gfp); if (!msg) return -ENOMEM; @@ -13779,7 +13826,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + msg = nlmsg_new(100 + len, gfp); if (!msg) return; @@ -14534,6 +14581,8 @@ static int nl80211_netlink_notify(struct notifier_block * nb, if (wdev->owner_nlportid == notify->portid) schedule_destroy_work = true; + else if (wdev->conn_owner_nlportid == notify->portid) + schedule_work(&wdev->disconnect_wk); } spin_lock_bh(&rdev->beacon_registrations_lock); @@ -14588,7 +14637,7 @@ void cfg80211_ft_event(struct net_device *netdev, if (!ft_event->target_ap) return; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + msg = nlmsg_new(100 + ft_event->ric_ies_len, GFP_KERNEL); if (!msg) return; diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 7e3821d7fcc5..e488dca87423 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -14,12 +14,10 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, bool aborted); -void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, - struct sk_buff *msg); +void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev, + struct sk_buff *msg); void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 cmd); -void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, - struct net_device *netdev); void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, struct regulatory_request *request); @@ -58,7 +56,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, - int status, gfp_t gfp); + int status, + enum nl80211_timeout_reason timeout_reason, + gfp_t gfp); void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, diff --git a/net/wireless/of.c b/net/wireless/of.c new file mode 100644 index 000000000000..de221f0edca5 --- /dev/null +++ b/net/wireless/of.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2017 Rafał Miłecki <rafal@milecki.pl> + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <linux/of.h> +#include <net/cfg80211.h> +#include "core.h" + +static bool wiphy_freq_limits_valid_chan(struct wiphy *wiphy, + struct ieee80211_freq_range *freq_limits, + unsigned int n_freq_limits, + struct ieee80211_channel *chan) +{ + u32 bw = MHZ_TO_KHZ(20); + int i; + + for (i = 0; i < n_freq_limits; i++) { + struct ieee80211_freq_range *limit = &freq_limits[i]; + + if (cfg80211_does_bw_fit_range(limit, + MHZ_TO_KHZ(chan->center_freq), + bw)) + return true; + } + + return false; +} + +static void wiphy_freq_limits_apply(struct wiphy *wiphy, + struct ieee80211_freq_range *freq_limits, + unsigned int n_freq_limits) +{ + enum nl80211_band band; + int i; + + if (WARN_ON(!n_freq_limits)) + return; + + for (band = 0; band < NUM_NL80211_BANDS; band++) { + struct ieee80211_supported_band *sband = wiphy->bands[band]; + + if (!sband) + continue; + + for (i = 0; i < sband->n_channels; i++) { + struct ieee80211_channel *chan = &sband->channels[i]; + + if (chan->flags & IEEE80211_CHAN_DISABLED) + continue; + + if (!wiphy_freq_limits_valid_chan(wiphy, freq_limits, + n_freq_limits, + chan)) { + pr_debug("Disabling freq %d MHz as it's out of OF limits\n", + chan->center_freq); + chan->flags |= IEEE80211_CHAN_DISABLED; + } + } + } +} + +void wiphy_read_of_freq_limits(struct wiphy *wiphy) +{ + struct device *dev = wiphy_dev(wiphy); + struct device_node *np; + struct property *prop; + struct ieee80211_freq_range *freq_limits; + unsigned int n_freq_limits; + const __be32 *p; + int len, i; + int err = 0; + + if (!dev) + return; + np = dev_of_node(dev); + if (!np) + return; + + prop = of_find_property(np, "ieee80211-freq-limit", &len); + if (!prop) + return; + + if (!len || len % sizeof(u32) || len / sizeof(u32) % 2) { + dev_err(dev, "ieee80211-freq-limit wrong format"); + return; + } + n_freq_limits = len / sizeof(u32) / 2; + + freq_limits = kcalloc(n_freq_limits, sizeof(*freq_limits), GFP_KERNEL); + if (!freq_limits) { + err = -ENOMEM; + goto out_kfree; + } + + p = NULL; + for (i = 0; i < n_freq_limits; i++) { + struct ieee80211_freq_range *limit = &freq_limits[i]; + + p = of_prop_next_u32(prop, p, &limit->start_freq_khz); + if (!p) { + err = -EINVAL; + goto out_kfree; + } + + p = of_prop_next_u32(prop, p, &limit->end_freq_khz); + if (!p) { + err = -EINVAL; + goto out_kfree; + } + + if (!limit->start_freq_khz || + !limit->end_freq_khz || + limit->start_freq_khz >= limit->end_freq_khz) { + err = -EINVAL; + goto out_kfree; + } + } + + wiphy_freq_limits_apply(wiphy, freq_limits, n_freq_limits); + +out_kfree: + kfree(freq_limits); + if (err) + dev_err(dev, "Failed to get limits: %d\n", err); +} +EXPORT_SYMBOL(wiphy_read_of_freq_limits); diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 5dbac3749738..753efcd51fa3 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -748,21 +748,6 @@ static bool is_valid_rd(const struct ieee80211_regdomain *rd) return true; } -static bool reg_does_bw_fit(const struct ieee80211_freq_range *freq_range, - u32 center_freq_khz, u32 bw_khz) -{ - u32 start_freq_khz, end_freq_khz; - - start_freq_khz = center_freq_khz - (bw_khz/2); - end_freq_khz = center_freq_khz + (bw_khz/2); - - if (start_freq_khz >= freq_range->start_freq_khz && - end_freq_khz <= freq_range->end_freq_khz) - return true; - - return false; -} - /** * freq_in_rule_band - tells us if a frequency is in a frequency band * @freq_range: frequency rule we want to query @@ -1070,7 +1055,7 @@ freq_reg_info_regd(u32 center_freq, if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); - bw_fits = reg_does_bw_fit(fr, center_freq, bw); + bw_fits = cfg80211_does_bw_fit_range(fr, center_freq, bw); if (band_rule_found && bw_fits) return rr; @@ -1138,11 +1123,13 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); /* If we get a reg_rule we can assume that at least 5Mhz fit */ - if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), - MHZ_TO_KHZ(10))) + if (!cfg80211_does_bw_fit_range(freq_range, + MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(10))) bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), - MHZ_TO_KHZ(20))) + if (!cfg80211_does_bw_fit_range(freq_range, + MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(10)) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 35ad69fd0838..21be56b3128e 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -227,7 +227,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, ASSERT_RTNL(); if (rdev->scan_msg) { - nl80211_send_scan_result(rdev, rdev->scan_msg); + nl80211_send_scan_msg(rdev, rdev->scan_msg); rdev->scan_msg = NULL; return; } @@ -273,7 +273,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (!send_message) rdev->scan_msg = msg; else - nl80211_send_scan_result(rdev, msg); + nl80211_send_scan_msg(rdev, msg); } void __cfg80211_scan_done(struct work_struct *wk) @@ -321,7 +321,8 @@ void __cfg80211_sched_scan_results(struct work_struct *wk) spin_unlock_bh(&rdev->bss_lock); request->scan_start = jiffies; } - nl80211_send_sched_scan_results(rdev, request->dev); + nl80211_send_sched_scan(rdev, request->dev, + NL80211_CMD_SCHED_SCAN_RESULTS); } rtnl_unlock(); @@ -1147,7 +1148,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, else rcu_assign_pointer(tmp.pub.beacon_ies, ies); rcu_assign_pointer(tmp.pub.ies, ies); - + memcpy(tmp.pub.bssid, mgmt->bssid, ETH_ALEN); tmp.pub.channel = channel; tmp.pub.scan_width = data->scan_width; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 5e0d19380302..b347e63d7aaa 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -34,10 +34,11 @@ struct cfg80211_conn { CFG80211_CONN_SCAN_AGAIN, CFG80211_CONN_AUTHENTICATE_NEXT, CFG80211_CONN_AUTHENTICATING, - CFG80211_CONN_AUTH_FAILED, + CFG80211_CONN_AUTH_FAILED_TIMEOUT, CFG80211_CONN_ASSOCIATE_NEXT, CFG80211_CONN_ASSOCIATING, CFG80211_CONN_ASSOC_FAILED, + CFG80211_CONN_ASSOC_FAILED_TIMEOUT, CFG80211_CONN_DEAUTH, CFG80211_CONN_ABANDON, CFG80211_CONN_CONNECTED, @@ -140,7 +141,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) return err; } -static int cfg80211_conn_do_work(struct wireless_dev *wdev) +static int cfg80211_conn_do_work(struct wireless_dev *wdev, + enum nl80211_timeout_reason *treason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_connect_params *params; @@ -171,7 +173,8 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) NULL, 0, params->key, params->key_len, params->key_idx, NULL, 0); - case CFG80211_CONN_AUTH_FAILED: + case CFG80211_CONN_AUTH_FAILED_TIMEOUT: + *treason = NL80211_TIMEOUT_AUTH; return -ENOTCONN; case CFG80211_CONN_ASSOCIATE_NEXT: if (WARN_ON(!rdev->ops->assoc)) @@ -198,6 +201,9 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) WLAN_REASON_DEAUTH_LEAVING, false); return err; + case CFG80211_CONN_ASSOC_FAILED_TIMEOUT: + *treason = NL80211_TIMEOUT_ASSOC; + /* fall through */ case CFG80211_CONN_ASSOC_FAILED: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, @@ -223,6 +229,7 @@ void cfg80211_conn_work(struct work_struct *work) container_of(work, struct cfg80211_registered_device, conn_work); struct wireless_dev *wdev; u8 bssid_buf[ETH_ALEN], *bssid = NULL; + enum nl80211_timeout_reason treason; rtnl_lock(); @@ -244,10 +251,12 @@ void cfg80211_conn_work(struct work_struct *work) memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); bssid = bssid_buf; } - if (cfg80211_conn_do_work(wdev)) { + treason = NL80211_TIMEOUT_UNSPECIFIED; + if (cfg80211_conn_do_work(wdev, &treason)) { __cfg80211_connect_result( wdev->netdev, bssid, - NULL, 0, NULL, 0, -1, false, NULL); + NULL, 0, NULL, 0, -1, false, NULL, + treason); } wdev_unlock(wdev); } @@ -352,7 +361,8 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) } else if (status_code != WLAN_STATUS_SUCCESS) { __cfg80211_connect_result(wdev->netdev, mgmt->bssid, NULL, 0, NULL, 0, - status_code, false, NULL); + status_code, false, NULL, + NL80211_TIMEOUT_UNSPECIFIED); } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); @@ -400,7 +410,7 @@ void cfg80211_sme_auth_timeout(struct wireless_dev *wdev) if (!wdev->conn) return; - wdev->conn->state = CFG80211_CONN_AUTH_FAILED; + wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } @@ -422,7 +432,7 @@ void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev) if (!wdev->conn) return; - wdev->conn->state = CFG80211_CONN_ASSOC_FAILED; + wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } @@ -564,7 +574,9 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, /* we're good if we have a matching bss struct */ if (bss) { - err = cfg80211_conn_do_work(wdev); + enum nl80211_timeout_reason treason; + + err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { /* otherwise we'll need to scan for the AP first */ @@ -661,7 +673,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, int status, bool wextev, - struct cfg80211_bss *bss) + struct cfg80211_bss *bss, + enum nl80211_timeout_reason timeout_reason) { struct wireless_dev *wdev = dev->ieee80211_ptr; const u8 *country_ie; @@ -680,7 +693,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, bssid, req_ie, req_ie_len, resp_ie, resp_ie_len, - status, GFP_KERNEL); + status, timeout_reason, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT if (wextev) { @@ -727,6 +740,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, kzfree(wdev->connect_keys); wdev->connect_keys = NULL; wdev->ssid_len = 0; + wdev->conn_owner_nlportid = 0; if (bss) { cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wdev->wiphy, bss); @@ -770,7 +784,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, struct cfg80211_bss *bss, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, - size_t resp_ie_len, int status, gfp_t gfp) + size_t resp_ie_len, int status, gfp_t gfp, + enum nl80211_timeout_reason timeout_reason) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -810,6 +825,7 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, cfg80211_hold_bss(bss_from_pub(bss)); ev->cr.bss = bss; ev->cr.status = status; + ev->cr.timeout_reason = timeout_reason; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); @@ -955,6 +971,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, wdev->current_bss = NULL; wdev->ssid_len = 0; + wdev->conn_owner_nlportid = 0; nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); @@ -1098,6 +1115,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, kzfree(wdev->connect_keys); wdev->connect_keys = NULL; + wdev->conn_owner_nlportid = 0; + if (wdev->conn) err = cfg80211_sme_disconnect(wdev, reason); else if (!rdev->ops->disconnect) @@ -1107,3 +1126,32 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, return err; } + +/* + * Used to clean up after the connection / connection attempt owner socket + * disconnects + */ +void cfg80211_autodisconnect_wk(struct work_struct *work) +{ + struct wireless_dev *wdev = + container_of(work, struct wireless_dev, disconnect_wk); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + + wdev_lock(wdev); + + if (wdev->conn_owner_nlportid) { + /* + * Use disconnect_bssid if still connecting and ops->disconnect + * not implemented. Otherwise we can use cfg80211_disconnect. + */ + if (rdev->ops->disconnect || wdev->current_bss) + cfg80211_disconnect(rdev, wdev->netdev, + WLAN_REASON_DEAUTH_LEAVING, true); + else + cfg80211_mlme_deauth(rdev, wdev->netdev, + wdev->disconnect_bssid, NULL, 0, + WLAN_REASON_DEAUTH_LEAVING, false); + } + + wdev_unlock(wdev); +} diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 14b3f007826d..16b6b5988be9 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -39,9 +39,11 @@ SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); static ssize_t name_show(struct device *dev, struct device_attribute *attr, - char *buf) { + char *buf) +{ struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; - return sprintf(buf, "%s\n", dev_name(&wiphy->dev)); + + return sprintf(buf, "%s\n", wiphy_name(wiphy)); } static DEVICE_ATTR_RO(name); diff --git a/net/wireless/util.c b/net/wireless/util.c index e9d040d29846..1b9296882dcd 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -114,8 +114,7 @@ int ieee80211_frequency_to_channel(int freq) } EXPORT_SYMBOL(ieee80211_frequency_to_channel); -struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, - int freq) +struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; @@ -135,14 +134,13 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, return NULL; } -EXPORT_SYMBOL(__ieee80211_get_channel); +EXPORT_SYMBOL(ieee80211_get_channel); -static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, - enum nl80211_band band) +static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { int i, want; - switch (band) { + switch (sband->band) { case NL80211_BAND_5GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { @@ -192,6 +190,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; case NUM_NL80211_BANDS: + default: WARN_ON(1); break; } @@ -203,7 +202,7 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy) for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) - set_mandatory_flags_band(wiphy->bands[band], band); + set_mandatory_flags_band(wiphy->bands[band]); } bool cfg80211_supported_cipher_suite(struct wiphy *wiphy, u32 cipher) @@ -952,7 +951,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) ev->cr.resp_ie, ev->cr.resp_ie_len, ev->cr.status, ev->cr.status == WLAN_STATUS_SUCCESS, - ev->cr.bss); + ev->cr.bss, ev->cr.timeout_reason); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie, @@ -1848,6 +1847,21 @@ void cfg80211_free_nan_func(struct cfg80211_nan_func *f) } EXPORT_SYMBOL(cfg80211_free_nan_func); +bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, + u32 center_freq_khz, u32 bw_khz) +{ + u32 start_freq_khz, end_freq_khz; + + start_freq_khz = center_freq_khz - (bw_khz / 2); + end_freq_khz = center_freq_khz + (bw_khz / 2); + + if (start_freq_khz >= freq_range->start_freq_khz && + end_freq_khz <= freq_range->end_freq_khz) + return true; + + return false; +} + /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ const unsigned char rfc1042_header[] __aligned(2) = diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 6250b1cfcde5..1a4db6790e20 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -1119,3 +1119,70 @@ int compat_wext_handle_ioctl(struct net *net, unsigned int cmd, return ret; } #endif + +char *iwe_stream_add_event(struct iw_request_info *info, char *stream, + char *ends, struct iw_event *iwe, int event_len) +{ + int lcp_len = iwe_stream_lcp_len(info); + + event_len = iwe_stream_event_len_adjust(info, event_len); + + /* Check if it's possible */ + if (likely((stream + event_len) < ends)) { + iwe->len = event_len; + /* Beware of alignement issues on 64 bits */ + memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); + memcpy(stream + lcp_len, &iwe->u, + event_len - lcp_len); + stream += event_len; + } + + return stream; +} +EXPORT_SYMBOL(iwe_stream_add_event); + +char *iwe_stream_add_point(struct iw_request_info *info, char *stream, + char *ends, struct iw_event *iwe, char *extra) +{ + int event_len = iwe_stream_point_len(info) + iwe->u.data.length; + int point_len = iwe_stream_point_len(info); + int lcp_len = iwe_stream_lcp_len(info); + + /* Check if it's possible */ + if (likely((stream + event_len) < ends)) { + iwe->len = event_len; + memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); + memcpy(stream + lcp_len, + ((char *) &iwe->u) + IW_EV_POINT_OFF, + IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); + if (iwe->u.data.length && extra) + memcpy(stream + point_len, extra, iwe->u.data.length); + stream += event_len; + } + + return stream; +} +EXPORT_SYMBOL(iwe_stream_add_point); + +char *iwe_stream_add_value(struct iw_request_info *info, char *event, + char *value, char *ends, struct iw_event *iwe, + int event_len) +{ + int lcp_len = iwe_stream_lcp_len(info); + + /* Don't duplicate LCP */ + event_len -= IW_EV_LCP_LEN; + + /* Check if it's possible */ + if (likely((value + event_len) < ends)) { + /* Add new value */ + memcpy(value, &iwe->u, event_len); + value += event_len; + /* Patch LCP */ + iwe->len = value - event; + memcpy(event, (char *) iwe, lcp_len); + } + + return value; +} +EXPORT_SYMBOL(iwe_stream_add_value); diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 995163830a61..c434f193f39a 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -105,30 +105,7 @@ int cfg80211_mgd_wext_siwfreq(struct net_device *dev, goto out; } - wdev->wext.connect.channel = chan; - - /* - * SSID is not set, we just want to switch monitor channel, - * this is really just backward compatibility, if the SSID - * is set then we use the channel to select the BSS to use - * to connect to instead. If we were connected on another - * channel we disconnected above and reconnect below. - */ - if (chan && !wdev->wext.connect.ssid_len) { - struct cfg80211_chan_def chandef = { - .width = NL80211_CHAN_WIDTH_20_NOHT, - .center_freq1 = freq, - }; - - chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (chandef.chan) - err = cfg80211_set_monitor_channel(rdev, &chandef); - else - err = -EINVAL; - goto out; - } - err = cfg80211_mgd_wext_connect(rdev, wdev); out: wdev_unlock(wdev); |