From ba171d3f0850003216fd1a85190d17b1feddb961 Mon Sep 17 00:00:00 2001 From: Cedric Neveux Date: Mon, 4 Mar 2019 08:54:23 +0100 Subject: driver: tee: Handle NULL pointer indication from client TEE Client introduce a new capability "TEE_GEN_CAP_MEMREF_NULL" to handle the support of the shared memory buffer with a NULL pointer. This capability depends on TEE Capabilities and driver support. Driver and TEE exchange capabilities at driver initialization. Signed-off-by: Michael Whitfield Signed-off-by: Cedric Neveux Reviewed-by: Joakim Bech Tested-by: Joakim Bech (QEMU) Signed-off-by: Jens Wiklander --- include/uapi/linux/tee.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index b619f37ee03e..d67cadf221fc 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -51,6 +51,9 @@ #define TEE_GEN_CAP_GP (1 << 0)/* GlobalPlatform compliant TEE */ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ #define TEE_GEN_CAP_REG_MEM (1 << 2)/* Supports registering shared memory */ +#define TEE_GEN_CAP_MEMREF_NULL (1 << 3)/* NULL MemRef support */ + +#define TEE_MEMREF_NULL (__u64)(-1) /* NULL MemRef Buffer */ /* * TEE Implementation ID @@ -200,6 +203,16 @@ struct tee_ioctl_buf_data { * a part of a shared memory by specifying an offset (@a) and size (@b) of * the object. To supply the entire shared memory object set the offset * (@a) to 0 and size (@b) to the previously returned size of the object. + * + * A client may need to present a NULL pointer in the argument + * passed to a trusted application in the TEE. + * This is also a requirement in GlobalPlatform Client API v1.0c + * (section 3.2.5 memory references), which can be found at + * http://www.globalplatform.org/specificationsdevice.asp + * + * If a NULL pointer is passed to a TA in the TEE, the (@c) + * IOCTL parameters value must be set to TEE_MEMREF_NULL indicating a NULL + * memory reference. */ struct tee_ioctl_param { __u64 attr; -- cgit v1.2.3 From 6b0a249a301e2af9adda84adbced3a2988248b95 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:18 -0700 Subject: bpf: Implement link_query for bpf iterators This patch implemented bpf_link callback functions show_fdinfo and fill_link_info to support link_query interface. The general interface for show_fdinfo and fill_link_info will print/fill the target_name. Each targets can register show_fdinfo and fill_link_info callbacks to print/fill more target specific information. For example, the below is a fdinfo result for a bpf task iterator. $ cat /proc/1749/fdinfo/7 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 11 prog_tag: 990e1f8152f7e54f prog_id: 59 target_name: task Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184418.574122-1-yhs@fb.com --- include/linux/bpf.h | 6 +++++ include/uapi/linux/bpf.h | 7 +++++ kernel/bpf/bpf_iter.c | 58 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++ 4 files changed, 78 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9b7185a6b37..529e9b183eeb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1218,12 +1218,18 @@ typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; + bpf_iter_show_fdinfo_t show_fdinfo; + bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0480f893facd..a1bbaff7a0af 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4071,6 +4071,13 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + __u32 map_id; + } map; + } iter; struct { __u32 netns_ino; __u32 attach_type; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b6715964b685..aeec7e174188 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -377,10 +377,68 @@ out_unlock: return ret; } +static void bpf_iter_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + bpf_iter_show_fdinfo_t show_fdinfo; + + seq_printf(seq, + "target_name:\t%s\n", + iter_link->tinfo->reg_info->target); + + show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo; + if (show_fdinfo) + show_fdinfo(&iter_link->aux, seq); +} + +static int bpf_iter_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + char __user *ubuf = u64_to_user_ptr(info->iter.target_name); + bpf_iter_fill_link_info_t fill_link_info; + u32 ulen = info->iter.target_name_len; + const char *target_name; + u32 target_len; + + if (!ulen ^ !ubuf) + return -EINVAL; + + target_name = iter_link->tinfo->reg_info->target; + target_len = strlen(target_name); + info->iter.target_name_len = target_len + 1; + + if (ubuf) { + if (ulen >= target_len + 1) { + if (copy_to_user(ubuf, target_name, target_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(ubuf, target_name, ulen - 1)) + return -EFAULT; + if (put_user(zero, ubuf + ulen - 1)) + return -EFAULT; + return -ENOSPC; + } + } + + fill_link_info = iter_link->tinfo->reg_info->fill_link_info; + if (fill_link_info) + return fill_link_info(&iter_link->aux, info); + + return 0; +} + static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, .update_prog = bpf_iter_link_replace, + .show_fdinfo = bpf_iter_link_show_fdinfo, + .fill_link_info = bpf_iter_link_fill_link_info, }; bool bpf_link_is_iter(struct bpf_link *link) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0480f893facd..a1bbaff7a0af 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4071,6 +4071,13 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + __u32 map_id; + } map; + } iter; struct { __u32 netns_ino; __u32 attach_type; -- cgit v1.2.3 From eee049c0ef5b5b433f36841801e34c21c9f82a23 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Sat, 22 Aug 2020 15:59:08 +0100 Subject: l2tp: remove tunnel and session debug flags field The l2tp subsystem now uses standard kernel logging APIs for informational and warning messages, and tracepoints for debug information. Now that the tunnel and session debug flags are unused, remove the field from the core structures. Various system calls (in the case of l2tp_ppp) and netlink messages handle the getting and setting of debug flags. To avoid userspace breakage don't modify the API of these calls; simply ignore set requests, and send dummy data for get requests. Signed-off-by: Tom Parkin Signed-off-by: David S. Miller --- include/uapi/linux/if_pppol2tp.h | 2 +- include/uapi/linux/l2tp.h | 6 ++++-- net/l2tp/l2tp_core.c | 8 -------- net/l2tp/l2tp_core.h | 4 ---- net/l2tp/l2tp_debugfs.c | 4 ++-- net/l2tp/l2tp_netlink.c | 16 ++-------------- net/l2tp/l2tp_ppp.c | 15 ++++++++------- 7 files changed, 17 insertions(+), 38 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_pppol2tp.h b/include/uapi/linux/if_pppol2tp.h index 060b4d1f3129..a91044328bc9 100644 --- a/include/uapi/linux/if_pppol2tp.h +++ b/include/uapi/linux/if_pppol2tp.h @@ -75,7 +75,7 @@ struct pppol2tpv3in6_addr { }; /* Socket options: - * DEBUG - bitmask of debug message categories + * DEBUG - bitmask of debug message categories (not used) * SENDSEQ - 0 => don't send packets with sequence numbers * 1 => send packets with sequence numbers * RECVSEQ - 0 => receive packet sequence numbers are optional diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 61158f5a1a5b..88a0d32b8c07 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -108,7 +108,7 @@ enum { L2TP_ATTR_VLAN_ID, /* u16 (not used) */ L2TP_ATTR_COOKIE, /* 0, 4 or 8 bytes */ L2TP_ATTR_PEER_COOKIE, /* 0, 4 or 8 bytes */ - L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags */ + L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags (not used) */ L2TP_ATTR_RECV_SEQ, /* u8 */ L2TP_ATTR_SEND_SEQ, /* u8 */ L2TP_ATTR_LNS_MODE, /* u8 */ @@ -177,7 +177,9 @@ enum l2tp_seqmode { }; /** - * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions + * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions. + * + * Unused. * * @L2TP_MSG_DEBUG: verbose debug (if compiled in) * @L2TP_MSG_CONTROL: userspace - kernel interface diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index d8435b6f6fee..560c687f5457 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1401,16 +1401,12 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 tunnel->version = version; tunnel->tunnel_id = tunnel_id; tunnel->peer_tunnel_id = peer_tunnel_id; - tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS; tunnel->magic = L2TP_TUNNEL_MAGIC; sprintf(&tunnel->name[0], "tunl %u", tunnel_id); rwlock_init(&tunnel->hlist_lock); tunnel->acpt_newsess = true; - if (cfg) - tunnel->debug = cfg->debug; - tunnel->encap = encap; refcount_set(&tunnel->ref_count, 1); @@ -1608,12 +1604,8 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn INIT_HLIST_NODE(&session->hlist); INIT_HLIST_NODE(&session->global_hlist); - /* Inherit debug options from tunnel */ - session->debug = tunnel->debug; - if (cfg) { session->pwtype = cfg->pw_type; - session->debug = cfg->debug; session->send_seq = cfg->send_seq; session->recv_seq = cfg->recv_seq; session->lns_mode = cfg->lns_mode; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 7a06ac135a9b..07249c5f22ef 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -51,7 +51,6 @@ struct l2tp_session_cfg { unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ - int debug; /* bitmask of debug message categories */ u16 l2specific_type; /* Layer 2 specific type */ u8 cookie[8]; /* optional cookie */ int cookie_len; /* 0, 4 or 8 bytes */ @@ -98,7 +97,6 @@ struct l2tp_session { unsigned int lns_mode:1; /* behave as LNS? * LAC enables sequence numbers under LNS control. */ - int debug; /* bitmask of debug message categories */ int reorder_timeout; /* configured reorder timeout (in jiffies) */ int reorder_skip; /* set if skip to next nr */ enum l2tp_pwtype pwtype; @@ -132,7 +130,6 @@ struct l2tp_session { /* L2TP tunnel configuration */ struct l2tp_tunnel_cfg { - int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; /* Used only for kernel-created sockets */ @@ -173,7 +170,6 @@ struct l2tp_tunnel { int version; /* 2=>L2TPv2, 3=>L2TPv3 */ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */ - int debug; /* bitmask of debug message categories */ enum l2tp_encap_type encap; struct l2tp_stats stats; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 96cb9601c21b..bca75bef8282 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -167,7 +167,7 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) tunnel->sock ? refcount_read(&tunnel->sock->sk_refcnt) : 0, refcount_read(&tunnel->ref_count)); seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n", - tunnel->debug, + 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), @@ -192,7 +192,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v) session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", - session->debug, + 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " offset 0 l2specific %hu/%hu\n", session->l2specific_type, l2tp_get_l2specific_len(session)); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index def78eebca4c..31a1e27eab20 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -229,9 +229,6 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info goto out; } - if (attrs[L2TP_ATTR_DEBUG]) - cfg.debug = nla_get_u32(attrs[L2TP_ATTR_DEBUG]); - ret = -EINVAL; switch (cfg.encap) { case L2TP_ENCAPTYPE_UDP: @@ -307,9 +304,6 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info goto out; } - if (info->attrs[L2TP_ATTR_DEBUG]) - tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); @@ -400,7 +394,7 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) || nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || - nla_put_u32(skb, L2TP_ATTR_DEBUG, tunnel->debug) || + nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap)) goto nla_put_failure; @@ -605,9 +599,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]); } - if (info->attrs[L2TP_ATTR_DEBUG]) - cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - if (info->attrs[L2TP_ATTR_RECV_SEQ]) cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); @@ -689,9 +680,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf goto out; } - if (info->attrs[L2TP_ATTR_DEBUG]) - session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]); - if (info->attrs[L2TP_ATTR_RECV_SEQ]) session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]); @@ -730,7 +718,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) || nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) || nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) || - nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) || + nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) || nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype)) goto nla_put_failure; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index bd6bb17dfadb..450637ffa557 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -702,7 +702,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, if (!tunnel) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, - .debug = 0, }; /* Prevent l2tp_tunnel_register() from trying to set up @@ -1147,7 +1146,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_DEBUG: - tunnel->debug = val; + /* Tunnel debug flags option is deprecated */ break; default: @@ -1199,7 +1198,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk, break; case PPPOL2TP_SO_DEBUG: - session->debug = val; + /* Session debug flags option is deprecated */ break; case PPPOL2TP_SO_REORDERTO: @@ -1271,7 +1270,8 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk, switch (optname) { case PPPOL2TP_SO_DEBUG: - *val = tunnel->debug; + /* Tunnel debug flags option is deprecated */ + *val = 0; break; default: @@ -1304,7 +1304,8 @@ static int pppol2tp_session_getsockopt(struct sock *sk, break; case PPPOL2TP_SO_DEBUG: - *val = session->debug; + /* Session debug flags option is deprecated */ + *val = 0; break; case PPPOL2TP_SO_REORDERTO: @@ -1496,7 +1497,7 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N', refcount_read(&tunnel->ref_count) - 1); seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", - tunnel->debug, + 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), @@ -1542,7 +1543,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", - session->debug, + 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, -- cgit v1.2.3 From 2b8ee4f05d4f6a6c427ad30dd6c1bb49eb2efd3b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:21 -0700 Subject: tcp: bpf: Add TCP_BPF_DELACK_MAX setsockopt This change is mostly from an internal patch and adapts it from sysctl config to the bpf_setsockopt setup. The bpf_prog can set the max delay ack by using bpf_setsockopt(TCP_BPF_DELACK_MAX). This max delay ack can be communicated to its peer through bpf header option. The receiving peer can then use this max delay ack and set a potentially lower rto by using bpf_setsockopt(TCP_BPF_RTO_MIN) which will be introduced in the next patch. Another later selftest patch will also use it like the above to show how to write and parse bpf tcp header option. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190021.2884000-1-kafai@fb.com --- include/net/inet_connection_sock.h | 1 + include/uapi/linux/bpf.h | 1 + net/core/filter.c | 8 ++++++++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_output.c | 2 ++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index aa8893c68c50..da7264a1ebfc 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -86,6 +86,7 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1bbaff7a0af..7b905cb0213e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4257,6 +4257,7 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ }; struct bpf_perf_event_value { diff --git a/net/core/filter.c b/net/core/filter.c index c847b1285acd..80fe7420f609 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4459,6 +4459,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, } else { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; if (optlen != sizeof(int)) return -EINVAL; @@ -4480,6 +4481,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, tp->snd_ssthresh = val; } break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; case TCP_SAVE_SYN: if (val < 0 || val > 1) ret = -EINVAL; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 87d3036d8bd8..44c353a39ad4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -418,6 +418,7 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -2685,6 +2686,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; tp->snd_cwnd_cnt = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 85ff417bda7f..44ffa4891beb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3741,6 +3741,8 @@ void tcp_send_delayed_ack(struct sock *sk) ato = min(ato, max_ato); } + ato = min_t(u32, ato, inet_csk(sk)->icsk_delack_max); + /* Stay within the limit we were given */ timeout = jiffies + ato; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a1bbaff7a0af..7b905cb0213e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4257,6 +4257,7 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From ca584ba070864c606f3a54faaafe774726d5b4a1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:27 -0700 Subject: tcp: bpf: Add TCP_BPF_RTO_MIN for bpf_setsockopt This patch adds bpf_setsockopt(TCP_BPF_RTO_MIN) to allow bpf prog to set the min rto of a connection. It could be used together with the earlier patch which has added bpf_setsockopt(TCP_BPF_DELACK_MAX). A later selftest patch will communicate the max delay ack in a bpf tcp header option and then the receiving side can use bpf_setsockopt(TCP_BPF_RTO_MIN) to set a shorter rto. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190027.2884170-1-kafai@fb.com --- include/net/inet_connection_sock.h | 1 + include/net/tcp.h | 2 +- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 7 +++++++ net/ipv4/tcp.c | 2 ++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index da7264a1ebfc..c738abeb3265 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -86,6 +86,7 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_rto_min; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; diff --git a/include/net/tcp.h b/include/net/tcp.h index eab6c7510b5b..dda778c782fe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -699,7 +699,7 @@ static inline void tcp_fast_path_check(struct sock *sk) static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; + u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7b905cb0213e..1ae20058b574 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4258,6 +4258,7 @@ enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ }; struct bpf_perf_event_value { diff --git a/net/core/filter.c b/net/core/filter.c index 80fe7420f609..075ab71b985c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4488,6 +4488,13 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; inet_csk(sk)->icsk_delack_max = timeout; break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; case TCP_SAVE_SYN: if (val < 0 || val > 1) ret = -EINVAL; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 44c353a39ad4..6075cb091a20 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -418,6 +418,7 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); @@ -2686,6 +2687,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd = TCP_INIT_CWND; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7b905cb0213e..1ae20058b574 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4258,6 +4258,7 @@ enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From 00d211a4ea6f48e8e3b758813fe23ad28193d3bf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:46 -0700 Subject: bpf: tcp: Add bpf_skops_parse_hdr() The patch adds a function bpf_skops_parse_hdr(). It will call the bpf prog to parse the TCP header received at a tcp_sock that has at least reached the ESTABLISHED state. For the packets received during the 3WHS (SYN, SYNACK and ACK), the received skb will be available to the bpf prog during the callback in bpf_skops_established() introduced in the previous patch and in the bpf_skops_write_hdr_opt() that will be added in the next patch. Calling bpf prog to parse header is controlled by two new flags in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG and BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG. When BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG is set, the bpf prog will only be called when there is unknown option in the TCP header. When BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG is set, the bpf prog will be called on all received TCP header. This function is half implemented to highlight the changes in TCP stack. The actual codes preparing the bpf running context and invoking the bpf prog will be added in the later patch with other necessary bpf pieces. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190046.2885054-1-kafai@fb.com --- include/uapi/linux/bpf.h | 4 +++- net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 4 +++- 3 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1ae20058b574..010ed2abcb66 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4173,8 +4173,10 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, }; /* List of known BPF sock_ops operators. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7b0faa2bfe32..b520450170d1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -139,6 +139,36 @@ EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif #ifdef CONFIG_CGROUP_BPF +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ + bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && + BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); + bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); + + if (likely(!unknown_opt && !parse_all_opt)) + return; + + /* The skb will be handled in the + * bpf_skops_established() or + * bpf_skops_write_hdr_opt(). + */ + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_SYN_SENT: + case TCP_LISTEN: + return; + } + + /* BPF prog will have access to the sk and skb. + * + * The bpf running context preparation and the actual bpf prog + * calling will be implemented in a later PATCH together with + * other bpf pieces. + */ +} + static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { @@ -155,6 +185,10 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } #else +static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) +{ +} + static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { @@ -5623,6 +5657,8 @@ syn_challenge: goto discard; } + bpf_skops_parse_hdr(sk, skb); + return true; discard: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1ae20058b574..010ed2abcb66 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4173,8 +4173,10 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From 331fca4315efa3bbd258fbdf8209d59d253c0480 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:52 -0700 Subject: bpf: tcp: Add bpf_skops_hdr_opt_len() and bpf_skops_write_hdr_opt() The bpf prog needs to parse the SYN header to learn what options have been sent by the peer's bpf-prog before writing its options into SYNACK. This patch adds a "syn_skb" arg to tcp_make_synack() and send_synack(). This syn_skb will eventually be made available (as read-only) to the bpf prog. This will be the only SYN packet available to the bpf prog during syncookie. For other regular cases, the bpf prog can also use the saved_syn. When writing options, the bpf prog will first be called to tell the kernel its required number of bytes. It is done by the new bpf_skops_hdr_opt_len(). The bpf prog will only be called when the new BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG is set in tp->bpf_sock_ops_cb_flags. When the bpf prog returns, the kernel will know how many bytes are needed and then update the "*remaining" arg accordingly. 4 byte alignment will be included in the "*remaining" before this function returns. The 4 byte aligned number of bytes will also be stored into the opts->bpf_opt_len. "bpf_opt_len" is a newly added member to the struct tcp_out_options. Then the new bpf_skops_write_hdr_opt() will call the bpf prog to write the header options. The bpf prog is only called if it has reserved spaces before (opts->bpf_opt_len > 0). The bpf prog is the last one getting a chance to reserve header space and writing the header option. These two functions are half implemented to highlight the changes in TCP stack. The actual codes preparing the bpf running context and invoking the bpf prog will be added in the later patch with other necessary bpf pieces. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190052.2885316-1-kafai@fb.com --- include/net/tcp.h | 6 ++- include/uapi/linux/bpf.h | 3 +- net/ipv4/tcp_input.c | 5 +- net/ipv4/tcp_ipv4.c | 5 +- net/ipv4/tcp_output.c | 105 ++++++++++++++++++++++++++++++++++++----- net/ipv6/tcp_ipv6.c | 5 +- tools/include/uapi/linux/bpf.h | 3 +- 7 files changed, 109 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp.h b/include/net/tcp.h index c186dbf731e1..3e768a6b8264 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -455,7 +455,8 @@ enum tcp_synack_type { struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -2035,7 +2036,8 @@ struct tcp_request_sock_ops { int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 010ed2abcb66..18d0e128bc3c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4175,8 +4175,9 @@ enum { BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b520450170d1..8c9da4b65dae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6824,7 +6824,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - &foc, TCP_SYNACK_FASTOPEN); + &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { reqsk_fastopen_remove(fastopen_sk, req, false); @@ -6842,7 +6842,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_timeout_init((struct sock *)req)); af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : - TCP_SYNACK_COOKIE); + TCP_SYNACK_COOKIE, + skb); if (want_cookie) { reqsk_free(req); return 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5084333b5ab6..631a5ee0dd4e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -965,7 +965,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -976,7 +977,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc, synack_type); + skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 44ffa4891beb..673db6879e46 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -438,6 +438,7 @@ struct tcp_out_options { u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ u8 hash_size; /* bytes in hash_location */ + u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ @@ -452,6 +453,59 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) #endif } +#ifdef CONFIG_CGROUP_BPF +/* req, syn_skb and synack_type are used when writing synack */ +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || + !*remaining) + return; + + /* The bpf running context preparation and the actual bpf prog + * calling will be implemented in a later PATCH together with + * other bpf pieces. + */ +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ + if (likely(!opts->bpf_opt_len)) + return; + + /* The bpf running context preparation and the actual bpf prog + * calling will be implemented in a later PATCH together with + * other bpf pieces. + */ +} +#else +static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +} + +static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct sk_buff *syn_skb, + enum tcp_synack_type synack_type, + struct tcp_out_options *opts) +{ +} +#endif + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -691,6 +745,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -701,7 +757,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, struct tcp_out_options *opts, const struct tcp_md5sig_key *md5, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; @@ -758,6 +815,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, + synack_type, opts, &remaining); + return MAX_TCP_OPTION_SPACE - remaining; } @@ -826,6 +886,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); + + size = MAX_TCP_OPTION_SPACE - remaining; + } + return size; } @@ -1213,6 +1282,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, } #endif + /* BPF prog is the last one writing header option */ + bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts); + INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check, tcp_v6_send_check, tcp_v4_send_check, sk, skb); @@ -3336,20 +3408,20 @@ int tcp_send_synack(struct sock *sk) } /** - * tcp_make_synack - Prepare a SYN-ACK. - * sk: listener socket - * dst: dst entry attached to the SYNACK - * req: request_sock pointer - * foc: cookie for tcp fast open - * synack_type: Type of synback to prepare - * - * Allocate one skb and build a SYNACK packet. - * @dst is consumed : Caller should not use it again. + * tcp_make_synack - Allocate one skb and build a SYNACK packet. + * @sk: listener socket + * @dst: dst entry attached to the SYNACK. It is consumed and caller + * should not use it again. + * @req: request_sock pointer + * @foc: cookie for tcp fast open + * @synack_type: Type of synack to prepare + * @syn_skb: SYN packet just received. It could be NULL for rtx case. */ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -3408,8 +3480,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + /* bpf program will be interested in the tcp_flags */ + TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, - foc, synack_type) + sizeof(*th); + foc, synack_type, + syn_skb) + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -3441,6 +3516,9 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_unlock(); #endif + bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, + synack_type, &opts); + skb->skb_mstamp_ns = now; tcp_add_tx_delay(skb, tp); @@ -3936,7 +4014,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + NULL); if (!res) { __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 305870a72352..87a633e1fbef 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -501,7 +501,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type) + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = tcp_inet6_sk(sk); @@ -515,7 +516,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc, synack_type); + skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 010ed2abcb66..18d0e128bc3c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4175,8 +4175,9 @@ enum { BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From 0813a841566f0962a5551be7749b43c45f0022a0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:04 -0700 Subject: bpf: tcp: Allow bpf prog to write and parse TCP header option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Note: The TCP changes here is mainly to implement the bpf pieces into the bpf_skops_*() functions introduced in the earlier patches. ] The earlier effort in BPF-TCP-CC allows the TCP Congestion Control algorithm to be written in BPF. It opens up opportunities to allow a faster turnaround time in testing/releasing new congestion control ideas to production environment. The same flexibility can be extended to writing TCP header option. It is not uncommon that people want to test new TCP header option to improve the TCP performance. Another use case is for data-center that has a more controlled environment and has more flexibility in putting header options for internal only use. For example, we want to test the idea in putting maximum delay ACK in TCP header option which is similar to a draft RFC proposal [1]. This patch introduces the necessary BPF API and use them in the TCP stack to allow BPF_PROG_TYPE_SOCK_OPS program to parse and write TCP header options. It currently supports most of the TCP packet except RST. Supported TCP header option: ─────────────────────────── This patch allows the bpf-prog to write any option kind. Different bpf-progs can write its own option by calling the new helper bpf_store_hdr_opt(). The helper will ensure there is no duplicated option in the header. By allowing bpf-prog to write any option kind, this gives a lot of flexibility to the bpf-prog. Different bpf-prog can write its own option kind. It could also allow the bpf-prog to support a recently standardized option on an older kernel. Sockops Callback Flags: ────────────────────── The bpf program will only be called to parse/write tcp header option if the following newly added callback flags are enabled in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG A few words on the PARSE CB flags. When the above PARSE CB flags are turned on, the bpf-prog will be called on packets received at a sk that has at least reached the ESTABLISHED state. The parsing of the SYN-SYNACK-ACK will be discussed in the "3 Way HandShake" section. The default is off for all of the above new CB flags, i.e. the bpf prog will not be called to parse or write bpf hdr option. There are details comment on these new cb flags in the UAPI bpf.h. sock_ops->skb_data and bpf_load_hdr_opt() ───────────────────────────────────────── sock_ops->skb_data and sock_ops->skb_data_end covers the whole TCP header and its options. They are read only. The new bpf_load_hdr_opt() helps to read a particular option "kind" from the skb_data. Please refer to the comment in UAPI bpf.h. It has details on what skb_data contains under different sock_ops->op. 3 Way HandShake ─────────────── The bpf-prog can learn if it is sending SYN or SYNACK by reading the sock_ops->skb_tcp_flags. * Passive side When writing SYNACK (i.e. sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB), the received SYN skb will be available to the bpf prog. The bpf prog can use the SYN skb (which may carry the header option sent from the remote bpf prog) to decide what bpf header option should be written to the outgoing SYNACK skb. The SYN packet can be obtained by getsockopt(TCP_BPF_SYN*). More on this later. Also, the bpf prog can learn if it is in syncookie mode (by checking sock_ops->args[0] == BPF_WRITE_HDR_TCP_SYNACK_COOKIE). The bpf prog can store the received SYN pkt by using the existing bpf_setsockopt(TCP_SAVE_SYN). The example in a later patch does it. [ Note that the fullsock here is a listen sk, bpf_sk_storage is not very useful here since the listen sk will be shared by many concurrent connection requests. Extending bpf_sk_storage support to request_sock will add weight to the minisock and it is not necessary better than storing the whole ~100 bytes SYN pkt. ] When the connection is established, the bpf prog will be called in the existing PASSIVE_ESTABLISHED_CB callback. At that time, the bpf prog can get the header option from the saved syn and then apply the needed operation to the newly established socket. The later patch will use the max delay ack specified in the SYN header and set the RTO of this newly established connection as an example. The received ACK (that concludes the 3WHS) will also be available to the bpf prog during PASSIVE_ESTABLISHED_CB through the sock_ops->skb_data. It could be useful in syncookie scenario. More on this later. There is an existing getsockopt "TCP_SAVED_SYN" to return the whole saved syn pkt which includes the IP[46] header and the TCP header. A few "TCP_BPF_SYN*" getsockopt has been added to allow specifying where to start getting from, e.g. starting from TCP header, or from IP[46] header. The new getsockopt(TCP_BPF_SYN*) will also know where it can get the SYN's packet from: - (a) the just received syn (available when the bpf prog is writing SYNACK) and it is the only way to get SYN during syncookie mode. or - (b) the saved syn (available in PASSIVE_ESTABLISHED_CB and also other existing CB). The bpf prog does not need to know where the SYN pkt is coming from. The getsockopt(TCP_BPF_SYN*) will hide this details. Similarly, a flags "BPF_LOAD_HDR_OPT_TCP_SYN" is also added to bpf_load_hdr_opt() to read a particular header option from the SYN packet. * Fastopen Fastopen should work the same as the regular non fastopen case. This is a test in a later patch. * Syncookie For syncookie, the later example patch asks the active side's bpf prog to resend the header options in ACK. The server can use bpf_load_hdr_opt() to look at the options in this received ACK during PASSIVE_ESTABLISHED_CB. * Active side The bpf prog will get a chance to write the bpf header option in the SYN packet during WRITE_HDR_OPT_CB. The received SYNACK pkt will also be available to the bpf prog during the existing ACTIVE_ESTABLISHED_CB callback through the sock_ops->skb_data and bpf_load_hdr_opt(). * Turn off header CB flags after 3WHS If the bpf prog does not need to write/parse header options beyond the 3WHS, the bpf prog can clear the bpf_sock_ops_cb_flags to avoid being called for header options. Or the bpf-prog can select to leave the UNKNOWN_HDR_OPT_CB_FLAG on so that the kernel will only call it when there is option that the kernel cannot handle. [1]: draft-wang-tcpm-low-latency-opt-00 https://tools.ietf.org/html/draft-wang-tcpm-low-latency-opt-00 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200820190104.2885895-1-kafai@fb.com --- include/linux/bpf-cgroup.h | 25 +++ include/linux/filter.h | 4 + include/net/tcp.h | 49 ++++++ include/uapi/linux/bpf.h | 300 ++++++++++++++++++++++++++++++++- net/core/filter.c | 365 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_input.c | 20 ++- net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 104 +++++++++++- tools/include/uapi/linux/bpf.h | 300 ++++++++++++++++++++++++++++++++- 9 files changed, 1150 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 64f367044e25..2f98d2fce62e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -279,6 +279,31 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) +/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a + * fullsock and its parent fullsock cannot be traced by + * sk_to_full_sk(). + * + * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. + * Its listener-sk is not attached to the rsk_listener. + * In this case, the caller holds the listener-sk (unlocked), + * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with + * the listener-sk such that the cgroup-bpf-progs of the + * listener-sk will be run. + * + * Regardless of syncookie mode or not, + * calling bpf_setsockopt on listener-sk will not make sense anyway, + * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. + */ +#define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ diff --git a/include/linux/filter.h b/include/linux/filter.h index c427dfa5f908..995625950cc1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1241,8 +1241,12 @@ struct bpf_sock_ops_kern { u32 reply; u32 replylong[4]; }; + struct sk_buff *syn_skb; + struct sk_buff *skb; + void *skb_data_end; u8 op; u8 is_fullsock; + u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that diff --git a/include/net/tcp.h b/include/net/tcp.h index 3e768a6b8264..1f967b4e22f6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2235,6 +2235,55 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#ifdef CONFIG_CGROUP_BPF +/* Copy the listen sk's HDR_OPT_CB flags to its child. + * + * During 3-Way-HandShake, the synack is usually sent from + * the listen sk with the HDR_OPT_CB flags set so that + * bpf-prog will be called to write the BPF hdr option. + * + * In fastopen, the child sk is used to send synack instead + * of the listen sk. Thus, inheriting the HDR_OPT_CB flags + * from the listen sk gives the bpf-prog a chance to write + * BPF hdr option in the synack pkt during fastopen. + * + * Both fastopen and non-fastopen child will inherit the + * HDR_OPT_CB flags to keep the bpf-prog having a consistent + * behavior when deciding to clear this cb flags (or not) + * during the PASSIVE_ESTABLISHED_CB. + * + * In the future, other cb flags could be inherited here also. + */ +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ + tcp_sk(child)->bpf_sock_ops_cb_flags = + tcp_sk(sk)->bpf_sock_ops_cb_flags & + (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ + skops->skb = skb; + skops->skb_data_end = skb->data + end_offset; +} +#else +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ +} +#endif + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18d0e128bc3c..f67ec5d9e57d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3395,6 +3395,120 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3539,6 +3653,9 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4165,6 +4282,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -4173,8 +4320,48 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), - BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, @@ -4233,6 +4420,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4262,6 +4506,60 @@ enum { TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { diff --git a/net/core/filter.c b/net/core/filter.c index 1608f4b3987f..ab5603d5b62a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4669,9 +4669,82 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, + int optname, const u8 **start) +{ + struct sk_buff *syn_skb = bpf_sock->syn_skb; + const u8 *hdr_start; + int ret; + + if (syn_skb) { + /* sk is a request_sock here */ + + if (optname == TCP_BPF_SYN) { + hdr_start = syn_skb->data; + ret = tcp_hdrlen(syn_skb); + } else { + /* optname == TCP_BPF_SYN_IP */ + hdr_start = skb_network_header(syn_skb); + ret = skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); + } + } else { + struct sock *sk = bpf_sock->sk; + struct saved_syn *saved_syn; + + if (sk->sk_state == TCP_NEW_SYN_RECV) + /* synack retransmit. bpf_sock->syn_skb will + * not be available. It has to resort to + * saved_syn (if it is saved). + */ + saved_syn = inet_reqsk(sk)->saved_syn; + else + saved_syn = tcp_sk(sk)->saved_syn; + + if (!saved_syn) + return -ENOENT; + + if (optname == TCP_BPF_SYN) { + hdr_start = saved_syn->data + + saved_syn->network_hdrlen; + ret = saved_syn->tcp_hdrlen; + } else { + /* optname == TCP_BPF_SYN_IP */ + hdr_start = saved_syn->data; + ret = saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; + } + } + + *start = hdr_start; + return ret; +} + BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && + optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_IP) { + int ret, copy_len = 0; + const u8 *start; + + ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start); + if (ret > 0) { + copy_len = ret; + if (optlen < copy_len) { + copy_len = optlen; + ret = -ENOSPC; + } + + memcpy(optval, start, copy_len); + } + + /* Zero out unused buffer at the end */ + memset(optval + copy_len, 0, optlen - copy_len); + + return ret; + } + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -6165,6 +6238,232 @@ static const struct bpf_func_proto bpf_sk_assign_proto = { .arg3_type = ARG_ANYTHING, }; +static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend, + u8 search_kind, const u8 *magic, + u8 magic_len, bool *eol) +{ + u8 kind, kind_len; + + *eol = false; + + while (op < opend) { + kind = op[0]; + + if (kind == TCPOPT_EOL) { + *eol = true; + return ERR_PTR(-ENOMSG); + } else if (kind == TCPOPT_NOP) { + op++; + continue; + } + + if (opend - op < 2 || opend - op < op[1] || op[1] < 2) + /* Something is wrong in the received header. + * Follow the TCP stack's tcp_parse_options() + * and just bail here. + */ + return ERR_PTR(-EFAULT); + + kind_len = op[1]; + if (search_kind == kind) { + if (!magic_len) + return op; + + if (magic_len > kind_len - 2) + return ERR_PTR(-ENOMSG); + + if (!memcmp(&op[2], magic, magic_len)) + return op; + } + + op += kind_len; + } + + return ERR_PTR(-ENOMSG); +} + +BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + void *, search_res, u32, len, u64, flags) +{ + bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN; + const u8 *op, *opend, *magic, *search = search_res; + u8 search_kind, search_len, copy_len, magic_len; + int ret; + + /* 2 byte is the minimal option len except TCPOPT_NOP and + * TCPOPT_EOL which are useless for the bpf prog to learn + * and this helper disallow loading them also. + */ + if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN) + return -EINVAL; + + search_kind = search[0]; + search_len = search[1]; + + if (search_len > len || search_kind == TCPOPT_NOP || + search_kind == TCPOPT_EOL) + return -EINVAL; + + if (search_kind == TCPOPT_EXP || search_kind == 253) { + /* 16 or 32 bit magic. +2 for kind and kind length */ + if (search_len != 4 && search_len != 6) + return -EINVAL; + magic = &search[2]; + magic_len = search_len - 2; + } else { + if (search_len) + return -EINVAL; + magic = NULL; + magic_len = 0; + } + + if (load_syn) { + ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op); + if (ret < 0) + return ret; + + opend = op + ret; + op += sizeof(struct tcphdr); + } else { + if (!bpf_sock->skb || + bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB) + /* This bpf_sock->op cannot call this helper */ + return -EPERM; + + opend = bpf_sock->skb_data_end; + op = bpf_sock->skb->data + sizeof(struct tcphdr); + } + + op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len, + &eol); + if (IS_ERR(op)) + return PTR_ERR(op); + + copy_len = op[1]; + ret = copy_len; + if (copy_len > len) { + ret = -ENOSPC; + copy_len = len; + } + + memcpy(search_res, op, copy_len); + return ret; +} + +static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { + .func = bpf_sock_ops_load_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + const void *, from, u32, len, u64, flags) +{ + u8 new_kind, new_kind_len, magic_len = 0, *opend; + const u8 *op, *new_op, *magic = NULL; + struct sk_buff *skb; + bool eol; + + if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB) + return -EPERM; + + if (len < 2 || flags) + return -EINVAL; + + new_op = from; + new_kind = new_op[0]; + new_kind_len = new_op[1]; + + if (new_kind_len > len || new_kind == TCPOPT_NOP || + new_kind == TCPOPT_EOL) + return -EINVAL; + + if (new_kind_len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + /* 253 is another experimental kind */ + if (new_kind == TCPOPT_EXP || new_kind == 253) { + if (new_kind_len < 4) + return -EINVAL; + /* Match for the 2 byte magic also. + * RFC 6994: the magic could be 2 or 4 bytes. + * Hence, matching by 2 byte only is on the + * conservative side but it is the right + * thing to do for the 'search-for-duplication' + * purpose. + */ + magic = &new_op[2]; + magic_len = 2; + } + + /* Check for duplication */ + skb = bpf_sock->skb; + op = skb->data + sizeof(struct tcphdr); + opend = bpf_sock->skb_data_end; + + op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len, + &eol); + if (!IS_ERR(op)) + return -EEXIST; + + if (PTR_ERR(op) != -ENOMSG) + return PTR_ERR(op); + + if (eol) + /* The option has been ended. Treat it as no more + * header option can be written. + */ + return -ENOSPC; + + /* No duplication found. Store the header option. */ + memcpy(opend, from, new_kind_len); + + bpf_sock->remaining_opt_len -= new_kind_len; + bpf_sock->skb_data_end += new_kind_len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { + .func = bpf_sock_ops_store_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, + u32, len, u64, flags) +{ + if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB) + return -EPERM; + + if (flags || len < 2) + return -EINVAL; + + if (len > bpf_sock->remaining_opt_len) + return -ENOSPC; + + bpf_sock->remaining_opt_len -= len; + + return 0; +} + +static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { + .func = bpf_sock_ops_reserve_hdr_opt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -6193,6 +6492,9 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_lwt_seg6_store_bytes || func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || +#endif +#ifdef CONFIG_INET + func == bpf_sock_ops_store_hdr_opt || #endif func == bpf_lwt_in_push_encap || func == bpf_lwt_xmit_push_encap) @@ -6565,6 +6867,12 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; #ifdef CONFIG_INET + case BPF_FUNC_load_hdr_opt: + return &bpf_sock_ops_load_hdr_opt_proto; + case BPF_FUNC_store_hdr_opt: + return &bpf_sock_ops_store_hdr_opt_proto; + case BPF_FUNC_reserve_hdr_opt: + return &bpf_sock_ops_reserve_hdr_opt_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ @@ -7364,6 +7672,20 @@ static bool sock_ops_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET_OR_NULL; break; + case offsetof(struct bpf_sock_ops, skb_data): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sock_ops, skb_data_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, + size_default); default: if (size != size_default) return false; @@ -8701,6 +9023,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sock_ops, sk): SOCK_OPS_GET_SK(); break; + case offsetof(struct bpf_sock_ops, skb_data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb_data_end), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb_data_end)); + break; + case offsetof(struct bpf_sock_ops, skb_data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, data)); + break; + case offsetof(struct bpf_sock_ops, skb_len): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + si->dst_reg, si->dst_reg, + offsetof(struct sk_buff, len)); + break; + case offsetof(struct bpf_sock_ops, skb_tcp_flags): + off = offsetof(struct sk_buff, cb); + off += offsetof(struct tcp_skb_cb, tcp_flags); + *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb, + tcp_flags), + si->dst_reg, si->dst_reg, off); + break; } return insn - insn_buf; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8c9da4b65dae..319cc7fd5117 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -146,6 +146,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); + struct bpf_sock_ops_kern sock_ops; if (likely(!unknown_opt && !parse_all_opt)) return; @@ -161,12 +162,15 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) return; } - /* BPF prog will have access to the sk and skb. - * - * The bpf running context preparation and the actual bpf prog - * calling will be implemented in a later PATCH together with - * other bpf pieces. - */ + sock_owned_by_me(sk); + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); + + BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } static void bpf_skops_established(struct sock *sk, int bpf_op, @@ -180,7 +184,9 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; sock_ops.sk = sk; - /* skb will be passed to the bpf prog in a later patch. */ + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 495dda2449fe..56c306e3cd2f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -548,6 +548,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + bpf_skops_init_child(sk, newsk); tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 673db6879e46..ab79d36ed07f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -454,6 +454,18 @@ static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) } #ifdef CONFIG_CGROUP_BPF +static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb, + enum tcp_synack_type synack_type) +{ + if (unlikely(!skb)) + return BPF_WRITE_HDR_TCP_CURRENT_MSS; + + if (unlikely(synack_type == TCP_SYNACK_COOKIE)) + return BPF_WRITE_HDR_TCP_SYNACK_COOKIE; + + return 0; +} + /* req, syn_skb and synack_type are used when writing synack */ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -462,15 +474,60 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, unsigned int *remaining) { + struct bpf_sock_ops_kern sock_ops; + int err; + if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) || !*remaining) return; - /* The bpf running context preparation and the actual bpf prog - * calling will be implemented in a later PATCH together with - * other bpf pieces. - */ + /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */ + + /* init sock_ops */ + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB; + + if (req) { + /* The listen "sk" cannot be passed here because + * it is not locked. It would not make too much + * sense to do bpf_setsockopt(listen_sk) based + * on individual connection request also. + * + * Thus, "req" is passed here and the cgroup-bpf-progs + * of the listen "sk" will be run. + * + * "req" is also used here for fastopen even the "sk" here is + * a fullsock "child" sk. It is to keep the behavior + * consistent between fastopen and non-fastopen on + * the bpf programming side. + */ + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = *remaining; + /* tcp_current_mss() does not pass a skb */ + if (skb) + bpf_skops_init_skb(&sock_ops, skb, 0); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err || sock_ops.remaining_opt_len == *remaining) + return; + + opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len; + /* round up to 4 bytes */ + opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3; + + *remaining -= opts->bpf_opt_len; } static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, @@ -479,13 +536,42 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, enum tcp_synack_type synack_type, struct tcp_out_options *opts) { - if (likely(!opts->bpf_opt_len)) + u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len; + struct bpf_sock_ops_kern sock_ops; + int err; + + if (likely(!max_opt_len)) return; - /* The bpf running context preparation and the actual bpf prog - * calling will be implemented in a later PATCH together with - * other bpf pieces. - */ + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + + sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB; + + if (req) { + sock_ops.sk = (struct sock *)req; + sock_ops.syn_skb = syn_skb; + } else { + sock_owned_by_me(sk); + + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + } + + sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type); + sock_ops.remaining_opt_len = max_opt_len; + first_opt_off = tcp_hdrlen(skb) - max_opt_len; + bpf_skops_init_skb(&sock_ops, skb, first_opt_off); + + err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk); + + if (err) + nr_written = 0; + else + nr_written = max_opt_len - sock_ops.remaining_opt_len; + + if (nr_written < max_opt_len) + memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP, + max_opt_len - nr_written); } #else static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 18d0e128bc3c..f67ec5d9e57d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3395,6 +3395,120 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3539,6 +3653,9 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4165,6 +4282,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -4173,8 +4320,48 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), - BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, @@ -4233,6 +4420,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4262,6 +4506,60 @@ enum { TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From 267cf9fa43d1c9d525d5d818a8651f2900e3aa9e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:23 -0700 Subject: tcp: bpf: Optionally store mac header in TCP_SAVE_SYN This patch is adapted from Eric's patch in an earlier discussion [1]. The TCP_SAVE_SYN currently only stores the network header and tcp header. This patch allows it to optionally store the mac header also if the setsockopt's optval is 2. It requires one more bit for the "save_syn" bit field in tcp_sock. This patch achieves this by moving the syn_smc bit next to the is_mptcp. The syn_smc is currently used with the TCP experimental option. Since syn_smc is only used when CONFIG_SMC is enabled, this patch also puts the "IS_ENABLED(CONFIG_SMC)" around it like the is_mptcp did with "IS_ENABLED(CONFIG_MPTCP)". The mac_hdrlen is also stored in the "struct saved_syn" to allow a quick offset from the bpf prog if it chooses to start getting from the network header or the tcp header. [1]: https://lore.kernel.org/netdev/CANn89iLJNWh6bkH7DNhy_kmcAexuUCccqERqe7z2QsvPhGrYPQ@mail.gmail.com/ Suggested-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190123.2886935-1-kafai@fb.com --- include/linux/tcp.h | 13 ++++++++----- include/net/request_sock.h | 1 + include/uapi/linux/bpf.h | 1 + net/core/filter.c | 27 ++++++++++++++++++++++----- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_input.c | 14 +++++++++++++- tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 48 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 29d166263ae7..56ff2952edaf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -239,14 +239,13 @@ struct tcp_sock { repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; - u8 syn_data:1, /* SYN includes data */ + u8 save_syn:2, /* Save headers of SYN packet */ + syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ @@ -393,6 +392,9 @@ struct tcp_sock { #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif +#if IS_ENABLED(CONFIG_SMC) + bool syn_smc; /* SYN includes SMC */ +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ @@ -488,7 +490,8 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { - return saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; + return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 7d9ed99a77bd..29e41ff3ec93 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -42,6 +42,7 @@ struct request_sock_ops { int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); struct saved_syn { + u32 mac_hdrlen; u32 network_hdrlen; u32 tcp_hdrlen; u8 data[]; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f67ec5d9e57d..544b89a64918 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4540,6 +4540,7 @@ enum { */ TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ab5603d5b62a..47eef9a0be6a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4682,11 +4682,16 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, if (optname == TCP_BPF_SYN) { hdr_start = syn_skb->data; ret = tcp_hdrlen(syn_skb); - } else { - /* optname == TCP_BPF_SYN_IP */ + } else if (optname == TCP_BPF_SYN_IP) { hdr_start = skb_network_header(syn_skb); ret = skb_network_header_len(syn_skb) + tcp_hdrlen(syn_skb); + } else { + /* optname == TCP_BPF_SYN_MAC */ + hdr_start = skb_mac_header(syn_skb); + ret = skb_mac_header_len(syn_skb) + + skb_network_header_len(syn_skb) + + tcp_hdrlen(syn_skb); } } else { struct sock *sk = bpf_sock->sk; @@ -4706,12 +4711,24 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, if (optname == TCP_BPF_SYN) { hdr_start = saved_syn->data + + saved_syn->mac_hdrlen + saved_syn->network_hdrlen; ret = saved_syn->tcp_hdrlen; + } else if (optname == TCP_BPF_SYN_IP) { + hdr_start = saved_syn->data + + saved_syn->mac_hdrlen; + ret = saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; } else { - /* optname == TCP_BPF_SYN_IP */ + /* optname == TCP_BPF_SYN_MAC */ + + /* TCP_SAVE_SYN may not have saved the mac hdr */ + if (!saved_syn->mac_hdrlen) + return -ENOENT; + hdr_start = saved_syn->data; - ret = saved_syn->network_hdrlen + + ret = saved_syn->mac_hdrlen + + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } } @@ -4724,7 +4741,7 @@ BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && - optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_IP) { + optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; const u8 *start; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6075cb091a20..57a568875539 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3211,7 +3211,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; case TCP_SAVE_SYN: - if (val < 0 || val > 1) + /* 0: disable, 1: enable, 2: start from ether_header */ + if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 319cc7fd5117..4337841faeff 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6676,13 +6676,25 @@ static void tcp_reqsk_record_syn(const struct sock *sk, if (tcp_sk(sk)->save_syn) { u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); struct saved_syn *saved_syn; + u32 mac_hdrlen; + void *base; + + if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */ + base = skb_mac_header(skb); + mac_hdrlen = skb_mac_header_len(skb); + len += mac_hdrlen; + } else { + base = skb_network_header(skb); + mac_hdrlen = 0; + } saved_syn = kmalloc(struct_size(saved_syn, data, len), GFP_ATOMIC); if (saved_syn) { + saved_syn->mac_hdrlen = mac_hdrlen; saved_syn->network_hdrlen = skb_network_header_len(skb); saved_syn->tcp_hdrlen = tcp_hdrlen(skb); - memcpy(saved_syn->data, skb_network_header(skb), len); + memcpy(saved_syn->data, base, len); req->saved_syn = saved_syn; } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f67ec5d9e57d..544b89a64918 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4540,6 +4540,7 @@ enum { */ TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ }; enum { -- cgit v1.2.3 From f836a56e84ffc9f1a1cd73f77e10404ca46a4616 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:15 +0200 Subject: bpf: Generalize bpf_sk_storage Refactor the functionality in bpf_sk_storage.c so that concept of storage linked to kernel objects can be extended to other objects like inode, task_struct etc. Each new local storage will still be a separate map and provide its own set of helpers. This allows for future object specific extensions and still share a lot of the underlying implementation. This includes the changes suggested by Martin in: https://lore.kernel.org/bpf/20200725013047.4006241-1-kafai@fb.com/ adding new map operations to support bpf_local_storage maps: * storages for different kernel objects to optionally have different memory charging strategy (map_local_storage_charge, map_local_storage_uncharge) * Functionality to extract the storage pointer from a pointer to the owning object (map_owner_storage_ptr) Co-developed-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-4-kpsingh@chromium.org --- include/linux/bpf.h | 8 ++ include/net/bpf_sk_storage.h | 52 +++++++++ include/uapi/linux/bpf.h | 8 +- net/core/bpf_sk_storage.c | 238 +++++++++++++++++++++++++++-------------- tools/include/uapi/linux/bpf.h | 8 +- 5 files changed, 228 insertions(+), 86 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 81f38e2fda78..8c443b93ac11 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -34,6 +34,8 @@ struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; +struct bpf_local_storage; +struct bpf_local_storage_map; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -104,6 +106,12 @@ struct bpf_map_ops { __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); + /* Functions called by bpf_local_storage maps */ + int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 950c5aaba15e..9e631b5466e3 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -3,8 +3,15 @@ #ifndef _BPF_SK_STORAGE_H #define _BPF_SK_STORAGE_H +#include +#include +#include #include #include +#include +#include +#include +#include struct sock; @@ -13,6 +20,7 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +struct bpf_local_storage_elem; struct bpf_sk_storage_diag; struct sk_buff; struct nlattr; @@ -34,6 +42,50 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, u16 idx); +/* Helper functions for bpf_local_storage */ +int bpf_local_storage_map_alloc_check(union bpf_attr *attr); + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit); + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type); + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); + +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_omem); + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem); + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, + bool charge_mem); + +int +bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem); + +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags); + #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); struct bpf_sk_storage_diag * diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 544b89a64918..2cbd137eed86 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3765,9 +3765,13 @@ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; -/* BPF_FUNC_sk_storage_get flags */ +/* BPF_FUNC__storage_get flags */ enum { - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index ec61ee7c7ee4..cd8b7017913b 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -84,7 +84,7 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; struct hlist_head list; /* List of bpf_local_storage_elem */ - struct sock *owner; /* The object that owns the above "list" of + void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. */ struct rcu_head rcu; @@ -110,6 +110,33 @@ static int omem_charge(struct sock *sk, unsigned int size) return -ENOMEM; } +static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) +{ + struct bpf_map *map = &smap->map; + + if (!map->ops->map_local_storage_charge) + return 0; + + return map->ops->map_local_storage_charge(smap, owner, size); +} + +static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, + u32 size) +{ + struct bpf_map *map = &smap->map; + + if (map->ops->map_local_storage_uncharge) + map->ops->map_local_storage_uncharge(smap, owner, size); +} + +static struct bpf_local_storage __rcu ** +owner_storage(struct bpf_local_storage_map *smap, void *owner) +{ + struct bpf_map *map = &smap->map; + + return map->ops->map_owner_storage_ptr(owner); +} + static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem) { return !hlist_unhashed(&selem->snode); @@ -120,13 +147,13 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) return !hlist_unhashed(&selem->map_node); } -static struct bpf_local_storage_elem * -bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk, - void *value, bool charge_omem) +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, + void *value, bool charge_mem) { struct bpf_local_storage_elem *selem; - if (charge_omem && omem_charge(sk, smap->elem_size)) + if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); @@ -136,8 +163,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk, return selem; } - if (charge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + if (charge_mem) + mem_uncharge(smap, owner, smap->elem_size); return NULL; } @@ -146,32 +173,32 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, struct sock *sk, * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ -static bool -bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_omem) +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_mem) { struct bpf_local_storage_map *smap; bool free_local_storage; - struct sock *sk; + void *owner; smap = rcu_dereference(SDATA(selem)->smap); - sk = local_storage->owner; + owner = local_storage->owner; /* All uncharging on the owner must be done first. * The owner may be freed once the last selem is unlinked * from local_storage. */ - if (uncharge_omem) - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + if (uncharge_mem) + mem_uncharge(smap, owner, smap->elem_size); free_local_storage = hlist_is_singular_node(&selem->snode, &local_storage->list); if (free_local_storage) { - atomic_sub(sizeof(struct bpf_local_storage), &sk->sk_omem_alloc); + mem_uncharge(smap, owner, sizeof(struct bpf_local_storage)); local_storage->owner = NULL; - /* After this RCU_INIT, sk may be freed and cannot be used */ - RCU_INIT_POINTER(sk->sk_bpf_storage, NULL); + + /* After this RCU_INIT, owner may be freed and cannot be used */ + RCU_INIT_POINTER(*owner_storage(smap, owner), NULL); /* local_storage is not freed now. local_storage->lock is * still held and raw_spin_unlock_bh(&local_storage->lock) @@ -209,23 +236,22 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) local_storage = rcu_dereference(selem->local_storage); raw_spin_lock_bh(&local_storage->lock); if (likely(selem_linked_to_storage(selem))) - free_local_storage = - bpf_selem_unlink_storage_nolock(local_storage, selem, true); + free_local_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, true); raw_spin_unlock_bh(&local_storage->lock); if (free_local_storage) kfree_rcu(local_storage, rcu); } -static void -bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem) +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem) { RCU_INIT_POINTER(selem->local_storage, local_storage); hlist_add_head(&selem->snode, &local_storage->list); } -static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; @@ -242,8 +268,8 @@ static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) raw_spin_unlock_bh(&b->lock); } -static void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem) +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); @@ -253,7 +279,7 @@ static void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_bh(&b->lock); } -static void bpf_selem_unlink(struct bpf_local_storage_elem *selem) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from @@ -263,7 +289,7 @@ static void bpf_selem_unlink(struct bpf_local_storage_elem *selem) __bpf_selem_unlink_storage(selem); } -static struct bpf_local_storage_data * +struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit) @@ -329,40 +355,45 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, return 0; } -static int sk_storage_alloc(struct sock *sk, +int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *first_selem) { - struct bpf_local_storage *prev_sk_storage, *sk_storage; + struct bpf_local_storage *prev_storage, *storage; + struct bpf_local_storage **owner_storage_ptr; int err; - err = omem_charge(sk, sizeof(*sk_storage)); + err = mem_charge(smap, owner, sizeof(*storage)); if (err) return err; - sk_storage = kzalloc(sizeof(*sk_storage), GFP_ATOMIC | __GFP_NOWARN); - if (!sk_storage) { + storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); + if (!storage) { err = -ENOMEM; goto uncharge; } - INIT_HLIST_HEAD(&sk_storage->list); - raw_spin_lock_init(&sk_storage->lock); - sk_storage->owner = sk; - bpf_selem_link_storage_nolock(sk_storage, first_selem); + INIT_HLIST_HEAD(&storage->list); + raw_spin_lock_init(&storage->lock); + storage->owner = owner; + + bpf_selem_link_storage_nolock(storage, first_selem); bpf_selem_link_map(smap, first_selem); - /* Publish sk_storage to sk. sk->sk_lock cannot be acquired. - * Hence, atomic ops is used to set sk->sk_bpf_storage - * from NULL to the newly allocated sk_storage ptr. + + owner_storage_ptr = + (struct bpf_local_storage **)owner_storage(smap, owner); + /* Publish storage to the owner. + * Instead of using any lock of the kernel object (i.e. owner), + * cmpxchg will work with any kernel object regardless what + * the running context is, bh, irq...etc. * - * From now on, the sk->sk_bpf_storage pointer is protected - * by the sk_storage->lock. Hence, when freeing - * the sk->sk_bpf_storage, the sk_storage->lock must - * be held before setting sk->sk_bpf_storage to NULL. + * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage) + * is protected by the storage->lock. Hence, when freeing + * the owner->storage, the storage->lock must be held before + * setting owner->storage ptr to NULL. */ - prev_sk_storage = cmpxchg((struct bpf_local_storage **)&sk->sk_bpf_storage, - NULL, sk_storage); - if (unlikely(prev_sk_storage)) { + prev_storage = cmpxchg(owner_storage_ptr, NULL, storage); + if (unlikely(prev_storage)) { bpf_selem_unlink_map(first_selem); err = -EAGAIN; goto uncharge; @@ -380,8 +411,8 @@ static int sk_storage_alloc(struct sock *sk, return 0; uncharge: - kfree(sk_storage); - atomic_sub(sizeof(*sk_storage), &sk->sk_omem_alloc); + kfree(storage); + mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -390,38 +421,37 @@ uncharge: * Otherwise, it will become a leak (and other memory issues * during map destruction). */ -static struct bpf_local_storage_data * -bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, - u64 map_flags) +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags) { struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; - struct bpf_local_storage_map *smap; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ - unlikely((map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); - smap = (struct bpf_local_storage_map *)map; - local_storage = rcu_dereference(sk->sk_bpf_storage); + local_storage = rcu_dereference(*owner_storage(smap, owner)); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, sk, value, true); + selem = bpf_selem_alloc(smap, owner, value, true); if (!selem) return ERR_PTR(-ENOMEM); - err = sk_storage_alloc(sk, smap, selem); + err = bpf_local_storage_alloc(owner, smap, selem); if (err) { kfree(selem); - atomic_sub(smap->elem_size, &sk->sk_omem_alloc); + mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } @@ -439,7 +469,7 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, if (err) return ERR_PTR(err); if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) { - copy_map_value_locked(map, old_sdata->data, + copy_map_value_locked(&smap->map, old_sdata->data, value, false); return old_sdata; } @@ -464,7 +494,8 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, goto unlock_err; if (old_sdata && (map_flags & BPF_F_LOCK)) { - copy_map_value_locked(map, old_sdata->data, value, false); + copy_map_value_locked(&smap->map, old_sdata->data, value, + false); selem = SELEM(old_sdata); goto unlock; } @@ -478,7 +509,7 @@ bpf_local_storage_update(struct sock *sk, struct bpf_map *map, void *value, * old_sdata will not be uncharged later during * bpf_selem_unlink_storage_nolock(). */ - selem = bpf_selem_alloc(smap, sk, value, !old_sdata); + selem = bpf_selem_alloc(smap, owner, value, !old_sdata); if (!selem) { err = -ENOMEM; goto unlock_err; @@ -591,17 +622,12 @@ void bpf_sk_storage_free(struct sock *sk) kfree_rcu(sk_storage, rcu); } -static void bpf_local_storage_map_free(struct bpf_map *map) +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) { struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; unsigned int i; - smap = (struct bpf_local_storage_map *)map; - - bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - /* Note that this map might be concurrently cloned from * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone * RCU read section to finish before proceeding. New RCU @@ -646,7 +672,16 @@ static void bpf_local_storage_map_free(struct bpf_map *map) synchronize_rcu(); kvfree(smap->buckets); - kfree(map); + kfree(smap); +} + +static void sk_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); } /* U16_MAX is much more than enough for sk local storage @@ -658,7 +693,7 @@ static void bpf_local_storage_map_free(struct bpf_map *map) sizeof(struct bpf_local_storage_elem)), \ (U16_MAX - sizeof(struct bpf_local_storage_elem))) -static int bpf_local_storage_map_alloc_check(union bpf_attr *attr) +int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || !(attr->map_flags & BPF_F_NO_PREALLOC) || @@ -677,7 +712,7 @@ static int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr) +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; unsigned int i; @@ -717,8 +752,19 @@ static struct bpf_map *bpf_local_storage_map_alloc(union bpf_attr *attr) smap->elem_size = sizeof(struct bpf_local_storage_elem) + attr->value_size; - smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); + return smap; +} + +static struct bpf_map *sk_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); return &smap->map; } @@ -728,10 +774,10 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static int bpf_local_storage_map_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type) +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) { u32 int_data; @@ -772,8 +818,9 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, fd = *(int *)key; sock = sockfd_lookup(fd, &err); if (sock) { - sdata = bpf_local_storage_update(sock->sk, map, value, - map_flags); + sdata = bpf_local_storage_update( + sock->sk, (struct bpf_local_storage_map *)map, value, + map_flags); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -862,7 +909,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) bpf_selem_link_map(smap, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { - ret = sk_storage_alloc(newsk, smap, copy_selem); + ret = bpf_local_storage_alloc(newsk, smap, copy_selem); if (ret) { kfree(copy_selem); atomic_sub(smap->elem_size, @@ -906,7 +953,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, * destruction). */ refcount_inc_not_zero(&sk->sk_refcnt)) { - sdata = bpf_local_storage_update(sk, map, value, BPF_NOEXIST); + sdata = bpf_local_storage_update( + sk, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -931,11 +980,33 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) return -ENOENT; } +static int sk_storage_charge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + return omem_charge(owner, size); +} + +static void sk_storage_uncharge(struct bpf_local_storage_map *smap, + void *owner, u32 size) +{ + struct sock *sk = owner; + + atomic_sub(size, &sk->sk_omem_alloc); +} + +static struct bpf_local_storage __rcu ** +sk_storage_ptr(void *owner) +{ + struct sock *sk = owner; + + return &sk->sk_bpf_storage; +} + static int sk_storage_map_btf_id; const struct bpf_map_ops sk_storage_map_ops = { .map_alloc_check = bpf_local_storage_map_alloc_check, - .map_alloc = bpf_local_storage_map_alloc, - .map_free = bpf_local_storage_map_free, + .map_alloc = sk_storage_map_alloc, + .map_free = sk_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_fd_sk_storage_lookup_elem, .map_update_elem = bpf_fd_sk_storage_update_elem, @@ -943,6 +1014,9 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_check_btf = bpf_local_storage_map_check_btf, .map_btf_name = "bpf_local_storage_map", .map_btf_id = &sk_storage_map_btf_id, + .map_local_storage_charge = sk_storage_charge, + .map_local_storage_uncharge = sk_storage_uncharge, + .map_owner_storage_ptr = sk_storage_ptr, }; const struct bpf_func_proto bpf_sk_storage_get_proto = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 544b89a64918..2cbd137eed86 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3765,9 +3765,13 @@ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; -/* BPF_FUNC_sk_storage_get flags */ +/* BPF_FUNC__storage_get flags */ enum { - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ -- cgit v1.2.3 From 8ea636848aca35b9f97c5b5dee30225cf2dd0fe6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:17 +0200 Subject: bpf: Implement bpf_local_storage for inodes Similar to bpf_local_storage for sockets, add local storage for inodes. The life-cycle of storage is managed with the life-cycle of the inode. i.e. the storage is destroyed along with the owning inode. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-6-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 29 +++ include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 40 +++- kernel/bpf/Makefile | 1 + kernel/bpf/bpf_inode_storage.c | 273 ++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 10 + security/bpf/hooks.c | 6 + tools/bpf/bpftool/Documentation/bpftool-map.rst | 2 +- tools/bpf/bpftool/bash-completion/bpftool | 3 +- tools/bpf/bpftool/map.c | 3 +- tools/include/uapi/linux/bpf.h | 40 +++- tools/lib/bpf/libbpf_probes.c | 5 +- 13 files changed, 410 insertions(+), 8 deletions(-) create mode 100644 kernel/bpf/bpf_inode_storage.c (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index af74712af585..aaacb6aafc87 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -17,9 +17,28 @@ #include #undef LSM_HOOK +struct bpf_storage_blob { + struct bpf_local_storage __rcu *storage; +}; + +extern struct lsm_blob_sizes bpf_lsm_blob_sizes; + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + if (unlikely(!inode->i_security)) + return NULL; + + return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; +} + +extern const struct bpf_func_proto bpf_inode_storage_get_proto; +extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +void bpf_inode_storage_free(struct inode *inode); + #else /* !CONFIG_BPF_LSM */ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, @@ -28,6 +47,16 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; } +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + return NULL; +} + +static inline void bpf_inode_storage_free(struct inode *inode) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a52a5688418e..2e6f568377f1 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -107,6 +107,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif +#ifdef CONFIG_BPF_LSM +BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2cbd137eed86..b6bfcd085a76 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, }; /* Note that tracing related programs such as @@ -3509,6 +3510,41 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3655,7 +3691,9 @@ union bpf_attr { FN(get_task_stack), \ FN(load_hdr_opt), \ FN(store_hdr_opt), \ - FN(reserve_hdr_opt), + FN(reserve_hdr_opt), \ + FN(inode_storage_get), \ + FN(inode_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6961ff400cba..bdc8cd1b6767 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -5,6 +5,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c new file mode 100644 index 000000000000..f3a44e929447 --- /dev/null +++ b/kernel/bpf/bpf_inode_storage.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_BPF_STORAGE_CACHE(inode_cache); + +static struct bpf_local_storage __rcu ** +inode_storage_ptr(void *owner) +{ + struct inode *inode = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, + struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *inode_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_inode(inode); + if (!bsb) + return NULL; + + inode_storage = rcu_dereference(bsb->storage); + if (!inode_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit); +} + +void bpf_inode_storage_free(struct inode *inode) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_inode_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_inode(inode); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Netiher the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_inode_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_inoode_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_inode_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return NULL; + + sdata = inode_storage_lookup(f->f_inode, map, true); + fput(f); + return sdata ? sdata->data : NULL; +} + +static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct file *f; + int fd; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f || !inode_storage_ptr(f->f_inode)) + return -EBADF; + + sdata = bpf_local_storage_update(f->f_inode, + (struct bpf_local_storage_map *)map, + value, map_flags); + fput(f); + return PTR_ERR_OR_ZERO(sdata); +} + +static int inode_storage_delete(struct inode *inode, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = inode_storage_lookup(inode, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct file *f; + int fd, err; + + fd = *(int *)key; + f = fget_raw(fd); + if (!f) + return -EBADF; + + err = inode_storage_delete(f->f_inode, map); + fput(f); + return err; +} + +BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the inode_storage_ptr is not + * NULL as inode_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!inode_storage_ptr(inode)) + return (unsigned long)NULL; + + sdata = inode_storage_lookup(inode, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + inode, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_inode_storage_delete, + struct bpf_map *, map, struct inode *, inode) +{ + /* This helper must only called from where the inode is gurranteed + * to have a refcount and cannot be freed. + */ + return inode_storage_delete(inode, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); + return &smap->map; +} + +static void inode_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int inode_storage_map_btf_id; +const struct bpf_map_ops inode_storage_map_ops = { + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = inode_storage_map_alloc, + .map_free = inode_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_fd_inode_storage_lookup_elem, + .map_update_elem = bpf_fd_inode_storage_update_elem, + .map_delete_elem = bpf_fd_inode_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &inode_storage_map_btf_id, + .map_owner_storage_ptr = inode_storage_ptr, +}; + +BTF_ID_LIST(bpf_inode_storage_btf_ids) +BTF_ID_UNUSED +BTF_ID(struct, inode) + +const struct bpf_func_proto bpf_inode_storage_get_proto = { + .func = bpf_inode_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .btf_id = bpf_inode_storage_btf_ids, +}; + +const struct bpf_func_proto bpf_inode_storage_delete_proto = { + .func = bpf_inode_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .btf_id = bpf_inode_storage_btf_ids, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b46e973faee9..5443cea86cef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -769,7 +769,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE) + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dd24503ab3d3..38748794518e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4311,6 +4311,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sk_storage_delete) goto error; break; + case BPF_MAP_TYPE_INODE_STORAGE: + if (func_id != BPF_FUNC_inode_storage_get && + func_id != BPF_FUNC_inode_storage_delete) + goto error; + break; default: break; } @@ -4384,6 +4389,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) goto error; break; + case BPF_FUNC_inode_storage_get: + case BPF_FUNC_inode_storage_delete: + if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + goto error; + break; default: break; } diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index 32d32d485451..788667d582ae 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -11,6 +11,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(NAME, bpf_lsm_##NAME), #include #undef LSM_HOOK + LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free), }; static int __init bpf_lsm_init(void) @@ -20,7 +21,12 @@ static int __init bpf_lsm_init(void) return 0; } +struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = { + .lbs_inode = sizeof(struct bpf_storage_blob), +}; + DEFINE_LSM(bpf) = { .name = "bpf", .init = bpf_lsm_init, + .blobs = &bpf_lsm_blob_sizes }; diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 41e2a74252d0..083db6c2fc67 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -49,7 +49,7 @@ MAP COMMANDS | | **lru_percpu_hash** | **lpm_trie** | **array_of_maps** | **hash_of_maps** | | **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash** | | **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage** -| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** } +| | **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage** } DESCRIPTION =========== diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index f53ed2f1a4aa..7b68e3c0a5fb 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -704,7 +704,8 @@ _bpftool() lru_percpu_hash lpm_trie array_of_maps \ hash_of_maps devmap devmap_hash sockmap cpumap \ xskmap sockhash cgroup_storage reuseport_sockarray \ - percpu_cgroup_storage queue stack' -- \ + percpu_cgroup_storage queue stack sk_storage \ + struct_ops inode_storage' -- \ "$cur" ) ) return 0 ;; diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 3a27d31a1856..bc0071228f88 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -50,6 +50,7 @@ const char * const map_type_name[] = { [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", [BPF_MAP_TYPE_RINGBUF] = "ringbuf", + [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", }; const size_t map_type_name_size = ARRAY_SIZE(map_type_name); @@ -1442,7 +1443,7 @@ static int do_help(int argc, char **argv) " lru_percpu_hash | lpm_trie | array_of_maps | hash_of_maps |\n" " devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n" " cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n" - " queue | stack | sk_storage | struct_ops | ringbuf }\n" + " queue | stack | sk_storage | struct_ops | ringbuf | inode_storage }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, argv[-2]); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2cbd137eed86..b6bfcd085a76 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, }; /* Note that tracing related programs such as @@ -3509,6 +3510,41 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3655,7 +3691,9 @@ union bpf_attr { FN(get_task_stack), \ FN(load_hdr_opt), \ FN(store_hdr_opt), \ - FN(reserve_hdr_opt), + FN(reserve_hdr_opt), \ + FN(inode_storage_get), \ + FN(inode_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 010c9a76fd2b..5482a9b7ae2d 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -170,7 +170,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len, return btf_fd; } -static int load_sk_storage_btf(void) +static int load_local_storage_btf(void) { const char strs[] = "\0bpf_spin_lock\0val\0cnt\0l"; /* struct bpf_spin_lock { @@ -229,12 +229,13 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) key_size = 0; break; case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_INODE_STORAGE: btf_key_type_id = 1; btf_value_type_id = 3; value_size = 8; max_entries = 0; map_flags = BPF_F_NO_PREALLOC; - btf_fd = load_sk_storage_btf(); + btf_fd = load_local_storage_btf(); if (btf_fd < 0) return false; break; -- cgit v1.2.3 From 30897832d8b97e93833fb52c0a02951db3692ed2 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:18 +0200 Subject: bpf: Allow local storage to be used from LSM programs Adds support for both bpf_{sk, inode}_storage_{get, delete} to be used in LSM programs. These helpers are not used for tracing programs (currently) as their usage is tied to the life-cycle of the object and should only be used where the owning object won't be freed (when the owning object is passed as an argument to the LSM hook). Thus, they are safer to use in LSM hooks than tracing. Usage of local storage in tracing programs will probably follow a per function based whitelist approach. Since the UAPI helper signature for bpf_sk_storage expect a bpf_sock, it, leads to a compilation warning for LSM programs, it's also updated to accept a void * pointer instead. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-7-kpsingh@chromium.org --- include/net/bpf_sk_storage.h | 2 ++ include/uapi/linux/bpf.h | 7 +++++-- kernel/bpf/bpf_lsm.c | 21 ++++++++++++++++++++- net/core/bpf_sk_storage.c | 25 +++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++-- 5 files changed, 57 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 3c516dd07caf..119f4c9c3a9c 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,6 +20,8 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto sk_storage_get_btf_proto; +extern const struct bpf_func_proto sk_storage_delete_btf_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b6bfcd085a76..0e1cdf806fe1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2808,7 +2808,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * @@ -2824,6 +2824,9 @@ union bpf_attr { * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used @@ -2836,7 +2839,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index fb278144e9fd..9cd1428c7199 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -45,10 +47,27 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return 0; } +static const struct bpf_func_proto * +bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_inode_storage_get: + return &bpf_inode_storage_get_proto; + case BPF_FUNC_inode_storage_delete: + return &bpf_inode_storage_delete_proto; + case BPF_FUNC_sk_storage_get: + return &sk_storage_get_btf_proto; + case BPF_FUNC_sk_storage_delete: + return &sk_storage_delete_btf_proto; + default: + return tracing_prog_func_proto(func_id, prog); + } +} + const struct bpf_prog_ops lsm_prog_ops = { }; const struct bpf_verifier_ops lsm_verifier_ops = { - .get_func_proto = tracing_prog_func_proto, + .get_func_proto = bpf_lsm_func_proto, .is_valid_access = btf_ctx_access, }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index f29d9a9b4ea4..55fae03b4cc3 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -12,6 +12,7 @@ #include #include #include +#include DEFINE_BPF_STORAGE_CACHE(sk_cache); @@ -377,6 +378,30 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_SOCKET, }; +BTF_ID_LIST(sk_storage_btf_ids) +BTF_ID_UNUSED +BTF_ID(struct, sock) + +const struct bpf_func_proto sk_storage_get_btf_proto = { + .func = bpf_sk_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .btf_id = sk_storage_btf_ids, +}; + +const struct bpf_func_proto sk_storage_delete_btf_proto = { + .func = bpf_sk_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .btf_id = sk_storage_btf_ids, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b6bfcd085a76..0e1cdf806fe1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2808,7 +2808,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * @@ -2824,6 +2824,9 @@ union bpf_attr { * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used @@ -2836,7 +2839,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return -- cgit v1.2.3 From 6e22ab9da79343532cd3cde39df25e5a5478c692 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:20 +0200 Subject: bpf: Add d_path helper Adding d_path helper function that returns full path for given 'struct path' object, which needs to be the kernel BTF 'path' object. The path is returned in buffer provided 'buf' of size 'sz' and is zero terminated. bpf_d_path(&file->f_path, buf, size); The helper calls directly d_path function, so there's only limited set of function it can be called from. Adding just very modest set for the start. Updating also bpf.h tools uapi header and adding 'path' to bpf_helpers_doc.py script. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200825192124.710397-11-jolsa@kernel.org --- include/uapi/linux/bpf.h | 14 ++++++++++++ kernel/trace/bpf_trace.c | 48 ++++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 14 ++++++++++++ 4 files changed, 78 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0e1cdf806fe1..0388bc0200b0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3513,6 +3513,7 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description * Get a bpf_local_storage from an *inode*. @@ -3548,6 +3549,18 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given 'struct path' object, which + * needs to be the kernel BTF 'path' object. The path is + * returned in the provided buffer 'buf' of size 'sz' and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3697,6 +3710,7 @@ union bpf_attr { FN(reserve_hdr_opt), \ FN(inode_storage_get), \ FN(inode_storage_delete), \ + FN(d_path), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a8d4f253ed77..d973d891f2e2 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1098,6 +1098,52 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_d_path, struct path *, path, char *, buf, u32, sz) +{ + long len; + char *p; + + if (!sz) + return 0; + + p = d_path(path, buf, sz); + if (IS_ERR(p)) { + len = PTR_ERR(p); + } else { + len = buf + sz - p; + memmove(buf, p, len); + } + + return len; +} + +BTF_SET_START(btf_allowlist_d_path) +BTF_ID(func, vfs_truncate) +BTF_ID(func, vfs_fallocate) +BTF_ID(func, dentry_open) +BTF_ID(func, vfs_getattr) +BTF_ID(func, filp_close) +BTF_SET_END(btf_allowlist_d_path) + +static bool bpf_d_path_allowed(const struct bpf_prog *prog) +{ + return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); +} + +BTF_ID_LIST(bpf_d_path_btf_ids) +BTF_ID(struct, path) + +static const struct bpf_func_proto bpf_d_path_proto = { + .func = bpf_d_path, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_d_path_btf_ids, + .allowed = bpf_d_path_allowed, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1579,6 +1625,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; + case BPF_FUNC_d_path: + return &bpf_d_path_proto; default: return raw_tp_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 5bfa448b4704..08388173973f 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -432,6 +432,7 @@ class PrinterHelpers(Printer): 'struct __sk_buff', 'struct sk_msg_md', 'struct xdp_md', + 'struct path', ] known_types = { '...', @@ -472,6 +473,7 @@ class PrinterHelpers(Printer): 'struct tcp_request_sock', 'struct udp6_sock', 'struct task_struct', + 'struct path', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0e1cdf806fe1..0388bc0200b0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3513,6 +3513,7 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description * Get a bpf_local_storage from an *inode*. @@ -3548,6 +3549,18 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given 'struct path' object, which + * needs to be the kernel BTF 'path' object. The path is + * returned in the provided buffer 'buf' of size 'sz' and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3697,6 +3710,7 @@ union bpf_attr { FN(reserve_hdr_opt), \ FN(inode_storage_get), \ FN(inode_storage_delete), \ + FN(d_path), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 493a0ebd804c986e6bd207603c5e1ca748470d3d Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Mon, 13 Apr 2020 09:20:53 -0700 Subject: nl80211: fix PORT_AUTHORIZED wording to reflect behavior The CMD_PORT_AUTHORIZED event was described as an event which indicated a successfully completed 4-way handshake. But the behavior was not as advertized. The only driver which uses this is brcmfmac, and this driver only sends the event after a successful 802.1X-FT roam. This prevents userspace applications from knowing if the 4-way completed on: 1. Normal 802.1X connects 2. Normal PSK connections 3. FT-PSK roams wpa_supplicant handles this incorrect behavior by just completing the connection after association, before the 4-way has completed. If the 4-way ends up failing it disconnects at that point. Since this behavior appears to be expected (wpa_s handles it this way) I have changed the wording in the API description to reflect the actual behavior. Signed-off-by: James Prestwood Link: https://lore.kernel.org/r/20200413162053.3711-1-prestwoj@gmail.com [fix spelling of 802.1X throughout ...] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 631f3a997b3c..8cc2b825e4e4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -647,13 +647,9 @@ * authentication/association or not receiving a response from the AP. * Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as * well to remain backwards compatible. - * When establishing a security association, drivers that support 4 way - * handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when - * the 4 way handshake is completed successfully. * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself. - * When a security association was established with the new AP (e.g. if - * the FT protocol was used for roaming or the driver completed the 4 way - * handshake), this event should be followed by an + * When a security association was established on an 802.1X network using + * fast transition, this event should be followed by an * %NL80211_CMD_PORT_AUTHORIZED event. * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify * userspace that a connection was dropped by the AP or due to other @@ -1067,13 +1063,11 @@ * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously * configured PMK for the authenticator address identified by * %NL80211_ATTR_MAC. - * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way - * handshake was completed successfully by the driver. The BSSID is - * specified with %NL80211_ATTR_MAC. Drivers that support 4 way handshake - * offload should send this event after indicating 802.11 association with - * %NL80211_CMD_CONNECT or %NL80211_CMD_ROAM. If the 4 way handshake failed - * %NL80211_CMD_DISCONNECT should be indicated instead. - * + * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates an 802.1X FT roam was + * completed successfully. Drivers that support 4 way handshake offload + * should send this event after indicating 802.1X FT assocation with + * %NL80211_CMD_ROAM. If the 4 way handshake failed %NL80211_CMD_DISCONNECT + * should be indicated instead. * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request * and RX notification. This command is used both as a request to transmit * a control port frame and as a notification that a control port frame -- cgit v1.2.3 From eb89a6a6b7a1af2d9c8d83ee44fa67700d6337e7 Mon Sep 17 00:00:00 2001 From: Miles Hu Date: Tue, 4 Aug 2020 10:16:29 +0200 Subject: nl80211: add support for setting fixed HE rate/gi/ltf This patch adds the nl80211 structs, definitions, policies and parsing code required to pass fixed HE rate, GI and LTF settings. Signed-off-by: Miles Hu Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20200804081630.2013619-1-john@phrozen.org [fix comment] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 + include/uapi/linux/nl80211.h | 28 +++++++++ net/wireless/nl80211.c | 137 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 160 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d9e6b9fbd95b..c9bce9bba511 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -678,7 +678,10 @@ struct cfg80211_bitrate_mask { u32 legacy; u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; + u16 he_mcs[NL80211_HE_NSS_MAX]; enum nl80211_txrate_gi gi; + enum nl80211_he_gi he_gi; + enum nl80211_he_ltf he_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8cc2b825e4e4..1a4b922f489f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3180,6 +3180,18 @@ enum nl80211_he_gi { NL80211_RATE_INFO_HE_GI_3_2, }; +/** + * enum nl80211_he_ltf - HE long training field + * @NL80211_RATE_INFO_HE_1xLTF: 3.2 usec + * @NL80211_RATE_INFO_HE_2xLTF: 6.4 usec + * @NL80211_RATE_INFO_HE_4xLTF: 12.8 usec + */ +enum nl80211_he_ltf { + NL80211_RATE_INFO_HE_1XLTF, + NL80211_RATE_INFO_HE_2XLTF, + NL80211_RATE_INFO_HE_4XLTF, +}; + /** * enum nl80211_he_ru_alloc - HE RU allocation values * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation @@ -4735,6 +4747,10 @@ enum nl80211_key_attributes { * @NL80211_TXRATE_VHT: VHT rates allowed for TX rate selection, * see &struct nl80211_txrate_vht * @NL80211_TXRATE_GI: configure GI, see &enum nl80211_txrate_gi + * @NL80211_TXRATE_HE: HE rates allowed for TX rate selection, + * see &struct nl80211_txrate_he + * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. + * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -4744,6 +4760,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HT, NL80211_TXRATE_VHT, NL80211_TXRATE_GI, + NL80211_TXRATE_HE, + NL80211_TXRATE_HE_GI, + NL80211_TXRATE_HE_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -4761,6 +4780,15 @@ struct nl80211_txrate_vht { __u16 mcs[NL80211_VHT_NSS_MAX]; }; +#define NL80211_HE_NSS_MAX 8 +/** + * struct nl80211_txrate_he - HE MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_he { + __u16 mcs[NL80211_HE_NSS_MAX]; +}; + enum nl80211_txrate_gi { NL80211_TXRATE_DEFAULT_GI, NL80211_TXRATE_FORCE_SGI, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6ee3bc48d776..da0f33c2d2d8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -336,6 +336,13 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { .len = NL80211_MAX_SUPP_HT_RATES }, [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), [NL80211_TXRATE_GI] = { .type = NLA_U8 }, + [NL80211_TXRATE_HE] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_he)), + [NL80211_TXRATE_HE_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_HE_GI_0_8, + NL80211_RATE_INFO_HE_GI_3_2), + [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_HE_1XLTF, + NL80211_RATE_INFO_HE_4XLTF), }; static const struct nla_policy @@ -4430,21 +4437,106 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, return true; } +static u16 he_mcs_map_to_mcs_mask(u8 he_mcs_map) +{ + switch (he_mcs_map) { + case IEEE80211_HE_MCS_NOT_SUPPORTED: + return 0; + case IEEE80211_HE_MCS_SUPPORT_0_7: + return 0x00FF; + case IEEE80211_HE_MCS_SUPPORT_0_9: + return 0x03FF; + case IEEE80211_HE_MCS_SUPPORT_0_11: + return 0xFFF; + default: + break; + } + return 0; +} + +static void he_build_mcs_mask(u16 he_mcs_map, + u16 he_mcs_mask[NL80211_HE_NSS_MAX]) +{ + u8 nss; + + for (nss = 0; nss < NL80211_HE_NSS_MAX; nss++) { + he_mcs_mask[nss] = he_mcs_map_to_mcs_mask(he_mcs_map & 0x03); + he_mcs_map >>= 2; + } +} + +static u16 he_get_txmcsmap(struct genl_info *info, + const struct ieee80211_sta_he_cap *he_cap) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + __le16 tx_mcs; + + switch (wdev->chandef.width) { + case NL80211_CHAN_WIDTH_80P80: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80p80; + break; + case NL80211_CHAN_WIDTH_160: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_160; + break; + default: + tx_mcs = he_cap->he_mcs_nss_supp.tx_mcs_80; + break; + } + return le16_to_cpu(tx_mcs); +} + +static bool he_set_mcs_mask(struct genl_info *info, + struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_he *txrate, + u16 mcs[NL80211_HE_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + u16 tx_mcs_mask[NL80211_HE_NSS_MAX] = {}; + u16 tx_mcs_map = 0; + u8 i; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + memset(mcs, 0, sizeof(u16) * NL80211_HE_NSS_MAX); + + tx_mcs_map = he_get_txmcsmap(info, he_cap); + + /* Build he_mcs_mask from HE capabilities */ + he_build_mcs_mask(tx_mcs_map, tx_mcs_mask); + + for (i = 0; i < NL80211_HE_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, - struct cfg80211_bitrate_mask *mask) + struct cfg80211_bitrate_mask *mask, + struct net_device *dev) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct wireless_dev *wdev = dev->ieee80211_ptr; int rem, i; struct nlattr *tx_rates; struct ieee80211_supported_band *sband; - u16 vht_tx_mcs_map; + u16 vht_tx_mcs_map, he_tx_mcs_map; memset(mask, 0, sizeof(*mask)); /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { + const struct ieee80211_sta_he_cap *he_cap; + sband = rdev->wiphy.bands[i]; if (!sband) @@ -4460,6 +4552,16 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, vht_tx_mcs_map = le16_to_cpu(sband->vht_cap.vht_mcs.tx_mcs_map); vht_build_mcs_mask(vht_tx_mcs_map, mask->control[i].vht_mcs); + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + continue; + + he_tx_mcs_map = he_get_txmcsmap(info, he_cap); + he_build_mcs_mask(he_tx_mcs_map, mask->control[i].he_mcs); + + mask->control[i].he_gi = 0xFF; + mask->control[i].he_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -4515,13 +4617,25 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].gi > NL80211_TXRATE_FORCE_LGI) return -EINVAL; } + if (tb[NL80211_TXRATE_HE] && + !he_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_HE]), + mask->control[band].he_mcs)) + return -EINVAL; + if (tb[NL80211_TXRATE_HE_GI]) + mask->control[band].he_gi = + nla_get_u8(tb[NL80211_TXRATE_HE_GI]); + if (tb[NL80211_TXRATE_HE_LTF]) + mask->control[band].he_ltf = + nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT or VHT + /* don't allow empty legacy rates if HT, VHT or HE * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || - rdev->wiphy.bands[band]->vht_cap.vht_supported)) + rdev->wiphy.bands[band]->vht_cap.vht_supported || + ieee80211_get_he_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -4532,6 +4646,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].vht_mcs[i]) goto out; + for (i = 0; i < NL80211_HE_NSS_MAX; i++) + if (mask->control[band].he_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -4976,7 +5094,8 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TX_RATES]) { err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, - ¶ms.beacon_rate); + ¶ms.beacon_rate, + dev); if (err) return err; @@ -10780,7 +10899,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, return -EOPNOTSUPP; err = nl80211_parse_tx_bitrate_mask(info, info->attrs, - NL80211_ATTR_TX_RATES, &mask); + NL80211_ATTR_TX_RATES, &mask, + dev); if (err) return err; @@ -11388,7 +11508,8 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TX_RATES]) { err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, - &setup.beacon_rate); + &setup.beacon_rate, + dev); if (err) return err; @@ -14168,7 +14289,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { attr = NL80211_TID_CONFIG_ATTR_TX_RATE; err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, - &tid_conf->txrate_mask); + &tid_conf->txrate_mask, dev); if (err) return err; -- cgit v1.2.3 From 00c207edfb2bff9cf03a8f21e57c9c752a1d9f16 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 11 Aug 2020 10:01:03 +0200 Subject: nl80211: rename csa counter attributes countdown counters We want to reuse the attributes for other counters such as BSS color change. Rename them to more generic names. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20200811080107.3615705-1-john@phrozen.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 ++++++++------ net/wireless/nl80211.c | 16 ++++++++-------- 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1a4b922f489f..ec96d5fe0e05 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2076,10 +2076,10 @@ enum nl80211_commands { * operation). * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information * for the time while performing a channel switch. - * @NL80211_ATTR_CSA_C_OFF_BEACON: An array of offsets (u16) to the channel - * switch counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). - * @NL80211_ATTR_CSA_C_OFF_PRESP: An array of offsets (u16) to the channel - * switch counters in the probe response (%NL80211_ATTR_PROBE_RESP). + * @NL80211_ATTR_CNTDWN_OFFS_BEACON: An array of offsets (u16) to the channel + * switch or color change counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). + * @NL80211_ATTR_CNTDWN_OFFS_PRESP: An array of offsets (u16) to the channel + * switch or color change counters in the probe response (%NL80211_ATTR_PROBE_RESP). * * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32. * As specified in the &enum nl80211_rxmgmt_flags. @@ -2815,8 +2815,8 @@ enum nl80211_attrs { NL80211_ATTR_CH_SWITCH_COUNT, NL80211_ATTR_CH_SWITCH_BLOCK_TX, NL80211_ATTR_CSA_IES, - NL80211_ATTR_CSA_C_OFF_BEACON, - NL80211_ATTR_CSA_C_OFF_PRESP, + NL80211_ATTR_CNTDWN_OFFS_BEACON, + NL80211_ATTR_CNTDWN_OFFS_PRESP, NL80211_ATTR_RXMGMT_FLAGS, @@ -3003,6 +3003,8 @@ enum nl80211_attrs { #define NL80211_ATTR_MESH_PARAMS NL80211_ATTR_MESH_CONFIG #define NL80211_ATTR_IFACE_SOCKET_OWNER NL80211_ATTR_SOCKET_OWNER #define NL80211_ATTR_SAE_DATA NL80211_ATTR_AUTH_DATA +#define NL80211_ATTR_CSA_C_OFF_BEACON NL80211_ATTR_CNTDWN_OFFS_BEACON +#define NL80211_ATTR_CSA_C_OFF_PRESP NL80211_ATTR_CNTDWN_OFFS_PRESP /* * Allow user space programs to use #ifdef on new attributes by defining them diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index da0f33c2d2d8..e640e65f3255 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -578,8 +578,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CH_SWITCH_COUNT] = { .type = NLA_U32 }, [NL80211_ATTR_CH_SWITCH_BLOCK_TX] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_IES] = { .type = NLA_NESTED }, - [NL80211_ATTR_CSA_C_OFF_BEACON] = { .type = NLA_BINARY }, - [NL80211_ATTR_CSA_C_OFF_PRESP] = { .type = NLA_BINARY }, + [NL80211_ATTR_CNTDWN_OFFS_BEACON] = { .type = NLA_BINARY }, + [NL80211_ATTR_CNTDWN_OFFS_PRESP] = { .type = NLA_BINARY }, [NL80211_ATTR_STA_SUPPORTED_CHANNELS] = NLA_POLICY_MIN_LEN(2), /* * The value of the Length field of the Supported Operating @@ -8891,10 +8891,10 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) return err; - if (!csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]) + if (!csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]) return -EINVAL; - len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); if (!len || (len % sizeof(u16))) return -EINVAL; @@ -8905,7 +8905,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; params.counter_offsets_beacon = - nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_BEACON]); + nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_BEACON]); /* sanity checks - counters should fit and be the same */ for (i = 0; i < params.n_counter_offsets_beacon; i++) { @@ -8918,8 +8918,8 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]) { - len = nla_len(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + if (csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]) { + len = nla_len(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); if (!len || (len % sizeof(u16))) return -EINVAL; @@ -8930,7 +8930,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) return -EINVAL; params.counter_offsets_presp = - nla_data(csa_attrs[NL80211_ATTR_CSA_C_OFF_PRESP]); + nla_data(csa_attrs[NL80211_ATTR_CNTDWN_OFFS_PRESP]); /* sanity checks - counters should fit and be the same */ for (i = 0; i < params.n_counter_offsets_presp; i++) { -- cgit v1.2.3 From 2831a631022eed6e3f800f08892132c6edde652c Mon Sep 17 00:00:00 2001 From: Chung-Hsien Hsu Date: Mon, 17 Aug 2020 02:33:15 -0500 Subject: nl80211: support SAE authentication offload in AP mode Let drivers advertise support for AP-mode SAE authentication offload with a new NL80211_EXT_FEATURE_SAE_OFFLOAD_AP flag. Signed-off-by: Chung-Hsien Hsu Signed-off-by: Chi-Hsien Lin Link: https://lore.kernel.org/r/20200817073316.33402-4-stanley.hsu@cypress.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 +++++++++++--- net/wireless/nl80211.c | 9 ++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ec96d5fe0e05..0584e0d349f0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -252,9 +252,13 @@ * DOC: SAE authentication offload * * By setting @NL80211_EXT_FEATURE_SAE_OFFLOAD flag drivers can indicate they - * support offloading SAE authentication for WPA3-Personal networks. In - * %NL80211_CMD_CONNECT the password for SAE should be specified using - * %NL80211_ATTR_SAE_PASSWORD. + * support offloading SAE authentication for WPA3-Personal networks in station + * mode. Similarly @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP flag can be set by + * drivers indicating the offload support in AP mode. + * + * The password for SAE should be specified using %NL80211_ATTR_SAE_PASSWORD in + * %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP for station and AP mode + * respectively. */ /** @@ -5845,6 +5849,9 @@ enum nl80211_feature_flags { * handshake with PSK in AP mode (PSK is passed as part of the start AP * command). * + * @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP: Device wants to do SAE authentication + * in AP mode (SAE password is passed as part of the start AP command). + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5902,6 +5909,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e640e65f3255..201d029687cc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4960,8 +4960,9 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return false; return true; case NL80211_CMD_START_AP: - /* SAE not supported yet */ - if (auth_type == NL80211_AUTHTYPE_SAE) + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP) && + auth_type == NL80211_AUTHTYPE_SAE) return false; /* FILS not supported yet */ if (auth_type == NL80211_AUTHTYPE_FILS_SK || @@ -9552,7 +9553,9 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, if (info->attrs[NL80211_ATTR_SAE_PASSWORD]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_SAE_OFFLOAD)) + NL80211_EXT_FEATURE_SAE_OFFLOAD) && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP)) return -EINVAL; settings->sae_pwd = nla_data(info->attrs[NL80211_ATTR_SAE_PASSWORD]); -- cgit v1.2.3 From 50aba46c234ea6ab3134cebb5ab27885f33a3e5d Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 27 Aug 2020 14:19:23 +0200 Subject: gtp: add notification mechanism Like all other network functions, let's notify gtp context on creation and deletion. Signed-off-by: Nicolas Dichtel Tested-by: Gabriel Ganne Acked-by: Harald Welte Signed-off-by: David S. Miller --- drivers/net/gtp.c | 58 ++++++++++++++++++++++++++++++++++++++++-------- include/uapi/linux/gtp.h | 2 ++ 2 files changed, 51 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 21640a035d7d..c84a10569388 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -928,8 +928,8 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) } } -static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, - struct genl_info *info) +static struct pdp_ctx *gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, + struct genl_info *info) { struct pdp_ctx *pctx, *pctx_tid = NULL; struct net_device *dev = gtp->dev; @@ -956,12 +956,12 @@ static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, if (found) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) - return -EEXIST; + return ERR_PTR(-EEXIST); if (info->nlhdr->nlmsg_flags & NLM_F_REPLACE) - return -EOPNOTSUPP; + return ERR_PTR(-EOPNOTSUPP); if (pctx && pctx_tid) - return -EEXIST; + return ERR_PTR(-EEXIST); if (!pctx) pctx = pctx_tid; @@ -974,13 +974,13 @@ static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, netdev_dbg(dev, "GTPv1-U: update tunnel id = %x/%x (pdp %p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, pctx); - return 0; + return pctx; } pctx = kmalloc(sizeof(*pctx), GFP_ATOMIC); if (pctx == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); sock_hold(sk); pctx->sk = sk; @@ -1018,7 +1018,7 @@ static int gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk, break; } - return 0; + return pctx; } static void pdp_context_free(struct rcu_head *head) @@ -1036,9 +1036,12 @@ static void pdp_context_delete(struct pdp_ctx *pctx) call_rcu(&pctx->rcu_head, pdp_context_free); } +static int gtp_tunnel_notify(struct pdp_ctx *pctx, u8 cmd); + static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info) { unsigned int version; + struct pdp_ctx *pctx; struct gtp_dev *gtp; struct sock *sk; int err; @@ -1088,7 +1091,13 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info) goto out_unlock; } - err = gtp_pdp_add(gtp, sk, info); + pctx = gtp_pdp_add(gtp, sk, info); + if (IS_ERR(pctx)) { + err = PTR_ERR(pctx); + } else { + gtp_tunnel_notify(pctx, GTP_CMD_NEWPDP); + err = 0; + } out_unlock: rcu_read_unlock(); @@ -1159,6 +1168,7 @@ static int gtp_genl_del_pdp(struct sk_buff *skb, struct genl_info *info) netdev_dbg(pctx->dev, "GTPv1-U: deleting tunnel id = %x/%x (pdp %p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, pctx); + gtp_tunnel_notify(pctx, GTP_CMD_DELPDP); pdp_context_delete(pctx); out_unlock: @@ -1168,6 +1178,14 @@ out_unlock: static struct genl_family gtp_genl_family; +enum gtp_multicast_groups { + GTP_GENL_MCGRP, +}; + +static const struct genl_multicast_group gtp_genl_mcgrps[] = { + [GTP_GENL_MCGRP] = { .name = GTP_GENL_MCGRP_NAME }, +}; + static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, int flags, u32 type, struct pdp_ctx *pctx) { @@ -1204,6 +1222,26 @@ nla_put_failure: return -EMSGSIZE; } +static int gtp_tunnel_notify(struct pdp_ctx *pctx, u8 cmd) +{ + struct sk_buff *msg; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + ret = gtp_genl_fill_info(msg, 0, 0, 0, cmd, pctx); + if (ret < 0) { + nlmsg_free(msg); + return ret; + } + + ret = genlmsg_multicast_netns(>p_genl_family, dev_net(pctx->dev), msg, + 0, GTP_GENL_MCGRP, GFP_ATOMIC); + return ret; +} + static int gtp_genl_get_pdp(struct sk_buff *skb, struct genl_info *info) { struct pdp_ctx *pctx = NULL; @@ -1334,6 +1372,8 @@ static struct genl_family gtp_genl_family __ro_after_init = { .module = THIS_MODULE, .ops = gtp_genl_ops, .n_ops = ARRAY_SIZE(gtp_genl_ops), + .mcgrps = gtp_genl_mcgrps, + .n_mcgrps = ARRAY_SIZE(gtp_genl_mcgrps), }; static int __net_init gtp_net_init(struct net *net) diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index c7d66755d212..79f9191bbb24 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_GTP_H_ #define _UAPI_LINUX_GTP_H_ +#define GTP_GENL_MCGRP_NAME "gtp" + enum gtp_genl_cmds { GTP_CMD_NEWPDP, GTP_CMD_DELPDP, -- cgit v1.2.3 From dab741e0e02bd3c4f5e2e97be74b39df2523fc6e Mon Sep 17 00:00:00 2001 From: Mattias Nissler Date: Thu, 27 Aug 2020 11:09:46 -0600 Subject: Add a "nosymfollow" mount option. For mounts that have the new "nosymfollow" option, don't follow symlinks when resolving paths. The new option is similar in spirit to the existing "nodev", "noexec", and "nosuid" options, as well as to the LOOKUP_NO_SYMLINKS resolve flag in the openat2(2) syscall. Various BSD variants have been supporting the "nosymfollow" mount option for a long time with equivalent implementations. Note that symlinks may still be created on file systems mounted with the "nosymfollow" option present. readlink() remains functional, so user space code that is aware of symlinks can still choose to follow them explicitly. Setting the "nosymfollow" mount option helps prevent privileged writers from modifying files unintentionally in case there is an unexpected link along the accessed path. The "nosymfollow" option is thus useful as a defensive measure for systems that need to deal with untrusted file systems in privileged contexts. More information on the history and motivation for this patch can be found here: https://sites.google.com/a/chromium.org/dev/chromium-os/chromiumos-design-docs/hardening-against-malicious-stateful-data#TOC-Restricting-symlink-traversal Signed-off-by: Mattias Nissler Signed-off-by: Ross Zwisler Reviewed-by: Aleksa Sarai Signed-off-by: Al Viro --- fs/namei.c | 3 ++- fs/namespace.c | 2 ++ fs/proc_namespace.c | 1 + fs/statfs.c | 2 ++ include/linux/mount.h | 3 ++- include/linux/statfs.h | 1 + include/uapi/linux/mount.h | 1 + 7 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/namei.c b/fs/namei.c index e99e2a9da0f7..33e8c79bc761 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1626,7 +1626,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link, return ERR_PTR(error); } - if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || + unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { diff --git a/fs/namespace.c b/fs/namespace.c index bae0e95b3713..6408788a649e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3160,6 +3160,8 @@ int path_mount(const char *dev_name, struct path *path, mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + if (flags & MS_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 3059a9394c2d..e59d4bb3a89e 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -70,6 +70,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, + { MNT_NOSYMFOLLOW, ",nosymfollow" }, { 0, NULL } }; const struct proc_fs_opts *fs_infop; diff --git a/fs/statfs.c b/fs/statfs.c index 2616424012ea..59f33752c131 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -29,6 +29,8 @@ static int flags_by_mnt(int mnt_flags) flags |= ST_NODIRATIME; if (mnt_flags & MNT_RELATIME) flags |= ST_RELATIME; + if (mnt_flags & MNT_NOSYMFOLLOW) + flags |= ST_NOSYMFOLLOW; return flags; } diff --git a/include/linux/mount.h b/include/linux/mount.h index de657bd211fa..aaf343b38671 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -30,6 +30,7 @@ struct fs_context; #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 #define MNT_READONLY 0x40 /* does the user want this to be r/o? */ +#define MNT_NOSYMFOLLOW 0x80 #define MNT_SHRINKABLE 0x100 #define MNT_WRITE_HOLD 0x200 @@ -46,7 +47,7 @@ struct fs_context; #define MNT_SHARED_MASK (MNT_UNBINDABLE) #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ - | MNT_READONLY) + | MNT_READONLY | MNT_NOSYMFOLLOW) #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ diff --git a/include/linux/statfs.h b/include/linux/statfs.h index 9bc69edb8f18..fac4356ea1bf 100644 --- a/include/linux/statfs.h +++ b/include/linux/statfs.h @@ -40,6 +40,7 @@ struct kstatfs { #define ST_NOATIME 0x0400 /* do not update access times */ #define ST_NODIRATIME 0x0800 /* do not update directory access times */ #define ST_RELATIME 0x1000 /* update atime relative to mtime/ctime */ +#define ST_NOSYMFOLLOW 0x2000 /* do not follow symlinks */ struct dentry; extern int vfs_get_fsid(struct dentry *dentry, __kernel_fsid_t *fsid); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 96a0240f23fe..dd8306ea336c 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -16,6 +16,7 @@ #define MS_REMOUNT 32 /* Alter flags of a mounted FS */ #define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ #define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#define MS_NOSYMFOLLOW 256 /* Do not follow symlinks */ #define MS_NOATIME 1024 /* Do not update access times. */ #define MS_NODIRATIME 2048 /* Do not update directory access times */ #define MS_BIND 4096 -- cgit v1.2.3 From b0c9eb37817943840a1a82dbc998c491609a0afd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Aug 2020 22:19:22 -0700 Subject: bpf: Make bpf_link_info.iter similar to bpf_iter_link_info bpf_link_info.iter is used by link_query to return bpf_iter_link_info to user space. Fields may be different, e.g., map_fd vs. map_id, so we cannot reuse the exact structure. But make them similar, e.g., struct bpf_link_info { /* common fields */ union { struct { ... } raw_tracepoint; struct { ... } tracing; ... struct { /* common fields for iter */ union { struct { __u32 map_id; } map; /* other structs for other targets */ }; }; }; }; so the structure is extensible the same way as bpf_iter_link_info. Fixes: 6b0a249a301e ("bpf: Implement link_query for bpf iterators") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200828051922.758950-1-yhs@fb.com --- include/uapi/linux/bpf.h | 6 ++++-- tools/include/uapi/linux/bpf.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0388bc0200b0..ef7af384f5ee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4251,8 +4251,10 @@ struct bpf_link_info { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ union { - __u32 map_id; - } map; + struct { + __u32 map_id; + } map; + }; } iter; struct { __u32 netns_ino; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0388bc0200b0..ef7af384f5ee 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4251,8 +4251,10 @@ struct bpf_link_info { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ union { - __u32 map_id; - } map; + struct { + __u32 map_id; + } map; + }; } iter; struct { __u32 netns_ino; -- cgit v1.2.3 From 7a81575b806e5dab214025e6757362c62d946405 Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Thu, 20 Aug 2020 10:19:01 +0200 Subject: netfilter: nf_tables: add userdata attributes to nft_table Enables storing userdata for nft_table. Field udata points to user data and udlen store its length. Adds new attribute flag NFTA_TABLE_USERDATA Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 22 +++++++++++++++++++++- 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bf9491b77d16..97a7e147a59a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1080,6 +1080,8 @@ struct nft_table { flags:8, genmask:2; char *name; + u16 udlen; + u8 *udata; }; void nft_register_chain_type(const struct nft_chain_type *); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 42f351c1f5c5..aeb88cbd303e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -172,6 +172,7 @@ enum nft_table_flags { * @NFTA_TABLE_NAME: name of the table (NLA_STRING) * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32) * @NFTA_TABLE_USE: number of chains in this table (NLA_U32) + * @NFTA_TABLE_USERDATA: user data (NLA_BINARY) */ enum nft_table_attributes { NFTA_TABLE_UNSPEC, @@ -180,6 +181,7 @@ enum nft_table_attributes { NFTA_TABLE_USE, NFTA_TABLE_HANDLE, NFTA_TABLE_PAD, + NFTA_TABLE_USERDATA, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd814e514f94..6ccce2a2e715 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -650,6 +650,8 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { .len = NFT_TABLE_MAXNAMELEN - 1 }, [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, [NFTA_TABLE_HANDLE] = { .type = NLA_U64 }, + [NFTA_TABLE_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN } }; static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, @@ -676,6 +678,11 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, NFTA_TABLE_PAD)) goto nla_put_failure; + if (table->udata) { + if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata)) + goto nla_put_failure; + } + nlmsg_end(skb, nlh); return 0; @@ -977,8 +984,9 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, int family = nfmsg->nfgen_family; const struct nlattr *attr; struct nft_table *table; - u32 flags = 0; struct nft_ctx ctx; + u32 flags = 0; + u16 udlen = 0; int err; lockdep_assert_held(&net->nft.commit_mutex); @@ -1014,6 +1022,16 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table->name == NULL) goto err_strdup; + if (nla[NFTA_TABLE_USERDATA]) { + udlen = nla_len(nla[NFTA_TABLE_USERDATA]); + table->udata = kzalloc(udlen, GFP_KERNEL); + if (table->udata == NULL) + goto err_table_udata; + + nla_memcpy(table->udata, nla[NFTA_TABLE_USERDATA], udlen); + table->udlen = udlen; + } + err = rhltable_init(&table->chains_ht, &nft_chain_ht_params); if (err) goto err_chain_ht; @@ -1036,6 +1054,8 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, err_trans: rhltable_destroy(&table->chains_ht); err_chain_ht: + kfree(table->udata); +err_table_udata: kfree(table->name); err_strdup: kfree(table); -- cgit v1.2.3 From 4afc41dfa5a716e9e7a90c22972583f337c0bcbf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:52:43 +0200 Subject: netfilter: conntrack: remove ignore stats This counter increments when nf_conntrack_in sees a packet that already has a conntrack attached or when the packet is marked as UNTRACKED. Neither is an error. The former is normal for loopback traffic. The second happens for certain ICMPv6 packets or when nftables/ip(6)tables rules are in place. In case someone needs to count UNTRACKED packets, or packets that are marked as untracked before conntrack_in this can be done with both nftables and ip(6)tables rules. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 - include/uapi/linux/netfilter/nfnetlink_conntrack.h | 2 +- net/netfilter/nf_conntrack_core.c | 4 +--- net/netfilter/nf_conntrack_netlink.c | 1 - net/netfilter/nf_conntrack_standalone.c | 2 +- 5 files changed, 3 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 1db83c931d9c..96b90d7e361f 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -8,7 +8,6 @@ struct ip_conntrack_stat { unsigned int found; unsigned int invalid; - unsigned int ignore; unsigned int insert; unsigned int insert_failed; unsigned int drop; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 262881792671..3e471558da82 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -247,7 +247,7 @@ enum ctattr_stats_cpu { CTA_STATS_FOUND, CTA_STATS_NEW, /* no longer used */ CTA_STATS_INVALID, - CTA_STATS_IGNORE, + CTA_STATS_IGNORE, /* no longer used */ CTA_STATS_DELETE, /* no longer used */ CTA_STATS_DELETE_LIST, /* no longer used */ CTA_STATS_INSERT, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3cfbafdff941..a111bcf1b93c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1800,10 +1800,8 @@ nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ if ((tmpl && !nf_ct_is_template(tmpl)) || - ctinfo == IP_CT_UNTRACKED) { - NF_CT_STAT_INC_ATOMIC(state->net, ignore); + ctinfo == IP_CT_UNTRACKED) return NF_ACCEPT; - } skb->_nfct = 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 832eabecfbdd..c64f23a8f373 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2509,7 +2509,6 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || - nla_put_be32(skb, CTA_STATS_IGNORE, htonl(st->ignore)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || nla_put_be32(skb, CTA_STATS_INSERT_FAILED, htonl(st->insert_failed)) || diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index a604f43e3e6b..b673a03624d2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -439,7 +439,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) st->found, 0, st->invalid, - st->ignore, + 0, 0, 0, st->insert, -- cgit v1.2.3 From bc92470413f3af152db0d8f90ef3eb13f8cc417a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:52:44 +0200 Subject: netfilter: conntrack: add clash resolution stat counter There is a misconception about what "insert_failed" means. We increment this even when a clash got resolved, so it might not indicate a problem. Add a dedicated counter for clash resolution and only increment insert_failed if a clash cannot be resolved. For the old /proc interface, export this in place of an older stat that got removed a while back. For ctnetlink, export this with a new attribute. Also correct an outdated comment that implies we add a duplicate tuple -- we only add the (unique) reply direction. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 + include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 9 +++++---- net/netfilter/nf_conntrack_netlink.c | 4 +++- net/netfilter/nf_conntrack_standalone.c | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 96b90d7e361f..0c7d8d1e945d 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -10,6 +10,7 @@ struct ip_conntrack_stat { unsigned int invalid; unsigned int insert; unsigned int insert_failed; + unsigned int clash_resolve; unsigned int drop; unsigned int early_drop; unsigned int error; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 3e471558da82..d8484be72fdc 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -256,6 +256,7 @@ enum ctattr_stats_cpu { CTA_STATS_EARLY_DROP, CTA_STATS_ERROR, CTA_STATS_SEARCH_RESTART, + CTA_STATS_CLASH_RESOLVE, __CTA_STATS_MAX, }; #define CTA_STATS_MAX (__CTA_STATS_MAX - 1) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a111bcf1b93c..93e77ca0efad 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -859,7 +859,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) out: nf_conntrack_double_unlock(hash, reply_hash); - NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); return -EEXIST; } @@ -934,7 +933,7 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, nf_conntrack_put(&loser_ct->ct_general); nf_ct_set(skb, ct, ctinfo); - NF_CT_STAT_INC(net, insert_failed); + NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } @@ -998,6 +997,8 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); + + NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; } @@ -1027,10 +1028,10 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) * * Failing that, the new, unconfirmed conntrack is still added to the table * provided that the collision only occurs in the ORIGINAL direction. - * The new entry will be added after the existing one in the hash list, + * The new entry will be added only in the non-clashing REPLY direction, * so packets in the ORIGINAL direction will continue to match the existing * entry. The new entry will also have a fixed timeout so it expires -- - * due to the collision, it will not see bidirectional traffic. + * due to the collision, it will only see reply traffic. * * Returns NF_DROP if the clash could not be resolved. */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c64f23a8f373..89d99f6dfd0a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2516,7 +2516,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_be32(skb, CTA_STATS_EARLY_DROP, htonl(st->early_drop)) || nla_put_be32(skb, CTA_STATS_ERROR, htonl(st->error)) || nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, - htonl(st->search_restart))) + htonl(st->search_restart)) || + nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, + htonl(st->clash_resolve))) goto nla_put_failure; nlmsg_end(skb, nlh); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index b673a03624d2..0ff39740797d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -435,7 +435,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, - 0, + st->clash_resolve, /* was: searched */ st->found, 0, st->invalid, -- cgit v1.2.3 From 1e6c62a8821557720a9b2ea9617359b264f2f67c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:11 -0700 Subject: bpf: Introduce sleepable BPF programs Introduce sleepable BPF programs that can request such property for themselves via BPF_F_SLEEPABLE flag at program load time. In such case they will be able to use helpers like bpf_copy_from_user() that might sleep. At present only fentry/fexit/fmod_ret and lsm programs can request to be sleepable and only when they are attached to kernel functions that are known to allow sleeping. The non-sleepable programs are relying on implicit rcu_read_lock() and migrate_disable() to protect life time of programs, maps that they use and per-cpu kernel structures used to pass info between bpf programs and the kernel. The sleepable programs cannot be enclosed into rcu_read_lock(). migrate_disable() maps to preempt_disable() in non-RT kernels, so the progs should not be enclosed in migrate_disable() as well. Therefore rcu_read_lock_trace is used to protect the life time of sleepable progs. There are many networking and tracing program types. In many cases the 'struct bpf_prog *' pointer itself is rcu protected within some other kernel data structure and the kernel code is using rcu_dereference() to load that program pointer and call BPF_PROG_RUN() on it. All these cases are not touched. Instead sleepable bpf programs are allowed with bpf trampoline only. The program pointers are hard-coded into generated assembly of bpf trampoline and synchronize_rcu_tasks_trace() is used to protect the life time of the program. The same trampoline can hold both sleepable and non-sleepable progs. When rcu_read_lock_trace is held it means that some sleepable bpf program is running from bpf trampoline. Those programs can use bpf arrays and preallocated hash/lru maps. These map types are waiting on programs to complete via synchronize_rcu_tasks_trace(); Updates to trampoline now has to do synchronize_rcu_tasks_trace() and synchronize_rcu_tasks() to wait for sleepable progs to finish and for trampoline assembly to finish. This is the first step of introducing sleepable progs. Eventually dynamically allocated hash maps can be allowed and networking program types can become sleepable too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Josef Bacik Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-3-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 32 +++++++++++------ include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 8 +++++ init/Kconfig | 1 + kernel/bpf/arraymap.c | 1 + kernel/bpf/hashtab.c | 12 +++---- kernel/bpf/syscall.c | 13 +++++-- kernel/bpf/trampoline.c | 28 +++++++++++++-- kernel/bpf/verifier.c | 81 ++++++++++++++++++++++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 8 +++++ 10 files changed, 162 insertions(+), 25 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 42b6709e6dc7..7d9ea7b41c71 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1379,10 +1379,15 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; int cnt = 0; - if (emit_call(&prog, __bpf_prog_enter, prog)) - return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) + return -EINVAL; + } else { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + } /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1402,13 +1407,18 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; + if (p->aux->sleepable) { + if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + return -EINVAL; + } else { + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, + (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } *pprog = prog; return 0; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dbba82a80087..4dd7e927621d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); +void notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(void); struct bpf_ksym { unsigned long start; @@ -734,6 +736,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool sleepable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ef7af384f5ee..6e8b706aeb05 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -346,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * diff --git a/init/Kconfig b/init/Kconfig index fc10f7ede5f6..6ecc00e130ff 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1691,6 +1691,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK + select TASKS_TRACE_RCU default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d851ebbcf302..e046fb7d17cd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "map_in_map.h" diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ad80f45774e7..fe0e06284d33 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "percpu_freelist.h" #include "bpf_lru_list.h" #include "map_in_map.h" @@ -577,8 +578,7 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) struct htab_elem *l; u32 hash, key_size; - /* Must be called with rcu_read_lock. */ - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -941,7 +941,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1032,7 +1032,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1220,7 +1220,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1252,7 +1252,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) u32 hash, key_size; int ret = -ENOENT; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b86b1155b748..4108ef3b828b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -29,6 +29,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -1731,10 +1732,14 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); - if (deferred) - call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); - else + if (deferred) { + if (prog->aux->sleepable) + call_rcu_tasks_trace(&prog->aux->rcu, __bpf_prog_put_rcu); + else + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); + } else { __bpf_prog_put_rcu(&prog->aux->rcu); + } } static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) @@ -2104,6 +2109,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | + BPF_F_SLEEPABLE | BPF_F_TEST_RND_HI32)) return -EINVAL; @@ -2159,6 +2165,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } prog->aux->offload_requested = !!attr->prog_ifindex; + prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; err = security_bpf_prog_alloc(prog->aux); if (err) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 9be85aa4ec5f..c2b76545153c 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -210,9 +212,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) * updates to trampoline would change the code from underneath the * preempted task. Hence wait for tasks to voluntarily schedule or go * to userspace. + * The same trampoline can hold both sleepable and non-sleepable progs. + * synchronize_rcu_tasks_trace() is needed to make sure all sleepable + * programs finish executing. + * Wait for these two grace periods together. */ - - synchronize_rcu_tasks(); + synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, &tr->func.model, flags, tprogs, @@ -344,7 +349,14 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; bpf_image_ksym_del(&tr->ksym); - /* wait for tasks to get out of trampoline before freeing it */ + /* This code will be executed when all bpf progs (both sleepable and + * non-sleepable) went through + * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). + * Hence no need for another synchronize_rcu_tasks_trace() here, + * but synchronize_rcu_tasks() is still needed, since trampoline + * may not have had any sleepable programs and we need to wait + * for tasks to get out of trampoline code before freeing it. + */ synchronize_rcu_tasks(); bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); @@ -394,6 +406,16 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) rcu_read_unlock(); } +void notrace __bpf_prog_enter_sleepable(void) +{ + rcu_read_lock_trace(); +} + +void notrace __bpf_prog_exit_sleepable(void) +{ + rcu_read_unlock_trace(); +} + int __weak arch_prepare_bpf_trampoline(void *image, void *image_end, const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6f5a9f51cc03..3ebfdb7bd427 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "disasm.h" @@ -9367,6 +9368,23 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } + if (prog->aux->sleepable) + switch (map->map_type) { + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_ARRAY: + if (!is_preallocated_map(map)) { + verbose(env, + "Sleepable programs can only use preallocated hash maps\n"); + return -EINVAL; + } + break; + default: + verbose(env, + "Sleepable programs can only use array and hash maps\n"); + return -EINVAL; + } + return 0; } @@ -10985,6 +11003,36 @@ static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) return -EINVAL; } +/* non exhaustive list of sleepable bpf_lsm_*() functions */ +BTF_SET_START(btf_sleepable_lsm_hooks) +#ifdef CONFIG_BPF_LSM +BTF_ID(func, bpf_lsm_file_mprotect) +BTF_ID(func, bpf_lsm_bprm_committed_creds) +#endif +BTF_SET_END(btf_sleepable_lsm_hooks) + +static int check_sleepable_lsm_hook(u32 btf_id) +{ + return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); +} + +/* list of non-sleepable functions that are otherwise on + * ALLOW_ERROR_INJECTION list + */ +BTF_SET_START(btf_non_sleepable_error_inject) +/* Three functions below can be called from sleepable and non-sleepable context. + * Assume non-sleepable from bpf safety point of view. + */ +BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, should_fail_alloc_page) +BTF_ID(func, should_failslab) +BTF_SET_END(btf_non_sleepable_error_inject) + +static int check_non_sleepable_error_inject(u32 btf_id) +{ + return btf_id_set_contains(&btf_non_sleepable_error_inject, btf_id); +} + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -11002,6 +11050,12 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) long addr; u64 key; + if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING && + prog->type != BPF_PROG_TYPE_LSM) { + verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n"); + return -EINVAL; + } + if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) return check_struct_ops_btf_id(env); @@ -11210,13 +11264,36 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } } - if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + if (prog->aux->sleepable) { + ret = -EINVAL; + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + /* fentry/fexit/fmod_ret progs can be sleepable only if they are + * attached to ALLOW_ERROR_INJECTION and are not in denylist. + */ + if (!check_non_sleepable_error_inject(btf_id) && + within_error_injection_list(addr)) + ret = 0; + break; + case BPF_PROG_TYPE_LSM: + /* LSM progs check that they are attached to bpf_lsm_*() funcs. + * Only some of them are sleepable. + */ + if (check_sleepable_lsm_hook(btf_id)) + ret = 0; + break; + default: + break; + } + if (ret) + verbose(env, "%s is not sleepable\n", + prog->aux->attach_func_name); + } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { ret = check_attach_modify_return(prog, addr); if (ret) verbose(env, "%s() is not modifiable\n", prog->aux->attach_func_name); } - if (ret) goto out; tr->func.addr = (void *)addr; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ef7af384f5ee..6e8b706aeb05 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -346,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * -- cgit v1.2.3 From 07be4c4a3e7a0db148e44b16c5190e753d1c8569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:12 -0700 Subject: bpf: Add bpf_copy_from_user() helper. Sleepable BPF programs can now use copy_from_user() to access user memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ kernel/bpf/helpers.c | 22 ++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 8 ++++++++ 5 files changed, 41 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4dd7e927621d..c6d9f2c444f4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1784,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; +extern const struct bpf_func_proto bpf_copy_from_user_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6e8b706aeb05..a613750d5515 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3569,6 +3569,13 @@ union bpf_attr { * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3719,6 +3726,7 @@ union bpf_attr { FN(inode_storage_get), \ FN(inode_storage_delete), \ FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index be43ab3e619f..5cc7425ee476 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -601,6 +601,28 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, + const void __user *, user_ptr) +{ + int ret = copy_from_user(dst, user_ptr, size); + + if (unlikely(ret)) { + memset(dst, 0, size); + ret = -EFAULT; + } + + return ret; +} + +const struct bpf_func_proto bpf_copy_from_user_proto = { + .func = bpf_copy_from_user, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d973d891f2e2..b2a5380eb187 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1228,6 +1228,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: return &bpf_get_task_stack_proto; + case BPF_FUNC_copy_from_user: + return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6e8b706aeb05..a613750d5515 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3569,6 +3569,13 @@ union bpf_attr { * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3719,6 +3726,7 @@ union bpf_attr { FN(inode_storage_get), \ FN(inode_storage_delete), \ FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 67407a406db337acdaabecd3747d160d89a929e4 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Sat, 29 Aug 2020 08:19:15 +0200 Subject: netfilter: nft_socket: add wildcard support Add NFT_SOCKET_WILDCARD to match to wildcard socket listener. Signed-off-by: Balazs Scheidler Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_socket.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index aeb88cbd303e..543dc697b796 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1010,10 +1010,12 @@ enum nft_socket_attributes { * * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option * @NFT_SOCKET_MARK: Value of the socket mark + * @NFT_SOCKET_WILDCARD: Whether the socket is zero-bound (e.g. 0.0.0.0 or ::0) */ enum nft_socket_keys { NFT_SOCKET_TRANSPARENT, NFT_SOCKET_MARK, + NFT_SOCKET_WILDCARD, __NFT_SOCKET_MAX }; #define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 637ce3e8c575..a28aca5124ce 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -14,6 +14,25 @@ struct nft_socket { }; }; +static void nft_socket_wildcard(const struct nft_pktinfo *pkt, + struct nft_regs *regs, struct sock *sk, + u32 *dest) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + nft_reg_store8(dest, inet_sk(sk)->inet_rcv_saddr == 0); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nft_reg_store8(dest, ipv6_addr_any(&sk->sk_v6_rcv_saddr)); + break; +#endif + default: + regs->verdict.code = NFT_BREAK; + return; + } +} + static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -59,6 +78,13 @@ static void nft_socket_eval(const struct nft_expr *expr, return; } break; + case NFT_SOCKET_WILDCARD: + if (!sk_fullsock(sk)) { + regs->verdict.code = NFT_BREAK; + return; + } + nft_socket_wildcard(pkt, regs, sk, dest); + break; default: WARN_ON(1); regs->verdict.code = NFT_BREAK; @@ -97,6 +123,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY])); switch(priv->key) { case NFT_SOCKET_TRANSPARENT: + case NFT_SOCKET_WILDCARD: len = sizeof(u8); break; case NFT_SOCKET_MARK: -- cgit v1.2.3 From c1077616142907bb6ee987ecd136d6857ffd8787 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 1 Sep 2020 15:10:08 -0700 Subject: ip: expose inet sockopts through inet_diag Expose all exisiting inet sockopt bits through inet_diag for debug purpose. Corresponding changes in iproute2 ss will be submitted to output all these values. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 2 ++ include/uapi/linux/inet_diag.h | 18 ++++++++++++++++++ net/ipv4/inet_diag.c | 17 +++++++++++++++++ 3 files changed, 37 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 0ef2d800fda7..84abb30a3fbb 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -75,6 +75,8 @@ static inline size_t inet_diag_msg_attrs_size(void) #ifdef CONFIG_SOCK_CGROUP_DATA + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ #endif + + nla_total_size(sizeof(struct inet_diag_sockopt)) + /* INET_DIAG_SOCKOPT */ ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 5ba122c1949a..20ee93f0f876 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -160,6 +160,7 @@ enum { INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, INET_DIAG_CGROUP_ID, + INET_DIAG_SOCKOPT, __INET_DIAG_MAX, }; @@ -183,6 +184,23 @@ struct inet_diag_meminfo { __u32 idiag_tmem; }; +/* INET_DIAG_SOCKOPT */ + +struct inet_diag_sockopt { + __u8 recverr:1, + is_icsk:1, + freebind:1, + hdrincl:1, + mc_loop:1, + transparent:1, + mc_all:1, + nodefrag:1; + __u8 bind_address_no_port:1, + recverr_rfc4884:1, + defer_connect:1, + unused:5; +}; + /* INET_DIAG_VEGASINFO */ struct tcpvegas_info { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 4a98dd736270..93816d47e55a 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -125,6 +125,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, bool net_admin) { const struct inet_sock *inet = inet_sk(sk); + struct inet_diag_sockopt inet_sockopt; if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; @@ -180,6 +181,22 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); + memset(&inet_sockopt, 0, sizeof(inet_sockopt)); + inet_sockopt.recverr = inet->recverr; + inet_sockopt.is_icsk = inet->is_icsk; + inet_sockopt.freebind = inet->freebind; + inet_sockopt.hdrincl = inet->hdrincl; + inet_sockopt.mc_loop = inet->mc_loop; + inet_sockopt.transparent = inet->transparent; + inet_sockopt.mc_all = inet->mc_all; + inet_sockopt.nodefrag = inet->nodefrag; + inet_sockopt.bind_address_no_port = inet->bind_address_no_port; + inet_sockopt.recverr_rfc4884 = inet->recverr_rfc4884; + inet_sockopt.defer_connect = inet->defer_connect; + if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), + &inet_sockopt)) + goto errout; + return 0; errout: return 1; -- cgit v1.2.3 From 16270a92355722e387e9ca19627c5a4d7bae1354 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 18 Aug 2020 17:27:46 +0800 Subject: PCI: designware-ep: Fix the Header Type check The current check will result in the multiple function device fails to initialize. So fix the check by masking out the multiple function bit. Link: https://lore.kernel.org/r/20200818092746.24366-1-Zhiqiang.Hou@nxp.com Fixes: 0b24134f7888 ("PCI: dwc: Add validation that PCIe core is set to correct mode") Signed-off-by: Hou Zhiqiang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- drivers/pci/controller/dwc/pcie-designware-ep.c | 3 ++- include/uapi/linux/pci_regs.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 305bfec2424d..29f5c616c3bc 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -505,7 +505,8 @@ int dw_pcie_ep_init_complete(struct dw_pcie_ep *ep) u32 reg; int i; - hdr_type = dw_pcie_readb_dbi(pci, PCI_HEADER_TYPE); + hdr_type = dw_pcie_readb_dbi(pci, PCI_HEADER_TYPE) & + PCI_HEADER_TYPE_MASK; if (hdr_type != PCI_HEADER_TYPE_NORMAL) { dev_err(pci->dev, "PCIe controller is not set to EP mode (hdr_type:0x%x)!\n", diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f9701410d3b5..57a222014cd2 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -76,6 +76,7 @@ #define PCI_CACHE_LINE_SIZE 0x0c /* 8 bits */ #define PCI_LATENCY_TIMER 0x0d /* 8 bits */ #define PCI_HEADER_TYPE 0x0e /* 8 bits */ +#define PCI_HEADER_TYPE_MASK 0x7f #define PCI_HEADER_TYPE_NORMAL 0 #define PCI_HEADER_TYPE_BRIDGE 1 #define PCI_HEADER_TYPE_CARDBUS 2 -- cgit v1.2.3 From 938c3efd9e650ca343d04e70d11a17c64119e17c Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 4 Sep 2020 17:14:53 +0100 Subject: bpf: Fix formatting in documentation for BPF helpers Fix a formatting error in the description of bpf_load_hdr_opt() (rst2man complains about a wrong indentation, but what is missing is actually a blank line before the bullet list). Fix and harmonise the formatting for other helpers. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904161454.31135-3-quentin@isovalent.com --- include/uapi/linux/bpf.h | 87 +++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 42 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8dda13880957..90359cab501d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3349,38 +3349,38 @@ union bpf_attr { * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid - * pointer to struct task_struct. To store the stacktrace, the - * bpf program provides *buf* with a nonnegative *size*. + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with @@ -3410,12 +3410,12 @@ union bpf_attr { * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header - * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the - * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different - * sock_ops->op. + * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. @@ -3435,7 +3435,7 @@ union bpf_attr { * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), - * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are @@ -3445,27 +3445,30 @@ union bpf_attr { * of a header option. * * Supported flags: + * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return - * >0 when found, the header option is copied to *searchby_res*. - * The return value is the total length copied. + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: * - * **-EINVAL** If param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOMSG** The option is not found + * **-ENOMSG** if the option is not found. * - * **-ENOENT** No syn packet available when - * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * - * **-ENOSPC** Not enough space. Only *len* number of - * bytes are copied. + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. * - * **-EFAULT** Cannot parse the header options in the packet + * **-EFAULT** on failure to parse the header options in the + * packet. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description @@ -3483,44 +3486,44 @@ union bpf_attr { * by searching the same option in the outgoing skb. * * This helper can only be called during - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** If param is invalid + * **-EINVAL** If param is invalid. * - * **-ENOSPC** Not enough space in the header. - * Nothing has been written + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written * - * **-EEXIST** The option has already existed + * **-EEXIST** if the option already exists. * - * **-EFAULT** Cannot parse the existing header options + * **-EFAULT** on failrue to parse the existing header options. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The - * space will be used by bpf_store_hdr_opt() later in - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * - * If bpf_reserve_hdr_opt() is called multiple times, + * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during - * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** if param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOSPC** Not enough space in the header. + * **-ENOSPC** if there is not enough space in the header. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description @@ -3560,9 +3563,9 @@ union bpf_attr { * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description - * Return full path for given 'struct path' object, which - * needs to be the kernel BTF 'path' object. The path is - * returned in the provided buffer 'buf' of size 'sz' and + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return @@ -3573,7 +3576,7 @@ union bpf_attr { * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store - * the data in *dst*. This is a wrapper of copy_from_user(). + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. */ -- cgit v1.2.3 From 5205e919c9f0c5b48678f2c787871c96f665ca1b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 7 Sep 2020 12:56:08 +0300 Subject: net: bridge: mcast: add support for src list and filter mode dumping Support per port group src list (address and timer) and filter mode dumping. Protected by either multicast_lock or rcu. v3: add IPv6 support v2: require RCU or multicast_lock to traverse src groups Signed-off-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 21 +++++++++++ net/bridge/br_mdb.c | 85 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c1227aecd38f..75a2ac479247 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -455,10 +455,31 @@ enum { enum { MDBA_MDB_EATTR_UNSPEC, MDBA_MDB_EATTR_TIMER, + MDBA_MDB_EATTR_SRC_LIST, + MDBA_MDB_EATTR_GROUP_MODE, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) +/* per mdb entry source */ +enum { + MDBA_MDB_SRCLIST_UNSPEC, + MDBA_MDB_SRCLIST_ENTRY, + __MDBA_MDB_SRCLIST_MAX +}; +#define MDBA_MDB_SRCLIST_MAX (__MDBA_MDB_SRCLIST_MAX - 1) + +/* per mdb entry per source attributes + * these are embedded in MDBA_MDB_SRCLIST_ENTRY + */ +enum { + MDBA_MDB_SRCATTR_UNSPEC, + MDBA_MDB_SRCATTR_ADDRESS, + MDBA_MDB_SRCATTR_TIMER, + __MDBA_MDB_SRCATTR_MAX +}; +#define MDBA_MDB_SRCATTR_MAX (__MDBA_MDB_SRCATTR_MAX - 1) + /* multicast router types */ enum { MDB_RTR_TYPE_DISABLED, diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 559bdc256a1e..9dc12ce61018 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -77,10 +77,67 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) #endif } +static int __mdb_fill_srcs(struct sk_buff *skb, + struct net_bridge_port_group *p) +{ + struct net_bridge_group_src *ent; + struct nlattr *nest, *nest_ent; + + if (hlist_empty(&p->src_list)) + return 0; + + nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST); + if (!nest) + return -EMSGSIZE; + + hlist_for_each_entry_rcu(ent, &p->src_list, node, + lockdep_is_held(&p->port->br->multicast_lock)) { + nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY); + if (!nest_ent) + goto out_cancel_err; + switch (ent->addr.proto) { + case htons(ETH_P_IP): + if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, + ent->addr.u.ip4)) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS, + &ent->addr.u.ip6)) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + break; +#endif + default: + nla_nest_cancel(skb, nest_ent); + continue; + } + if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER, + br_timer_value(&ent->timer))) { + nla_nest_cancel(skb, nest_ent); + goto out_cancel_err; + } + nla_nest_end(skb, nest_ent); + } + + nla_nest_end(skb, nest); + + return 0; + +out_cancel_err: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int __mdb_fill_info(struct sk_buff *skb, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *p) { + bool dump_srcs_mode = false; struct timer_list *mtimer; struct nlattr *nest_ent; struct br_mdb_entry e; @@ -119,6 +176,23 @@ static int __mdb_fill_info(struct sk_buff *skb, nla_nest_cancel(skb, nest_ent); return -EMSGSIZE; } + switch (mp->addr.proto) { + case htons(ETH_P_IP): + dump_srcs_mode = !!(p && mp->br->multicast_igmp_version == 3); + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + dump_srcs_mode = !!(p && mp->br->multicast_mld_version == 2); + break; +#endif + } + if (dump_srcs_mode && + (__mdb_fill_srcs(skb, p) || + nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) { + nla_nest_cancel(skb, nest_ent); + return -EMSGSIZE; + } + nla_nest_end(skb, nest_ent); return 0; @@ -127,7 +201,7 @@ static int __mdb_fill_info(struct sk_buff *skb, static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { - int idx = 0, s_idx = cb->args[1], err = 0; + int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2]; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mdb_entry *mp; struct nlattr *nest, *nest2; @@ -152,7 +226,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, break; } - if (mp->host_joined) { + if (!s_pidx && mp->host_joined) { err = __mdb_fill_info(skb, mp, NULL); if (err) { nla_nest_cancel(skb, nest2); @@ -164,13 +238,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, pp = &p->next) { if (!p->port) continue; + if (pidx < s_pidx) + goto skip_pg; err = __mdb_fill_info(skb, mp, p); if (err) { nla_nest_cancel(skb, nest2); goto out; } +skip_pg: + pidx++; } + pidx = 0; + s_pidx = 0; nla_nest_end(skb, nest2); skip: idx++; @@ -178,6 +258,7 @@ skip: out: cb->args[1] = idx; + cb->args[2] = pidx; nla_nest_end(skb, nest); return err; } -- cgit v1.2.3 From 0db0c34cfbc9838c1a14cb04dd880602abd699a7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Sep 2020 16:14:31 -0700 Subject: net: tighten the definition of interface statistics This patch is born out of an investigation into which IEEE statistics correspond to which struct rtnl_link_stats64 members. Turns out that there seems to be reasonable consensus on the matter, among many drivers. To save others the time (and it took more time than I'm comfortable admitting) I'm adding comments referring to IEEE attributes to struct rtnl_link_stats64. Up until now we had two forms of documentation for stats - in Documentation/ABI/testing/sysfs-class-net-statistics and the comments on struct rtnl_link_stats64 itself. While the former is very cautious in defining the expected behavior, the latter feel quite dated and may not be easy to understand for modern day driver author (e.g. rx_over_errors). At the same time modern systems are far more complex and once obvious definitions lost their clarity. For example - does rx_packet count at the MAC layer (aFramesReceivedOK)? packets processed correctly by hardware? received by the driver? or maybe received by the stack? I tried to clarify the expectations, further clarifications from others are very welcome. The part hardest to untangle is rx_over_errors vs rx_fifo_errors vs rx_missed_errors. After much deliberation I concluded that for modern HW only two of the counters will make sense. The distinction between internal FIFO overflow and packets dropped due to back-pressure from the host is likely too implementation (driver and device) specific to expose in the standard stats. Now - which two of those counters we select to use is anyone's pick: sysfs documentation suggests rx_over_errors counts packets which did not fit into buffers due to MTU being too small, which I reused. There don't seem to be many modern drivers using it (well, CAN drivers seem to love this statistic). Of the remaining two I picked rx_missed_errors to report device drops. bnxt reports it and it's folded into "drop"s in procfs (while rx_fifo_errors is an error, and modern devices usually receive the frame OK, they just can't admit it into the pipeline). Of the drivers I looked at only AMD Lance-like and NS8390-like use all three of these counters. rx_missed_errors counts missed frames, rx_over_errors counts overflow events, and rx_fifo_errors counts frames which were truncated because they didn't fit into buffers. This suggests that rx_fifo_errors may be the correct stat for truncated packets, but I'd think a FIFO stat counting truncated packets would be very confusing to a modern reader. v2: - add driver developer notes about ethtool stat count and reset - replace Ethernet with IEEE 802.3 to better indicate source of attrs - mention byte counters don't count FCS - clarify RX counter is from device to host - drop "sightly" from sysfs paragraph - add examples of ethtool stats - s/incoming/received/ s/incoming/transmitted/ Signed-off-by: Jakub Kicinski --- Documentation/networking/index.rst | 1 + Documentation/networking/statistics.rst | 132 +++++++++++++++++++++ include/uapi/linux/if_link.h | 204 +++++++++++++++++++++++++++++--- 3 files changed, 320 insertions(+), 17 deletions(-) create mode 100644 Documentation/networking/statistics.rst (limited to 'include/uapi/linux') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c29496fff81c..4167acc5c076 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -93,6 +93,7 @@ Contents: sctp secid seg6-sysctl + statistics strparser switchdev tc-actions-env-rules diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst new file mode 100644 index 000000000000..d490b535cd14 --- /dev/null +++ b/Documentation/networking/statistics.rst @@ -0,0 +1,132 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Interface statistics +==================== + +This document is a guide to Linux network interface statistics. + +There are two main sources of interface statistics in Linux: + + - standard interface statistics based on + :c:type:`struct rtnl_link_stats64 `; and + - driver-defined statistics available via ethtool. + +There are multiple interfaces to reach the former. Most commonly used +is the `ip` command from `iproute2`:: + + $ ip -s -s link show dev ens4u1u1 + 6: ens4u1u1: mtu 1500 qdisc fq_codel state UP mode DEFAULT group default qlen 1000 + link/ether 48:2a:e3:4c:b1:d1 brd ff:ff:ff:ff:ff:ff + RX: bytes packets errors dropped overrun mcast + 74327665117 69016965 0 0 0 0 + RX errors: length crc frame fifo missed + 0 0 0 0 0 + TX: bytes packets errors dropped carrier collsns + 21405556176 44608960 0 0 0 0 + TX errors: aborted fifo window heartbeat transns + 0 0 0 0 128 + altname enp58s0u1u1 + +Note that `-s` has been specified twice to see all members of +:c:type:`struct rtnl_link_stats64 `. +If `-s` is specified once the detailed errors won't be shown. + +`ip` supports JSON formatting via the `-j` option. + +Ethtool statistics can be dumped using `ethtool -S $ifc`, e.g.:: + + $ ethtool -S ens4u1u1 + NIC statistics: + tx_single_collisions: 0 + tx_multi_collisions: 0 + +uAPIs +===== + +procfs +------ + +The historical `/proc/net/dev` text interface gives access to the list +of interfaces as well as their statistics. + +Note that even though this interface is using +:c:type:`struct rtnl_link_stats64 ` +internally it combines some of the fields. + +sysfs +----- + +Each device directory in sysfs contains a `statistics` directory (e.g. +`/sys/class/net/lo/statistics/`) with files corresponding to +members of :c:type:`struct rtnl_link_stats64 `. + +This simple interface is convenient especially in constrained/embedded +environments without access to tools. However, it's inefficient when +reading multiple stats as it internally performs a full dump of +:c:type:`struct rtnl_link_stats64 ` +and reports only the stat corresponding to the accessed file. + +Sysfs files are documented in +`Documentation/ABI/testing/sysfs-class-net-statistics`. + + +netlink +------- + +`rtnetlink` (`NETLINK_ROUTE`) is the preferred method of accessing +:c:type:`struct rtnl_link_stats64 ` stats. + +Statistics are reported both in the responses to link information +requests (`RTM_GETLINK`) and statistic requests (`RTM_GETSTATS`, +when `IFLA_STATS_LINK_64` bit is set in the `.filter_mask` of the request). + +ethtool +------- + +Ethtool IOCTL interface allows drivers to report implementation +specific statistics. Historically it has also been used to report +statistics for which other APIs did not exist, like per-device-queue +statistics, or standard-based statistics (e.g. RFC 2863). + +Statistics and their string identifiers are retrieved separately. +Identifiers via `ETHTOOL_GSTRINGS` with `string_set` set to `ETH_SS_STATS`, +and values via `ETHTOOL_GSTATS`. User space should use `ETHTOOL_GDRVINFO` +to retrieve the number of statistics (`.n_stats`). + +debugfs +------- + +Some drivers expose extra statistics via `debugfs`. + +struct rtnl_link_stats64 +======================== + +.. kernel-doc:: include/uapi/linux/if_link.h + :identifiers: rtnl_link_stats64 + +Notes for driver authors +======================== + +Drivers should report all statistics which have a matching member in +:c:type:`struct rtnl_link_stats64 ` exclusively +via `.ndo_get_stats64`. Reporting such standard stats via ethtool +or debugfs will not be accepted. + +Drivers must ensure best possible compliance with +:c:type:`struct rtnl_link_stats64 `. +Please note for example that detailed error statistics must be +added into the general `rx_error` / `tx_error` counters. + +The `.ndo_get_stats64` callback can not sleep because of accesses +via `/proc/net/dev`. If driver may sleep when retrieving the statistics +from the device it should do so periodically asynchronously and only return +a recent copy from `.ndo_get_stats64`. Ethtool interrupt coalescing interface +allows setting the frequency of refreshing statistics, if needed. + +Retrieving ethtool statistics is a multi-syscall process, drivers are advised +to keep the number of statistics constant to avoid race conditions with +user space trying to read them. + +Statistics must persist across routine operations like bringing the interface +down and up. diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7fba4de511de..bf4667403cab 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -40,26 +40,197 @@ struct rtnl_link_stats { __u32 rx_nohandler; /* dropped, no handler found */ }; -/* The main device statistics structure */ +/** + * struct rtnl_link_stats64 - The main device statistics structure. + * + * @rx_packets: Number of good packets received by the interface. + * For hardware interfaces counts all good packets received from the device + * by the host, including packets which host had to drop at various stages + * of processing (even in the driver). + * + * @tx_packets: Number of packets successfully transmitted. + * For hardware interfaces counts packets which host was able to successfully + * hand over to the device, which does not necessarily mean that packets + * had been successfully transmitted out of the device, only that device + * acknowledged it copied them out of host memory. + * + * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @rx_errors: Total number of bad packets received on this network device. + * This counter must include events counted by @rx_length_errors, + * @rx_crc_errors, @rx_frame_errors and other errors not otherwise + * counted. + * + * @tx_errors: Total number of transmit problems. + * This counter must include events counter by @tx_aborted_errors, + * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, + * @tx_window_errors and other errors not otherwise counted. + * + * @rx_dropped: Number of packets received but not processed, + * e.g. due to lack of resources or unsupported protocol. + * For hardware interfaces this counter should not include packets + * dropped by the device which are counted separately in + * @rx_missed_errors (since procfs folds those two counters together). + * + * @tx_dropped: Number of packets dropped on their way to transmission, + * e.g. due to lack of resources. + * + * @multicast: Multicast packets received. + * For hardware interfaces this statistic is commonly calculated + * at the device level (unlike @rx_packets) and therefore may include + * packets which did not reach the host. + * + * For IEEE 802.3 devices this counter may be equivalent to: + * + * - 30.3.1.1.21 aMulticastFramesReceivedOK + * + * @collisions: Number of collisions during packet transmissions. + * + * @rx_length_errors: Number of packets dropped due to invalid length. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to a sum + * of the following attributes: + * + * - 30.3.1.1.23 aInRangeLengthErrors + * - 30.3.1.1.24 aOutOfRangeLengthField + * - 30.3.1.1.25 aFrameTooLongErrors + * + * @rx_over_errors: Receiver FIFO overflow event counter. + * + * Historically the count of overflow events. Such events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * The recommended interpretation for high speed interfaces is - + * number of packets dropped because they did not fit into buffers + * provided by the host, e.g. packets larger than MTU or next buffer + * in the ring was not available for a scatter transfer. + * + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * This statistics was historically used interchangeably with + * @rx_fifo_errors. + * + * This statistic corresponds to hardware events and is not commonly used + * on software devices. + * + * @rx_crc_errors: Number of packets received with a CRC error. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.6 aFrameCheckSequenceErrors + * + * @rx_frame_errors: Receiver frame alignment errors. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to: + * + * - 30.3.1.1.7 aAlignmentErrors + * + * @rx_fifo_errors: Receiver FIFO error counter. + * + * Historically the count of overflow events. Those events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * This statistics was used interchangeably with @rx_over_errors. + * Not recommended for use in drivers for high speed interfaces. + * + * This statistic is used on software devices, e.g. to count software + * packet queue overflow (can) or sequencing errors (GRE). + * + * @rx_missed_errors: Count of packets missed by the host. + * Folded into the "drop" counter in `/proc/net/dev`. + * + * Counts number of packets dropped by the device due to lack + * of buffer space. This usually indicates that the host interface + * is slower than the network interface, or host is not keeping up + * with the receive packet rate. + * + * This statistic corresponds to hardware events and is not used + * on software devices. + * + * @tx_aborted_errors: + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * For IEEE 802.3 devices capable of half-duplex operation this counter + * must be equivalent to: + * + * - 30.3.1.1.11 aFramesAbortedDueToXSColls + * + * High speed interfaces may use this counter as a general device + * discard counter. + * + * @tx_carrier_errors: Number of frame transmission errors due to loss + * of carrier during transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.13 aCarrierSenseErrors + * + * @tx_fifo_errors: Number of frame transmission errors due to device + * FIFO underrun / underflow. This condition occurs when the device + * begins transmission of a frame but is unable to deliver the + * entire frame to the transmitter in time for transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for + * old half-duplex Ethernet. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices possibly equivalent to: + * + * - 30.3.2.1.4 aSQETestErrors + * + * @tx_window_errors: Number of frame transmission errors due + * to late collisions (for Ethernet - after the first 64B of transmission). + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.10 aLateCollisions + * + * @rx_compressed: Number of correctly received compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @tx_compressed: Number of transmitted compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @rx_nohandler: Number of packets received on the interface + * but dropped by the networking stack because the device is + * not designated to receive packets (e.g. backup link in a bond). + */ struct rtnl_link_stats64 { - __u64 rx_packets; /* total packets received */ - __u64 tx_packets; /* total packets transmitted */ - __u64 rx_bytes; /* total bytes received */ - __u64 tx_bytes; /* total bytes transmitted */ - __u64 rx_errors; /* bad packets received */ - __u64 tx_errors; /* packet transmit problems */ - __u64 rx_dropped; /* no space in linux buffers */ - __u64 tx_dropped; /* no space available in linux */ - __u64 multicast; /* multicast packets received */ + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; __u64 collisions; /* detailed rx_errors: */ __u64 rx_length_errors; - __u64 rx_over_errors; /* receiver ring buff overflow */ - __u64 rx_crc_errors; /* recved pkt with crc error */ - __u64 rx_frame_errors; /* recv'd frame alignment error */ - __u64 rx_fifo_errors; /* recv'r fifo overrun */ - __u64 rx_missed_errors; /* receiver missed packet */ + __u64 rx_over_errors; + __u64 rx_crc_errors; + __u64 rx_frame_errors; + __u64 rx_fifo_errors; + __u64 rx_missed_errors; /* detailed tx_errors */ __u64 tx_aborted_errors; @@ -71,8 +242,7 @@ struct rtnl_link_stats64 { /* for cslip etc */ __u64 rx_compressed; __u64 tx_compressed; - - __u64 rx_nohandler; /* dropped, no handler found */ + __u64 rx_nohandler; }; /* The struct should be in sync with struct ifmap */ -- cgit v1.2.3 From b131c96496b369c7b14125e7c50e89ac7cec8051 Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Tue, 8 Sep 2020 13:01:41 +0200 Subject: netfilter: nf_tables: add userdata support for nft_object Enables storing userdata for nft_object. Initially this will store an optional comment but can be extended in the future as needed. Adds new attribute NFTA_OBJ_USERDATA to nft_object. Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 35 ++++++++++++++++++++++++-------- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 97a7e147a59a..99c1b3188b1e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1123,6 +1123,8 @@ struct nft_object { u32 genmask:2, use:30; u64 handle; + u16 udlen; + u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 543dc697b796..2a6e09dea1a0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1559,6 +1559,7 @@ enum nft_ct_expectation_attributes { * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED) * @NFTA_OBJ_USE: number of references to this expression (NLA_U32) * @NFTA_OBJ_HANDLE: object handle (NLA_U64) + * @NFTA_OBJ_USERDATA: user data (NLA_BINARY) */ enum nft_object_attributes { NFTA_OBJ_UNSPEC, @@ -1569,6 +1570,7 @@ enum nft_object_attributes { NFTA_OBJ_USE, NFTA_OBJ_HANDLE, NFTA_OBJ_PAD, + NFTA_OBJ_USERDATA, __NFTA_OBJ_MAX }; #define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6ccce2a2e715..e9b4848e9dd0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5755,6 +5755,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_TYPE] = { .type = NLA_U32 }, [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, [NFTA_OBJ_HANDLE] = { .type = NLA_U64}, + [NFTA_OBJ_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, @@ -5902,6 +5904,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, struct nft_object *obj; struct nft_ctx ctx; u32 objtype; + u16 udlen; int err; if (!nla[NFTA_OBJ_TYPE] || @@ -5946,7 +5949,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err1; + goto err_init; } obj->key.table = table; obj->handle = nf_tables_alloc_handle(table); @@ -5954,32 +5957,44 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); if (!obj->key.name) { err = -ENOMEM; - goto err2; + goto err_strdup; + } + + if (nla[NFTA_OBJ_USERDATA]) { + udlen = nla_len(nla[NFTA_OBJ_USERDATA]); + obj->udata = kzalloc(udlen, GFP_KERNEL); + if (obj->udata == NULL) + goto err_userdata; + + nla_memcpy(obj->udata, nla[NFTA_OBJ_USERDATA], udlen); + obj->udlen = udlen; } err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); if (err < 0) - goto err3; + goto err_trans; err = rhltable_insert(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params); if (err < 0) - goto err4; + goto err_obj_ht; list_add_tail_rcu(&obj->list, &table->objects); table->use++; return 0; -err4: +err_obj_ht: /* queued in transaction log */ INIT_LIST_HEAD(&obj->list); return err; -err3: +err_trans: kfree(obj->key.name); -err2: +err_userdata: + kfree(obj->udata); +err_strdup: if (obj->ops->destroy) obj->ops->destroy(&ctx, obj); kfree(obj); -err1: +err_init: module_put(type->owner); return err; } @@ -6011,6 +6026,10 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, NFTA_OBJ_PAD)) goto nla_put_failure; + if (obj->udata && + nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From cd80ec795156346236e9b1cd9f5cbff5a9bbd212 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 8 Sep 2020 09:33:56 -0700 Subject: Input: allocate keycodes for notification-center, pickup-phone and hangup-phone New Lenovo Thinkpad models, e.g. the X1 Carbon 8th gen and the new T14 gen1 models have 3 new symbols / shortcuts on their F9-F11 keys (and the thinkpad_acpi driver receives 3 new "scancodes" for these): F9: Has a symbol resembling a rectangular speech balloon, the manual says the hotkey functions shows or hides the notification center F10: Has a symbol of a telephone horn which has been picked up from the receiver, the manual says: "Answer incoming calls" F11: Has a symbol of a telephone horn which is resting on the receiver, the manual says: "Decline incoming calls" We have no existing keycodes which are a good match for these, so add 3 new keycodes for these. I noticed that we have a hole in our keycodes between 0x1ba and 0x1c0 which does not seem to be reserved for any specific purpose, so these new 3 codes use 0x1bc - 0x1be, instead of starting at 0x27b. Acked-by: Henrique de Moraes Holschuh Acked-by: Andy Shevchenko Signed-off-by: Hans de Goede Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 0c2e27d28e0a..b74821d09145 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -515,6 +515,9 @@ #define KEY_10CHANNELSUP 0x1b8 /* 10 channels up (10+) */ #define KEY_10CHANNELSDOWN 0x1b9 /* 10 channels down (10-) */ #define KEY_IMAGES 0x1ba /* AL Image Browser */ +#define KEY_NOTIFICATION_CENTER 0x1bc /* Show/hide the notification center */ +#define KEY_PICKUP_PHONE 0x1bd /* Answer incoming call */ +#define KEY_HANGUP_PHONE 0x1be /* Decline incoming call */ #define KEY_DEL_EOL 0x1c0 #define KEY_DEL_EOS 0x1c1 -- cgit v1.2.3 From bba013e1ca5e7150b42a1a1a1e852010d772edad Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 8 Sep 2020 09:58:09 -0700 Subject: Input: allocate keycode for Fn + right shift The last 2 generations of Lenovo Thinkpads send an acpi_thinkpad event when Fn + right shift is pressed. This is intended for use with "Lenovo Quick Clean" software, which disables the touchpad + kbd for 2 minutes on this key-combo so that healthcare workes can disinfect it. But there is no silkscreen print on the right-keyboard to indicate this, so add a KEY_FN_RIGHT_SHIFT keycode define to use for this key-combo. Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20200908135147.4044-3-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index b74821d09145..ee93428ced9a 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -545,6 +545,7 @@ #define KEY_FN_F 0x1e2 #define KEY_FN_S 0x1e3 #define KEY_FN_B 0x1e4 +#define KEY_FN_RIGHT_SHIFT 0x1e5 #define KEY_BRL_DOT1 0x1f1 #define KEY_BRL_DOT2 0x1f2 -- cgit v1.2.3 From 05b595e9c44acaca94192c6db430a489c1b212a7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 9 Sep 2020 07:50:36 +0300 Subject: devlink: Introduce external controller flag A devlink eswitch port may represent PCI PF/VF ports of a controller. A controller either located on same system or it can be an external controller located in host where such NIC is plugged in. Add the ability for driver to specify if a port is for external controller. Use such flag in the mlx5_core driver. An example of an external controller having VF1 of PF0 belong to controller 1. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 6 ++++-- include/net/devlink.h | 8 ++++++-- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 12 ++++++++++-- 4 files changed, 21 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index e13e5d1b3eae..5b3599caa007 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1215,11 +1215,13 @@ static int register_devlink_port(struct mlx5_core_dev *dev, struct devlink_port_attrs attrs = {}; struct netdev_phys_item_id ppid = {}; unsigned int dl_port_index = 0; + bool external; u16 pfnum; if (!is_devlink_port_supported(dev, rpriv)) return 0; + external = mlx5_core_is_ecpf_esw_manager(dev); mlx5e_rep_get_port_parent_id(rpriv->netdev, &ppid); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, rep->vport); pfnum = PCI_FUNC(dev->pdev->devfn); @@ -1232,12 +1234,12 @@ static int register_devlink_port(struct mlx5_core_dev *dev, } else if (rep->vport == MLX5_VPORT_PF) { memcpy(rpriv->dl_port.attrs.switch_id.id, &ppid.id[0], ppid.id_len); rpriv->dl_port.attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_pf_set(&rpriv->dl_port, pfnum); + devlink_port_attrs_pci_pf_set(&rpriv->dl_port, pfnum, external); } else if (mlx5_eswitch_is_vf_vport(dev->priv.eswitch, rpriv->rep->vport)) { memcpy(rpriv->dl_port.attrs.switch_id.id, &ppid.id[0], ppid.id_len); rpriv->dl_port.attrs.switch_id.id_len = ppid.id_len; devlink_port_attrs_pci_vf_set(&rpriv->dl_port, - pfnum, rep->vport - 1); + pfnum, rep->vport - 1, external); } return devlink_port_register(devlink, &rpriv->dl_port, dl_port_index); } diff --git a/include/net/devlink.h b/include/net/devlink.h index efff9274d248..2dad8c9151f4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -60,19 +60,23 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes * @pf: Associated PCI PF number for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { u16 pf; + u8 external:1; }; /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes * @pf: Associated PCI PF number for this port. * @vf: Associated PCI VF for of the PCI PF for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { u16 pf; u16 vf; + u8 external:1; }; /** @@ -1215,9 +1219,9 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *devlink_port_attrs); -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf); +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - u16 pf, u16 vf); + u16 pf, u16 vf, bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index cfef4245ea5a..40823ed7e05a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -458,6 +458,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_LANES, /* u32 */ DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ + DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 49e911c19881..6f5f85372721 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -526,6 +526,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) + return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, @@ -533,6 +535,8 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; + if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) + return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PHYSICAL: case DEVLINK_PORT_FLAVOUR_CPU: @@ -7716,8 +7720,9 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * * @devlink_port: devlink port * @pf: associated PF for the devlink port instance + * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -7728,6 +7733,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf) return; attrs->pci_pf.pf = pf; + attrs->pci_pf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); @@ -7737,9 +7743,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * @devlink_port: devlink port * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - u16 pf, u16 vf) + u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -7750,6 +7757,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, return; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; + attrs->pci_vf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); -- cgit v1.2.3 From 3a2d9588c4f79adae6a0e986b64ebdd5b38085c6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 9 Sep 2020 07:50:37 +0300 Subject: devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 9 +++++++-- include/net/devlink.h | 9 +++++++-- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 23 ++++++++++++++--------- 4 files changed, 29 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 5b3599caa007..135ee26881c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1210,11 +1210,13 @@ is_devlink_port_supported(const struct mlx5_core_dev *dev, static int register_devlink_port(struct mlx5_core_dev *dev, struct mlx5e_rep_priv *rpriv) { + struct mlx5_esw_offload *offloads = &dev->priv.eswitch->offloads; struct devlink *devlink = priv_to_devlink(dev); struct mlx5_eswitch_rep *rep = rpriv->rep; struct devlink_port_attrs attrs = {}; struct netdev_phys_item_id ppid = {}; unsigned int dl_port_index = 0; + u32 controller_num = 0; bool external; u16 pfnum; @@ -1222,6 +1224,8 @@ static int register_devlink_port(struct mlx5_core_dev *dev, return 0; external = mlx5_core_is_ecpf_esw_manager(dev); + if (external) + controller_num = offloads->host_number + 1; mlx5e_rep_get_port_parent_id(rpriv->netdev, &ppid); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, rep->vport); pfnum = PCI_FUNC(dev->pdev->devfn); @@ -1234,11 +1238,12 @@ static int register_devlink_port(struct mlx5_core_dev *dev, } else if (rep->vport == MLX5_VPORT_PF) { memcpy(rpriv->dl_port.attrs.switch_id.id, &ppid.id[0], ppid.id_len); rpriv->dl_port.attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_pf_set(&rpriv->dl_port, pfnum, external); + devlink_port_attrs_pci_pf_set(&rpriv->dl_port, controller_num, + pfnum, external); } else if (mlx5_eswitch_is_vf_vport(dev->priv.eswitch, rpriv->rep->vport)) { memcpy(rpriv->dl_port.attrs.switch_id.id, &ppid.id[0], ppid.id_len); rpriv->dl_port.attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_vf_set(&rpriv->dl_port, + devlink_port_attrs_pci_vf_set(&rpriv->dl_port, controller_num, pfnum, rep->vport - 1, external); } return devlink_port_register(devlink, &rpriv->dl_port, dl_port_index); diff --git a/include/net/devlink.h b/include/net/devlink.h index 2dad8c9151f4..eaec0a8cc5ef 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -59,21 +59,25 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes + * @controller: Associated controller number * @pf: Associated PCI PF number for this port. * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { + u32 controller; u16 pf; u8 external:1; }; /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes + * @controller: Associated controller number * @pf: Associated PCI PF number for this port. * @vf: Associated PCI VF for of the PCI PF for this port. * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { + u32 controller; u16 pf; u16 vf; u8 external:1; @@ -1219,8 +1223,9 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *devlink_port_attrs); -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external); -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, bool external); +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 40823ed7e05a..40d35145c879 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -459,6 +459,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ + DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 6f5f85372721..9cf5b118253b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -523,17 +523,18 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return -EMSGSIZE; switch (devlink_port->attrs.flavour) { case DEVLINK_PORT_FLAVOUR_PCI_PF: - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_pf.pf)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_pf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external)) return -EMSGSIZE; break; case DEVLINK_PORT_FLAVOUR_PCI_VF: - if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, - attrs->pci_vf.pf) || - nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, - attrs->pci_vf.vf)) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, + attrs->pci_vf.controller) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) || + nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf)) return -EMSGSIZE; if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external)) return -EMSGSIZE; @@ -7719,10 +7720,12 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes * * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external) +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -7731,7 +7734,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bo DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) return; - + attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; } @@ -7741,11 +7744,12 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes * * @devlink_port: devlink port + * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @vf: associated VF of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; @@ -7755,6 +7759,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) return; + attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; attrs->pci_vf.external = external; -- cgit v1.2.3 From 22f3787e9d95e72d1f09795f294fb010e2998f43 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 19 Aug 2020 18:19:46 -0400 Subject: virtiofs: set up virtio_fs dax_device Setup a dax device. Use the shm capability to find the cache entry and map it. The DAX window is accessed by the fs/dax.c infrastructure and must have struct pages (at least on x86). Use devm_memremap_pages() to map the DAX window PCI BAR and allocate struct page. Signed-off-by: Stefan Hajnoczi Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Vivek Goyal Signed-off-by: Sebastien Boeuf Signed-off-by: Liu Bo Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 138 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_fs.h | 3 + 2 files changed, 141 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 47ecdc15f25d..f31a59f74475 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -5,12 +5,16 @@ */ #include +#include +#include +#include #include #include #include #include #include #include +#include #include "fuse_i.h" /* List of virtio-fs device instances and a lock for the list. Also provides @@ -49,6 +53,12 @@ struct virtio_fs { struct virtio_fs_vq *vqs; unsigned int nvqs; /* number of virtqueues */ unsigned int num_request_queues; /* number of request queues */ + struct dax_device *dax_dev; + + /* DAX memory window where file contents are mapped */ + void *window_kaddr; + phys_addr_t window_phys_addr; + size_t window_len; }; struct virtio_fs_forget_req { @@ -686,6 +696,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, vdev->config->del_vqs(vdev); } +/* Map a window offset to a page frame number. The window offset will have + * been produced by .iomap_begin(), which maps a file offset to a window + * offset. + */ +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct virtio_fs *fs = dax_get_private(dax_dev); + phys_addr_t offset = PFN_PHYS(pgoff); + size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + + if (kaddr) + *kaddr = fs->window_kaddr + offset; + if (pfn) + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, + PFN_DEV | PFN_MAP); + return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; +} + +static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + +static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +static int virtio_fs_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + long rc; + void *kaddr; + + rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + if (rc < 0) + return rc; + memset(kaddr, 0, nr_pages << PAGE_SHIFT); + dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); + return 0; +} + +static const struct dax_operations virtio_fs_dax_ops = { + .direct_access = virtio_fs_direct_access, + .copy_from_iter = virtio_fs_copy_from_iter, + .copy_to_iter = virtio_fs_copy_to_iter, + .zero_page_range = virtio_fs_zero_page_range, +}; + +static void virtio_fs_cleanup_dax(void *data) +{ + struct dax_device *dax_dev = data; + + kill_dax(dax_dev); + put_dax(dax_dev); +} + +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) +{ + struct virtio_shm_region cache_reg; + struct dev_pagemap *pgmap; + bool have_cache; + + if (!IS_ENABLED(CONFIG_FUSE_DAX)) + return 0; + + /* Get cache region */ + have_cache = virtio_get_shm_region(vdev, &cache_reg, + (u8)VIRTIO_FS_SHMCAP_ID_CACHE); + if (!have_cache) { + dev_notice(&vdev->dev, "%s: No cache capability\n", __func__); + return 0; + } + + if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len, + dev_name(&vdev->dev))) { + dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n", + cache_reg.addr, cache_reg.len); + return -EBUSY; + } + + dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len, + cache_reg.addr); + + pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->type = MEMORY_DEVICE_FS_DAX; + + /* Ideally we would directly use the PCI BAR resource but + * devm_memremap_pages() wants its own copy in pgmap. So + * initialize a struct resource from scratch (only the start + * and end fields will be used). + */ + pgmap->res = (struct resource){ + .name = "virtio-fs dax window", + .start = (phys_addr_t) cache_reg.addr, + .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1, + }; + + fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); + if (IS_ERR(fs->window_kaddr)) + return PTR_ERR(fs->window_kaddr); + + fs->window_phys_addr = (phys_addr_t) cache_reg.addr; + fs->window_len = (phys_addr_t) cache_reg.len; + + dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", + __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); + + fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0); + if (IS_ERR(fs->dax_dev)) + return PTR_ERR(fs->dax_dev); + + return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, + fs->dax_dev); +} + static int virtio_fs_probe(struct virtio_device *vdev) { struct virtio_fs *fs; @@ -707,6 +841,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) /* TODO vq affinity */ + ret = virtio_fs_setup_dax(vdev, fs); + if (ret < 0) + goto out_vqs; + /* Bring the device online in case the filesystem is mounted and * requests need to be sent before we return. */ diff --git a/include/uapi/linux/virtio_fs.h b/include/uapi/linux/virtio_fs.h index 3056b6e9f8ce..bea38291421b 100644 --- a/include/uapi/linux/virtio_fs.h +++ b/include/uapi/linux/virtio_fs.h @@ -16,4 +16,7 @@ struct virtio_fs_config { __le32 num_request_queues; } __attribute__((packed)); +/* For the id field in virtio_pci_shm_cap */ +#define VIRTIO_FS_SHMCAP_ID_CACHE 0 + #endif /* _UAPI_LINUX_VIRTIO_FS_H */ -- cgit v1.2.3 From fd1a1dc6f5aa7361e3562790336e116935f8fcfa Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 19 Aug 2020 18:19:49 -0400 Subject: virtiofs: implement FUSE_INIT map_alignment field The device communicates FUSE_SETUPMAPPING/FUSE_REMOVMAPPING alignment constraints via the FUST_INIT map_alignment field. Parse this field and ensure our DAX mappings meet the alignment constraints. We don't actually align anything differently since our mappings are already 2MB aligned. Just check the value when the connection is established. If it becomes necessary to honor arbitrary alignments in the future we'll have to adjust how mappings are sized. The upshot of this commit is that we can be confident that mappings will work even when emulating x86 on Power and similar combinations where the host page sizes are different. Signed-off-by: Stefan Hajnoczi Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 15 ++++++++++++++- fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 17 ++++++++++++++++- include/uapi/linux/fuse.h | 4 +++- 4 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 031106020f75..fec8a2bd75b3 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -9,7 +9,10 @@ #include #include -/* Default memory range size, 2MB */ +/* + * Default memory range size. A power of 2 so it agrees with common FUSE_INIT + * map_alignment values 4KB and 64KB. + */ #define FUSE_DAX_SHIFT 21 #define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) #define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) @@ -123,3 +126,13 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) fc->dax = fcd; return 0; } + +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) +{ + if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { + pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", + map_alignment, FUSE_DAX_SZ); + return false; + } + return true; +} diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 97af7952373a..2f3f04aa64c7 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1106,5 +1106,6 @@ void fuse_free_conn(struct fuse_conn *fc); int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1780dfe063ab..67e99cee5a4f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -908,9 +908,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, { struct fuse_init_args *ia = container_of(args, typeof(*ia), args); struct fuse_init_out *arg = &ia->out; + bool ok = true; if (error || arg->major != FUSE_KERNEL_VERSION) - fc->conn_error = 1; + ok = false; else { unsigned long ra_pages; @@ -973,6 +974,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, min_t(unsigned int, FUSE_MAX_MAX_PAGES, max_t(unsigned int, arg->max_pages, 1)); } + if (IS_ENABLED(CONFIG_FUSE_DAX) && + arg->flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -988,6 +994,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, } kfree(ia); + if (!ok) { + fc->conn_init = 0; + fc->conn_error = 1; + } + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } @@ -1011,6 +1022,10 @@ void fuse_send_init(struct fuse_conn *fc) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; +#ifdef CONFIG_FUSE_DAX + if (fc->dax) + ia->in.flags |= FUSE_MAP_ALIGNMENT; +#endif ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; ia->args.in_args[0].size = sizeof(ia->in); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 373cada89815..5b85819e045f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -313,7 +313,9 @@ struct fuse_file_lock { * FUSE_CACHE_SYMLINKS: cache READLINK responses * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir * FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request - * FUSE_MAP_ALIGNMENT: map_alignment field is valid + * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for + * foffset and moffset fields in struct + * fuse_setupmapping_out and fuse_removemapping_one. */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) -- cgit v1.2.3 From ceec02d4354a317cacce4b053a580ea3c7fc6cdc Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 19 Aug 2020 18:19:50 -0400 Subject: virtiofs: introduce setupmapping/removemapping commands Introduce two new fuse commands to setup/remove memory mappings. This will be used to setup/tear down file mapping in dax window. Signed-off-by: Vivek Goyal Signed-off-by: Peng Tao Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 5b85819e045f..60a7bfc787ce 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -894,4 +894,33 @@ struct fuse_copy_file_range_in { uint64_t flags; }; +#define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) +struct fuse_setupmapping_in { + /* An already open handle */ + uint64_t fh; + /* Offset into the file to start the mapping */ + uint64_t foffset; + /* Length of mapping required */ + uint64_t len; + /* Flags, FUSE_SETUPMAPPING_FLAG_* */ + uint64_t flags; + /* Offset in Memory Window */ + uint64_t moffset; +}; + +struct fuse_removemapping_in { + /* number of fuse_removemapping_one follows */ + uint32_t count; +}; + +struct fuse_removemapping_one { + /* Offset into the dax window start the unmapping */ + uint64_t moffset; + /* Length of mapping required */ + uint64_t len; +}; + +#define FUSE_REMOVEMAPPING_MAX_ENTRY \ + (PAGE_SIZE / sizeof(struct fuse_removemapping_one)) + #endif /* _LINUX_FUSE_H */ -- cgit v1.2.3 From c2d0ad00d948de73c78f05d2b3e5bdfa605035cc Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 19 Aug 2020 18:19:51 -0400 Subject: virtiofs: implement dax read/write operations This patch implements basic DAX support. mmap() is not implemented yet and will come in later patches. This patch looks into implemeting read/write. We make use of interval tree to keep track of per inode dax mappings. Do not use dax for file extending writes, instead just send WRITE message to daemon (like we do for direct I/O path). This will keep write and i_size change atomic w.r.t crash. Signed-off-by: Stefan Hajnoczi Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Vivek Goyal Signed-off-by: Liu Bo Signed-off-by: Peng Tao Cc: Dave Chinner Signed-off-by: Miklos Szeredi --- fs/fuse/Kconfig | 1 + fs/fuse/dax.c | 565 ++++++++++++++++++++++++++++++++++++++++++++++ fs/fuse/file.c | 15 +- fs/fuse/fuse_i.h | 15 ++ fs/fuse/inode.c | 21 +- include/uapi/linux/fuse.h | 1 + 6 files changed, 612 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index fddd40630077..40ce9a1c12e5 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -42,6 +42,7 @@ config VIRTIO_FS config FUSE_DAX bool "Virtio Filesystem Direct Host Memory Access support" default y + select INTERVAL_TREE depends on VIRTIO_FS depends on FS_DAX depends on DAX_DRIVER diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index fec8a2bd75b3..a8d311b2db8e 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -7,7 +7,10 @@ #include "fuse_i.h" #include +#include #include +#include +#include /* * Default memory range size. A power of 2 so it agrees with common FUSE_INIT @@ -22,22 +25,556 @@ struct fuse_dax_mapping { /* Will connect in fcd->free_ranges to keep track of free memory */ struct list_head list; + /* For interval tree in file/inode */ + struct interval_tree_node itn; + /** Position in DAX window */ u64 window_offset; /** Length of mapping, in bytes */ loff_t length; + + /* Is this mapping read-only or read-write */ + bool writable; +}; + +/* Per-inode dax map */ +struct fuse_inode_dax { + /* Semaphore to protect modifications to the dmap tree */ + struct rw_semaphore sem; + + /* Sorted rb tree of struct fuse_dax_mapping elements */ + struct rb_root_cached tree; + unsigned long nr; }; struct fuse_conn_dax { /* DAX device */ struct dax_device *dev; + /* Lock protecting accessess to members of this structure */ + spinlock_t lock; + /* DAX Window Free Ranges */ long nr_free_ranges; struct list_head free_ranges; }; +static inline struct fuse_dax_mapping * +node_to_dmap(struct interval_tree_node *node) +{ + if (!node) + return NULL; + + return container_of(node, struct fuse_dax_mapping, itn); +} + +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) +{ + struct fuse_dax_mapping *dmap; + + spin_lock(&fcd->lock); + dmap = list_first_entry_or_null(&fcd->free_ranges, + struct fuse_dax_mapping, list); + if (dmap) { + list_del_init(&dmap->list); + WARN_ON(fcd->nr_free_ranges <= 0); + fcd->nr_free_ranges--; + } + spin_unlock(&fcd->lock); + return dmap; +} + +/* This assumes fcd->lock is held */ +static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_add_tail(&dmap->list, &fcd->free_ranges); + fcd->nr_free_ranges++; +} + +static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + /* Return fuse_dax_mapping to free list */ + spin_lock(&fcd->lock); + __dmap_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); +} + +static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, + struct fuse_dax_mapping *dmap, bool writable, + bool upgrade) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_setupmapping_in inarg; + loff_t offset = start_idx << FUSE_DAX_SHIFT; + FUSE_ARGS(args); + ssize_t err; + + WARN_ON(fcd->nr_free_ranges < 0); + + /* Ask fuse daemon to setup mapping */ + memset(&inarg, 0, sizeof(inarg)); + inarg.foffset = offset; + inarg.fh = -1; + inarg.moffset = dmap->window_offset; + inarg.len = FUSE_DAX_SZ; + inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; + if (writable) + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; + args.opcode = FUSE_SETUPMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fc, &args); + if (err < 0) + return err; + dmap->writable = writable; + if (!upgrade) { + dmap->itn.start = dmap->itn.last = start_idx; + /* Protected by fi->dax->sem */ + interval_tree_insert(&dmap->itn, &fi->dax->tree); + fi->dax->nr++; + } + return 0; +} + +static int fuse_send_removemapping(struct inode *inode, + struct fuse_removemapping_in *inargp, + struct fuse_removemapping_one *remove_one) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + FUSE_ARGS(args); + + args.opcode = FUSE_REMOVEMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(*inargp); + args.in_args[0].value = inargp; + args.in_args[1].size = inargp->count * sizeof(*remove_one); + args.in_args[1].value = remove_one; + return fuse_simple_request(fc, &args); +} + +static int dmap_removemapping_list(struct inode *inode, unsigned int num, + struct list_head *to_remove) +{ + struct fuse_removemapping_one *remove_one, *ptr; + struct fuse_removemapping_in inarg; + struct fuse_dax_mapping *dmap; + int ret, i = 0, nr_alloc; + + nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); + remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); + if (!remove_one) + return -ENOMEM; + + ptr = remove_one; + list_for_each_entry(dmap, to_remove, list) { + ptr->moffset = dmap->window_offset; + ptr->len = dmap->length; + ptr++; + i++; + num--; + if (i >= nr_alloc || num == 0) { + memset(&inarg, 0, sizeof(inarg)); + inarg.count = i; + ret = fuse_send_removemapping(inode, &inarg, + remove_one); + if (ret) + goto out; + ptr = remove_one; + i = 0; + } + } +out: + kfree(remove_one); + return ret; +} + +/* + * Cleanup dmap entry and add back to free list. This should be called with + * fcd->lock held. + */ +static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", + dmap->itn.start, dmap->itn.last, dmap->window_offset, + dmap->length); + dmap->itn.start = dmap->itn.last = 0; + __dmap_add_to_free_pool(fcd, dmap); +} + +/* + * Free inode dmap entries whose range falls inside [start, end]. + * Does not take any locks. At this point of time it should only be + * called from evict_inode() path where we know all dmap entries can be + * reclaimed. + */ +static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, + struct inode *inode, + loff_t start, loff_t end) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap, *n; + int err, num = 0; + LIST_HEAD(to_remove); + unsigned long start_idx = start >> FUSE_DAX_SHIFT; + unsigned long end_idx = end >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + while (1) { + node = interval_tree_iter_first(&fi->dax->tree, start_idx, + end_idx); + if (!node) + break; + dmap = node_to_dmap(node); + interval_tree_remove(&dmap->itn, &fi->dax->tree); + num++; + list_add(&dmap->list, &to_remove); + } + + /* Nothing to remove */ + if (list_empty(&to_remove)) + return; + + WARN_ON(fi->dax->nr < num); + fi->dax->nr -= num; + err = dmap_removemapping_list(inode, num, &to_remove); + if (err && err != -ENOTCONN) { + pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", + start, end); + } + spin_lock(&fcd->lock); + list_for_each_entry_safe(dmap, n, &to_remove, list) { + list_del_init(&dmap->list); + dmap_reinit_add_to_free_pool(fcd, dmap); + } + spin_unlock(&fcd->lock); +} + +/* + * It is called from evict_inode() and by that time inode is going away. So + * this function does not take any locks like fi->dax->sem for traversing + * that fuse inode interval tree. If that lock is taken then lock validator + * complains of deadlock situation w.r.t fs_reclaim lock. + */ +void fuse_dax_inode_cleanup(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * fuse_evict_inode() has already called truncate_inode_pages_final() + * before we arrive here. So we should not have to worry about any + * pages/exception entries still associated with inode. + */ + inode_reclaim_dmap_range(fc->dax, inode, 0, -1); + WARN_ON(fi->dax->nr); +} + +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + iomap->type = IOMAP_HOLE; +} + +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap, struct fuse_dax_mapping *dmap, + unsigned int flags) +{ + loff_t offset, len; + loff_t i_size = i_size_read(inode); + + offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); + len = min(length, dmap->length - offset); + + /* If length is beyond end of file, truncate further */ + if (pos + len > i_size) + len = i_size - pos; + + if (len > 0) { + iomap->addr = dmap->window_offset + offset; + iomap->length = len; + if (flags & IOMAP_FAULT) + iomap->length = ALIGN(len, PAGE_SIZE); + iomap->type = IOMAP_MAPPED; + } else { + /* Mapping beyond end of file is hole */ + fuse_fill_iomap_hole(iomap, length); + } +} + +static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; + int ret; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + alloc_dmap = alloc_dax_mapping(fcd); + if (!alloc_dmap) + return -EIO; + + /* + * Take write lock so that only one caller can try to setup mapping + * and other waits. + */ + down_write(&fi->dax->sem); + /* + * We dropped lock. Check again if somebody else setup + * mapping already. + */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return 0; + } + + /* Setup one mapping */ + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, + writable, false); + if (ret < 0) { + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return ret; + } + fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); + up_write(&fi->dax->sem); + return 0; +} + +static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + int ret; + unsigned long idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Take exclusive lock so that only one caller can try to setup + * mapping and others wait. + */ + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, idx, idx); + + /* We are holding either inode lock or i_mmap_sem, and that should + * ensure that dmap can't reclaimed or truncated and it should still + * be there in tree despite the fact we dropped and re-acquired the + * lock. + */ + ret = -EIO; + if (WARN_ON(!node)) + goto out_err; + + dmap = node_to_dmap(node); + + /* Maybe another thread already upgraded mapping while we were not + * holding lock. + */ + if (dmap->writable) { + ret = 0; + goto out_fill_iomap; + } + + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, + true); + if (ret < 0) + goto out_err; +out_fill_iomap: + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); +out_err: + up_write(&fi->dax->sem); + return ret; +} + +/* This is just for DAX and the mapping is ephemeral, do not use it for other + * purposes since there is no block device with a permanent mapping. + */ +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_dax_mapping *dmap; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* We don't support FIEMAP */ + if (WARN_ON(flags & IOMAP_REPORT)) + return -EIO; + + iomap->offset = pos; + iomap->flags = 0; + iomap->bdev = NULL; + iomap->dax_dev = fc->dax->dev; + + /* + * Both read/write and mmap path can race here. So we need something + * to make sure if we are setting up mapping, then other path waits + * + * For now, use a semaphore for this. It probably needs to be + * optimized later. + */ + down_read(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + if (writable && !dmap->writable) { + /* Upgrade read-only mapping to read-write. This will + * require exclusive fi->dax->sem lock as we don't want + * two threads to be trying to this simultaneously + * for same dmap. So drop shared lock and acquire + * exclusive lock. + */ + up_read(&fi->dax->sem); + pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + return fuse_upgrade_dax_mapping(inode, pos, length, + flags, iomap); + } else { + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + up_read(&fi->dax->sem); + return 0; + } + } else { + up_read(&fi->dax->sem); + pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + if (pos >= i_size_read(inode)) + goto iomap_hole; + + return fuse_setup_new_dax_mapping(inode, pos, length, flags, + iomap); + } + + /* + * If read beyond end of file happnes, fs code seems to return + * it as hole + */ +iomap_hole: + fuse_fill_iomap_hole(iomap, length); + pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", + __func__, pos, length, iomap->length); + return 0; +} + +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, + struct iomap *iomap) +{ + /* DAX writes beyond end-of-file aren't handled using iomap, so the + * file size is unchanged and there is nothing to do here. + */ + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, + .iomap_end = fuse_iomap_end, +}; + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); + inode_unlock_shared(inode); + + /* TODO file_accessed(iocb->f_filp) */ + return ret; +} + +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return (iov_iter_rw(from) == WRITE && + ((iocb->ki_pos) >= i_size_read(inode) || + (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); +} + +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t ret; + + ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + if (ret < 0) + return ret; + + fuse_invalidate_attr(inode); + fuse_write_update_size(inode, iocb->ki_pos); + return ret; +} + +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + /* TODO file_update_time() but we don't want metadata I/O */ + + /* Do not use dax for file extending writes as write and on + * disk i_size increase are not atomic otherwise. + */ + if (file_extending_write(iocb, from)) + ret = fuse_dax_direct_write(iocb, from); + else + ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + static void fuse_free_dax_mem_ranges(struct list_head *mem_list) { struct fuse_dax_mapping *range, *temp; @@ -116,6 +653,7 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) if (!fcd) return -ENOMEM; + spin_lock_init(&fcd->lock); fcd->dev = dax_dev; err = fuse_dax_mem_range_init(fcd); if (err) { @@ -127,6 +665,33 @@ int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) return 0; } +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fi->dax = NULL; + if (fc->dax) { + fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); + if (!fi->dax) + return false; + + init_rwsem(&fi->dax->sem); + fi->dax->tree = RB_ROOT_CACHED; + } + + return true; +} + +void fuse_dax_inode_init(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->dax) + return; + + inode->i_flags |= S_DAX; +} + bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) { if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6611ef3269a8..6c586bc97b64 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1539,10 +1539,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_read_iter(iocb, to); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_read_iter(iocb, to); else @@ -1553,10 +1557,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_write_iter(iocb, from); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_write_iter(iocb, from); else @@ -3440,4 +3448,7 @@ void fuse_init_file_inode(struct inode *inode) fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); fi->writepages = RB_ROOT; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_inode_init(inode); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2f3f04aa64c7..2d2bdd596194 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -148,6 +148,13 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; + +#ifdef CONFIG_FUSE_DAX + /* + * Dax specific inode data + */ + struct fuse_inode_dax *dax; +#endif }; /** FUSE inode state bits */ @@ -1104,8 +1111,16 @@ void fuse_free_conn(struct fuse_conn *fc); /* dax.c */ +#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode)) + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); +void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_cleanup(struct inode *inode); bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 67e99cee5a4f..cab4239bd78a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -87,12 +87,19 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); - if (!fi->forget) { - kmem_cache_free(fuse_inode_cachep, fi); - return NULL; - } + if (!fi->forget) + goto out_free; + + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) + goto out_free_forget; return &fi->inode; + +out_free_forget: + kfree(fi->forget); +out_free: + kmem_cache_free(fuse_inode_cachep, fi); + return NULL; } static void fuse_free_inode(struct inode *inode) @@ -101,6 +108,9 @@ static void fuse_free_inode(struct inode *inode) mutex_destroy(&fi->mutex); kfree(fi->forget); +#ifdef CONFIG_FUSE_DAX + kfree(fi->dax); +#endif kmem_cache_free(fuse_inode_cachep, fi); } @@ -112,6 +122,9 @@ static void fuse_evict_inode(struct inode *inode) clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); + + if (FUSE_IS_DAX(inode)) + fuse_dax_inode_cleanup(inode); fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); fi->forget = NULL; } diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 60a7bfc787ce..8899e4862309 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -895,6 +895,7 @@ struct fuse_copy_file_range_in { }; #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) +#define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) struct fuse_setupmapping_in { /* An already open handle */ uint64_t fh; -- cgit v1.2.3 From 501cb008906631a019f3ab2104a17ef8b2651ed0 Mon Sep 17 00:00:00 2001 From: Paul Davey Date: Tue, 8 Sep 2020 10:04:06 +1200 Subject: ipmr: Add route table ID to netlink cache reports Insert the multicast route table ID as a Netlink attribute to Netlink cache report notifications. When multiple route tables are in use it is necessary to have a way to determine which route table a given cache report belongs to when receiving the cache report. Signed-off-by: Paul Davey Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 1 + net/ipv4/ipmr.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 11c8c1fc1124..918f1ef32ffe 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -169,6 +169,7 @@ enum { IPMRA_CREPORT_SRC_ADDR, IPMRA_CREPORT_DST_ADDR, IPMRA_CREPORT_PKT, + IPMRA_CREPORT_TABLE, __IPMRA_CREPORT_MAX }; #define IPMRA_CREPORT_MAX (__IPMRA_CREPORT_MAX - 1) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 876fd6ff1ff9..19b2f586319b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2396,6 +2396,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen) + nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */ + nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */ + nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */ + + nla_total_size(4) /* IPMRA_CREPORT_TABLE */ /* IPMRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; @@ -2431,7 +2432,8 @@ static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR, msg->im_src.s_addr) || nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR, - msg->im_dst.s_addr)) + msg->im_dst.s_addr) || + nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id)) goto nla_put_failure; nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen); -- cgit v1.2.3 From c8715a8e9f38906e73d6d78764216742db13ba0e Mon Sep 17 00:00:00 2001 From: Paul Davey Date: Tue, 8 Sep 2020 10:04:07 +1200 Subject: ipmr: Add high byte of VIF ID to igmpmsg Use the unused3 byte in struct igmpmsg to hold the high 8 bits of the VIF ID. If using more than 255 IPv4 multicast interfaces it is necessary to have access to a VIF ID for cache reports that is wider than 8 bits, the VIF ID present in the igmpmsg reports sent to mroute_sk was only 8 bits wide in the igmpmsg header. Adding the high 8 bits of the 16 bit VIF ID in the unused byte allows use of more than 255 IPv4 multicast interfaces. Signed-off-by: Paul Davey Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 4 ++-- net/ipv4/ipmr.c | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 918f1ef32ffe..1a42f5f9b31b 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -113,8 +113,8 @@ struct igmpmsg { __u32 unused1,unused2; unsigned char im_msgtype; /* What is this */ unsigned char im_mbz; /* Must be zero */ - unsigned char im_vif; /* Interface (this ought to be a vifi_t!) */ - unsigned char unused3; + unsigned char im_vif; /* Low 8 bits of Interface */ + unsigned char im_vif_hi; /* High 8 bits of Interface */ struct in_addr im_src,im_dst; }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 19b2f586319b..4809318f591b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1038,10 +1038,13 @@ static int ipmr_cache_report(struct mr_table *mrt, memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr)); msg->im_msgtype = assert; msg->im_mbz = 0; - if (assert == IGMPMSG_WRVIFWHOLE) + if (assert == IGMPMSG_WRVIFWHOLE) { msg->im_vif = vifi; - else + msg->im_vif_hi = vifi >> 8; + } else { msg->im_vif = mrt->mroute_reg_vif_num; + msg->im_vif_hi = mrt->mroute_reg_vif_num >> 8; + } ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2; ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) + sizeof(struct iphdr)); @@ -1054,6 +1057,7 @@ static int ipmr_cache_report(struct mr_table *mrt, ip_hdr(skb)->protocol = 0; msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; + msg->im_vif_hi = vifi >> 8; skb_dst_set(skb, dst_clone(skb_dst(pkt))); /* Add our header */ igmp = skb_put(skb, sizeof(struct igmphdr)); -- cgit v1.2.3 From 1aef5b4391f0c75c0a1523706a7b0311846ee12f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 10 Sep 2020 13:33:14 -0700 Subject: bpf: Fix comment for helper bpf_current_task_under_cgroup() This should be "current" not "skb". Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Link: https://lore.kernel.org/bpf/20200910203314.70018-1-songliubraving@fb.com --- include/uapi/linux/bpf.h | 4 ++-- tools/include/uapi/linux/bpf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 90359cab501d..7dd314176df7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1447,8 +1447,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 90359cab501d..7dd314176df7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1447,8 +1447,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) -- cgit v1.2.3 From e47168f3d1b14af5281cf50c59561d59d28201f9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 31 Aug 2020 08:30:44 +0000 Subject: powerpc/8xx: Support 16k hugepages with 4k pages The 8xx has 4 page sizes: 4k, 16k, 512k and 8M 4k and 16k can be selected at build time as standard page sizes, and 512k and 8M are hugepages. When 4k standard pages are selected, 16k pages are not available. Allow 16k pages as hugepages when 4k pages are used. To allow that, implement arch_make_huge_pte() which receives the necessary arguments to allow setting the PTE in accordance with the page size: - 512 k pages must have _PAGE_HUGE and _PAGE_SPS. They are set by pte_mkhuge(). arch_make_huge_pte() does nothing. - 16 k pages must have only _PAGE_SPS. arch_make_huge_pte() clears _PAGE_HUGE. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/a518abc29266a708dfbccc8fce9ae6694fe4c2c6.1598862623.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 14 ++++++++++++++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 ++ arch/powerpc/mm/hugetlbpage.c | 2 +- arch/powerpc/mm/nohash/tlb.c | 4 ---- arch/powerpc/mm/ptdump/8xx.c | 5 +++++ include/uapi/asm-generic/hugetlb_encode.h | 1 + include/uapi/linux/mman.h | 1 + 7 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index e752a5807a59..39be9aea86db 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -65,4 +65,18 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, pte_update(mm, addr, ptep, clr, set, 1); } +#ifdef CONFIG_PPC_4K_PAGES +static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable) +{ + size_t size = huge_page_size(hstate_vma(vma)); + + if (size == SZ_16K) + return __pte(pte_val(entry) & ~_PAGE_HUGE); + else + return entry; +} +#define arch_make_huge_pte arch_make_huge_pte +#endif + #endif /* _ASM_POWERPC_NOHASH_32_HUGETLB_8XX_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 80bbc21b87f0..ee2243ba96cf 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -235,6 +235,8 @@ static int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge) return PAGE_SIZE / SZ_4K; else if (hugepd_ok(*((hugepd_t *)pmd))) return 1; + else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE)) + return SZ_16K / SZ_4K; else return SZ_512K / SZ_4K; } diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index e7ae2a2c4545..36c3800769fb 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -180,7 +180,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz if (!hpdp) return NULL; - if (IS_ENABLED(CONFIG_PPC_8xx) && sz == SZ_512K) + if (IS_ENABLED(CONFIG_PPC_8xx) && pshift < PMD_SHIFT) return pte_alloc_map(mm, (pmd_t *)hpdp, addr); BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 14514585db98..5872f69141d5 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -83,16 +83,12 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { }; #elif defined(CONFIG_PPC_8xx) struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { - /* we only manage 4k and 16k pages as normal pages */ -#ifdef CONFIG_PPC_4K_PAGES [MMU_PAGE_4K] = { .shift = 12, }, -#else [MMU_PAGE_16K] = { .shift = 14, }, -#endif [MMU_PAGE_512K] = { .shift = 19, }, diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 8a797dcbf475..86da2a669680 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -11,8 +11,13 @@ static const struct flag_info flag_array[] = { { +#ifdef CONFIG_PPC_16K_PAGES .mask = _PAGE_HUGE, .val = _PAGE_HUGE, +#else + .mask = _PAGE_SPS, + .val = _PAGE_SPS, +#endif .set = "huge", .clear = " ", }, { diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h index b0f8e87235bd..4f3d5aaa11f5 100644 --- a/include/uapi/asm-generic/hugetlb_encode.h +++ b/include/uapi/asm-generic/hugetlb_encode.h @@ -20,6 +20,7 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f +#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) #define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index 923cc162609c..f55bc680b5b0 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -27,6 +27,7 @@ #define MAP_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT #define MAP_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK +#define MAP_HUGE_16KB HUGETLB_FLAG_ENCODE_16KB #define MAP_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB #define MAP_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB #define MAP_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB -- cgit v1.2.3 From 9a27a33027f22a716ce362be48d70ae0eb012ab7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Sep 2020 17:11:52 -0700 Subject: ethtool: add standard pause stats Currently drivers have to report their pause frames statistics via ethtool -S, and there is a wide variety of names used for these statistics. Add the two statistics defined in IEEE 802.3x to the standard API. Create a new ethtool request header flag for including statistics in the response to GET commands. Always create the ETHTOOL_A_PAUSE_STATS nest in replies when flag is set. Testing if driver declares the op is not a reliable way of checking if any stats will actually be included and therefore we don't want to give the impression that presence of ETHTOOL_A_PAUSE_STATS indicates driver support. Note that this patch does not include PFC counters, which may fit better in dcbnl? But mostly I don't need them/have a setup to test them so I haven't looked deeply into exposing them :) v3: - add a helper for "uninitializing" stats, rather than a cryptic memset() (Andrew) Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 11 +++++ include/linux/ethtool.h | 26 ++++++++++++ include/uapi/linux/ethtool_netlink.h | 18 +++++++- net/ethtool/pause.c | 63 +++++++++++++++++++++++++++- 4 files changed, 116 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index d53bcb31645a..2c8e0ddf548e 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -68,6 +68,7 @@ the flags may not apply to requests. Recognized flags are: ================================= =================================== ``ETHTOOL_FLAG_COMPACT_BITSETS`` use compact format bitsets in reply ``ETHTOOL_FLAG_OMIT_REPLY`` omit optional reply (_SET and _ACT) + ``ETHTOOL_FLAG_STATS`` include optional device statistics ================================= =================================== New request flags should follow the general idea that if the flag is not set, @@ -989,8 +990,18 @@ Kernel response contents: ``ETHTOOL_A_PAUSE_AUTONEG`` bool pause autonegotiation ``ETHTOOL_A_PAUSE_RX`` bool receive pause frames ``ETHTOOL_A_PAUSE_TX`` bool transmit pause frames + ``ETHTOOL_A_PAUSE_STATS`` nested pause statistics ===================================== ====== ========================== +``ETHTOOL_A_PAUSE_STATS`` are reported if ``ETHTOOL_FLAG_STATS`` was set +in ``ETHTOOL_A_HEADER_FLAGS``. +It will be empty if driver did not report any statistics. Drivers fill in +the statistics in the following structure: + +.. kernel-doc:: include/linux/ethtool.h + :identifiers: ethtool_pause_stats + +Each member has a corresponding attribute defined. PAUSE_SET ============ diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 969a80211df6..060b20f0b20f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -241,6 +241,27 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \ ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL) +#define ETHTOOL_STAT_NOT_SET (~0ULL) + +/** + * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames + * @tx_pause_frames: transmitted pause frame count. Reported to user space + * as %ETHTOOL_A_PAUSE_STAT_TX_FRAMES. + * + * Equivalent to `30.3.4.2 aPAUSEMACCtrlFramesTransmitted` + * from the standard. + * + * @rx_pause_frames: received pause frame count. Reported to user space + * as %ETHTOOL_A_PAUSE_STAT_RX_FRAMES. Equivalent to: + * + * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` + * from the standard. + */ +struct ethtool_pause_stats { + u64 tx_pause_frames; + u64 rx_pause_frames; +}; + /** * struct ethtool_ops - optional netdev operations * @supported_coalesce_params: supported types of interrupt coalescing. @@ -282,6 +303,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * Returns a negative error code or zero. * @get_ringparam: Report ring sizes * @set_ringparam: Set ring sizes. Returns a negative error code or zero. + * @get_pause_stats: Report pause frame statistics. Drivers must not zero + * statistics which they don't report. The stats structure is initialized + * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. * @get_pauseparam: Report pause parameters * @set_pauseparam: Set pause parameters. Returns a negative error code * or zero. @@ -418,6 +442,8 @@ struct ethtool_ops { struct ethtool_ringparam *); int (*set_ringparam)(struct net_device *, struct ethtool_ringparam *); + void (*get_pause_stats)(struct net_device *dev, + struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, struct ethtool_pauseparam*); int (*set_pauseparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 5dcd24cb33ea..9cee6df01a10 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -91,9 +91,12 @@ enum { #define ETHTOOL_FLAG_COMPACT_BITSETS (1 << 0) /* provide optional reply for SET or ACT requests */ #define ETHTOOL_FLAG_OMIT_REPLY (1 << 1) +/* request statistics, if supported by the driver */ +#define ETHTOOL_FLAG_STATS (1 << 2) #define ETHTOOL_FLAG_ALL (ETHTOOL_FLAG_COMPACT_BITSETS | \ - ETHTOOL_FLAG_OMIT_REPLY) + ETHTOOL_FLAG_OMIT_REPLY | \ + ETHTOOL_FLAG_STATS) enum { ETHTOOL_A_HEADER_UNSPEC, @@ -376,12 +379,25 @@ enum { ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ ETHTOOL_A_PAUSE_RX, /* u8 */ ETHTOOL_A_PAUSE_TX, /* u8 */ + ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ /* add new constants above here */ __ETHTOOL_A_PAUSE_CNT, ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) }; +enum { + ETHTOOL_A_PAUSE_STAT_UNSPEC, + ETHTOOL_A_PAUSE_STAT_PAD, + + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, + + /* add new constants above here */ + __ETHTOOL_A_PAUSE_STAT_CNT, + ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) +}; + /* EEE */ enum { diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 7aea35d1e8a5..1980aa7eb2b6 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -10,6 +10,7 @@ struct pause_req_info { struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; + struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ @@ -22,8 +23,15 @@ pause_get_policy[ETHTOOL_A_PAUSE_MAX + 1] = { [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_REJECT }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_REJECT }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_REJECT }, + [ETHTOOL_A_PAUSE_STATS] = { .type = NLA_REJECT }, }; +static void ethtool_stats_init(u64 *stats, unsigned int n) +{ + while (n--) + stats[n] = ETHTOOL_STAT_NOT_SET; +} + static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -34,10 +42,17 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); + if (req_base->flags & ETHTOOL_FLAG_STATS && + dev->ethtool_ops->get_pause_stats) { + ethtool_stats_init((u64 *)&data->pausestat, + sizeof(data->pausestat) / 8); + dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); + } ethnl_ops_complete(dev); return 0; @@ -46,9 +61,50 @@ static int pause_prepare_data(const struct ethnl_req_info *req_base, static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { - return nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ + int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ + + if (req_base->flags & ETHTOOL_FLAG_STATS) + n += nla_total_size(0) + /* _PAUSE_STATS */ + nla_total_size_64bit(sizeof(u64)) * + (ETHTOOL_A_PAUSE_STAT_MAX - 2); + return n; +} + +static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, + u16 padtype) +{ + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + if (nla_put_u64_64bit(skb, attrtype, val, padtype)) + return -EMSGSIZE; + + return 0; +} + +static int pause_put_stats(struct sk_buff *skb, + const struct ethtool_pause_stats *pause_stats) +{ + const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); + if (!nest) + return -EMSGSIZE; + + if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || + ethtool_put_stat(skb, pause_stats->rx_pause_frames, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, @@ -63,6 +119,10 @@ static int pause_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; + if (req_base->flags & ETHTOOL_FLAG_STATS && + pause_put_stats(skb, &data->pausestat)) + return -EMSGSIZE; + return 0; } @@ -89,6 +149,7 @@ pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = { [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, + [ETHTOOL_A_PAUSE_STATS] = { .type = NLA_REJECT }, }; int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3 From e2ce94dc1d89e0f76ddd202cea72e0f505083d0a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 15 Sep 2020 11:40:57 +0300 Subject: devlink: introduce the health reporter test command Introduce a test command for health reporters. User might use this command to trigger test event on a reporter if the reporter supports it. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index eaec0a8cc5ef..48b1c1ef1ebd 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -566,6 +566,7 @@ enum devlink_health_reporter_state { * @dump: callback to dump an object * if priv_ctx is NULL, run a full dump * @diagnose: callback to diagnose the current status + * @test: callback to trigger a test event */ struct devlink_health_reporter_ops { @@ -578,6 +579,8 @@ struct devlink_health_reporter_ops { int (*diagnose)(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, struct netlink_ext_ack *extack); + int (*test)(struct devlink_health_reporter *reporter, + struct netlink_ext_ack *extack); }; /** diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 40d35145c879..631f5bdf1707 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -122,6 +122,8 @@ enum devlink_command { DEVLINK_CMD_TRAP_POLICER_NEW, DEVLINK_CMD_TRAP_POLICER_DEL, + DEVLINK_CMD_HEALTH_REPORTER_TEST, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 19037f114307..e5b71f3c2d4d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6096,6 +6096,28 @@ devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, return 0; } +static int devlink_nl_cmd_health_reporter_test_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->test) { + devlink_health_reporter_put(reporter); + return -EOPNOTSUPP; + } + + err = reporter->ops->test(reporter, info->extack); + + devlink_health_reporter_put(reporter); + return err; +} + struct devlink_stats { u64 rx_bytes; u64 rx_packets; @@ -7316,6 +7338,14 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | DEVLINK_NL_FLAG_NO_LOCK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = devlink_nl_cmd_health_reporter_test_doit, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT | + DEVLINK_NL_FLAG_NO_LOCK, + }, { .cmd = DEVLINK_CMD_FLASH_UPDATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -- cgit v1.2.3 From ef15314aa5de955c6afd87d512e8b00f5ac08d06 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:40 -0700 Subject: bpf: Add BPF_PROG_BIND_MAP syscall This syscall binds a map to a program. Returns success if the map is already bound to the program. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-3-sdf@google.com --- include/uapi/linux/bpf.h | 7 +++++ kernel/bpf/syscall.c | 63 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 +++++ 3 files changed, 77 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7dd314176df7..a22812561064 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a67b8c6746be..2ce32cad5c8e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4161,6 +4161,66 @@ static int bpf_iter_create(union bpf_attr *attr) return err; } +#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags + +static int bpf_prog_bind_map(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct bpf_map *map; + struct bpf_map **used_maps_old, **used_maps_new; + int i, ret = 0; + + if (CHECK_ATTR(BPF_PROG_BIND_MAP)) + return -EINVAL; + + if (attr->prog_bind_map.flags) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_bind_map.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + map = bpf_map_get(attr->prog_bind_map.map_fd); + if (IS_ERR(map)) { + ret = PTR_ERR(map); + goto out_prog_put; + } + + mutex_lock(&prog->aux->used_maps_mutex); + + used_maps_old = prog->aux->used_maps; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (used_maps_old[i] == map) + goto out_unlock; + + used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1, + sizeof(used_maps_new[0]), + GFP_KERNEL); + if (!used_maps_new) { + ret = -ENOMEM; + goto out_unlock; + } + + memcpy(used_maps_new, used_maps_old, + sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); + used_maps_new[prog->aux->used_map_cnt] = map; + + prog->aux->used_map_cnt++; + prog->aux->used_maps = used_maps_new; + + kfree(used_maps_old); + +out_unlock: + mutex_unlock(&prog->aux->used_maps_mutex); + + if (ret) + bpf_map_put(map); +out_prog_put: + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4294,6 +4354,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_LINK_DETACH: err = link_detach(&attr); break; + case BPF_PROG_BIND_MAP: + err = bpf_prog_bind_map(&attr); + break; default: err = -EINVAL; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7dd314176df7..a22812561064 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 78a3ea5557137b0811f3c5a020afaafa7b61d6aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Sep 2020 10:51:32 -0700 Subject: net: remove comments on struct rtnl_link_stats We removed the misleading comments from struct rtnl_link_stats64 when we added proper kdoc. struct rtnl_link_stats has the same inline comments, so remove them, too. Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bf4667403cab..c4b23f06f69e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -7,24 +7,23 @@ /* This struct should be in sync with struct rtnl_link_stats64 */ struct rtnl_link_stats { - __u32 rx_packets; /* total packets received */ - __u32 tx_packets; /* total packets transmitted */ - __u32 rx_bytes; /* total bytes received */ - __u32 tx_bytes; /* total bytes transmitted */ - __u32 rx_errors; /* bad packets received */ - __u32 tx_errors; /* packet transmit problems */ - __u32 rx_dropped; /* no space in linux buffers */ - __u32 tx_dropped; /* no space available in linux */ - __u32 multicast; /* multicast packets received */ + __u32 rx_packets; + __u32 tx_packets; + __u32 rx_bytes; + __u32 tx_bytes; + __u32 rx_errors; + __u32 tx_errors; + __u32 rx_dropped; + __u32 tx_dropped; + __u32 multicast; __u32 collisions; - /* detailed rx_errors: */ __u32 rx_length_errors; - __u32 rx_over_errors; /* receiver ring buff overflow */ - __u32 rx_crc_errors; /* recved pkt with crc error */ - __u32 rx_frame_errors; /* recv'd frame alignment error */ - __u32 rx_fifo_errors; /* recv'r fifo overrun */ - __u32 rx_missed_errors; /* receiver missed packet */ + __u32 rx_over_errors; + __u32 rx_crc_errors; + __u32 rx_frame_errors; + __u32 rx_fifo_errors; + __u32 rx_missed_errors; /* detailed tx_errors */ __u32 tx_aborted_errors; @@ -37,7 +36,7 @@ struct rtnl_link_stats { __u32 rx_compressed; __u32 tx_compressed; - __u32 rx_nohandler; /* dropped, no handler found */ + __u32 rx_nohandler; }; /** -- cgit v1.2.3 From d65a977087f94f3bb97f351798d864556063109a Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Tue, 8 Sep 2020 12:03:03 -0700 Subject: nl80211: advertise supported channel width in S1G S1G supports 5 channel widths: 1, 2, 4, 8, and 16. One channel width is allowed per frequency in each operating class, so it makes more sense to advertise the specific channel width allowed. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200908190323.15814-3-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++++++++++ net/wireless/nl80211.c | 15 +++++++++++++++ 3 files changed, 45 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7ad530912b21..2a7561743717 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -96,6 +96,16 @@ struct wiphy; * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. + * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted + * on this channel. * */ enum ieee80211_channel_flags { @@ -113,6 +123,11 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_20MHZ = 1<<11, IEEE80211_CHAN_NO_10MHZ = 1<<12, IEEE80211_CHAN_NO_HE = 1<<13, + IEEE80211_CHAN_1MHZ = 1<<14, + IEEE80211_CHAN_2MHZ = 1<<15, + IEEE80211_CHAN_4MHZ = 1<<16, + IEEE80211_CHAN_8MHZ = 1<<17, + IEEE80211_CHAN_16MHZ = 1<<18, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0584e0d349f0..4e119c6afa31 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3737,6 +3737,16 @@ enum nl80211_wmm_rule { * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel * in current regulatory domain. * @NL80211_FREQUENCY_ATTR_OFFSET: frequency offset in KHz + * @NL80211_FREQUENCY_ATTR_1MHZ: 1 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_2MHZ: 2 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_4MHZ: 4 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_8MHZ: 8 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_16MHZ: 16 MHz operation is allowed + * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3768,6 +3778,11 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_WMM, NL80211_FREQUENCY_ATTR_NO_HE, NL80211_FREQUENCY_ATTR_OFFSET, + NL80211_FREQUENCY_ATTR_1MHZ, + NL80211_FREQUENCY_ATTR_2MHZ, + NL80211_FREQUENCY_ATTR_4MHZ, + NL80211_FREQUENCY_ATTR_8MHZ, + NL80211_FREQUENCY_ATTR_16MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 52a35e788547..7da4d84bcc1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1010,6 +1010,21 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_HE) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_1MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_2MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, -- cgit v1.2.3 From 291c49ded2fda1fd0d7bd6056de99fe47d2332e6 Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Fri, 11 Sep 2020 00:05:29 +0000 Subject: nl80211: Add FILS discovery support FILS discovery attribute, NL80211_ATTR_FILS_DISCOVERY, is nested which supports following parameters as given in IEEE Std 802.11ai-2016, Annex C.3 MIB detail: (1) NL80211_FILS_DISCOVERY_ATTR_INT_MIN - Minimum packet interval (2) NL80211_FILS_DISCOVERY_ATTR_INT_MAX - Maximum packet interval (3) NL80211_FILS_DISCOVERY_ATTR_TMPL - Template data Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/20200805011838.28166-2-alokad@codeaurora.org [fix attribute and other names, use NLA_RANGE(), use policy only once] Link: https://lore.kernel.org/r/010101747a7b38a8-306f06b2-9061-4baf-81c1-054a42a18e22-000000@us-west-2.amazonses.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 19 +++++++++++++++++ include/uapi/linux/nl80211.h | 44 +++++++++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 44db9f80e495..c90700727945 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1082,6 +1082,23 @@ struct cfg80211_acl_data { struct mac_address mac_addrs[]; }; +/** + * struct cfg80211_fils_discovery - FILS discovery parameters from + * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. + * + * @min_interval: Minimum packet interval in TUs (0 - 10000) + * @max_interval: Maximum packet interval in TUs (0 - 10000) + * @tmpl_len: Template length + * @tmpl: Template data for FILS discovery frame including the action + * frame headers. + */ +struct cfg80211_fils_discovery { + u32 min_interval; + u32 max_interval; + size_t tmpl_len; + const u8 *tmpl; +}; + /** * enum cfg80211_ap_settings_flags - AP settings flags * @@ -1129,6 +1146,7 @@ enum cfg80211_ap_settings_flags { * @he_obss_pd: OBSS Packet Detection settings * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) + * @fils_discovery: FILS discovery transmission parameters */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1159,6 +1177,7 @@ struct cfg80211_ap_settings { u32 flags; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; + struct cfg80211_fils_discovery fils_discovery; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4e119c6afa31..ad2bea3b07e3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2513,6 +2513,10 @@ enum nl80211_commands { * @NL80211_ATTR_HE_6GHZ_CAPABILITY: HE 6 GHz Band Capability element (from * association request when used with NL80211_CMD_NEW_STATION). * + * @NL80211_ATTR_FILS_DISCOVERY: Optional parameter to configure FILS + * discovery. It is a nested attribute, see + * &enum nl80211_fils_discovery_attributes. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2995,6 +2999,8 @@ enum nl80211_attrs { NL80211_ATTR_HE_6GHZ_CAPABILITY, + NL80211_ATTR_FILS_DISCOVERY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5867,6 +5873,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP: Device wants to do SAE authentication * in AP mode (SAE password is passed as part of the start AP command). * + * @NL80211_EXT_FEATURE_FILS_DISCOVERY: Driver/device supports FILS discovery + * frames transmission + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5925,6 +5934,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, + NL80211_EXT_FEATURE_FILS_DISCOVERY, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -7019,4 +7029,38 @@ enum nl80211_iftype_akm_attributes { NL80211_IFTYPE_AKM_ATTR_MAX = __NL80211_IFTYPE_AKM_ATTR_LAST - 1, }; +/** + * enum nl80211_fils_discovery_attributes - FILS discovery configuration + * from IEEE Std 802.11ai-2016, Annex C.3 MIB detail. + * + * @__NL80211_FILS_DISCOVERY_ATTR_INVALID: Invalid + * + * @NL80211_FILS_DISCOVERY_ATTR_INT_MIN: Minimum packet interval (u32, TU). + * Allowed range: 0..10000 (TU = Time Unit) + * @NL80211_FILS_DISCOVERY_ATTR_INT_MAX: Maximum packet interval (u32, TU). + * Allowed range: 0..10000 (TU = Time Unit) + * @NL80211_FILS_DISCOVERY_ATTR_TMPL: Template data for FILS discovery action + * frame including the headers. + * + * @__NL80211_FILS_DISCOVERY_ATTR_LAST: Internal + * @NL80211_FILS_DISCOVERY_ATTR_MAX: highest attribute + */ +enum nl80211_fils_discovery_attributes { + __NL80211_FILS_DISCOVERY_ATTR_INVALID, + + NL80211_FILS_DISCOVERY_ATTR_INT_MIN, + NL80211_FILS_DISCOVERY_ATTR_INT_MAX, + NL80211_FILS_DISCOVERY_ATTR_TMPL, + + /* keep last */ + __NL80211_FILS_DISCOVERY_ATTR_LAST, + NL80211_FILS_DISCOVERY_ATTR_MAX = __NL80211_FILS_DISCOVERY_ATTR_LAST - 1 +}; + +/* + * FILS discovery template minimum length with action frame headers and + * mandatory fields. + */ +#define NL80211_FILS_DISCOVERY_TMPL_MIN_LEN 42 + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5d9d51cfc653..afe782887ca9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -376,6 +376,15 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_NESTED(nl80211_txattr_policy), }; +static const struct nla_policy +nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = { + [NL80211_FILS_DISCOVERY_ATTR_INT_MIN] = NLA_POLICY_MAX(NLA_U32, 10000), + [NL80211_FILS_DISCOVERY_ATTR_INT_MAX] = NLA_POLICY_MAX(NLA_U32, 10000), + NLA_POLICY_RANGE(NLA_BINARY, + NL80211_FILS_DISCOVERY_TMPL_MIN_LEN, + IEEE80211_MAX_DATA_LEN), +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -684,6 +693,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, [NL80211_ATTR_HE_6GHZ_CAPABILITY] = NLA_POLICY_EXACT_LEN(sizeof(struct ieee80211_he_6ghz_capa)), + [NL80211_ATTR_FILS_DISCOVERY] = + NLA_POLICY_NESTED(nl80211_fils_discovery_policy), }; /* policy for the key attributes */ @@ -4874,6 +4885,36 @@ static int nl80211_parse_he_bss_color(struct nlattr *attrs, return 0; } +static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_ap_settings *params) +{ + struct nlattr *tb[NL80211_FILS_DISCOVERY_ATTR_MAX + 1]; + int ret; + struct cfg80211_fils_discovery *fd = ¶ms->fils_discovery; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_FILS_DISCOVERY)) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_FILS_DISCOVERY_ATTR_MAX, attrs, + NULL, NULL); + if (ret) + return ret; + + if (!tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN] || + !tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX] || + !tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]) + return -EINVAL; + + fd->tmpl_len = nla_len(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]); + fd->tmpl = nla_data(tb[NL80211_FILS_DISCOVERY_ATTR_TMPL]); + fd->min_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MIN]); + fd->max_interval = nla_get_u32(tb[NL80211_FILS_DISCOVERY_ATTR_INT_MAX]); + + return 0; +} + static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, const u8 *rates) { @@ -5182,6 +5223,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) { + err = nl80211_parse_fils_discovery(rdev, + info->attrs[NL80211_ATTR_FILS_DISCOVERY], + ¶ms); + if (err) + goto out; + } + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) -- cgit v1.2.3 From 7443dcd1f1718a355e9c4ebeb7e95c3f9f27bb5f Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Fri, 11 Sep 2020 00:33:00 +0000 Subject: nl80211: Unsolicited broadcast probe response support This patch adds new attributes to support unsolicited broadcast probe response transmission used for in-band discovery in 6GHz band (IEEE P802.11ax/D6.0 26.17.2.3.2, AP behavior for fast passive scanning). The new attribute, NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, is nested which supports following parameters: (1) NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT - Packet interval (2) NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL - Template data Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/010101747a946698-aac263ae-2ed3-4dab-9590-0bc7131214e1-000000@us-west-2.amazonses.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 18 +++++++++++++++++ include/uapi/linux/nl80211.h | 36 ++++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 46 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c90700727945..93d666a571da 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1099,6 +1099,22 @@ struct cfg80211_fils_discovery { const u8 *tmpl; }; +/** + * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe + * response parameters in 6GHz. + * + * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned + * in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive + * scanning + * @tmpl_len: Template length + * @tmpl: Template data for probe response + */ +struct cfg80211_unsol_bcast_probe_resp { + u32 interval; + size_t tmpl_len; + const u8 *tmpl; +}; + /** * enum cfg80211_ap_settings_flags - AP settings flags * @@ -1147,6 +1163,7 @@ enum cfg80211_ap_settings_flags { * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) * @fils_discovery: FILS discovery transmission parameters + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1178,6 +1195,7 @@ struct cfg80211_ap_settings { struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; struct cfg80211_fils_discovery fils_discovery; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ad2bea3b07e3..bdc90b8dfd24 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2517,6 +2517,10 @@ enum nl80211_commands { * discovery. It is a nested attribute, see * &enum nl80211_fils_discovery_attributes. * + * @NL80211_ATTR_UNSOL_BCAST_PROBE_RESP: Optional parameter to configure + * unsolicited broadcast probe response. It is a nested attribute, see + * &enum nl80211_unsol_bcast_probe_resp_attributes. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3001,6 +3005,8 @@ enum nl80211_attrs { NL80211_ATTR_FILS_DISCOVERY, + NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5876,6 +5882,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_FILS_DISCOVERY: Driver/device supports FILS discovery * frames transmission * + * @NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP: Driver/device supports + * unsolicited broadcast probe response transmission + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5935,6 +5944,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, NL80211_EXT_FEATURE_FILS_DISCOVERY, + NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -7063,4 +7073,30 @@ enum nl80211_fils_discovery_attributes { */ #define NL80211_FILS_DISCOVERY_TMPL_MIN_LEN 42 +/** + * enum nl80211_unsol_bcast_probe_resp_attributes - Unsolicited broadcast probe + * response configuration. Applicable only in 6GHz. + * + * @__NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INVALID: Invalid + * + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT: Maximum packet interval (u32, TU). + * Allowed range: 0..20 (TU = Time Unit). IEEE P802.11ax/D6.0 + * 26.17.2.3.2 (AP behavior for fast passive scanning). + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL: Unsolicited broadcast probe response + * frame template (binary). + * + * @__NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST: Internal + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX: highest attribute + */ +enum nl80211_unsol_bcast_probe_resp_attributes { + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INVALID, + + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT, + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL, + + /* keep last */ + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST, + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX = + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST - 1 +}; #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index afe782887ca9..1a212db7a300 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -385,6 +385,13 @@ nl80211_fils_discovery_policy[NL80211_FILS_DISCOVERY_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_unsol_bcast_probe_resp_policy[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1] = { + [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] = NLA_POLICY_MAX(NLA_U32, 20), + [NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN } +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -695,6 +702,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_EXACT_LEN(sizeof(struct ieee80211_he_6ghz_capa)), [NL80211_ATTR_FILS_DISCOVERY] = NLA_POLICY_NESTED(nl80211_fils_discovery_policy), + [NL80211_ATTR_UNSOL_BCAST_PROBE_RESP] = + NLA_POLICY_NESTED(nl80211_unsol_bcast_probe_resp_policy), }; /* policy for the key attributes */ @@ -4915,6 +4924,35 @@ static int nl80211_parse_fils_discovery(struct cfg80211_registered_device *rdev, return 0; } +static int +nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_ap_settings *params) +{ + struct nlattr *tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX + 1]; + int ret; + struct cfg80211_unsol_bcast_probe_resp *presp = + ¶ms->unsol_bcast_probe_resp; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP)) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX, + attrs, NULL, NULL); + if (ret) + return ret; + + if (!tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT] || + !tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]) + return -EINVAL; + + presp->tmpl = nla_data(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]); + presp->tmpl_len = nla_len(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL]); + presp->interval = nla_get_u32(tb[NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT]); + return 0; +} + static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, const u8 *rates) { @@ -5231,6 +5269,14 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { + err = nl80211_parse_unsol_bcast_probe_resp( + rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], + ¶ms); + if (err) + return err; + } + nl80211_calculate_ap_params(¶ms); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) -- cgit v1.2.3 From c6ff213fe5b8696c9539a1b34ff03de9306dfff9 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 8 Sep 2020 18:01:48 +0200 Subject: fuse: add submount support to - Add fuse_attr.flags - Add FUSE_ATTR_SUBMOUNT This is a flag for fuse_attr.flags that indicates that the given entry resides on a different filesystem than the parent, and as such should have a different st_dev. - Add FUSE_SUBMOUNTS The client sets this flag if it supports automounting directories. Signed-off-by: Max Reitz Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 8899e4862309..7233502ea991 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -172,6 +172,9 @@ * - add FUSE_WRITE_KILL_PRIV flag * - add FUSE_SETUPMAPPING and FUSE_REMOVEMAPPING * - add map_alignment to fuse_init_out, add FUSE_MAP_ALIGNMENT flag + * + * 7.32 + * - add flags to fuse_attr, add FUSE_ATTR_SUBMOUNT, add FUSE_SUBMOUNTS */ #ifndef _LINUX_FUSE_H @@ -207,7 +210,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 31 +#define FUSE_KERNEL_MINOR_VERSION 32 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -231,7 +234,7 @@ struct fuse_attr { uint32_t gid; uint32_t rdev; uint32_t blksize; - uint32_t padding; + uint32_t flags; }; struct fuse_kstatfs { @@ -316,6 +319,7 @@ struct fuse_file_lock { * FUSE_MAP_ALIGNMENT: init_out.map_alignment contains log2(byte alignment) for * foffset and moffset fields in struct * fuse_setupmapping_out and fuse_removemapping_one. + * FUSE_SUBMOUNTS: kernel supports auto-mounting directory submounts */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -344,6 +348,7 @@ struct fuse_file_lock { #define FUSE_NO_OPENDIR_SUPPORT (1 << 24) #define FUSE_EXPLICIT_INVAL_DATA (1 << 25) #define FUSE_MAP_ALIGNMENT (1 << 26) +#define FUSE_SUBMOUNTS (1 << 27) /** * CUSE INIT request/reply flags @@ -419,6 +424,13 @@ struct fuse_file_lock { */ #define FUSE_FSYNC_FDATASYNC (1 << 0) +/** + * fuse_attr flags + * + * FUSE_ATTR_SUBMOUNT: Object is a submount root + */ +#define FUSE_ATTR_SUBMOUNT (1 << 0) + enum fuse_opcode { FUSE_LOOKUP = 1, FUSE_FORGET = 2, /* no reply */ -- cgit v1.2.3 From f92970c694b36a4dbac2b650b173c78c0f0954cc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 17 Sep 2020 18:13:23 -0700 Subject: devlink: add timeout information to status_notify Add a timeout element to the DEVLINK_CMD_FLASH_UPDATE_STATUS netlink message for use by a userland utility to show that a particular firmware flash activity may take a long but bounded time to finish. Also add a handy helper for drivers to make use of the new timeout value. UI usage hints: - if non-zero, add timeout display to the end of the status line [component] status_msg ( Xm Ys : Am Bs ) using the timeout value for Am Bs and updating the Xm Ys every second - if the timeout expires while awaiting the next update, display something like [component] status_msg ( timeout reached : Am Bs ) - if new status notify messages are received, remove the timeout and start over Signed-off-by: Shannon Nelson Reviewed-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 29 +++++++++++++++++++++++------ 3 files changed, 30 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 48b1c1ef1ebd..be132c17fbcc 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1403,6 +1403,10 @@ void devlink_flash_update_status_notify(struct devlink *devlink, const char *component, unsigned long done, unsigned long total); +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout); int devlink_traps_register(struct devlink *devlink, const struct devlink_trap *traps, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 631f5bdf1707..a2ecc8b00611 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -462,6 +462,9 @@ enum devlink_attr { DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ + + DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index e5b71f3c2d4d..a32e15851119 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3024,7 +3024,9 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg, enum devlink_command cmd, const char *status_msg, const char *component, - unsigned long done, unsigned long total) + unsigned long done, + unsigned long total, + unsigned long timeout) { void *hdr; @@ -3052,6 +3054,9 @@ static int devlink_nl_flash_update_fill(struct sk_buff *msg, if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL, total, DEVLINK_ATTR_PAD)) goto nla_put_failure; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, + timeout, DEVLINK_ATTR_PAD)) + goto nla_put_failure; out: genlmsg_end(msg, hdr); @@ -3067,7 +3072,8 @@ static void __devlink_flash_update_notify(struct devlink *devlink, const char *status_msg, const char *component, unsigned long done, - unsigned long total) + unsigned long total, + unsigned long timeout) { struct sk_buff *msg; int err; @@ -3081,7 +3087,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink, return; err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg, - component, done, total); + component, done, total, timeout); if (err) goto out_free_msg; @@ -3097,7 +3103,7 @@ void devlink_flash_update_begin_notify(struct devlink *devlink) { __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE, - NULL, NULL, 0, 0); + NULL, NULL, 0, 0, 0); } EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify); @@ -3105,7 +3111,7 @@ void devlink_flash_update_end_notify(struct devlink *devlink) { __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_END, - NULL, NULL, 0, 0); + NULL, NULL, 0, 0, 0); } EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify); @@ -3117,10 +3123,21 @@ void devlink_flash_update_status_notify(struct devlink *devlink, { __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_STATUS, - status_msg, component, done, total); + status_msg, component, done, total, 0); } EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify); +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout) +{ + __devlink_flash_update_notify(devlink, + DEVLINK_CMD_FLASH_UPDATE_STATUS, + status_msg, component, 0, 0, timeout); +} +EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { -- cgit v1.2.3 From daef1ee3798b25e8464b8eb618eaa74b8f423ac7 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 18 Sep 2020 08:17:27 +0700 Subject: tipc: introduce encryption master key In addition to the supported cluster & per-node encryption keys for the en/decryption of TIPC messages, we now introduce one option for user to set a cluster key as 'master key', which is simply a symmetric key like the former but has a longer life cycle. It has two purposes: - Authentication of new member nodes in the cluster. New nodes, having no knowledge of current session keys in the cluster will still be able to join the cluster as long as they know the master key. This is because all neighbor discovery (LINK_CONFIG) messages must be encrypted with this key. - Encryption of session encryption keys during automatic exchange and update of those.This is a feature we will introduce in a later commit in this series. We insert the new key into the currently unused slot 0 in the key array and start using it immediately once the user has set it. After joining, a node only knowing the master key should be fully communicable to existing nodes in the cluster, although those nodes may have their own session keys activated (i.e. not the master one). To support this, we define a 'grace period', starting from the time a node itself reports having no RX keys, so the existing nodes will use the master key for encryption instead. The grace period can be extended but will automatically stop after e.g. 5 seconds without a new report. This is also the basis for later key exchanging feature as the new node will be impossible to decrypt anything without the support from master key. For user to set a master key, we define a new netlink flag - 'TIPC_NLA_NODE_KEY_MASTER', so it can be added to the current 'set key' netlink command to specify the setting key to be a master key. Above all, the traditional cluster/per-node key mechanism is guaranteed to work when user comes not to use this master key option. This is also compatible to legacy nodes without the feature supported. Even this master key can be updated without any interruption of cluster connectivity but is so is needed, this has to be coordinated and set by the user. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/crypto.c | 210 ++++++++++++++++++++++++++++---------- net/tipc/crypto.h | 15 ++- net/tipc/msg.h | 4 +- net/tipc/netlink.c | 1 + net/tipc/node.c | 6 +- 6 files changed, 175 insertions(+), 62 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index dc0d23a50e69..d484baa9d365 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -165,6 +165,7 @@ enum { TIPC_NLA_NODE_UP, /* flag */ TIPC_NLA_NODE_ID, /* data */ TIPC_NLA_NODE_KEY, /* data */ + TIPC_NLA_NODE_KEY_MASTER, /* flag */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 45a8f4d9d9de..2510b82d3cc1 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -38,6 +38,7 @@ #include #include "crypto.h" +#define TIPC_TX_GRACE_PERIOD msecs_to_jiffies(5000) /* 5s */ #define TIPC_TX_LASTING_TIME msecs_to_jiffies(10000) /* 10s */ #define TIPC_RX_ACTIVE_LIM msecs_to_jiffies(3000) /* 3s */ #define TIPC_RX_PASSIVE_LIM msecs_to_jiffies(15000) /* 15s */ @@ -49,9 +50,9 @@ * TIPC Key ids */ enum { - KEY_UNUSED = 0, - KEY_MIN, - KEY_1 = KEY_MIN, + KEY_MASTER = 0, + KEY_MIN = KEY_MASTER, + KEY_1 = 1, KEY_2, KEY_3, KEY_MAX = KEY_3, @@ -166,27 +167,36 @@ struct tipc_crypto_stats { * @aead: array of pointers to AEAD keys for encryption/decryption * @peer_rx_active: replicated peer RX active key index * @key: the key states - * @working: the crypto is working or not * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) * @timer1: general timer 1 (jiffies) * @timer2: general timer 2 (jiffies) + * @working: the crypto is working or not + * @key_master: flag indicates if master key exists + * @legacy_user: flag indicates if a peer joins w/o master key (for bwd comp.) * @lock: tipc_key lock */ struct tipc_crypto { struct net *net; struct tipc_node *node; - struct tipc_aead __rcu *aead[KEY_MAX + 1]; /* key[0] is UNUSED */ + struct tipc_aead __rcu *aead[KEY_MAX + 1]; atomic_t peer_rx_active; struct tipc_key key; - u8 working:1; struct tipc_crypto_stats __percpu *stats; char name[48]; atomic64_t sndnxt ____cacheline_aligned; unsigned long timer1; unsigned long timer2; + union { + struct { + u8 working:1; + u8 key_master:1; + u8 legacy_user:1; + }; + u8 flags; + }; spinlock_t lock; /* crypto lock */ } ____cacheline_aligned; @@ -236,13 +246,19 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, u8 new_active, u8 new_pending); static int tipc_crypto_key_attach(struct tipc_crypto *c, - struct tipc_aead *aead, u8 pos); + struct tipc_aead *aead, u8 pos, + bool master_key); static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending); static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, - struct sk_buff *skb); + struct sk_buff *skb, + u8 tx_key); static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb); static int tipc_crypto_key_revoke(struct net *net, u8 tx_key); +static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode, u8 type); static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, struct tipc_bearer *b, struct sk_buff **skb, int err); @@ -943,8 +959,6 @@ bool tipc_ehdr_validate(struct sk_buff *skb) return false; if (unlikely(skb->len <= ehsz + TIPC_AES_GCM_TAG_SIZE)) return false; - if (unlikely(!ehdr->tx_key)) - return false; return true; } @@ -997,6 +1011,8 @@ static int tipc_ehdr_build(struct net *net, struct tipc_aead *aead, ehdr->tx_key = tx_key; ehdr->destined = (__rx) ? 1 : 0; ehdr->rx_key_active = (__rx) ? __rx->key.active : 0; + ehdr->rx_nokey = (__rx) ? !__rx->key.keys : 0; + ehdr->master_key = aead->crypto->key_master; ehdr->reserved_1 = 0; ehdr->reserved_2 = 0; @@ -1039,6 +1055,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, * @c: TIPC crypto to which new key is attached * @ukey: the user key * @mode: the key mode (CLUSTER_KEY or PER_NODE_KEY) + * @master_key: specify this is a cluster master key * * A new TIPC AEAD key will be allocated and initiated with the specified user * key, then attached to the TIPC crypto. @@ -1046,7 +1063,7 @@ static inline void tipc_crypto_key_set_state(struct tipc_crypto *c, * Return: new key id in case of success, otherwise: < 0 */ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, - u8 mode) + u8 mode, bool master_key) { struct tipc_aead *aead = NULL; int rc = 0; @@ -1056,7 +1073,7 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, /* Attach it to the crypto */ if (likely(!rc)) { - rc = tipc_crypto_key_attach(c, aead, 0); + rc = tipc_crypto_key_attach(c, aead, 0, master_key); if (rc < 0) tipc_aead_free(&aead->rcu); } @@ -1069,11 +1086,13 @@ int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, * @c: TIPC crypto to which the new AEAD key is attached * @aead: the new AEAD key pointer * @pos: desired slot in the crypto key array, = 0 if any! + * @master_key: specify this is a cluster master key * * Return: new key id in case of success, otherwise: -EBUSY */ static int tipc_crypto_key_attach(struct tipc_crypto *c, - struct tipc_aead *aead, u8 pos) + struct tipc_aead *aead, u8 pos, + bool master_key) { struct tipc_key key; int rc = -EBUSY; @@ -1081,6 +1100,10 @@ static int tipc_crypto_key_attach(struct tipc_crypto *c, spin_lock_bh(&c->lock); key = c->key; + if (master_key) { + new_key = KEY_MASTER; + goto attach; + } if (key.active && key.passive) goto exit; if (key.pending) { @@ -1112,8 +1135,7 @@ attach: tipc_crypto_key_set_state(c, key.passive, key.active, key.pending); c->working = 1; - c->timer1 = jiffies; - c->timer2 = jiffies; + c->key_master |= master_key; rc = new_key; exit: @@ -1126,7 +1148,7 @@ void tipc_crypto_key_flush(struct tipc_crypto *c) int k; spin_lock_bh(&c->lock); - c->working = 0; + c->flags = 0; tipc_crypto_key_set_state(c, 0, 0, 0); for (k = KEY_MIN; k <= KEY_MAX; k++) tipc_crypto_key_detach(c->aead[k], &c->lock); @@ -1202,6 +1224,7 @@ exit: * @tx: TX crypto handle * @rx: RX crypto handle (can be NULL) * @skb: the message skb which will be decrypted later + * @tx_key: peer TX key id * * This function looks up the existing TX keys and pick one which is suitable * for the message decryption, that must be a cluster key and not used before @@ -1211,7 +1234,8 @@ exit: */ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, struct tipc_crypto *rx, - struct sk_buff *skb) + struct sk_buff *skb, + u8 tx_key) { struct tipc_skb_cb *skb_cb = TIPC_SKB_CB(skb); struct tipc_aead *aead = NULL; @@ -1230,6 +1254,10 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, /* Pick one TX key */ spin_lock(&tx->lock); + if (tx_key == KEY_MASTER) { + aead = tipc_aead_rcu_ptr(tx->aead[KEY_MASTER], &tx->lock); + goto done; + } do { k = (i == 0) ? key.pending : ((i == 1) ? key.active : key.passive); @@ -1249,9 +1277,12 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, skb->next = skb_clone(skb, GFP_ATOMIC); if (unlikely(!skb->next)) pr_warn("Failed to clone skb for next round if any\n"); - WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); break; } while (++i < 3); + +done: + if (likely(aead)) + WARN_ON(!refcount_inc_not_zero(&aead->refcnt)); spin_unlock(&tx->lock); return aead; @@ -1266,6 +1297,9 @@ static struct tipc_aead *tipc_crypto_key_pick_tx(struct tipc_crypto *tx, * has changed, so the number of TX keys' users on this node are increased and * decreased correspondingly. * + * It also considers if peer has no key, then we need to make own master key + * (if any) taking over i.e. starting grace period. + * * The "per-peer" sndnxt is also reset when the peer key has switched. */ static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) @@ -1276,11 +1310,23 @@ static void tipc_crypto_key_synch(struct tipc_crypto *rx, struct sk_buff *skb) u32 self = tipc_own_addr(rx->net); u8 cur, new; - /* Ensure this message is destined to us first */ + /* Update RX 'key_master' flag according to peer, also mark "legacy" if + * a peer has no master key. + */ + rx->key_master = ehdr->master_key; + if (!rx->key_master) + tx->legacy_user = 1; + + /* For later cases, apply only if message is destined to this node */ if (!ehdr->destined || msg_short(hdr) || msg_destnode(hdr) != self) return; - /* Peer RX active key has changed, let's update own TX users */ + /* Case 1: Peer has no keys, let's make master key take over */ + if (ehdr->rx_nokey) + /* Set or extend grace period */ + tx->timer2 = jiffies; + + /* Case 2: Peer RX active key has changed, let's update own TX users */ cur = atomic_read(&rx->peer_rx_active); new = ehdr->rx_key_active; if (tx->key.keys && @@ -1338,7 +1384,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, return -ENOMEM; } - c->working = 0; + c->flags = 0; c->net = net; c->node = node; tipc_crypto_key_set_state(c, 0, 0, 0); @@ -1473,6 +1519,12 @@ s4: s5: spin_unlock(&rx->lock); + /* Relax it here, the flag will be set again if it really is, but only + * when we are not in grace period for safety! + */ + if (time_after(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) + tx->legacy_user = 0; + /* Limit max_tfms & do debug commands if needed */ if (likely(sysctl_tipc_max_tfms <= TIPC_MAX_TFMS_LIM)) return; @@ -1482,6 +1534,22 @@ s5: tipc_crypto_do_cmd(rx->net, cmd); } +static inline void tipc_crypto_clone_msg(struct net *net, struct sk_buff *_skb, + struct tipc_bearer *b, + struct tipc_media_addr *dst, + struct tipc_node *__dnode, u8 type) +{ + struct sk_buff *skb; + + skb = skb_clone(_skb, GFP_ATOMIC); + if (skb) { + TIPC_SKB_CB(skb)->xmit_type = type; + tipc_crypto_xmit(net, &skb, b, dst, __dnode); + if (skb) + b->media->send_msg(net, skb, b, dst); + } +} + /** * tipc_crypto_xmit - Build & encrypt TIPC message for xmit * @net: struct net @@ -1491,7 +1559,8 @@ s5: * @__dnode: destination node for reference if any * * First, build an encryption message header on the top of the message, then - * encrypt the original TIPC message by using the active or pending TX key. + * encrypt the original TIPC message by using the pending, master or active + * key with this preference order. * If the encryption is successful, the encrypted skb is returned directly or * via the callback. * Otherwise, the skb is freed! @@ -1514,46 +1583,63 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, struct tipc_msg *hdr = buf_msg(*skb); struct tipc_key key = tx->key; struct tipc_aead *aead = NULL; - struct sk_buff *_skb; - int rc = -ENOKEY; u32 user = msg_user(hdr); - u8 tx_key; + u32 type = msg_type(hdr); + int rc = -ENOKEY; + u8 tx_key = 0; /* No encryption? */ if (!tx->working) return 0; - /* Try with the pending key if available and: - * 1) This is the only choice (i.e. no active key) or; - * 2) Peer has switched to this key (unicast only) or; - * 3) It is time to do a pending key probe; - */ + /* Pending key if peer has active on it or probing time */ if (unlikely(key.pending)) { tx_key = key.pending; - if (!key.active) + if (!tx->key_master && !key.active) goto encrypt; if (__rx && atomic_read(&__rx->peer_rx_active) == tx_key) goto encrypt; - if (TIPC_SKB_CB(*skb)->probe) { + if (TIPC_SKB_CB(*skb)->xmit_type == SKB_PROBING) { pr_debug("%s: probing for key[%d]\n", tx->name, key.pending); goto encrypt; } - if (user == LINK_CONFIG || user == LINK_PROTOCOL) { - _skb = skb_clone(*skb, GFP_ATOMIC); - if (_skb) { - TIPC_SKB_CB(_skb)->probe = 1; - tipc_crypto_xmit(net, &_skb, b, dst, __dnode); - if (_skb) - b->media->send_msg(net, _skb, b, dst); + if (user == LINK_CONFIG || user == LINK_PROTOCOL) + tipc_crypto_clone_msg(net, *skb, b, dst, __dnode, + SKB_PROBING); + } + + /* Master key if this is a *vital* message or in grace period */ + if (tx->key_master) { + tx_key = KEY_MASTER; + if (!key.active) + goto encrypt; + if (TIPC_SKB_CB(*skb)->xmit_type == SKB_GRACING) { + pr_debug("%s: gracing for msg (%d %d)\n", tx->name, + user, type); + goto encrypt; + } + if (user == LINK_CONFIG || + (user == LINK_PROTOCOL && type == RESET_MSG) || + time_before(jiffies, tx->timer2 + TIPC_TX_GRACE_PERIOD)) { + if (__rx && __rx->key_master && + !atomic_read(&__rx->peer_rx_active)) + goto encrypt; + if (!__rx) { + if (likely(!tx->legacy_user)) + goto encrypt; + tipc_crypto_clone_msg(net, *skb, b, dst, + __dnode, SKB_GRACING); } } } + /* Else, use the active key if any */ if (likely(key.active)) { tx_key = key.active; goto encrypt; } + goto exit; encrypt: @@ -1619,15 +1705,16 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct tipc_aead *aead = NULL; struct tipc_key key; int rc = -ENOKEY; - u8 tx_key = 0; + u8 tx_key; + + tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* New peer? * Let's try with TX key (i.e. cluster mode) & verify the skb first! */ - if (unlikely(!rx)) + if (unlikely(!rx || tx_key == KEY_MASTER)) goto pick_tx; - tx_key = ((struct tipc_ehdr *)(*skb)->data)->tx_key; /* Pick RX key according to TX key if any */ key = rx->key; if (tx_key == key.active || tx_key == key.pending || @@ -1640,7 +1727,7 @@ int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, pick_tx: /* No key suitable? Try to pick one from TX... */ - aead = tipc_crypto_key_pick_tx(tx, rx, *skb); + aead = tipc_crypto_key_pick_tx(tx, rx, *skb, tx_key); if (aead) goto decrypt; goto exit; @@ -1722,9 +1809,12 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, goto free_skb; } + /* Ignore cloning if it was TX master key */ + if (ehdr->tx_key == KEY_MASTER) + goto rcv; if (tipc_aead_clone(&tmp, aead) < 0) goto rcv; - if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key) < 0) { + if (tipc_crypto_key_attach(rx, tmp, ehdr->tx_key, false) < 0) { tipc_aead_free(&tmp->rcu); goto rcv; } @@ -1740,10 +1830,10 @@ static void tipc_crypto_rcv_complete(struct net *net, struct tipc_aead *aead, /* Set the RX key's user */ tipc_aead_users_set(aead, 1); -rcv: /* Mark this point, RX works */ rx->timer1 = jiffies; +rcv: /* Remove ehdr & auth. tag prior to tipc_rcv() */ ehdr = (struct tipc_ehdr *)(*skb)->data; @@ -1865,14 +1955,24 @@ static char *tipc_crypto_key_dump(struct tipc_crypto *c, char *buf) char *s; for (k = KEY_MIN; k <= KEY_MAX; k++) { - if (k == key.passive) - s = "PAS"; - else if (k == key.active) - s = "ACT"; - else if (k == key.pending) - s = "PEN"; - else - s = "-"; + if (k == KEY_MASTER) { + if (is_rx(c)) + continue; + if (time_before(jiffies, + c->timer2 + TIPC_TX_GRACE_PERIOD)) + s = "ACT"; + else + s = "PAS"; + } else { + if (k == key.passive) + s = "PAS"; + else if (k == key.active) + s = "ACT"; + else if (k == key.pending) + s = "PEN"; + else + s = "-"; + } i += scnprintf(buf + i, 200 - i, "\tKey%d: %s", k, s); rcu_read_lock(); @@ -1905,7 +2005,7 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, /* Output format: "[%s %s %s] -> [%s %s %s]", max len = 32 */ again: i += scnprintf(buf + i, 32 - i, "["); - for (k = KEY_MIN; k <= KEY_MAX; k++) { + for (k = KEY_1; k <= KEY_3; k++) { if (k == key->passive) s = "pas"; else if (k == key->active) @@ -1915,7 +2015,7 @@ again: else s = "-"; i += scnprintf(buf + i, 32 - i, - (k != KEY_MAX) ? "%s " : "%s", s); + (k != KEY_3) ? "%s " : "%s", s); } if (key != &new) { i += scnprintf(buf + i, 32 - i, "] -> "); diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h index c387240e03d0..643b55077112 100644 --- a/net/tipc/crypto.h +++ b/net/tipc/crypto.h @@ -74,7 +74,7 @@ extern int sysctl_tipc_max_tfms __read_mostly; * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 * 1 0 9 8 7 6 5 4|3 2 1 0 9 8 7 6|5 4 3 2 1 0 9 8|7 6 5 4 3 2 1 0 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * w0:|Ver=7| User |D|TX |RX |K| Rsvd | + * w0:|Ver=7| User |D|TX |RX |K|M|N| Rsvd | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * w1:| Seqno | * w2:| (8 octets) | @@ -101,6 +101,9 @@ extern int sysctl_tipc_max_tfms __read_mostly; * RX : Currently RX active key corresponding to the destination * node's TX key (when the "D" bit is set) * K : Keep-alive bit (for RPS, LINK_PROTOCOL/STATE_MSG only) + * M : Bit indicates if sender has master key + * N : Bit indicates if sender has no RX keys corresponding to the + * receiver's TX (when the "D" bit is set) * Rsvd : Reserved bit, field * Word1-2: * Seqno : The 64-bit sequence number of the encrypted message, also @@ -117,7 +120,9 @@ struct tipc_ehdr { __u8 destined:1, user:4, version:3; - __u8 reserved_1:3, + __u8 reserved_1:1, + rx_nokey:1, + master_key:1, keepalive:1, rx_key_active:2, tx_key:2; @@ -128,7 +133,9 @@ struct tipc_ehdr { __u8 tx_key:2, rx_key_active:2, keepalive:1, - reserved_1:3; + master_key:1, + rx_nokey:1, + reserved_1:1; #else #error "Please fix " #endif @@ -158,7 +165,7 @@ int tipc_crypto_xmit(struct net *net, struct sk_buff **skb, int tipc_crypto_rcv(struct net *net, struct tipc_crypto *rx, struct sk_buff **skb, struct tipc_bearer *b); int tipc_crypto_key_init(struct tipc_crypto *c, struct tipc_aead_key *ukey, - u8 mode); + u8 mode, bool master_key); void tipc_crypto_key_flush(struct tipc_crypto *c); int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info); bool tipc_ehdr_validate(struct sk_buff *skb); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 1016e96db5c4..25e5c5c8a6ff 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -127,7 +127,9 @@ struct tipc_skb_cb { #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; - u8 probe:1; +#define SKB_PROBING 1 +#define SKB_GRACING 2 + u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index c4aee6247d55..1ec00fcc26ee 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -108,6 +108,7 @@ const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { .len = TIPC_NODEID_LEN}, [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, .len = TIPC_AEAD_KEY_SIZE_MAX}, + [TIPC_NLA_NODE_KEY_MASTER] = { .type = NLA_FLAG }, }; /* Properties valid for media, bearer and link */ diff --git a/net/tipc/node.c b/net/tipc/node.c index 70045630e6bb..5da94d1dda77 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2875,6 +2875,7 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; + bool master_key = false; u8 *id, *own_id, mode; int rc = 0; @@ -2905,6 +2906,7 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) switch (rc) { case -ENODATA: mode = CLUSTER_KEY; + master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; @@ -2921,11 +2923,11 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) } /* Initiate the TX/RX key */ - rc = tipc_crypto_key_init(c, ukey, mode); + rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); - if (rc < 0) { + if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } -- cgit v1.2.3 From 23700da29b83e859a8c3727fddd33ba74c4f3a39 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 18 Sep 2020 08:17:29 +0700 Subject: tipc: add automatic rekeying for encryption key Rekeying is required for security since a key is less secure when using for a long time. Also, key will be detached when its nonce value (or seqno ...) is exhausted. We now make the rekeying process automatic and configurable by user. Basically, TIPC will at a specific interval generate a new key by using the kernel 'Random Number Generator' cipher, then attach it as the node TX key and securely distribute to others in the cluster as RX keys (- the key exchange). The automatic key switching will then take over, and make the new key active shortly. Afterwards, the traffic from this node will be encrypted with the new session key. The same can happen in peer nodes but not necessarily at the same time. For simplicity, the automatically generated key will be initiated as a per node key. It is not too hard to also support a cluster key rekeying (e.g. a given node will generate a unique cluster key and update to the others in the cluster...), but that doesn't bring much benefit, while a per-node key is even more secure. We also enable user to force a rekeying or change the rekeying interval via netlink, the new 'set key' command option: 'TIPC_NLA_NODE_REKEYING' is added for these purposes as follows: - A value >= 1 will be set as the rekeying interval (in minutes); - A value of 0 will disable the rekeying; - A value of 'TIPC_REKEYING_NOW' (~0) will force an immediate rekeying; The default rekeying interval is (60 * 24) minutes i.e. done every day. There isn't any restriction for the value but user shouldn't set it too small or too large which results in an "ineffective" rekeying (thats ok for testing though). Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 2 + include/uapi/linux/tipc_netlink.h | 1 + net/tipc/crypto.c | 113 +++++++++++++++++++++++++++++++++++++- net/tipc/crypto.h | 2 + net/tipc/netlink.c | 1 + net/tipc/node.c | 25 ++++++++- 6 files changed, 141 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index add01db1daef..80ea15e12113 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -254,6 +254,8 @@ static inline int tipc_aead_key_size(struct tipc_aead_key *key) return sizeof(*key) + key->keylen; } +#define TIPC_REKEYING_NOW (~0U) + /* The macros and functions below are deprecated: */ diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index d484baa9d365..d847dd671d79 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -166,6 +166,7 @@ enum { TIPC_NLA_NODE_ID, /* data */ TIPC_NLA_NODE_KEY, /* data */ TIPC_NLA_NODE_KEY_MASTER, /* flag */ + TIPC_NLA_NODE_REKEYING, /* u32 */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 91d8b268cae0..40c44101fe8e 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -36,6 +36,7 @@ #include #include +#include #include "crypto.h" #include "msg.h" #include "bcast.h" @@ -48,6 +49,8 @@ #define TIPC_MAX_TFMS_DEF 10 #define TIPC_MAX_TFMS_LIM 1000 +#define TIPC_REKEYING_INTV_DEF (60 * 24) /* default: 1 day */ + /** * TIPC Key ids */ @@ -181,6 +184,7 @@ struct tipc_crypto_stats { * @wq: common workqueue on TX crypto * @work: delayed work sched for TX/RX * @key_distr: key distributing state + * @rekeying_intv: rekeying interval (in minutes) * @stats: the crypto statistics * @name: the crypto name * @sndnxt: the per-peer sndnxt (TX) @@ -206,6 +210,7 @@ struct tipc_crypto { #define KEY_DISTR_SCHED 1 #define KEY_DISTR_COMPL 2 atomic_t key_distr; + u32 rekeying_intv; struct tipc_crypto_stats __percpu *stats; char name[48]; @@ -294,7 +299,9 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, static int tipc_crypto_key_xmit(struct net *net, struct tipc_aead_key *skey, u16 gen, u8 mode, u32 dnode); static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr); +static void tipc_crypto_work_tx(struct work_struct *work); static void tipc_crypto_work_rx(struct work_struct *work); +static int tipc_aead_key_generate(struct tipc_aead_key *skey); #define is_tx(crypto) (!(crypto)->node) #define is_rx(crypto) (!is_tx(crypto)) @@ -346,6 +353,27 @@ int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info) return 0; } +/** + * tipc_aead_key_generate - Generate new session key + * @skey: input/output key with new content + * + * Return: 0 in case of success, otherwise < 0 + */ +static int tipc_aead_key_generate(struct tipc_aead_key *skey) +{ + int rc = 0; + + /* Fill the key's content with a random value via RNG cipher */ + rc = crypto_get_default_rng(); + if (likely(!rc)) { + rc = crypto_rng_get_bytes(crypto_default_rng, skey->key, + skey->keylen); + crypto_put_default_rng(); + } + + return rc; +} + static struct tipc_aead *tipc_aead_get(struct tipc_aead __rcu *aead) { struct tipc_aead *tmp; @@ -1471,6 +1499,7 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, atomic64_set(&c->sndnxt, 0); c->timer1 = jiffies; c->timer2 = jiffies; + c->rekeying_intv = TIPC_REKEYING_INTV_DEF; spin_lock_init(&c->lock); scnprintf(c->name, 48, "%s(%s)", (is_rx(c)) ? "RX" : "TX", (is_rx(c)) ? tipc_node_get_id_str(c->node) : @@ -1478,6 +1507,8 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, if (is_rx(c)) INIT_DELAYED_WORK(&c->work, tipc_crypto_work_rx); + else + INIT_DELAYED_WORK(&c->work, tipc_crypto_work_tx); *crypto = c; return 0; @@ -1492,8 +1523,11 @@ void tipc_crypto_stop(struct tipc_crypto **crypto) return; /* Flush any queued works & destroy wq */ - if (is_tx(c)) + if (is_tx(c)) { + c->rekeying_intv = 0; + cancel_delayed_work_sync(&c->work); destroy_workqueue(c->wq); + } /* Release AEAD keys */ rcu_read_lock(); @@ -2351,3 +2385,80 @@ static void tipc_crypto_work_rx(struct work_struct *work) tipc_node_put(rx->node); } + +/** + * tipc_crypto_rekeying_sched - (Re)schedule rekeying w/o new interval + * @tx: TX crypto + * @changed: if the rekeying needs to be rescheduled with new interval + * @new_intv: new rekeying interval (when "changed" = true) + */ +void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, + u32 new_intv) +{ + unsigned long delay; + bool now = false; + + if (changed) { + if (new_intv == TIPC_REKEYING_NOW) + now = true; + else + tx->rekeying_intv = new_intv; + cancel_delayed_work_sync(&tx->work); + } + + if (tx->rekeying_intv || now) { + delay = (now) ? 0 : tx->rekeying_intv * 60 * 1000; + queue_delayed_work(tx->wq, &tx->work, msecs_to_jiffies(delay)); + } +} + +/** + * tipc_crypto_work_tx - Scheduled TX works handler + * @work: the struct TX work + * + * The function processes the previous scheduled work, i.e. key rekeying, by + * generating a new session key based on current one, then attaching it to the + * TX crypto and finally distributing it to peers. It also re-schedules the + * rekeying if needed. + */ +static void tipc_crypto_work_tx(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct tipc_crypto *tx = container_of(dwork, struct tipc_crypto, work); + struct tipc_aead_key *skey = NULL; + struct tipc_key key = tx->key; + struct tipc_aead *aead; + int rc = -ENOMEM; + + if (unlikely(key.pending)) + goto resched; + + /* Take current key as a template */ + rcu_read_lock(); + aead = rcu_dereference(tx->aead[key.active ?: KEY_MASTER]); + if (unlikely(!aead)) { + rcu_read_unlock(); + /* At least one key should exist for securing */ + return; + } + + /* Lets duplicate it first */ + skey = kmemdup(aead->key, tipc_aead_key_size(aead->key), GFP_ATOMIC); + rcu_read_unlock(); + + /* Now, generate new key, initiate & distribute it */ + if (likely(skey)) { + rc = tipc_aead_key_generate(skey) ?: + tipc_crypto_key_init(tx, skey, PER_NODE_KEY, false); + if (likely(rc > 0)) + rc = tipc_crypto_key_distr(tx, rc, NULL); + kzfree(skey); + } + + if (unlikely(rc)) + pr_warn_ratelimited("%s: rekeying returns %d\n", tx->name, rc); + +resched: + /* Re-schedule rekeying if any */ + tipc_crypto_rekeying_sched(tx, false, 0); +} diff --git a/net/tipc/crypto.h b/net/tipc/crypto.h index b2a9c9b90684..e71193bd5e36 100644 --- a/net/tipc/crypto.h +++ b/net/tipc/crypto.h @@ -171,6 +171,8 @@ void tipc_crypto_key_flush(struct tipc_crypto *c); int tipc_crypto_key_distr(struct tipc_crypto *tx, u8 key, struct tipc_node *dest); void tipc_crypto_msg_rcv(struct net *net, struct sk_buff *skb); +void tipc_crypto_rekeying_sched(struct tipc_crypto *tx, bool changed, + u32 new_intv); int tipc_aead_key_validate(struct tipc_aead_key *ukey, struct genl_info *info); bool tipc_ehdr_validate(struct sk_buff *skb); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 1ec00fcc26ee..c447cb5f879e 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -109,6 +109,7 @@ const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { [TIPC_NLA_NODE_KEY] = { .type = NLA_BINARY, .len = TIPC_AEAD_KEY_SIZE_MAX}, [TIPC_NLA_NODE_KEY_MASTER] = { .type = NLA_FLAG }, + [TIPC_NLA_NODE_REKEYING] = { .type = NLA_U32 }, }; /* Properties valid for media, bearer and link */ diff --git a/net/tipc/node.c b/net/tipc/node.c index c9b6042e32b5..cf4b239fc569 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -2879,6 +2879,17 @@ static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) return 0; } +static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) +{ + struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; + + if (!attr) + return -ENODATA; + + *intv = nla_get_u32(attr); + return 0; +} + static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; @@ -2886,8 +2897,9 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; - bool master_key = false; + bool rekeying = true, master_key = false; u8 *id, *own_id, mode; + u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) @@ -2905,8 +2917,14 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) return -EPERM; } + rc = tipc_nl_retrieve_rekeying(attrs, &intv); + if (rc == -ENODATA) + rekeying = false; + rc = tipc_nl_retrieve_key(attrs, &ukey); - if (rc) + if (rc == -ENODATA && rekeying) + goto rekeying; + else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); @@ -2945,6 +2963,9 @@ static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); +rekeying: + /* Schedule TX rekeying if needed */ + tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; -- cgit v1.2.3 From 55f13311785cebd60b9bab9ca7fd64205436c462 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Fri, 18 Sep 2020 14:14:51 -0500 Subject: ethtool: Add 100base-FX link mode entries Add entries for the 100base-FX full and half duplex supported modes. $ ethtool eth0 Supported ports: [ FIBRE ] Supported link modes: 100baseFX/Half 100baseFX/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: No Supported FEC modes: Not reported Advertised link modes: 100baseFX/Half 100baseFX/Full Advertised pause frame use: No Advertised auto-negotiation: No Advertised FEC modes: Not reported Speed: 100Mb/s Duplex: Full Auto-negotiation: off Port: MII PHYAD: 1 Transceiver: external Supports Wake-on: gs Wake-on: d SecureOn password: 00:00:00:00:00:00 Current message level: 0x00000000 (0) Link detected: yes Signed-off-by: Dan Murphy Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 4 +++- include/uapi/linux/ethtool.h | 2 ++ net/ethtool/common.c | 2 ++ net/ethtool/linkmodes.c | 2 ++ 4 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index ff8e14b01eeb..de5b869139d7 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -8,7 +8,7 @@ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 90, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 92, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -160,6 +160,8 @@ static const struct phy_setting settings[] = { PHY_SETTING( 100, FULL, 100baseT_Full ), PHY_SETTING( 100, FULL, 100baseT1_Full ), PHY_SETTING( 100, HALF, 100baseT_Half ), + PHY_SETTING( 100, HALF, 100baseFX_Half ), + PHY_SETTING( 100, FULL, 100baseFX_Full ), /* 10M */ PHY_SETTING( 10, FULL, 10baseT_Full ), PHY_SETTING( 10, HALF, 10baseT_Half ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index b4f2d134e713..9ca87bc73c44 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1617,6 +1617,8 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, + ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, + ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index ed19573fccd7..24036e3055a1 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -192,6 +192,8 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full), __DEFINE_LINK_MODE_NAME(400000, DR4, Full), __DEFINE_LINK_MODE_NAME(400000, CR4, Full), + __DEFINE_LINK_MODE_NAME(100, FX, Half), + __DEFINE_LINK_MODE_NAME(100, FX, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 7044a2853886..29dcd675b65a 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -272,6 +272,8 @@ static const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(100, FX, Half), + __DEFINE_LINK_MODE_PARAMS(100, FX, Full), }; static const struct nla_policy -- cgit v1.2.3 From c12fa88c6d16ed3865072d91154cff6fd1cd9cd4 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Thu, 10 Sep 2020 20:25:08 +0800 Subject: vfio: Fix typo of the device_state A typo fix ("_RUNNNG" => "_RUNNING") in comment block of the uapi header. Signed-off-by: Zenghui Yu Reviewed-by: Cornelia Huck Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..d4bd39e124bf 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -462,7 +462,7 @@ struct vfio_region_gfx_edid { * 5. Resumed * |--------->| * - * 0. Default state of VFIO device is _RUNNNG when the user application starts. + * 0. Default state of VFIO device is _RUNNING when the user application starts. * 1. During normal shutdown of the user application, the user application may * optionally change the VFIO device state from _RUNNING to _STOP. This * transition is optional. The vendor driver must support this transition but -- cgit v1.2.3 From 7d6e1329652ed971d1b6e0e7bea66fba5044e271 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Tue, 15 Sep 2020 15:05:18 -0400 Subject: vfio iommu: Add dma available capability Commit 492855939bdb ("vfio/type1: Limit DMA mappings per container") added the ability to limit the number of memory backed DMA mappings. However on s390x, when lazy mapping is in use, we use a very large number of concurrent mappings. Let's provide the current allowable number of DMA mappings to userspace via the IOMMU info chain so that userspace can take appropriate mitigation. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 17 +++++++++++++++++ include/uapi/linux/vfio.h | 15 +++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5fbf0c1f7433..15e21dbffb16 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2609,6 +2609,20 @@ static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig)); } +static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, + struct vfio_info_cap *caps) +{ + struct vfio_iommu_type1_info_dma_avail cap_dma_avail; + + cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL; + cap_dma_avail.header.version = 1; + + cap_dma_avail.avail = iommu->dma_avail; + + return vfio_info_add_capability(caps, &cap_dma_avail.header, + sizeof(cap_dma_avail)); +} + static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, unsigned long arg) { @@ -2641,6 +2655,9 @@ static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, ret = vfio_iommu_migration_build_caps(iommu, &caps); + if (!ret) + ret = vfio_iommu_dma_avail_build_caps(iommu, &caps); + if (!ret) ret = vfio_iommu_iova_build_caps(iommu, &caps); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..3891e03d3af0 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1039,6 +1039,21 @@ struct vfio_iommu_type1_info_cap_migration { __u64 max_dirty_bitmap_size; /* in bytes */ }; +/* + * The DMA available capability allows to report the current number of + * simultaneously outstanding DMA mappings that are allowed. + * + * The structure below defines version 1 of this capability. + * + * avail: specifies the current number of outstanding DMA mappings allowed. + */ +#define VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL 3 + +struct vfio_iommu_type1_info_dma_avail { + struct vfio_info_cap_header header; + __u32 avail; +}; + #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) /** -- cgit v1.2.3 From 9c4258c78a2a7624c79b797f40ae2dbfd2555e26 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:18 +0300 Subject: net: bridge: mdb: add support to extend add/del commands Since the MDB add/del code expects an exact struct br_mdb_entry we can't really add any extensions, thus add a new nested attribute at the level of MDBA_SET_ENTRY called MDBA_SET_ENTRY_ATTRS which will be used to pass all new options via netlink attributes. This patch doesn't change anything functionally since the new attribute is not used yet, only parsed. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 12 ++++++++++++ net/bridge/br_mdb.c | 22 +++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 75a2ac479247..dc52f8cffa0d 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -530,10 +530,22 @@ struct br_mdb_entry { enum { MDBA_SET_ENTRY_UNSPEC, MDBA_SET_ENTRY, + MDBA_SET_ENTRY_ATTRS, __MDBA_SET_ENTRY_MAX, }; #define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1) +/* [MDBA_SET_ENTRY_ATTRS] = { + * [MDBE_ATTR_xxx] + * ... + * } + */ +enum { + MDBE_ATTR_UNSPEC, + __MDBE_ATTR_MAX, +}; +#define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) + /* Embedded inside LINK_XSTATS_TYPE_BRIDGE */ enum { BRIDGE_XSTATS_UNSPEC, diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index a1ff0a372185..907df6d695ec 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -670,9 +670,12 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry, return true; } +static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { +}; + static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device **pdev, struct br_mdb_entry **pentry, - struct netlink_ext_ack *extack) + struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct br_mdb_entry *entry; @@ -719,6 +722,17 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; *pentry = entry; + if (tb[MDBA_SET_ENTRY_ATTRS]) { + err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, + tb[MDBA_SET_ENTRY_ATTRS], + br_mdbe_attrs_pol, extack); + if (err) + return err; + } else { + memset(mdb_attrs, 0, + sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1)); + } + return 0; } @@ -803,6 +817,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; @@ -812,7 +827,7 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_bridge *br; int err; - err = br_mdb_parse(skb, nlh, &dev, &entry, extack); + err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; @@ -921,6 +936,7 @@ unlock: static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; @@ -930,7 +946,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_bridge *br; int err; - err = br_mdb_parse(skb, nlh, &dev, &entry, extack); + err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; -- cgit v1.2.3 From 88d4bd180419a7cde3947f191dc4e26fbb19f80b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:19 +0300 Subject: net: bridge: mdb: add support for add/del/dump of entries with source Add new mdb attributes (MDBE_ATTR_SOURCE for setting, MDBA_MDB_EATTR_SOURCE for dumping) to allow add/del and dump of mdb entries with a source address (S,G). New S,G entries are created with filter mode of MCAST_INCLUDE. The same attributes are used for IPv4 and IPv6, they're validated and parsed based on their protocol. S,G host joined entries which are added by user are not allowed yet. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 + net/bridge/br_mdb.c | 142 +++++++++++++++++++++++++++++++++-------- net/bridge/br_private.h | 14 ++++ 3 files changed, 130 insertions(+), 28 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index dc52f8cffa0d..3e6377c865eb 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -457,6 +457,7 @@ enum { MDBA_MDB_EATTR_TIMER, MDBA_MDB_EATTR_SRC_LIST, MDBA_MDB_EATTR_GROUP_MODE, + MDBA_MDB_EATTR_SOURCE, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) @@ -542,6 +543,7 @@ enum { */ enum { MDBE_ATTR_UNSPEC, + MDBE_ATTR_SOURCE, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 907df6d695ec..7f9ca5c20120 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -64,17 +64,27 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_FAST_LEAVE; } -static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) +static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, + struct nlattr **mdb_attrs) { memset(ip, 0, sizeof(struct br_ip)); ip->vid = entry->vid; ip->proto = entry->addr.proto; - if (ip->proto == htons(ETH_P_IP)) + switch (ip->proto) { + case htons(ETH_P_IP): ip->dst.ip4 = entry->addr.u.ip4; + if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) + ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]); + break; #if IS_ENABLED(CONFIG_IPV6) - else + case htons(ETH_P_IPV6): ip->dst.ip6 = entry->addr.u.ip6; + if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE]) + ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]); + break; #endif + } + } static int __mdb_fill_srcs(struct sk_buff *skb, @@ -172,30 +182,41 @@ static int __mdb_fill_info(struct sk_buff *skb, if (nla_put_nohdr(skb, sizeof(e), &e) || nla_put_u32(skb, MDBA_MDB_EATTR_TIMER, - br_timer_value(mtimer))) { - nla_nest_cancel(skb, nest_ent); - return -EMSGSIZE; - } + br_timer_value(mtimer))) + goto nest_err; switch (mp->addr.proto) { case htons(ETH_P_IP): - dump_srcs_mode = !!(p && mp->br->multicast_igmp_version == 3); + dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3); + if (mp->addr.src.ip4) { + if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, + mp->addr.src.ip4)) + goto nest_err; + break; + } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - dump_srcs_mode = !!(p && mp->br->multicast_mld_version == 2); + dump_srcs_mode = !!(mp->br->multicast_mld_version == 2); + if (!ipv6_addr_any(&mp->addr.src.ip6)) { + if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, + &mp->addr.src.ip6)) + goto nest_err; + break; + } break; #endif } - if (dump_srcs_mode && + if (p && dump_srcs_mode && (__mdb_fill_srcs(skb, p) || - nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) { - nla_nest_cancel(skb, nest_ent); - return -EMSGSIZE; - } - + nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) + goto nest_err; nla_nest_end(skb, nest_ent); return 0; + +nest_err: + nla_nest_cancel(skb, nest_ent); + return -EMSGSIZE; } static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, @@ -395,12 +416,18 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) switch (pg->addr.proto) { case htons(ETH_P_IP): + /* MDBA_MDB_EATTR_SOURCE */ + if (pg->addr.src.ip4) + nlmsg_size += nla_total_size(sizeof(__be32)); if (pg->port->br->multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): + /* MDBA_MDB_EATTR_SOURCE */ + if (!ipv6_addr_any(&pg->addr.src.ip6)) + nlmsg_size += nla_total_size(sizeof(struct in6_addr)); if (pg->port->br->multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); @@ -670,7 +697,48 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry, return true; } +static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, + struct netlink_ext_ack *extack) +{ + switch (proto) { + case htons(ETH_P_IP): + if (nla_len(attr) != sizeof(struct in_addr)) { + NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length"); + return false; + } + if (ipv4_is_multicast(nla_get_in_addr(attr))) { + NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed"); + return false; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): { + struct in6_addr src; + + if (nla_len(attr) != sizeof(struct in6_addr)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length"); + return false; + } + src = nla_get_in6_addr(attr); + if (ipv6_addr_is_multicast(&src)) { + NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed"); + return false; + } + break; + } +#endif + default: + NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address"); + return false; + } + + return true; +} + static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { + [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, + sizeof(struct in_addr), + sizeof(struct in6_addr)), }; static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -728,6 +796,10 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, br_mdbe_attrs_pol, extack); if (err) return err; + if (mdb_attrs[MDBE_ATTR_SOURCE] && + !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], + entry->addr.proto, extack)) + return -EINVAL; } else { memset(mdb_attrs, 0, sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1)); @@ -744,8 +816,22 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; unsigned long now = jiffies; + u8 filter_mode; int err; + /* host join errors which can happen before creating the group */ + if (!port) { + /* don't allow any flags for host-joined groups */ + if (entry->state) { + NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(group)) { + NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); + return -EINVAL; + } + } + mp = br_mdb_ip_get(br, group); if (!mp) { mp = br_multicast_new_group(br, group); @@ -756,11 +842,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, /* host join */ if (!port) { - /* don't allow any flags for host-joined groups */ - if (entry->state) { - NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); - return -EINVAL; - } if (mp->host_joined) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; @@ -783,8 +864,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, break; } + filter_mode = br_multicast_is_star_g(group) ? MCAST_EXCLUDE : + MCAST_INCLUDE; + p = br_multicast_new_port_group(port, group, *pp, entry->state, NULL, - MCAST_EXCLUDE); + filter_mode); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); return -ENOMEM; @@ -800,12 +884,13 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, static int __br_mdb_add(struct net *net, struct net_bridge *br, struct net_bridge_port *p, struct br_mdb_entry *entry, + struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct br_ip ip; int ret; - __mdb_entry_to_br_ip(entry, &ip); + __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); spin_lock_bh(&br->multicast_lock); ret = br_mdb_add_group(br, p, &ip, entry, extack); @@ -875,18 +960,19 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_add(net, br, p, entry, extack); + err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); if (err) break; } } else { - err = __br_mdb_add(net, br, p, entry, extack); + err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); } return err; } -static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) +static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry, + struct nlattr **mdb_attrs) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; @@ -897,7 +983,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) return -EINVAL; - __mdb_entry_to_br_ip(entry, &ip); + __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); @@ -971,10 +1057,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_del(br, entry); + err = __br_mdb_del(br, entry, mdb_attrs); } } else { - err = __br_mdb_del(br, entry); + err = __br_mdb_del(br, entry, mdb_attrs); } return err; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a23d2bae56e1..0f54a7a7c186 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -873,6 +873,20 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br, } } +static inline bool br_multicast_is_star_g(const struct br_ip *ip) +{ + switch (ip->proto) { + case htons(ETH_P_IP): + return ipv4_is_zeronet(ip->src.ip4); +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + return ipv6_addr_any(&ip->src.ip6); +#endif + default: + return false; + } +} + static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->igmp; -- cgit v1.2.3 From 8f8cb77e0b22d9044d8d57ab3bb18ea8d0474752 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:21 +0300 Subject: net: bridge: mcast: add rt_protocol field to the port group struct We need to be able to differentiate between pg entries created by user-space and the kernel when we start generating S,G entries for IGMPv3/MLDv2's fast path. User-space entries are created by default as RTPROT_STATIC and the kernel entries are RTPROT_KERNEL. Later we can allow user-space to provide the entry rt_protocol so we can differentiate between who added the entries specifically (e.g. clag, admin, frr etc). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 42 ++++++++++++++++++++++++++---------------- net/bridge/br_multicast.c | 7 +++++-- net/bridge/br_private.h | 3 ++- 4 files changed, 34 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 3e6377c865eb..1054f151078d 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -458,6 +458,7 @@ enum { MDBA_MDB_EATTR_SRC_LIST, MDBA_MDB_EATTR_GROUP_MODE, MDBA_MDB_EATTR_SOURCE, + MDBA_MDB_EATTR_RTPROT, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 7f9ca5c20120..b386a5e07698 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -184,6 +184,7 @@ static int __mdb_fill_info(struct sk_buff *skb, MDBA_MDB_EATTR_TIMER, br_timer_value(mtimer))) goto nest_err; + switch (mp->addr.proto) { case htons(ETH_P_IP): dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3); @@ -206,10 +207,15 @@ static int __mdb_fill_info(struct sk_buff *skb, break; #endif } - if (p && dump_srcs_mode && - (__mdb_fill_srcs(skb, p) || - nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, p->filter_mode))) - goto nest_err; + if (p) { + if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol)) + goto nest_err; + if (dump_srcs_mode && + (__mdb_fill_srcs(skb, p) || + nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE, + p->filter_mode))) + goto nest_err; + } nla_nest_end(skb, nest_ent); return 0; @@ -414,6 +420,9 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) if (!pg) goto out; + /* MDBA_MDB_EATTR_RTPROT */ + nlmsg_size += nla_total_size(sizeof(u8)); + switch (pg->addr.proto) { case htons(ETH_P_IP): /* MDBA_MDB_EATTR_SOURCE */ @@ -809,16 +818,20 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, struct br_mdb_entry *entry, + struct br_mdb_entry *entry, + struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; unsigned long now = jiffies; + struct br_ip group; u8 filter_mode; int err; + __mdb_entry_to_br_ip(entry, &group, mdb_attrs); + /* host join errors which can happen before creating the group */ if (!port) { /* don't allow any flags for host-joined groups */ @@ -826,15 +839,15 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); return -EINVAL; } - if (!br_multicast_is_star_g(group)) { + if (!br_multicast_is_star_g(&group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); return -EINVAL; } } - mp = br_mdb_ip_get(br, group); + mp = br_mdb_ip_get(br, &group); if (!mp) { - mp = br_multicast_new_group(br, group); + mp = br_multicast_new_group(br, &group); err = PTR_ERR_OR_ZERO(mp); if (err) return err; @@ -864,11 +877,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, break; } - filter_mode = br_multicast_is_star_g(group) ? MCAST_EXCLUDE : - MCAST_INCLUDE; + filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE : + MCAST_INCLUDE; - p = br_multicast_new_port_group(port, group, *pp, entry->state, NULL, - filter_mode); + p = br_multicast_new_port_group(port, &group, *pp, entry->state, NULL, + filter_mode, RTPROT_STATIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); return -ENOMEM; @@ -887,13 +900,10 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { - struct br_ip ip; int ret; - __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); - spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, &ip, entry, extack); + ret = br_mdb_add_group(br, p, entry, mdb_attrs, extack); spin_unlock_bh(&br->multicast_lock); return ret; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 4fd690bc848f..b6e7b0ece422 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -795,7 +795,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, - u8 filter_mode) + u8 filter_mode, + u8 rt_protocol) { struct net_bridge_port_group *p; @@ -807,6 +808,7 @@ struct net_bridge_port_group *br_multicast_new_port_group( p->port = port; p->flags = flags; p->filter_mode = filter_mode; + p->rt_protocol = rt_protocol; p->mcast_gc.destroy = br_multicast_destroy_port_group; INIT_HLIST_HEAD(&p->src_list); rcu_assign_pointer(p->next, next); @@ -892,7 +894,8 @@ static int br_multicast_add_group(struct net_bridge *br, break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode); + p = br_multicast_new_port_group(port, group, *pp, 0, src, filter_mode, + RTPROT_KERNEL); if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 0f54a7a7c186..dae7e3526fc7 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -246,6 +246,7 @@ struct net_bridge_port_group { unsigned char flags; unsigned char filter_mode; unsigned char grp_query_rexmit_cnt; + unsigned char rt_protocol; struct hlist_head src_list; unsigned int src_ents; @@ -804,7 +805,7 @@ struct net_bridge_port_group * br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, - u8 filter_mode); + u8 filter_mode, u8 rt_protocol); int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, -- cgit v1.2.3 From 8266a0491e92d39dc9af739e8380a0daa9b8836b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:24 +0300 Subject: net: bridge: mcast: handle port group filter modes We need to handle group filter mode transitions and initial state. To change a port group's INCLUDE -> EXCLUDE mode (or when we have added a new port group in EXCLUDE mode) we need to add that port to all of *,G ports' S,G entries for proper replication. When the EXCLUDE state is changed from IGMPv3 report, br_multicast_fwd_filter_exclude() must be called after the source list processing because the assumption is that all of the group's S,G entries will be created before transitioning to EXCLUDE mode, i.e. most importantly its blocked entries will already be added so it will not get automatically added to them. The transition EXCLUDE -> INCLUDE happens only when a port group timer expires, it requires us to remove that port from all of *,G ports' S,G entries where it was automatically added previously. Finally when we are adding a new S,G entry we must add all of *,G's EXCLUDE ports to it. In order to distinguish automatically added *,G EXCLUDE ports we have a new port group flag - MDB_PG_FLAGS_STAR_EXCL. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 25 +++++- net/bridge/br_multicast.c | 172 +++++++++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 20 +++++ 4 files changed, 216 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 1054f151078d..e4bd30a25f6b 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -518,6 +518,7 @@ struct br_mdb_entry { __u8 state; #define MDB_FLAGS_OFFLOAD (1 << 0) #define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) __u8 flags; __u16 vid; struct { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 4e3a5cefc626..28cd35a9cf37 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -62,6 +62,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_OFFLOAD; if (flags & MDB_PG_FLAGS_FAST_LEAVE) e->flags |= MDB_FLAGS_FAST_LEAVE; + if (flags & MDB_PG_FLAGS_STAR_EXCL) + e->flags |= MDB_FLAGS_STAR_EXCL; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, @@ -822,11 +824,11 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { - struct net_bridge_mdb_entry *mp; + struct net_bridge_mdb_entry *mp, *star_mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct br_ip group, star_group; unsigned long now = jiffies; - struct br_ip group; u8 filter_mode; int err; @@ -890,6 +892,25 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, if (entry->state == MDB_TEMPORARY) mod_timer(&p->timer, now + br->multicast_membership_interval); br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); + /* if we are adding a new EXCLUDE port group (*,G) it needs to be also + * added to all S,G entries for proper replication, if we are adding + * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be + * added to it for proper replication + */ + if (br_multicast_should_handle_mode(br, group.proto)) { + switch (filter_mode) { + case MCAST_EXCLUDE: + br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); + break; + case MCAST_INCLUDE: + star_group = p->key.addr; + memset(&star_group.src, 0, sizeof(star_group.src)); + star_mp = br_mdb_ip_get(br, &star_group); + if (star_mp) + br_multicast_sg_add_exclude_ports(star_mp, p); + break; + } + } return 0; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index ece8ac805e98..f39bbd733722 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -73,6 +73,8 @@ __br_multicast_add_group(struct net_bridge *br, const unsigned char *src, u8 filter_mode, bool igmpv2_mldv1); +static void br_multicast_find_del_pg(struct net_bridge *br, + struct net_bridge_port_group *pg); static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, @@ -195,8 +197,163 @@ static bool br_port_group_equal(struct net_bridge_port_group *p, return ether_addr_equal(src, p->eth_addr); } +static void __fwd_add_star_excl(struct net_bridge_port_group *pg, + struct br_ip *sg_ip) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *src_pg; + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.port = pg->key.port; + sg_key.addr = *sg_ip; + if (br_sg_port_find(br, &sg_key)) + return; + + src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr, + MCAST_INCLUDE, false); + if (IS_ERR_OR_NULL(src_pg) || + src_pg->rt_protocol != RTPROT_KERNEL) + return; + + src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; +} + +static void __fwd_del_star_excl(struct net_bridge_port_group *pg, + struct br_ip *sg_ip) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *src_pg; + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.port = pg->key.port; + sg_key.addr = *sg_ip; + src_pg = br_sg_port_find(br, &sg_key); + if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) || + src_pg->rt_protocol != RTPROT_KERNEL) + return; + + br_multicast_find_del_pg(br, src_pg); +} + +/* When a port group transitions to (or is added as) EXCLUDE we need to add it + * to all other ports' S,G entries which are not blocked by the current group + * for proper replication, the assumption is that any S,G blocked entries + * are already added so the S,G,port lookup should skip them. + * When a port group transitions from EXCLUDE -> INCLUDE mode or is being + * deleted we need to remove it from all ports' S,G entries where it was + * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL). + */ +void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, + u8 filter_mode) +{ + struct net_bridge *br = pg->key.port->br; + struct net_bridge_port_group *pg_lst; + struct net_bridge_mdb_entry *mp; + struct br_ip sg_ip; + + if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr))) + return; + + mp = br_mdb_ip_get(br, &pg->key.addr); + if (!mp) + return; + + memset(&sg_ip, 0, sizeof(sg_ip)); + sg_ip = pg->key.addr; + for (pg_lst = mlock_dereference(mp->ports, br); + pg_lst; + pg_lst = mlock_dereference(pg_lst->next, br)) { + struct net_bridge_group_src *src_ent; + + if (pg_lst == pg) + continue; + hlist_for_each_entry(src_ent, &pg_lst->src_list, node) { + if (!(src_ent->flags & BR_SGRP_F_INSTALLED)) + continue; + sg_ip.src = src_ent->addr.src; + switch (filter_mode) { + case MCAST_INCLUDE: + __fwd_del_star_excl(pg, &sg_ip); + break; + case MCAST_EXCLUDE: + __fwd_add_star_excl(pg, &sg_ip); + break; + } + } + } +} + +static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp) +{ + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + + /* *,G exclude ports are only added to S,G entries */ + if (WARN_ON(br_multicast_is_star_g(&sgmp->addr))) + return; + + /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports + * we should ignore perm entries since they're managed by user-space + */ + for (pp = &sgmp->ports; + (p = mlock_dereference(*pp, sgmp->br)) != NULL; + pp = &p->next) + if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL | + MDB_PG_FLAGS_PERMANENT))) + return; + + for (pp = &sgmp->ports; + (p = mlock_dereference(*pp, sgmp->br)) != NULL;) { + if (!(p->flags & MDB_PG_FLAGS_PERMANENT)) + br_multicast_del_pg(sgmp, p, pp); + else + pp = &p->next; + } +} + +void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, + struct net_bridge_port_group *sg) +{ + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge *br = star_mp->br; + struct net_bridge_port_group *pg; + + if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) + return; + if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr))) + return; + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.addr = sg->key.addr; + /* we need to add all exclude ports to the S,G */ + for (pg = mlock_dereference(star_mp->ports, br); + pg; + pg = mlock_dereference(pg->next, br)) { + struct net_bridge_port_group *src_pg; + + if (pg == sg || pg->filter_mode == MCAST_INCLUDE) + continue; + + sg_key.port = pg->key.port; + if (br_sg_port_find(br, &sg_key)) + continue; + + src_pg = __br_multicast_add_group(br, pg->key.port, + &sg->key.addr, + sg->eth_addr, + MCAST_INCLUDE, false); + if (IS_ERR_OR_NULL(src_pg) || + src_pg->rt_protocol != RTPROT_KERNEL) + continue; + src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL; + } +} + static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) { + struct net_bridge_mdb_entry *star_mp; struct net_bridge_port_group *sg; struct br_ip sg_ip; @@ -211,6 +368,7 @@ static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) if (IS_ERR_OR_NULL(sg)) return; src->flags |= BR_SGRP_F_INSTALLED; + sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL; /* if it was added by user-space as perm we can skip next steps */ if (sg->rt_protocol != RTPROT_KERNEL && @@ -219,6 +377,11 @@ static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) /* the kernel is now responsible for removing this S,G */ del_timer(&sg->timer); + star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr); + if (!star_mp) + return; + + br_multicast_sg_add_exclude_ports(star_mp, sg); } static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src) @@ -349,6 +512,10 @@ void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) br_multicast_del_group_src(ent); br_mdb_notify(br->dev, mp, pg, RTM_DELMDB); + if (!br_multicast_is_star_g(&mp->addr)) + br_multicast_sg_del_exclude_ports(mp); + else + br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); @@ -407,6 +574,9 @@ static void br_multicast_port_group_expired(struct timer_list *t) } else if (changed) { struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr); + if (changed && br_multicast_is_star_g(&pg->key.addr)) + br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE); + if (WARN_ON(!mp)) goto out; br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB); @@ -1641,6 +1811,7 @@ static bool br_multicast_isexc(struct net_bridge_port_group *pg, switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_isexc_incl(pg, srcs, nsrcs, src_size); + br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: @@ -1853,6 +2024,7 @@ static bool br_multicast_toex(struct net_bridge_port_group *pg, switch (pg->filter_mode) { case MCAST_INCLUDE: __grp_src_toex_incl(pg, srcs, nsrcs, src_size); + br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 93d76b3dfc35..128d2d0417a0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -213,6 +213,7 @@ struct net_bridge_fdb_entry { #define MDB_PG_FLAGS_PERMANENT BIT(0) #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define MDB_PG_FLAGS_STAR_EXCL BIT(3) #define PG_SRC_ENT_LIMIT 32 @@ -833,6 +834,10 @@ void br_mdb_init(void); void br_mdb_uninit(void); void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); +void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, + u8 filter_mode); +void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, + struct net_bridge_port_group *sg); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) @@ -895,6 +900,21 @@ static inline bool br_multicast_is_star_g(const struct br_ip *ip) } } +static inline bool br_multicast_should_handle_mode(const struct net_bridge *br, + __be16 proto) +{ + switch (proto) { + case htons(ETH_P_IP): + return !!(br->multicast_igmp_version == 3); +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + return !!(br->multicast_mld_version == 2); +#endif + default: + return false; + } +} + static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return BR_INPUT_SKB_CB(skb)->igmp; -- cgit v1.2.3 From 9116ffbf1dd71f953ffda4198d01f82d3ca16df8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:25 +0300 Subject: net: bridge: mcast: add support for blocked port groups When excluding S,G entries we need a way to block a particular S,G,port. The new port group flag is managed based on the source's timer as per RFCs 3376 and 3810. When a source expires and its port group is in EXCLUDE mode, it will be blocked. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 2 ++ net/bridge/br_multicast.c | 49 ++++++++++++++++++++++++++++++++++++------ net/bridge/br_private.h | 1 + 4 files changed, 47 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index e4bd30a25f6b..4c687686aa8f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -519,6 +519,7 @@ struct br_mdb_entry { #define MDB_FLAGS_OFFLOAD (1 << 0) #define MDB_FLAGS_FAST_LEAVE (1 << 1) #define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) __u8 flags; __u16 vid; struct { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 28cd35a9cf37..e15bab19a012 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -64,6 +64,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_FAST_LEAVE; if (flags & MDB_PG_FLAGS_STAR_EXCL) e->flags |= MDB_FLAGS_STAR_EXCL; + if (flags & MDB_PG_FLAGS_BLOCKED) + e->flags |= MDB_FLAGS_BLOCKED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index f39bbd733722..11d224c01914 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -72,7 +72,8 @@ __br_multicast_add_group(struct net_bridge *br, struct br_ip *group, const unsigned char *src, u8 filter_mode, - bool igmpv2_mldv1); + bool igmpv2_mldv1, + bool blocked); static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); @@ -211,7 +212,7 @@ static void __fwd_add_star_excl(struct net_bridge_port_group *pg, return; src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr, - MCAST_INCLUDE, false); + MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) return; @@ -343,7 +344,7 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, src_pg = __br_multicast_add_group(br, pg->key.port, &sg->key.addr, sg->eth_addr, - MCAST_INCLUDE, false); + MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) continue; @@ -364,7 +365,8 @@ static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) sg_ip = src->pg->key.addr; sg_ip.src = src->addr.src; sg = __br_multicast_add_group(src->br, src->pg->key.port, &sg_ip, - src->pg->eth_addr, MCAST_INCLUDE, false); + src->pg->eth_addr, MCAST_INCLUDE, false, + !timer_pending(&src->timer)); if (IS_ERR_OR_NULL(sg)) return; src->flags |= BR_SGRP_F_INSTALLED; @@ -415,9 +417,38 @@ static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src) src->flags &= ~BR_SGRP_F_INSTALLED; } +/* install S,G and based on src's timer enable or disable forwarding */ static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src) { + struct net_bridge_port_group_sg_key sg_key; + struct net_bridge_port_group *sg; + u8 old_flags; + br_multicast_fwd_src_add(src); + + memset(&sg_key, 0, sizeof(sg_key)); + sg_key.addr = src->pg->key.addr; + sg_key.addr.src = src->addr.src; + sg_key.port = src->pg->key.port; + + sg = br_sg_port_find(src->br, &sg_key); + if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT)) + return; + + old_flags = sg->flags; + if (timer_pending(&src->timer)) + sg->flags &= ~MDB_PG_FLAGS_BLOCKED; + else + sg->flags |= MDB_PG_FLAGS_BLOCKED; + + if (old_flags != sg->flags) { + struct net_bridge_mdb_entry *sg_mp; + + sg_mp = br_mdb_ip_get(src->br, &sg_key.addr); + if (!sg_mp) + return; + br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB); + } } static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc) @@ -995,7 +1026,10 @@ static void br_multicast_group_src_expired(struct timer_list *t) if (!hlist_empty(&pg->src_list)) goto out; br_multicast_find_del_pg(br, pg); + } else { + br_multicast_fwd_src_handle(src); } + out: spin_unlock(&br->multicast_lock); } @@ -1131,7 +1165,8 @@ __br_multicast_add_group(struct net_bridge *br, struct br_ip *group, const unsigned char *src, u8 filter_mode, - bool igmpv2_mldv1) + bool igmpv2_mldv1, + bool blocked) { struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p = NULL; @@ -1167,6 +1202,8 @@ __br_multicast_add_group(struct net_bridge *br, goto out; } rcu_assign_pointer(*pp, p); + if (blocked) + p->flags |= MDB_PG_FLAGS_BLOCKED; br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); found: @@ -1189,7 +1226,7 @@ static int br_multicast_add_group(struct net_bridge *br, spin_lock(&br->multicast_lock); pg = __br_multicast_add_group(br, port, group, src, filter_mode, - igmpv2_mldv1); + igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ err = IS_ERR(pg) ? PTR_ERR(pg) : 0; spin_unlock(&br->multicast_lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 128d2d0417a0..345118e35c42 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -214,6 +214,7 @@ struct net_bridge_fdb_entry { #define MDB_PG_FLAGS_OFFLOAD BIT(1) #define MDB_PG_FLAGS_FAST_LEAVE BIT(2) #define MDB_PG_FLAGS_STAR_EXCL BIT(3) +#define MDB_PG_FLAGS_BLOCKED BIT(4) #define PG_SRC_ENT_LIMIT 32 -- cgit v1.2.3 From a5fa25adf03d4b063aece74ba70ccbb3a71af122 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:03:56 -0700 Subject: bpf: Change bpf_sk_release and bpf_sk_*cgroup_id to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON The previous patch allows the networking bpf prog to use the bpf_skc_to_*() helpers to get a PTR_TO_BTF_ID socket pointer, e.g. "struct tcp_sock *". It allows the bpf prog to read all the fields of the tcp_sock. This patch changes the bpf_sk_release() and bpf_sk_*cgroup_id() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. For example, the following will work: sk = bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); if (!sk) return; tp = bpf_skc_to_tcp_sock(sk); if (!tp) { bpf_sk_release(sk); return; } lsndtime = tp->lsndtime; /* Pass tp to bpf_sk_release() will also work */ bpf_sk_release(tp); Since PTR_TO_BTF_ID could be NULL, the helper taking ARG_PTR_TO_BTF_ID_SOCK_COMMON has to check for NULL at runtime. A btf_id of "struct sock" may not always mean a fullsock. Regardless the helper's running context may get a non-fullsock or not, considering fullsock check/handling is pretty cheap, it is better to keep the same verifier expectation on helper that takes ARG_PTR_TO_BTF_ID* will be able to handle the minisock situation. In the bpf_sk_*cgroup_id() case, it will try to get a fullsock by using sk_to_full_sk() as its skb variant bpf_sk"b"_*cgroup_id() has already been doing. bpf_sk_release can already handle minisock, so nothing special has to be done. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200925000356.3856047-1-kafai@fb.com --- include/uapi/linux/bpf.h | 8 ++++---- net/core/filter.c | 30 ++++++++++++++---------------- tools/include/uapi/linux/bpf.h | 8 ++++---- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a22812561064..c96a56d9c3be 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2512,7 +2512,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(void *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -3234,11 +3234,11 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * u64 bpf_sk_cgroup_id(void *sk) * Description * Return the cgroup v2 id of the socket *sk*. * - * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * *sk* must be a non-**NULL** pointer to a socket, e.g. one * returned from **bpf_sk_lookup_xxx**\ (), * **bpf_sk_fullsock**\ (), etc. The format of returned id is * same as in **bpf_skb_cgroup_id**\ (). @@ -3248,7 +3248,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of cgroup associated * with the *sk* at the *ancestor_level*. The root cgroup is at diff --git a/net/core/filter.c b/net/core/filter.c index 6d1864f2bd51..06d397eeef2a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4088,18 +4088,17 @@ static inline u64 __bpf_sk_cgroup_id(struct sock *sk) { struct cgroup *cgrp; + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgroup_id(cgrp); } BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { - struct sock *sk = skb_to_full_sk(skb); - - if (!sk || !sk_fullsock(sk)) - return 0; - - return __bpf_sk_cgroup_id(sk); + return __bpf_sk_cgroup_id(skb->sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4115,6 +4114,10 @@ static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, struct cgroup *ancestor; struct cgroup *cgrp; + sk = sk_to_full_sk(sk); + if (!sk || !sk_fullsock(sk)) + return 0; + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4126,12 +4129,7 @@ static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); - - if (!sk || !sk_fullsock(sk)) - return 0; - - return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); + return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level); } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { @@ -4151,7 +4149,7 @@ static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { .func = bpf_sk_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCKET, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) @@ -4163,7 +4161,7 @@ static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { .func = bpf_sk_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCKET, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, }; #endif @@ -5697,7 +5695,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { BPF_CALL_1(bpf_sk_release, struct sock *, sk) { - if (sk_is_refcounted(sk)) + if (sk && sk_is_refcounted(sk)) sock_gen_put(sk); return 0; } @@ -5706,7 +5704,7 @@ static const struct bpf_func_proto bpf_sk_release_proto = { .func = bpf_sk_release, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a22812561064..c96a56d9c3be 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2512,7 +2512,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(void *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -3234,11 +3234,11 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * u64 bpf_sk_cgroup_id(void *sk) * Description * Return the cgroup v2 id of the socket *sk*. * - * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * *sk* must be a non-**NULL** pointer to a socket, e.g. one * returned from **bpf_sk_lookup_xxx**\ (), * **bpf_sk_fullsock**\ (), etc. The format of returned id is * same as in **bpf_skb_cgroup_id**\ (). @@ -3248,7 +3248,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of cgroup associated * with the *sk* at the *ancestor_level*. The root cgroup is at -- cgit v1.2.3 From 592a3498648af000e93dff2d36229ab11cd8c7f6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:02 -0700 Subject: bpf: Change bpf_sk_storage_*() to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_storage_*() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. A micro benchmark has been done on a "cgroup_skb/egress" bpf program which does a bpf_sk_storage_get(). It was driven by netperf doing a 4096 connected UDP_STREAM test with 64bytes packet. The stats from "kernel.bpf_stats_enabled" shows no meaningful difference. The sk_storage_get_btf_proto, sk_storage_delete_btf_proto, btf_sk_storage_get_proto, and btf_sk_storage_delete_proto are no longer needed, so they are removed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000402.3856307-1-kafai@fb.com --- include/net/bpf_sk_storage.h | 2 -- include/uapi/linux/bpf.h | 1 + kernel/bpf/bpf_lsm.c | 4 ++-- net/core/bpf_sk_storage.c | 29 ++++++----------------------- net/ipv4/bpf_tcp_ca.c | 23 ++--------------------- tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 12 insertions(+), 48 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 119f4c9c3a9c..3c516dd07caf 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,8 +20,6 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; -extern const struct bpf_func_proto sk_storage_get_btf_proto; -extern const struct bpf_func_proto sk_storage_delete_btf_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c96a56d9c3be..0ec6dbeb17a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2861,6 +2861,7 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). * * long bpf_send_signal(u32 sig) * Description diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9cd1428c7199..78ea8a7bd27f 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -56,9 +56,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_inode_storage_delete: return &bpf_inode_storage_delete_proto; case BPF_FUNC_sk_storage_get: - return &sk_storage_get_btf_proto; + return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: - return &sk_storage_delete_btf_proto; + return &bpf_sk_storage_delete_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 838efc682cff..c907f0dc7f87 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -269,7 +269,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, { struct bpf_local_storage_data *sdata; - if (flags > BPF_SK_STORAGE_GET_F_CREATE) + if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; sdata = sk_storage_lookup(sk, map, true); @@ -299,6 +299,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) { + if (!sk || !sk_fullsock(sk)) + return -EINVAL; + if (refcount_inc_not_zero(&sk->sk_refcnt)) { int err; @@ -355,7 +358,7 @@ const struct bpf_func_proto bpf_sk_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -375,27 +378,7 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_SOCKET, -}; - -const struct bpf_func_proto sk_storage_get_btf_proto = { - .func = bpf_sk_storage_get, - .gpl_only = false, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], - .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, - .arg4_type = ARG_ANYTHING, -}; - -const struct bpf_func_proto sk_storage_delete_btf_proto = { - .func = bpf_sk_storage_delete, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; struct bpf_sk_storage_diag { diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 74a2ef598c31..618954f82764 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -28,22 +28,6 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; -static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly; -static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly; - -static void convert_sk_func_proto(struct bpf_func_proto *to, const struct bpf_func_proto *from) -{ - int i; - - *to = *from; - for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) { - if (to->arg_type[i] == ARG_PTR_TO_SOCKET) { - to->arg_type[i] = ARG_PTR_TO_BTF_ID; - to->arg_btf_id[i] = &tcp_sock_id; - } - } -} - static int bpf_tcp_ca_init(struct btf *btf) { s32 type_id; @@ -59,9 +43,6 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); - convert_sk_func_proto(&btf_sk_storage_get_proto, &bpf_sk_storage_get_proto); - convert_sk_func_proto(&btf_sk_storage_delete_proto, &bpf_sk_storage_delete_proto); - return 0; } @@ -188,9 +169,9 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, case BPF_FUNC_tcp_send_ack: return &bpf_tcp_send_ack_proto; case BPF_FUNC_sk_storage_get: - return &btf_sk_storage_get_proto; + return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: - return &btf_sk_storage_delete_proto; + return &bpf_sk_storage_delete_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c96a56d9c3be..0ec6dbeb17a5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2861,6 +2861,7 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). * * long bpf_send_signal(u32 sig) * Description -- cgit v1.2.3 From c0df236e1394970f3503a8fb103de95d000014ca Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:09 -0700 Subject: bpf: Change bpf_tcp_*_syncookie to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_tcp_*_syncookie() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000409.3856725-1-kafai@fb.com --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 8 ++++---- tools/include/uapi/linux/bpf.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0ec6dbeb17a5..69b9e30375bc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2692,7 +2692,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2878,7 +2878,7 @@ union bpf_attr { * * **-EAGAIN** if bpf program can try again. * - * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. diff --git a/net/core/filter.c b/net/core/filter.c index 06d397eeef2a..1d88e9b498eb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6086,7 +6086,7 @@ BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len u32 cookie; int ret; - if (unlikely(th_len < sizeof(*th))) + if (unlikely(!sk || th_len < sizeof(*th))) return -EINVAL; /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */ @@ -6139,7 +6139,7 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .gpl_only = true, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM, @@ -6153,7 +6153,7 @@ BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len, u32 cookie; u16 mss; - if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4)) + if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4)) return -EINVAL; if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN) @@ -6208,7 +6208,7 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */ .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_SOCK_COMMON, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_PTR_TO_MEM, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0ec6dbeb17a5..69b9e30375bc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2692,7 +2692,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2878,7 +2878,7 @@ union bpf_attr { * * **-EAGAIN** if bpf program can try again. * - * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. -- cgit v1.2.3 From 27e5203bd9c5cc6d54dcac48c3027f3f04522b8b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:15 -0700 Subject: bpf: Change bpf_sk_assign to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_assign() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. The bpf_sk_lookup_assign() is taking ARG_PTR_TO_SOCKET_"OR_NULL". Meaning it specifically takes a literal NULL. ARG_PTR_TO_BTF_ID_SOCK_COMMON does not allow a literal NULL, so another ARG type is required for this purpose and another follow-up patch can be used if there is such need. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200925000415.3857374-1-kafai@fb.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 4 ++-- tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69b9e30375bc..2d6519a2ed77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3107,7 +3107,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This * description applies to **BPF_PROG_TYPE_SCHED_CLS** and diff --git a/net/core/filter.c b/net/core/filter.c index 1d88e9b498eb..af88935e24b1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6217,7 +6217,7 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags) { - if (flags != 0) + if (!sk || flags != 0) return -EINVAL; if (!skb_at_tc_ingress(skb)) return -EOPNOTSUPP; @@ -6241,7 +6241,7 @@ static const struct bpf_func_proto bpf_sk_assign_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_SOCK_COMMON, + .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg3_type = ARG_ANYTHING, }; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 69b9e30375bc..2d6519a2ed77 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3107,7 +3107,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This * description applies to **BPF_PROG_TYPE_SCHED_CLS** and -- cgit v1.2.3 From 5d5b4128c4caae34ddcd9b2dc30ac4d6155617a3 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 25 Sep 2020 13:46:07 -0700 Subject: devlink: introduce flash update overwrite mask Sections of device flash may contain settings or device identifying information. When performing a flash update, it is generally expected that these settings and identifiers are not overwritten. However, it may sometimes be useful to allow overwriting these fields when performing a flash update. Some examples include, 1) customizing the initial device config on first programming, such as overwriting default device identifying information, or 2) reverting a device configuration to known good state provided in the new firmware image, or 3) in case it is suspected that current firmware logic for managing the preservation of fields during an update is broken. Although some devices are able to completely separate these types of settings and fields into separate components, this is not true for all hardware. To support controlling this behavior, a new DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK is defined. This is an nla_bitfield32 which will define what subset of fields in a component should be overwritten during an update. If no bits are specified, or of the overwrite mask is not provided, then an update should not overwrite anything, and should maintain the settings and identifiers as they are in the previous image. If the overwrite mask has the DEVLINK_FLASH_OVERWRITE_SETTINGS bit set, then the device should be configured to overwrite any of the settings in the requested component with settings found in the provided image. Similarly, if the DEVLINK_FLASH_OVERWRITE_IDENTIFIERS bit is set, the device should be configured to overwrite any device identifiers in the requested component with the identifiers from the image. Multiple overwrite modes may be combined to indicate that a combination of the set of fields that should be overwritten. Drivers which support the new overwrite mask must set the DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK in the supported_flash_update_params field of their devlink_ops. Signed-off-by: Jacob Keller Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-flash.rst | 28 ++++++++++++++++++++++ include/net/devlink.h | 4 +++- include/uapi/linux/devlink.h | 23 ++++++++++++++++++ net/core/devlink.c | 17 ++++++++++++- 4 files changed, 70 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/devlink/devlink-flash.rst b/Documentation/networking/devlink/devlink-flash.rst index 40a87c0222cb..603e732f00cc 100644 --- a/Documentation/networking/devlink/devlink-flash.rst +++ b/Documentation/networking/devlink/devlink-flash.rst @@ -16,6 +16,34 @@ Note that the file name is a path relative to the firmware loading path (usually ``/lib/firmware/``). Drivers may send status updates to inform user space about the progress of the update operation. +Overwrite Mask +============== + +The ``devlink-flash`` command allows optionally specifying a mask indicating +how the device should handle subsections of flash components when updating. +This mask indicates the set of sections which are allowed to be overwritten. + +.. list-table:: List of overwrite mask bits + :widths: 5 95 + + * - Name + - Description + * - ``DEVLINK_FLASH_OVERWRITE_SETTINGS`` + - Indicates that the device should overwrite settings in the components + being updated with the settings found in the provided image. + * - ``DEVLINK_FLASH_OVERWRITE_IDENTIFIERS`` + - Indicates that the device should overwrite identifiers in the + components being updated with the identifiers found in the provided + image. This includes MAC addresses, serial IDs, and similar device + identifiers. + +Multiple overwrite bits may be combined and requested together. If no bits +are provided, it is expected that the device only update firmware binaries +in the components being updated. Settings and identifiers are expected to be +preserved across the update. A device may not support every combination and +the driver for such a device must reject any combination which cannot be +faithfully implemented. + Firmware Loading ================ diff --git a/include/net/devlink.h b/include/net/devlink.h index 7794e1601772..7339bf9ba6b4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -562,9 +562,11 @@ enum devlink_param_generic_id { struct devlink_flash_update_params { const char *file_name; const char *component; + u32 overwrite_mask; }; -#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) +#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) +#define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(1) struct devlink_region; struct devlink_info_req; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a2ecc8b00611..7b0face1bad5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -230,6 +230,28 @@ enum { DEVLINK_ATTR_STATS_MAX = __DEVLINK_ATTR_STATS_MAX - 1 }; +/* Specify what sections of a flash component can be overwritten when + * performing an update. Overwriting of firmware binary sections is always + * implicitly assumed to be allowed. + * + * Each section must be documented in + * Documentation/networking/devlink/devlink-flash.rst + * + */ +enum { + DEVLINK_FLASH_OVERWRITE_SETTINGS_BIT, + DEVLINK_FLASH_OVERWRITE_IDENTIFIERS_BIT, + + __DEVLINK_FLASH_OVERWRITE_MAX_BIT, + DEVLINK_FLASH_OVERWRITE_MAX_BIT = __DEVLINK_FLASH_OVERWRITE_MAX_BIT - 1 +}; + +#define DEVLINK_FLASH_OVERWRITE_SETTINGS _BITUL(DEVLINK_FLASH_OVERWRITE_SETTINGS_BIT) +#define DEVLINK_FLASH_OVERWRITE_IDENTIFIERS _BITUL(DEVLINK_FLASH_OVERWRITE_IDENTIFIERS_BIT) + +#define DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS \ + (_BITUL(__DEVLINK_FLASH_OVERWRITE_MAX_BIT) - 1) + /** * enum devlink_trap_action - Packet trap action. * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not @@ -464,6 +486,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ + DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK, /* bitfield32 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 6766f9ef3152..7a38f9e25922 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3147,9 +3147,9 @@ EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { + struct nlattr *nla_component, *nla_overwrite_mask; struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; - struct nlattr *nla_component; u32 supported_params; if (!devlink->ops->flash_update) @@ -3172,6 +3172,19 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, params.component = nla_data(nla_component); } + nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; + if (nla_overwrite_mask) { + struct nla_bitfield32 sections; + + if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) { + NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask, + "overwrite settings are not supported by this device"); + return -EOPNOTSUPP; + } + sections = nla_get_bitfield32(nla_overwrite_mask); + params.overwrite_mask = sections.value & sections.selector; + } + return devlink->ops->flash_update(devlink, ¶ms, info->extack); } @@ -7093,6 +7106,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = + NLA_POLICY_BITFIELD32(DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS), [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_TRAP_ACTION] = { .type = NLA_U8 }, [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING }, -- cgit v1.2.3 From c8cb5b854b40f2ce52ccd032fa19750f4181d5fc Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Fri, 18 Sep 2020 11:33:13 +0200 Subject: nl80211/cfg80211: support 6 GHz scanning Support 6 GHz scanning, by * a new scan flag to scan for colocated BSSes advertised by (and found) APs on 2.4 & 5 GHz * doing the necessary reduced neighbor report parsing for this, to find them * adding the ability to split the scan request in case the device by itself cannot support this. Also add some necessary bits in mac80211 to not break with these changes. Signed-off-by: Tova Mussai Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200918113313.232917c93af9.Ida22f0212f9122f47094d81659e879a50434a6a2@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 32 ++- include/uapi/linux/nl80211.h | 3 + net/mac80211/scan.c | 9 +- net/wireless/core.c | 8 +- net/wireless/core.h | 5 +- net/wireless/nl80211.c | 11 +- net/wireless/scan.c | 501 ++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 552 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 10c2cc8f0efc..11eb81676e95 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2095,6 +2095,27 @@ struct cfg80211_scan_info { bool aborted; }; +/** + * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only + * + * @short_bssid: short ssid to scan for + * @bssid: bssid to scan for + * @channel_idx: idx of the channel in the channel array in the scan request + * which the above info relvant to + * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU + * @short_ssid_valid: short_ssid is valid and can be used + * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait + * 20 TUs before starting to send probe requests. + */ +struct cfg80211_scan_6ghz_params { + u32 short_ssid; + u32 channel_idx; + u8 bssid[ETH_ALEN]; + bool unsolicited_probe; + bool short_ssid_valid; + bool psc_no_listen; +}; + /** * struct cfg80211_scan_request - scan request description * @@ -2122,6 +2143,10 @@ struct cfg80211_scan_info { * @mac_addr_mask: MAC address mask used with randomisation, bits that * are 0 in the mask should be randomised, bits that are 1 should * be taken from the @mac_addr + * @scan_6ghz: relevant for split scan request only, + * true if this is the second scan request + * @n_6ghz_params: number of 6 GHz params + * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) */ struct cfg80211_scan_request { @@ -2149,6 +2174,9 @@ struct cfg80211_scan_request { struct cfg80211_scan_info info; bool notified; bool no_cck; + bool scan_6ghz; + u32 n_6ghz_params; + struct cfg80211_scan_6ghz_params *scan_6ghz_params; /* keep last */ struct ieee80211_channel *channels[]; @@ -4217,6 +4245,8 @@ struct cfg80211_ops { /** * enum wiphy_flags - wiphy capability flags * + * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split + * into two, first for legacy bands and second for UHB. * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this * wiphy at all * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled @@ -4260,7 +4290,7 @@ struct cfg80211_ops { enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), /* use hole at 1 */ - /* use hole at 2 */ + WIPHY_FLAG_SPLIT_SCAN_6GHZ = BIT(2), WIPHY_FLAG_NETNS_OK = BIT(3), WIPHY_FLAG_PS_ON_BY_DEFAULT = BIT(4), WIPHY_FLAG_4ADDR_AP = BIT(5), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index bdc90b8dfd24..c74ceaddb909 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6059,6 +6059,8 @@ enum nl80211_timeout_reason { * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with * %NL80211_ATTR_SCAN_FREQ_KHZ. This also means * %NL80211_ATTR_SCAN_FREQUENCIES will not be included. + * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for colocated APs reported by + * 2.4/5 GHz APs */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, @@ -6075,6 +6077,7 @@ enum nl80211_scan_flags { NL80211_SCAN_FLAG_RANDOM_SN = 1<<11, NL80211_SCAN_FLAG_MIN_PREQ_CONTENT = 1<<12, NL80211_SCAN_FLAG_FREQ_KHZ = 1<<13, + NL80211_SCAN_FLAG_COLOCATED_6GHZ = 1<<14, }; /** diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 5ac2785cdc7b..7361e1239bf2 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include @@ -712,6 +712,10 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, req->duration_mandatory; local->hw_scan_band = 0; + local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; + local->hw_scan_req->req.scan_6ghz_params = + req->scan_6ghz_params; + local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; /* * After allocating local->hw_scan_req, we must @@ -1124,7 +1128,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, int max_n; for (band = 0; band < NUM_NL80211_BANDS; band++) { - if (!local->hw.wiphy->bands[band]) + if (!local->hw.wiphy->bands[band] || + band == NL80211_BAND_6GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; diff --git a/net/wireless/core.c b/net/wireless/core.c index 354b0ccbdc24..9f23923e8d29 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -236,7 +236,9 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, rdev->opencount--; if (rdev->scan_req && rdev->scan_req->wdev == wdev) { - if (WARN_ON(!rdev->scan_req->notified)) + if (WARN_ON(!rdev->scan_req->notified && + (!rdev->int_scan_req || + !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } @@ -1336,7 +1338,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, case NETDEV_DOWN: cfg80211_update_iface_num(rdev, wdev->iftype, -1); if (rdev->scan_req && rdev->scan_req->wdev == wdev) { - if (WARN_ON(!rdev->scan_req->notified)) + if (WARN_ON(!rdev->scan_req->notified && + (!rdev->int_scan_req || + !rdev->int_scan_req->notified))) rdev->scan_req->info.aborted = true; ___cfg80211_scan_done(rdev, false); } diff --git a/net/wireless/core.h b/net/wireless/core.h index 2ebc2a66680d..e1ec9ac8e608 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -72,6 +72,7 @@ struct cfg80211_registered_device { u32 bss_generation; u32 bss_entries; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + struct cfg80211_scan_request *int_scan_req; struct sk_buff *scan_msg; struct list_head sched_scan_req_list; time64_t suspend_at; @@ -457,6 +458,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev); bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, u32 center_freq_khz, u32 bw_khz); +int cfg80211_scan(struct cfg80211_registered_device *rdev); + extern struct work_struct cfg80211_disconnect_work; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1a212db7a300..d98db166d5e6 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8236,7 +8236,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) request->scan_start = jiffies; rdev->scan_req = request; - err = rdev_scan(rdev, request); + err = cfg80211_scan(rdev); if (err) goto out_free; @@ -15518,6 +15518,7 @@ static int nl80211_add_scan_req(struct sk_buff *msg, struct cfg80211_scan_request *req = rdev->scan_req; struct nlattr *nest; int i; + struct cfg80211_scan_info *info; if (WARN_ON(!req)) return 0; @@ -15561,11 +15562,13 @@ static int nl80211_add_scan_req(struct sk_buff *msg, nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags)) goto nla_put_failure; - if (req->info.scan_start_tsf && + info = rdev->int_scan_req ? &rdev->int_scan_req->info : + &rdev->scan_req->info; + if (info->scan_start_tsf && (nla_put_u64_64bit(msg, NL80211_ATTR_SCAN_START_TIME_TSF, - req->info.scan_start_tsf, NL80211_BSS_PAD) || + info->scan_start_tsf, NL80211_BSS_PAD) || nla_put(msg, NL80211_ATTR_SCAN_START_TIME_TSF_BSSID, ETH_ALEN, - req->info.tsf_bssid))) + info->tsf_bssid))) goto nla_put_failure; return 0; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 84fc8ab16dd2..4fbeb17580d9 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include #include @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include @@ -74,6 +76,43 @@ MODULE_PARM_DESC(bss_entries_limit, #define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ) +/** + * struct cfg80211_colocated_ap - colocated AP information + * + * @list: linked list to all colocated aPS + * @bssid: BSSID of the reported AP + * @ssid: SSID of the reported AP + * @ssid_len: length of the ssid + * @center_freq: frequency the reported AP is on + * @unsolicited_probe: the reported AP is part of an ESS, where all the APs + * that operate in the same channel as the reported AP and that might be + * detected by a STA receiving this frame, are transmitting unsolicited + * Probe Response frames every 20 TUs + * @oct_recommended: OCT is recommended to exchange MMPDUs with the reported AP + * @same_ssid: the reported AP has the same SSID as the reporting AP + * @multi_bss: the reported AP is part of a multiple BSSID set + * @transmitted_bssid: the reported AP is the transmitting BSSID + * @colocated_ess: all the APs that share the same ESS as the reported AP are + * colocated and can be discovered via legacy bands. + * @short_ssid_valid: short_ssid is valid and can be used + * @short_ssid: the short SSID for this SSID + */ +struct cfg80211_colocated_ap { + struct list_head list; + u8 bssid[ETH_ALEN]; + u8 ssid[IEEE80211_MAX_SSID_LEN]; + size_t ssid_len; + u32 short_ssid; + u32 center_freq; + u8 unsolicited_probe:1, + oct_recommended:1, + same_ssid:1, + multi_bss:1, + transmitted_bssid:1, + colocated_ess:1, + short_ssid_valid:1; +}; + static void bss_free(struct cfg80211_internal_bss *bss) { struct cfg80211_bss_ies *ies; @@ -448,10 +487,433 @@ static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev) return ret; } +static u8 cfg80211_parse_bss_param(u8 data, + struct cfg80211_colocated_ap *coloc_ap) +{ + coloc_ap->oct_recommended = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED); + coloc_ap->same_ssid = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_SAME_SSID); + coloc_ap->multi_bss = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID); + coloc_ap->transmitted_bssid = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID); + coloc_ap->unsolicited_probe = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE); + coloc_ap->colocated_ess = + u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS); + + return u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_AP); +} + +static int cfg80211_calc_short_ssid(const struct cfg80211_bss_ies *ies, + const struct element **elem, u32 *s_ssid) +{ + + *elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len); + if (!*elem || (*elem)->datalen > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + + *s_ssid = ~crc32_le(~0, (*elem)->data, (*elem)->datalen); + return 0; +} + +static void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list) +{ + struct cfg80211_colocated_ap *ap, *tmp_ap; + + list_for_each_entry_safe(ap, tmp_ap, coloc_ap_list, list) { + list_del(&ap->list); + kfree(ap); + } +} + +static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, + const u8 *pos, u8 length, + const struct element *ssid_elem, + int s_ssid_tmp) +{ + /* skip the TBTT offset */ + pos++; + + memcpy(entry->bssid, pos, ETH_ALEN); + pos += ETH_ALEN; + + if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM) { + memcpy(&entry->short_ssid, pos, + sizeof(entry->short_ssid)); + entry->short_ssid_valid = true; + pos += 4; + } + + /* skip non colocated APs */ + if (!cfg80211_parse_bss_param(*pos, entry)) + return -EINVAL; + pos++; + + if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM) { + /* + * no information about the short ssid. Consider the entry valid + * for now. It would later be dropped in case there are explicit + * SSIDs that need to be matched + */ + if (!entry->same_ssid) + return 0; + } + + if (entry->same_ssid) { + entry->short_ssid = s_ssid_tmp; + entry->short_ssid_valid = true; + + /* + * This is safe because we validate datalen in + * cfg80211_parse_colocated_ap(), before calling this + * function. + */ + memcpy(&entry->ssid, &ssid_elem->data, + ssid_elem->datalen); + entry->ssid_len = ssid_elem->datalen; + } + return 0; +} + +static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, + struct list_head *list) +{ + struct ieee80211_neighbor_ap_info *ap_info; + const struct element *elem, *ssid_elem; + const u8 *pos, *end; + u32 s_ssid_tmp; + int n_coloc = 0, ret; + LIST_HEAD(ap_list); + + elem = cfg80211_find_elem(WLAN_EID_REDUCED_NEIGHBOR_REPORT, ies->data, + ies->len); + if (!elem || elem->datalen > IEEE80211_MAX_SSID_LEN) + return 0; + + pos = elem->data; + end = pos + elem->datalen; + + ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp); + if (ret) + return ret; + + /* RNR IE may contain more than one NEIGHBOR_AP_INFO */ + while (pos + sizeof(*ap_info) <= end) { + enum nl80211_band band; + int freq; + u8 length, i, count; + + ap_info = (void *)pos; + count = u8_get_bits(ap_info->tbtt_info_hdr, + IEEE80211_AP_INFO_TBTT_HDR_COUNT) + 1; + length = ap_info->tbtt_info_len; + + pos += sizeof(*ap_info); + + if (!ieee80211_operating_class_to_band(ap_info->op_class, + &band)) + break; + + freq = ieee80211_channel_to_frequency(ap_info->channel, band); + + if (end - pos < count * ap_info->tbtt_info_len) + break; + + /* + * TBTT info must include bss param + BSSID + + * (short SSID or same_ssid bit to be set). + * ignore other options, and move to the + * next AP info + */ + if (band != NL80211_BAND_6GHZ || + (length != IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM && + length < IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM)) { + pos += count * ap_info->tbtt_info_len; + continue; + } + + for (i = 0; i < count; i++) { + struct cfg80211_colocated_ap *entry; + + entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, + GFP_ATOMIC); + + if (!entry) + break; + + entry->center_freq = freq; + + if (!cfg80211_parse_ap_info(entry, pos, length, + ssid_elem, s_ssid_tmp)) { + n_coloc++; + list_add_tail(&entry->list, &ap_list); + } else { + kfree(entry); + } + + pos += ap_info->tbtt_info_len; + } + } + + if (pos != end) { + cfg80211_free_coloc_ap_list(&ap_list); + return 0; + } + + list_splice_tail(&ap_list, list); + return n_coloc; +} + +static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, + struct ieee80211_channel *chan, + bool add_to_6ghz) +{ + int i; + u32 n_channels = request->n_channels; + struct cfg80211_scan_6ghz_params *params = + &request->scan_6ghz_params[request->n_6ghz_params]; + + for (i = 0; i < n_channels; i++) { + if (request->channels[i] == chan) { + if (add_to_6ghz) + params->channel_idx = i; + return; + } + } + + request->channels[n_channels] = chan; + if (add_to_6ghz) + request->scan_6ghz_params[request->n_6ghz_params].channel_idx = + n_channels; + + request->n_channels++; +} + +static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, + struct cfg80211_scan_request *request) +{ + u8 i; + u32 s_ssid; + + for (i = 0; i < request->n_ssids; i++) { + /* wildcard ssid in the scan request */ + if (!request->ssids[i].ssid_len) + return true; + + if (ap->ssid_len && + ap->ssid_len == request->ssids[i].ssid_len) { + if (!memcmp(request->ssids[i].ssid, ap->ssid, + ap->ssid_len)) + return true; + } else if (ap->short_ssid_valid) { + s_ssid = ~crc32_le(~0, request->ssids[i].ssid, + request->ssids[i].ssid_len); + + if (ap->short_ssid == s_ssid) + return true; + } + } + + return false; +} + +static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) +{ + u8 i; + struct cfg80211_colocated_ap *ap; + int n_channels, count = 0, err; + struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req; + LIST_HEAD(coloc_ap_list); + bool need_scan_psc; + const struct ieee80211_sband_iftype_data *iftd; + + rdev_req->scan_6ghz = true; + + if (!rdev->wiphy.bands[NL80211_BAND_6GHZ]) + return -EOPNOTSUPP; + + iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ], + rdev_req->wdev->iftype); + if (!iftd || !iftd->he_cap.has_he) + return -EOPNOTSUPP; + + n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels; + + if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { + struct cfg80211_internal_bss *intbss; + + spin_lock_bh(&rdev->bss_lock); + list_for_each_entry(intbss, &rdev->bss_list, list) { + struct cfg80211_bss *res = &intbss->pub; + const struct cfg80211_bss_ies *ies; + + ies = rcu_access_pointer(res->ies); + count += cfg80211_parse_colocated_ap(ies, + &coloc_ap_list); + } + spin_unlock_bh(&rdev->bss_lock); + } + + request = kzalloc(struct_size(request, channels, n_channels) + + sizeof(*request->scan_6ghz_params) * count, + GFP_KERNEL); + if (!request) { + cfg80211_free_coloc_ap_list(&coloc_ap_list); + return -ENOMEM; + } + + *request = *rdev_req; + request->n_channels = 0; + request->scan_6ghz_params = + (void *)&request->channels[n_channels]; + + /* + * PSC channels should not be scanned if all the reported co-located APs + * are indicating that all APs in the same ESS are co-located + */ + if (count) { + need_scan_psc = false; + + list_for_each_entry(ap, &coloc_ap_list, list) { + if (!ap->colocated_ess) { + need_scan_psc = true; + break; + } + } + } else { + need_scan_psc = true; + } + + /* + * add to the scan request the channels that need to be scanned + * regardless of the collocated APs (PSC channels or all channels + * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set) + */ + for (i = 0; i < rdev_req->n_channels; i++) { + if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ && + ((need_scan_psc && + cfg80211_channel_is_psc(rdev_req->channels[i])) || + !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { + cfg80211_scan_req_add_chan(request, + rdev_req->channels[i], + false); + } + } + + if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) + goto skip; + + list_for_each_entry(ap, &coloc_ap_list, list) { + bool found = false; + struct cfg80211_scan_6ghz_params *scan_6ghz_params = + &request->scan_6ghz_params[request->n_6ghz_params]; + struct ieee80211_channel *chan = + ieee80211_get_channel(&rdev->wiphy, ap->center_freq); + + if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) + continue; + + for (i = 0; i < rdev_req->n_channels; i++) { + if (rdev_req->channels[i] == chan) + found = true; + } + + if (!found) + continue; + + if (request->n_ssids > 0 && + !cfg80211_find_ssid_match(ap, request)) + continue; + + cfg80211_scan_req_add_chan(request, chan, true); + memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN); + scan_6ghz_params->short_ssid = ap->short_ssid; + scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid; + scan_6ghz_params->unsolicited_probe = ap->unsolicited_probe; + + /* + * If a PSC channel is added to the scan and 'need_scan_psc' is + * set to false, then all the APs that the scan logic is + * interested with on the channel are collocated and thus there + * is no need to perform the initial PSC channel listen. + */ + if (cfg80211_channel_is_psc(chan) && !need_scan_psc) + scan_6ghz_params->psc_no_listen = true; + + request->n_6ghz_params++; + } + +skip: + cfg80211_free_coloc_ap_list(&coloc_ap_list); + + if (request->n_channels) { + struct cfg80211_scan_request *old = rdev->int_scan_req; + + rdev->int_scan_req = request; + + /* + * If this scan follows a previous scan, save the scan start + * info from the first part of the scan + */ + if (old) + rdev->int_scan_req->info = old->info; + + err = rdev_scan(rdev, request); + if (err) { + rdev->int_scan_req = old; + kfree(request); + } else { + kfree(old); + } + + return err; + } + + kfree(request); + return -EINVAL; +} + +int cfg80211_scan(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_scan_request *request; + struct cfg80211_scan_request *rdev_req = rdev->scan_req; + u32 n_channels = 0, idx, i; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) + return rdev_scan(rdev, rdev_req); + + for (i = 0; i < rdev_req->n_channels; i++) { + if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) + n_channels++; + } + + if (!n_channels) + return cfg80211_scan_6ghz(rdev); + + request = kzalloc(struct_size(request, channels, n_channels), + GFP_KERNEL); + if (!request) + return -ENOMEM; + + *request = *rdev_req; + request->n_channels = n_channels; + + for (i = idx = 0; i < rdev_req->n_channels; i++) { + if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) + request->channels[idx++] = rdev_req->channels[i]; + } + + rdev_req->scan_6ghz = false; + rdev->int_scan_req = request; + return rdev_scan(rdev, request); +} + void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message) { - struct cfg80211_scan_request *request; + struct cfg80211_scan_request *request, *rdev_req; struct wireless_dev *wdev; struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT @@ -466,11 +928,18 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, return; } - request = rdev->scan_req; - if (!request) + rdev_req = rdev->scan_req; + if (!rdev_req) return; - wdev = request->wdev; + wdev = rdev_req->wdev; + request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req; + + if (wdev_running(wdev) && + (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) && + !rdev_req->scan_6ghz && !request->info.aborted && + !cfg80211_scan_6ghz(rdev)) + return; /* * This must be before sending the other events! @@ -501,8 +970,11 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (wdev->netdev) dev_put(wdev->netdev); + kfree(rdev->int_scan_req); + rdev->int_scan_req = NULL; + + kfree(rdev->scan_req); rdev->scan_req = NULL; - kfree(request); if (!send_message) rdev->scan_msg = msg; @@ -525,10 +997,25 @@ void __cfg80211_scan_done(struct work_struct *wk) void cfg80211_scan_done(struct cfg80211_scan_request *request, struct cfg80211_scan_info *info) { + struct cfg80211_scan_info old_info = request->info; + trace_cfg80211_scan_done(request, info); - WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req); + WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req && + request != wiphy_to_rdev(request->wiphy)->int_scan_req); request->info = *info; + + /* + * In case the scan is split, the scan_start_tsf and tsf_bssid should + * be of the first part. In such a case old_info.scan_start_tsf should + * be non zero. + */ + if (request->scan_6ghz && old_info.scan_start_tsf) { + request->info.scan_start_tsf = old_info.scan_start_tsf; + memcpy(request->info.tsf_bssid, old_info.tsf_bssid, + sizeof(request->info.tsf_bssid)); + } + request->notified = true; queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } -- cgit v1.2.3 From d2b7588a47de8322891de38ec14d15105d66cb1e Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:04 -0700 Subject: nl80211: support S1G capability overrides in assoc NL80211_ATTR_S1G_CAPABILITY can be passed along with NL80211_ATTR_S1G_CAPABILITY_MASK to NL80211_CMD_ASSOCIATE to indicate S1G capabilities which should override the hardware capabilities in eg. the association request. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-4-thomas@adapt-ip.com [johannes: always require both attributes together, commit message] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 9 +++++++++ net/wireless/nl80211.c | 20 ++++++++++++++++++++ 4 files changed, 34 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 53fba39d4ba6..f71cffa18176 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2330,6 +2330,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) } /* S1G Capabilities Information field */ +#define IEEE80211_S1G_CAPABILITY_LEN 15 + #define S1G_CAP0_S1G_LONG BIT(0) #define S1G_CAP0_SGI_1MHZ BIT(1) #define S1G_CAP0_SGI_2MHZ BIT(2) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 11eb81676e95..bead4b9afeca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2556,6 +2556,8 @@ enum cfg80211_assoc_req_flags { * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association * Request/Response frame or %NULL if FILS is not used. This field starts * with 16 octets of STA Nonce followed by 16 octets of AP Nonce. + * @s1g_capa: S1G capability override + * @s1g_capa_mask: S1G capability override mask */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -2570,6 +2572,7 @@ struct cfg80211_assoc_request { const u8 *fils_kek; size_t fils_kek_len; const u8 *fils_nonces; + struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c74ceaddb909..05db40b4c56f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2521,6 +2521,12 @@ enum nl80211_commands { * unsolicited broadcast probe response. It is a nested attribute, see * &enum nl80211_unsol_bcast_probe_resp_attributes. * + * @NL80211_ATTR_S1G_CAPABILITY: S1G Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION) + * @NL80211_ATTR_S1G_CAPABILITY_MASK: S1G Capability Information element + * override mask. Used with NL80211_ATTR_S1G_CAPABILITY in + * NL80211_CMD_ASSOCIATE or NL80211_CMD_CONNECT. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3007,6 +3013,9 @@ enum nl80211_attrs { NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, + NL80211_ATTR_S1G_CAPABILITY, + NL80211_ATTR_S1G_CAPABILITY_MASK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d98db166d5e6..d31451db5407 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -704,6 +704,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_NESTED(nl80211_fils_discovery_policy), [NL80211_ATTR_UNSOL_BCAST_PROBE_RESP] = NLA_POLICY_NESTED(nl80211_unsol_bcast_probe_resp_policy), + [NL80211_ATTR_S1G_CAPABILITY] = + NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN), + [NL80211_ATTR_S1G_CAPABILITY_MASK] = + NLA_POLICY_EXACT_LEN(IEEE80211_S1G_CAPABILITY_LEN), }; /* policy for the key attributes */ @@ -9792,6 +9796,22 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]); } + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]) { + if (!info->attrs[NL80211_ATTR_S1G_CAPABILITY]) + return -EINVAL; + memcpy(&req.s1g_capa_mask, + nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]), + sizeof(req.s1g_capa_mask)); + } + + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) { + if (!info->attrs[NL80211_ATTR_S1G_CAPABILITY_MASK]) + return -EINVAL; + memcpy(&req.s1g_capa, + nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]), + sizeof(req.s1g_capa)); + } + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); -- cgit v1.2.3 From 1ae099540e8c7f1ee066b3ad45cc91f582bb1ce8 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:16 +0200 Subject: KVM: x86: Allow deflecting unknown MSR accesses to user space MSRs are weird. Some of them are normal control registers, such as EFER. Some however are registers that really are model specific, not very interesting to virtualization workloads, and not performance critical. Others again are really just windows into package configuration. Out of these MSRs, only the first category is necessary to implement in kernel space. Rarely accessed MSRs, MSRs that should be fine tunes against certain CPU models and MSRs that contain information on the package level are much better suited for user space to process. However, over time we have accumulated a lot of MSRs that are not the first category, but still handled by in-kernel KVM code. This patch adds a generic interface to handle WRMSR and RDMSR from user space. With this, any future MSR that is part of the latter categories can be handled in user space. Furthermore, it allows us to replace the existing "ignore_msrs" logic with something that applies per-VM rather than on the full system. That way you can run productive VMs in parallel to experimental ones where you don't care about proper MSR handling. Signed-off-by: Alexander Graf Reviewed-by: Jim Mattson Message-Id: <20200925143422.21718-3-graf@amazon.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 85 +++++++++++++++++++++++++--- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/emulate.c | 18 +++++- arch/x86/kvm/x86.c | 120 ++++++++++++++++++++++++++++++++++++++-- include/trace/events/kvm.h | 2 +- include/uapi/linux/kvm.h | 13 +++++ 6 files changed, 226 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9e2a545d8084..4fdba43d83e8 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4872,14 +4872,13 @@ to the byte array. .. note:: - For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and - KVM_EXIT_EPR the corresponding - -operations are complete (and guest state is consistent) only after userspace -has re-entered the kernel with KVM_RUN. The kernel side will first finish -incomplete operations and then check for pending signals. Userspace -can re-enter the guest with an unmasked signal pending to complete -pending operations. + For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, + KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding + operations are complete (and guest state is consistent) only after userspace + has re-entered the kernel with KVM_RUN. The kernel side will first finish + incomplete operations and then check for pending signals. Userspace + can re-enter the guest with an unmasked signal pending to complete + pending operations. :: @@ -5166,6 +5165,43 @@ Note that KVM does not skip the faulting instruction as it does for KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state if it decides to decode and emulate the instruction. +:: + + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + __u8 error; /* user -> kernel */ + __u8 pad[7]; + __u32 reason; /* kernel -> user */ + __u32 index; /* kernel -> user */ + __u64 data; /* kernel <-> user */ + } msr; + +Used on x86 systems. When the VM capability KVM_CAP_X86_USER_SPACE_MSR is +enabled, MSR accesses to registers that would invoke a #GP by KVM kernel code +will instead trigger a KVM_EXIT_X86_RDMSR exit for reads and KVM_EXIT_X86_WRMSR +exit for writes. + +The "reason" field specifies why the MSR trap occurred. User space will only +receive MSR exit traps when a particular reason was requested during through +ENABLE_CAP. Currently valid exit reasons are: + + KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM + KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits + +For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest +wants to read. To respond to this request with a successful read, user space +writes the respective data into the "data" field and must continue guest +execution to ensure the read data is transferred into guest register state. + +If the RDMSR request was unsuccessful, user space indicates that with a "1" in +the "error" field. This will inject a #GP into the guest when the VCPU is +executed again. + +For KVM_EXIT_X86_WRMSR, the "index" field tells user space which MSR the guest +wants to write. Once finished processing the event, user space must continue +vCPU execution. If the MSR write was unsuccessful, user space also sets the +"error" field to "1". + :: /* Fix the size of the union. */ @@ -5855,6 +5891,28 @@ controlled by the kvm module parameter halt_poll_ns. This capability allows the maximum halt time to specified on a per-VM basis, effectively overriding the module parameter for the target VM. +7.21 KVM_CAP_X86_USER_SPACE_MSR +------------------------------- + +:Architectures: x86 +:Target: VM +:Parameters: args[0] contains the mask of KVM_MSR_EXIT_REASON_* events to report +:Returns: 0 on success; -1 on error + +This capability enables trapping of #GP invoking RDMSR and WRMSR instructions +into user space. + +When a guest requests to read or write an MSR, KVM may not implement all MSRs +that are relevant to a respective system. It also does not differentiate by +CPU type. + +To allow more fine grained control over MSR handling, user space may enable +this capability. With it enabled, MSR accesses that match the mask specified in +args[0] and trigger a #GP event inside the guest by KVM will instead trigger +KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space +can then handle to implement model specific MSR handling and/or user notifications +to inform a user that an MSR was not handled. + 8. Other capabilities. ====================== @@ -6196,3 +6254,14 @@ distribution...) If this capability is available, then the CPNC and CPVC can be synchronized between KVM and userspace via the sync regs mechanism (KVM_SYNC_DIAG318). + +8.26 KVM_CAP_X86_USER_SPACE_MSR +------------------------------- + +:Architectures: x86 + +This capability indicates that KVM supports deflection of MSR reads and +writes to user space. It can be enabled on a VM level. If enabled, MSR +accesses that would usually trigger a #GP by KVM into the guest will +instead get bounced to user space through the KVM_EXIT_X86_RDMSR and +KVM_EXIT_X86_WRMSR exit notifications. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5d4c39c37390..dd2665504dc0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -961,6 +961,9 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ + u32 user_space_msr_mask; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 85111cd0adcd..0cc0db500f71 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3701,11 +3701,18 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt) static int em_wrmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; msr_data = (u32)reg_read(ctxt, VCPU_REGS_RAX) | ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32); - if (ctxt->ops->set_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), msr_data)) + r = ctxt->ops->set_msr(ctxt, msr_index, msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; + + if (r) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -3713,9 +3720,16 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt) static int em_rdmsr(struct x86_emulate_ctxt *ctxt) { + u64 msr_index = reg_read(ctxt, VCPU_REGS_RCX); u64 msr_data; + int r; + + r = ctxt->ops->get_msr(ctxt, msr_index, &msr_data); + + if (r == X86EMUL_IO_NEEDED) + return r; - if (ctxt->ops->get_msr(ctxt, reg_read(ctxt, VCPU_REGS_RCX), &msr_data)) + if (r) return emulate_gp(ctxt, 0); *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43173382f02f..af6d008145cd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1590,12 +1590,89 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); +static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +{ + if (vcpu->run->msr.error) { + kvm_inject_gp(vcpu, 0); + return 1; + } else if (is_read) { + kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); + kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); + } + + return kvm_skip_emulated_instruction(vcpu); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, true); +} + +static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, false); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case -ENOENT: + return KVM_MSR_EXIT_REASON_UNKNOWN; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + +static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r); +} + +static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_wrmsr, r); +} + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; + int r; + + r = kvm_get_msr(vcpu, ecx, &data); - if (kvm_get_msr(vcpu, ecx, &data)) { + /* MSR read failed? See if we should ask user space */ + if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR read failed? Inject a #GP */ + if (r) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; @@ -1613,8 +1690,18 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); + int r; - if (kvm_set_msr(vcpu, ecx, data)) { + r = kvm_set_msr(vcpu, ecx, data); + + /* MSR write failed? See if we should ask user space */ + if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR write failed? Inject a #GP */ + if (r) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -3526,6 +3613,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: + case KVM_CAP_X86_USER_SPACE_MSR: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -5046,6 +5134,10 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_USER_SPACE_MSR: + kvm->arch.user_space_msr_mask = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -6378,13 +6470,33 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_get_msr(vcpu, msr_index, pdata); + + if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_set_msr(vcpu, msr_index, data); + + if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 9417a34aad08..26cfb0fa8e7e 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -17,7 +17,7 @@ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ - ERSN(HYPERV), ERSN(ARM_NISV) + ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7d8eced6f459..31292a3cdfc2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -248,6 +248,8 @@ struct kvm_hyperv_exit { #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 #define KVM_EXIT_ARM_NISV 28 +#define KVM_EXIT_X86_RDMSR 29 +#define KVM_EXIT_X86_WRMSR 30 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -413,6 +415,16 @@ struct kvm_run { __u64 esr_iss; __u64 fault_ipa; } arm_nisv; + /* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */ + struct { + __u8 error; /* user -> kernel */ + __u8 pad[7]; +#define KVM_MSR_EXIT_REASON_INVAL (1 << 0) +#define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) + __u32 reason; /* kernel -> user */ + __u32 index; /* kernel -> user */ + __u64 data; /* kernel <-> user */ + } msr; /* Fix the size of the union. */ char padding[256]; }; @@ -1037,6 +1049,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_SMALLER_MAXPHYADDR 185 #define KVM_CAP_S390_DIAG318 186 #define KVM_CAP_STEAL_TIME 187 +#define KVM_CAP_X86_USER_SPACE_MSR 188 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 1a155254ff937ac92cf9940d273ea597b2c667a2 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 25 Sep 2020 16:34:21 +0200 Subject: KVM: x86: Introduce MSR filtering It's not desireable to have all MSRs always handled by KVM kernel space. Some MSRs would be useful to handle in user space to either emulate behavior (like uCode updates) or differentiate whether they are valid based on the CPU model. To allow user space to specify which MSRs it wants to see handled by KVM, this patch introduces a new ioctl to push filter rules with bitmaps into KVM. Based on these bitmaps, KVM can then decide whether to reject MSR access. With the addition of KVM_CAP_X86_USER_SPACE_MSR it can also deflect the denied MSR events to user space to operate on. If no filter is populated, MSR handling stays identical to before. Signed-off-by: Alexander Graf Message-Id: <20200925143422.21718-8-graf@amazon.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 108 ++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 14 ++++ arch/x86/include/uapi/asm/kvm.h | 18 +++++ arch/x86/kvm/x86.c | 145 +++++++++++++++++++++++++++++++++++++++- include/uapi/linux/kvm.h | 5 ++ 5 files changed, 289 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 4fdba43d83e8..425325ff4434 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4707,6 +4707,99 @@ KVM_PV_VM_VERIFY Verify the integrity of the unpacked image. Only if this succeeds, KVM is allowed to start protected VCPUs. +4.126 KVM_X86_SET_MSR_FILTER +---------------------------- + +:Capability: KVM_X86_SET_MSR_FILTER +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_msr_filter +:Returns: 0 on success, < 0 on error + +:: + + struct kvm_msr_filter_range { + #define KVM_MSR_FILTER_READ (1 << 0) + #define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ + }; + + #define KVM_MSR_FILTER_MAX_RANGES 16 + struct kvm_msr_filter { + #define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) + #define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; + }; + +flags values for struct kvm_msr_filter_range: + +KVM_MSR_FILTER_READ + + Filter read accesses to MSRs using the given bitmap. A 0 in the bitmap + indicates that a read should immediately fail, while a 1 indicates that + a read for a particular MSR should be handled regardless of the default + filter action. + +KVM_MSR_FILTER_WRITE + + Filter write accesses to MSRs using the given bitmap. A 0 in the bitmap + indicates that a write should immediately fail, while a 1 indicates that + a write for a particular MSR should be handled regardless of the default + filter action. + +KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE + + Filter both read and write accesses to MSRs using the given bitmap. A 0 + in the bitmap indicates that both reads and writes should immediately fail, + while a 1 indicates that reads and writes for a particular MSR are not + filtered by this range. + +flags values for struct kvm_msr_filter: + +KVM_MSR_FILTER_DEFAULT_ALLOW + + If no filter range matches an MSR index that is getting accessed, KVM will + fall back to allowing access to the MSR. + +KVM_MSR_FILTER_DEFAULT_DENY + + If no filter range matches an MSR index that is getting accessed, KVM will + fall back to rejecting access to the MSR. In this mode, all MSRs that should + be processed by KVM need to explicitly be marked as allowed in the bitmaps. + +This ioctl allows user space to define up to 16 bitmaps of MSR ranges to +specify whether a certain MSR access should be explicitly filtered for or not. + +If this ioctl has never been invoked, MSR accesses are not guarded and the +old KVM in-kernel emulation behavior is fully preserved. + +As soon as the filtering is in place, every MSR access is processed through +the filtering. If a bit is within one of the defined ranges, read and write +accesses are guarded by the bitmap's value for the MSR index. If it is not +defined in any range, whether MSR access is rejected is determined by the flags +field in the kvm_msr_filter struct: KVM_MSR_FILTER_DEFAULT_ALLOW and +KVM_MSR_FILTER_DEFAULT_DENY. + +Calling this ioctl with an empty set of ranges (all nmsrs == 0) disables MSR +filtering. In that mode, KVM_MSR_FILTER_DEFAULT_DENY no longer has any effect. + +Each bitmap range specifies a range of MSRs to potentially allow access on. +The range goes from MSR index [base .. base+nmsrs]. The flags field +indicates whether reads, writes or both reads and writes are filtered +by setting a 1 bit in the bitmap for the corresponding MSR index. + +If an MSR access is not permitted through the filtering, it generates a +#GP inside the guest. When combined with KVM_CAP_X86_USER_SPACE_MSR, that +allows user space to deflect and potentially handle various MSR accesses +into user space. + +If a vCPU is in running state while this ioctl is invoked, the vCPU may +experience inconsistent filtering behavior on MSR accesses. + 5. The kvm_run structure ======================== @@ -5187,6 +5280,7 @@ ENABLE_CAP. Currently valid exit reasons are: KVM_MSR_EXIT_REASON_UNKNOWN - access to MSR that is unknown to KVM KVM_MSR_EXIT_REASON_INVAL - access to invalid MSRs or reserved bits + KVM_MSR_EXIT_REASON_FILTER - access blocked by KVM_X86_SET_MSR_FILTER For KVM_EXIT_X86_RDMSR, the "index" field tells user space which MSR the guest wants to read. To respond to this request with a successful read, user space @@ -6265,3 +6359,17 @@ writes to user space. It can be enabled on a VM level. If enabled, MSR accesses that would usually trigger a #GP by KVM into the guest will instead get bounced to user space through the KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications. + +8.25 KVM_X86_SET_MSR_FILTER +--------------------------- + +:Architectures: x86 + +This capability indicates that KVM supports that accesses to user defined MSRs +may be rejected. With this capability exposed, KVM exports new VM ioctl +KVM_X86_SET_MSR_FILTER which user space can call to specify bitmaps of MSR +ranges that KVM should reject access to. + +In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to +trap and emulate MSRs that are outside of the scope of KVM as well as +limit the attack surface on KVM's MSR emulation code. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f4a2443219bc..dc7a58b39faf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -87,6 +87,7 @@ #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) +#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -860,6 +861,13 @@ struct kvm_hv { struct kvm_hv_syndbg hv_syndbg; }; +struct msr_bitmap_range { + u32 flags; + u32 nmsrs; + u32 base; + unsigned long *bitmap; +}; + enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ @@ -964,6 +972,12 @@ struct kvm_arch { /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; + struct { + u8 count; + bool default_allow:1; + struct msr_bitmap_range ranges[16]; + } msr_filter; + struct kvm_pmu_event_filter *pmu_event_filter; struct task_struct *nx_lpage_recovery_thread; }; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c2fd0aa2f587..89e5f3d1bba8 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -192,8 +192,26 @@ struct kvm_msr_list { __u32 indices[0]; }; +/* Maximum size of any access bitmap in bytes */ +#define KVM_MSR_FILTER_MAX_BITMAP_SIZE 0x600 + +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range { #define KVM_MSR_FILTER_READ (1 << 0) #define KVM_MSR_FILTER_WRITE (1 << 1) + __u32 flags; + __u32 nmsrs; /* number of msrs in bitmap */ + __u32 base; /* MSR index the bitmap starts at */ + __u8 *bitmap; /* a 1 bit allows the operations in flags, 0 denies */ +}; + +#define KVM_MSR_FILTER_MAX_RANGES 16 +struct kvm_msr_filter { +#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0) +#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0) + __u32 flags; + struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; struct kvm_cpuid_entry { __u32 function; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 60219882fee2..72f91f3640f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1490,7 +1490,35 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { - return true; + struct kvm *kvm = vcpu->kvm; + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + u32 count = kvm->arch.msr_filter.count; + u32 i; + bool r = kvm->arch.msr_filter.default_allow; + int idx; + + /* MSR filtering not set up, allow everything */ + if (!count) + return true; + + /* Prevent collision with set_msr_filter */ + idx = srcu_read_lock(&kvm->srcu); + + for (i = 0; i < count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + r = !!test_bit(index - start, bitmap); + break; + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + return r; } EXPORT_SYMBOL_GPL(kvm_msr_allowed); @@ -1505,6 +1533,9 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return -EPERM; + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: @@ -1561,6 +1592,9 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return -EPERM; + msr.index = index; msr.host_initiated = host_initiated; @@ -1624,6 +1658,8 @@ static u64 kvm_msr_reason(int r) switch (r) { case -ENOENT: return KVM_MSR_EXIT_REASON_UNKNOWN; + case -EPERM: + return KVM_MSR_EXIT_REASON_FILTER; default: return KVM_MSR_EXIT_REASON_INVAL; } @@ -3620,6 +3656,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: + case KVM_CAP_X86_MSR_FILTER: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -5151,6 +5188,103 @@ split_irqchip_unlock: return r; } +static void kvm_clear_msr_filter(struct kvm *kvm) +{ + u32 i; + u32 count = kvm->arch.msr_filter.count; + struct msr_bitmap_range ranges[16]; + + mutex_lock(&kvm->lock); + kvm->arch.msr_filter.count = 0; + memcpy(ranges, kvm->arch.msr_filter.ranges, count * sizeof(ranges[0])); + mutex_unlock(&kvm->lock); + synchronize_srcu(&kvm->srcu); + + for (i = 0; i < count; i++) + kfree(ranges[i].bitmap); +} + +static int kvm_add_msr_filter(struct kvm *kvm, struct kvm_msr_filter_range *user_range) +{ + struct msr_bitmap_range *ranges = kvm->arch.msr_filter.ranges; + struct msr_bitmap_range range; + unsigned long *bitmap = NULL; + size_t bitmap_size; + int r; + + if (!user_range->nmsrs) + return 0; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + range = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { + r = -EINVAL; + goto err; + } + + if (!range.flags) { + r = -EINVAL; + goto err; + } + + /* Everything ok, add this range identifier to our global pool */ + ranges[kvm->arch.msr_filter.count] = range; + /* Make sure we filled the array before we tell anyone to walk it */ + smp_wmb(); + kvm->arch.msr_filter.count++; + + return 0; +err: + kfree(bitmap); + return r; +} + +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +{ + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + bool default_allow; + int r = 0; + u32 i; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + kvm_clear_msr_filter(kvm); + + default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + kvm->arch.msr_filter.default_allow = default_allow; + + /* + * Protect from concurrent calls to this function that could trigger + * a TOCTOU violation on kvm->arch.msr_filter.count. + */ + mutex_lock(&kvm->lock); + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + r = kvm_add_msr_filter(kvm, &filter.ranges[i]); + if (r) + break; + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + mutex_unlock(&kvm->lock); + + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5457,6 +5591,9 @@ set_pit2_out: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; + case KVM_X86_SET_MSR_FILTER: + r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + break; default: r = -ENOTTY; } @@ -8611,6 +8748,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); + if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) + kvm_x86_ops.msr_filter_changed(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -10163,6 +10302,8 @@ void kvm_arch_pre_destroy_vm(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u32 i; + if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, @@ -10179,6 +10320,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops.vm_destroy) kvm_x86_ops.vm_destroy(kvm); + for (i = 0; i < kvm->arch.msr_filter.count; i++) + kfree(kvm->arch.msr_filter.ranges[i].bitmap); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 31292a3cdfc2..58f43aa1fc21 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -421,6 +421,7 @@ struct kvm_run { __u8 pad[7]; #define KVM_MSR_EXIT_REASON_INVAL (1 << 0) #define KVM_MSR_EXIT_REASON_UNKNOWN (1 << 1) +#define KVM_MSR_EXIT_REASON_FILTER (1 << 2) __u32 reason; /* kernel -> user */ __u32 index; /* kernel -> user */ __u64 data; /* kernel <-> user */ @@ -1050,6 +1051,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_DIAG318 186 #define KVM_CAP_STEAL_TIME 187 #define KVM_CAP_X86_USER_SPACE_MSR 188 +#define KVM_CAP_X86_MSR_FILTER 189 #ifdef KVM_CAP_IRQ_ROUTING @@ -1551,6 +1553,9 @@ struct kvm_pv_cmd { /* Available with KVM_CAP_S390_PROTECTED */ #define KVM_S390_PV_COMMAND _IOWR(KVMIO, 0xc5, struct kvm_pv_cmd) +/* Available with KVM_CAP_X86_MSR_FILTER */ +#define KVM_X86_SET_MSR_FILTER _IOW(KVMIO, 0xc6, struct kvm_msr_filter) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ -- cgit v1.2.3 From 58ef7c1b555e0e605da24b76cb2821dd3fcd6bc6 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:16 -0700 Subject: nl80211: include frequency offset in survey info Recently channels gained a potential frequency offset, so include this in the per-channel survey info. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-16-thomas@adapt-ip.com [add the offset only if non-zero] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/nl80211.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 05db40b4c56f..1e51445f81cd 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4097,6 +4097,7 @@ enum nl80211_user_reg_hint_type { * receiving frames destined to the local BSS * @NL80211_SURVEY_INFO_MAX: highest survey info attribute number * currently defined + * @NL80211_SURVEY_INFO_FREQUENCY_OFFSET: center frequency offset in KHz * @__NL80211_SURVEY_INFO_AFTER_LAST: internal use */ enum nl80211_survey_info { @@ -4112,6 +4113,7 @@ enum nl80211_survey_info { NL80211_SURVEY_INFO_TIME_SCAN, NL80211_SURVEY_INFO_PAD, NL80211_SURVEY_INFO_TIME_BSS_RX, + NL80211_SURVEY_INFO_FREQUENCY_OFFSET, /* keep last */ __NL80211_SURVEY_INFO_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d31451db5407..aece2352a349 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9319,6 +9319,11 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq, survey->channel->center_freq)) goto nla_put_failure; + if (survey->channel && survey->channel->freq_offset && + nla_put_u32(msg, NL80211_SURVEY_INFO_FREQUENCY_OFFSET, + survey->channel->freq_offset)) + goto nla_put_failure; + if ((survey->filled & SURVEY_INFO_NOISE_DBM) && nla_put_u8(msg, NL80211_SURVEY_INFO_NOISE, survey->noise)) goto nla_put_failure; -- cgit v1.2.3 From f5bec330e3010450daeb5cb6a94a4a7c54afa306 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Mon, 28 Sep 2020 00:28:11 -0700 Subject: nl80211: extend support to config spatial reuse parameter set Allow the user to configure below Spatial Reuse Parameter Set element. * Non-SRG OBSS PD Max Offset * SRG BSS Color Bitmap * SRG Partial BSSID Bitmap Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1601278091-20313-2-git-send-email-rmanohar@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++-- include/net/cfg80211.h | 10 ++++++++++ include/uapi/linux/nl80211.h | 11 +++++++++++ net/wireless/nl80211.c | 25 +++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f2f56b287aed..770408b2fdaf 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2350,8 +2350,11 @@ ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) } /* HE Spatial Reuse defines */ -#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT 0x4 -#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT 0x8 +#define IEEE80211_HE_SPR_PSR_DISALLOWED BIT(0) +#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED BIT(1) +#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT BIT(2) +#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT BIT(3) +#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED BIT(4) /* * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bead4b9afeca..aee47f2b5709 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -269,13 +269,23 @@ struct ieee80211_rate { * struct ieee80211_he_obss_pd - AP settings for spatial reuse * * @enable: is the feature enabled. + * @sr_ctrl: The SR Control field of SRP element. + * @non_srg_max_offset: non-SRG maximum tx power offset * @min_offset: minimal tx power offset an associated station shall use * @max_offset: maximum tx power offset an associated station shall use + * @bss_color_bitmap: bitmap that indicates the BSS color values used by + * members of the SRG + * @partial_bssid_bitmap: bitmap that indicates the partial BSSID values + * used by members of the SRG */ struct ieee80211_he_obss_pd { bool enable; + u8 sr_ctrl; + u8 non_srg_max_offset; u8 min_offset; u8 max_offset; + u8 bss_color_bitmap[8]; + u8 partial_bssid_bitmap[8]; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1e51445f81cd..47700a2b9af9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6991,6 +6991,13 @@ enum nl80211_peer_measurement_ftm_resp { * * @NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET: the OBSS PD minimum tx power offset. * @NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET: the OBSS PD maximum tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET: the non-SRG OBSS PD maximum + * tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP: bitmap that indicates the BSS color + * values used by members of the SRG. + * @NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP: bitmap that indicates the partial + * BSSID values used by members of the SRG. + * @NL80211_HE_OBSS_PD_ATTR_SR_CTRL: The SR Control field of SRP element. * * @__NL80211_HE_OBSS_PD_ATTR_LAST: Internal * @NL80211_HE_OBSS_PD_ATTR_MAX: highest OBSS PD attribute. @@ -7000,6 +7007,10 @@ enum nl80211_obss_pd_attributes { NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET, NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET, + NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET, + NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP, + NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP, + NL80211_HE_OBSS_PD_ATTR_SR_CTRL, /* keep last */ __NL80211_HE_OBSS_PD_ATTR_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e501bce86436..d76b8bd0e1d1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -329,6 +329,13 @@ he_obss_pd_policy[NL80211_HE_OBSS_PD_ATTR_MAX + 1] = { NLA_POLICY_RANGE(NLA_U8, 1, 20), [NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET] = NLA_POLICY_RANGE(NLA_U8, 1, 20), + [NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET] = + NLA_POLICY_RANGE(NLA_U8, 1, 20), + [NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP] = + NLA_POLICY_EXACT_LEN(8), + [NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP] = + NLA_POLICY_EXACT_LEN(8), + [NL80211_HE_OBSS_PD_ATTR_SR_CTRL] = { .type = NLA_U8 }, }; static const struct nla_policy @@ -4857,16 +4864,34 @@ static int nl80211_parse_he_obss_pd(struct nlattr *attrs, if (err) return err; + if (!tb[NL80211_HE_OBSS_PD_ATTR_SR_CTRL]) + return -EINVAL; + + he_obss_pd->sr_ctrl = nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_SR_CTRL]); + if (tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]) he_obss_pd->min_offset = nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET]); if (tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]) he_obss_pd->max_offset = nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET]); + if (tb[NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET]) + he_obss_pd->non_srg_max_offset = + nla_get_u8(tb[NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET]); if (he_obss_pd->min_offset > he_obss_pd->max_offset) return -EINVAL; + if (tb[NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP]) + memcpy(he_obss_pd->bss_color_bitmap, + nla_data(tb[NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP]), + sizeof(he_obss_pd->bss_color_bitmap)); + + if (tb[NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP]) + memcpy(he_obss_pd->partial_bssid_bitmap, + nla_data(tb[NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP]), + sizeof(he_obss_pd->partial_bssid_bitmap)); + he_obss_pd->enable = true; return 0; -- cgit v1.2.3 From 1b4d60ec162f82ea29a2e7a907b5c6cc9f926321 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 25 Sep 2020 13:54:29 -0700 Subject: bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint Add .test_run for raw_tracepoint. Also, introduce a new feature that runs the target program on a specific CPU. This is achieved by a new flag in bpf_attr.test, BPF_F_TEST_RUN_ON_CPU. When this flag is set, the program is triggered on cpu with id bpf_attr.test.cpu. This feature is needed for BPF programs that handle perf_event and other percpu resources, as the program can access these resource locally. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200925205432.1777-2-songliubraving@fb.com --- include/linux/bpf.h | 3 ++ include/uapi/linux/bpf.h | 7 ++++ kernel/bpf/syscall.c | 2 +- kernel/trace/bpf_trace.c | 1 + net/bpf/test_run.c | 91 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 7 ++++ 6 files changed, 110 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 79902325bef8..db6dcdee7933 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1396,6 +1396,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2d6519a2ed77..82522f05c021 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -424,6 +424,11 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) + /* type for BPF_ENABLE_STATS */ enum bpf_stats_type { /* enabled run_time_ns and run_cnt */ @@ -566,6 +571,8 @@ union bpf_attr { */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2740df19f55e..3bc2ed2e171b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2979,7 +2979,7 @@ static int bpf_prog_query(const union bpf_attr *attr, } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out +#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 36508f46a8db..2834866d379a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1678,6 +1678,7 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { }; const struct bpf_prog_ops raw_tracepoint_prog_ops = { + .test_run = bpf_prog_test_run_raw_tp, }; const struct bpf_verifier_ops tracing_verifier_ops = { diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index a66f211726e7..fde5db93507c 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -11,6 +11,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -204,6 +205,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, int b = 2, err = -EFAULT; u32 retval = 0; + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: @@ -236,6 +240,87 @@ out: return err; } +struct bpf_raw_tp_test_run_info { + struct bpf_prog *prog; + void *ctx; + u32 retval; +}; + +static void +__bpf_prog_test_run_raw_tp(void *data) +{ + struct bpf_raw_tp_test_run_info *info = data; + + rcu_read_lock(); + migrate_disable(); + info->retval = BPF_PROG_RUN(info->prog, info->ctx); + migrate_enable(); + rcu_read_unlock(); +} + +int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); + __u32 ctx_size_in = kattr->test.ctx_size_in; + struct bpf_raw_tp_test_run_info info; + int cpu = kattr->test.cpu, err = 0; + + /* doesn't support data_in/out, ctx_out, duration, or repeat */ + if (kattr->test.data_in || kattr->test.data_out || + kattr->test.ctx_out || kattr->test.duration || + kattr->test.repeat) + return -EINVAL; + + if (ctx_size_in < prog->aux->max_ctx_offset) + return -EINVAL; + + if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) + return -EINVAL; + + if (ctx_size_in) { + info.ctx = kzalloc(ctx_size_in, GFP_USER); + if (!info.ctx) + return -ENOMEM; + if (copy_from_user(info.ctx, ctx_in, ctx_size_in)) { + err = -EFAULT; + goto out; + } + } else { + info.ctx = NULL; + } + + info.prog = prog; + + if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || + cpu == smp_processor_id()) { + __bpf_prog_test_run_raw_tp(&info); + } else { + /* smp_call_function_single() also checks cpu_online() + * after csd_lock(). However, since cpu is from user + * space, let's do an extra quick check to filter out + * invalid value before smp_call_function_single(). + */ + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { + err = -ENXIO; + goto out; + } + + err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, + &info, 1); + if (err) + goto out; + } + + if (copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) + err = -EFAULT; + +out: + kfree(info.ctx); + return err; +} + static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) { void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); @@ -410,6 +495,9 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, void *data; int ret; + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) @@ -607,6 +695,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) return -EINVAL; + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + if (size < ETH_HLEN) return -EINVAL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d6519a2ed77..82522f05c021 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -424,6 +424,11 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) + /* type for BPF_ENABLE_STATS */ enum bpf_stats_type { /* enabled run_time_ns and run_cnt */ @@ -566,6 +571,8 @@ union bpf_attr { */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ -- cgit v1.2.3 From c4d0bfb45068d853a478b9067a95969b1886a30f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:05 +0100 Subject: bpf: Add bpf_snprintf_btf helper A helper is added to support tracing kernel type information in BPF using the BPF Type Format (BTF). Its signature is long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); struct btf_ptr * specifies - a pointer to the data to be traced - the BTF id of the type of data pointed to - a flags field is provided for future use; these flags are not to be confused with the BTF_F_* flags below that control how the btf_ptr is displayed; the flags member of the struct btf_ptr may be used to disambiguate types in kernel versus module BTF, etc; the main distinction is the flags relate to the type and information needed in identifying it; not how it is displayed. For example a BPF program with a struct sk_buff *skb could do the following: static struct btf_ptr b = { }; b.ptr = skb; b.type_id = __builtin_btf_type_id(struct sk_buff, 1); bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0); Default output looks like this: (struct sk_buff){ .transport_header = (__u16)65535, .mac_header = (__u16)65535, .end = (sk_buff_data_t)192, .head = (unsigned char *)0x000000007524fd8b, .data = (unsigned char *)0x000000007524fd8b, .truesize = (unsigned int)768, .users = (refcount_t){ .refs = (atomic_t){ .counter = (int)1, }, }, } Flags modifying display are as follows: - BTF_F_COMPACT: no formatting around type information - BTF_F_NONAME: no struct/union member names/types - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; equivalent to %px. - BTF_F_ZERO: show zero-valued struct/union members; they are not displayed by default Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com --- include/linux/bpf.h | 1 + include/linux/btf.h | 9 +++--- include/uapi/linux/bpf.h | 67 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 4 +++ kernel/trace/bpf_trace.c | 65 ++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 67 ++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 212 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e620a4b1290f..768b533ba48e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1822,6 +1822,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; +extern const struct bpf_func_proto bpf_snprintf_btf_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/btf.h b/include/linux/btf.h index d0f5d3c9ec3d..3e5cdc2ba963 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -6,6 +6,7 @@ #include #include +#include #define BTF_TYPE_EMIT(type) ((void)(type *)0) @@ -59,10 +60,10 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, * - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read * data before displaying it. */ -#define BTF_SHOW_COMPACT (1ULL << 0) -#define BTF_SHOW_NONAME (1ULL << 1) -#define BTF_SHOW_PTR_RAW (1ULL << 2) -#define BTF_SHOW_ZERO (1ULL << 3) +#define BTF_SHOW_COMPACT BTF_F_COMPACT +#define BTF_SHOW_NONAME BTF_F_NONAME +#define BTF_SHOW_PTR_RAW BTF_F_PTR_RAW +#define BTF_SHOW_ZERO BTF_F_ZERO #define BTF_SHOW_UNSAFE (1ULL << 4) void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 82522f05c021..cca9eb1b13e5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3594,6 +3594,42 @@ union bpf_attr { * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3745,6 +3781,7 @@ union bpf_attr { FN(inode_storage_delete), \ FN(d_path), \ FN(copy_from_user), \ + FN(snprintf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4853,4 +4890,34 @@ struct bpf_sk_lookup { __u32 local_port; /* Host byte order */ }; +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c4811b139caa..403fb2341a86 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2216,6 +2216,7 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; +const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5cc7425ee476..e825441781ab 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -683,6 +683,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) if (!perfmon_capable()) return NULL; return bpf_get_trace_printk_proto(); + case BPF_FUNC_snprintf_btf: + if (!perfmon_capable()) + return NULL; + return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; default: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 2834866d379a..140e1be9dab6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,9 @@ #include #include +#include +#include + #include #include "trace_probe.h" @@ -1147,6 +1151,65 @@ static const struct bpf_func_proto bpf_d_path_proto = { .allowed = bpf_d_path_allowed, }; +#define BTF_F_ALL (BTF_F_COMPACT | BTF_F_NONAME | \ + BTF_F_PTR_RAW | BTF_F_ZERO) + +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id) +{ + const struct btf_type *t; + + if (unlikely(flags & ~(BTF_F_ALL))) + return -EINVAL; + + if (btf_ptr_size != sizeof(struct btf_ptr)) + return -EINVAL; + + *btf = bpf_get_btf_vmlinux(); + + if (IS_ERR_OR_NULL(*btf)) + return PTR_ERR(*btf); + + if (ptr->type_id > 0) + *btf_id = ptr->type_id; + else + return -EINVAL; + + if (*btf_id > 0) + t = btf_type_by_id(*btf, *btf_id); + if (*btf_id <= 0 || !t) + return -ENOENT; + + return 0; +} + +BPF_CALL_5(bpf_snprintf_btf, char *, str, u32, str_size, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_snprintf_show(btf, btf_id, ptr->ptr, str, str_size, + flags); +} + +const struct bpf_func_proto bpf_snprintf_btf_proto = { + .func = bpf_snprintf_btf, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1233,6 +1296,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 08388173973f..7d86fdd190be 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -433,6 +433,7 @@ class PrinterHelpers(Printer): 'struct sk_msg_md', 'struct xdp_md', 'struct path', + 'struct btf_ptr', ] known_types = { '...', @@ -474,6 +475,7 @@ class PrinterHelpers(Printer): 'struct udp6_sock', 'struct task_struct', 'struct path', + 'struct btf_ptr', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 82522f05c021..cca9eb1b13e5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3594,6 +3594,42 @@ union bpf_attr { * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3745,6 +3781,7 @@ union bpf_attr { FN(inode_storage_delete), \ FN(d_path), \ FN(copy_from_user), \ + FN(snprintf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4853,4 +4890,34 @@ struct bpf_sk_lookup { __u32 local_port; /* Host byte order */ }; +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From eb411377aed9e27835e77ee0710ee8f4649958f3 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:09 +0100 Subject: bpf: Add bpf_seq_printf_btf helper A helper is added to allow seq file writing of kernel data structures using vmlinux BTF. Its signature is long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); Flags and struct btf_ptr definitions/use are identical to the bpf_snprintf_btf helper, and the helper returns 0 on success or a negative error value. Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-8-git-send-email-alan.maguire@oracle.com --- include/linux/btf.h | 2 ++ include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/btf.c | 4 ++-- kernel/bpf/core.c | 1 + kernel/trace/bpf_trace.c | 33 +++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 6 files changed, 56 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 3e5cdc2ba963..024e16ff7dcc 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -68,6 +68,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m, u64 flags); /* * Copy len bytes of string representation of obj of BTF type_id into buf. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cca9eb1b13e5..96ddb00b91dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3630,6 +3630,14 @@ union bpf_attr { * The number of bytes that were written (or would have been * written if output had to be truncated due to string size), * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3782,6 +3790,7 @@ union bpf_attr { FN(d_path), \ FN(copy_from_user), \ FN(snprintf_btf), \ + FN(seq_printf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcdd7109aa29..498e5e553825 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5346,8 +5346,8 @@ static void btf_seq_show(struct btf_show *show, const char *fmt, seq_vprintf((struct seq_file *)show->target, fmt, args); } -static int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, - void *obj, struct seq_file *m, u64 flags) +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, + void *obj, struct seq_file *m, u64 flags) { struct btf_show sseq; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 403fb2341a86..c4ba45fa4fe1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2217,6 +2217,7 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; +const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 140e1be9dab6..e118a83439c3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -71,6 +71,10 @@ static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name) u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, + u64 flags, const struct btf **btf, + s32 *btf_id); + /** * trace_call_bpf - invoke BPF program * @call: tracepoint event @@ -776,6 +780,31 @@ static const struct bpf_func_proto bpf_seq_write_proto = { .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_4(bpf_seq_printf_btf, struct seq_file *, m, struct btf_ptr *, ptr, + u32, btf_ptr_size, u64, flags) +{ + const struct btf *btf; + s32 btf_id; + int ret; + + ret = bpf_btf_printf_prepare(ptr, btf_ptr_size, flags, &btf, &btf_id); + if (ret) + return ret; + + return btf_type_seq_show_flags(btf, btf_id, ptr->ptr, m, flags); +} + +static const struct bpf_func_proto bpf_seq_printf_btf_proto = { + .func = bpf_seq_printf_btf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_seq_file_ids[0], + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1695,6 +1724,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->expected_attach_type == BPF_TRACE_ITER ? &bpf_seq_write_proto : NULL; + case BPF_FUNC_seq_printf_btf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_btf_proto : + NULL; case BPF_FUNC_d_path: return &bpf_d_path_proto; default: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index cca9eb1b13e5..96ddb00b91dc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3630,6 +3630,14 @@ union bpf_attr { * The number of bytes that were written (or would have been * written if output had to be truncated due to string size), * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3782,6 +3790,7 @@ union bpf_attr { FN(d_path), \ FN(copy_from_user), \ FN(snprintf_btf), \ + FN(seq_printf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 3789af9a13e5561738c0f2114e3a5e22c843ca3e Mon Sep 17 00:00:00 2001 From: Krzysztof Wilczyński Date: Thu, 30 Jul 2020 21:08:48 +0000 Subject: PCI/PM: Rename pci_dev.d3_delay to d3hot_delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI devices support two variants of the D3 power state: D3hot (main power present) D3cold (main power removed). Previously struct pci_dev contained: unsigned int d3_delay; /* D3->D0 transition time in ms */ unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */ "d3_delay" refers specifically to the D3hot state. Rename it to "d3hot_delay" to avoid ambiguity and align with the ACPI "_DSM for Specifying Device Readiness Durations" in the PCI Firmware spec r3.2, sec 4.6.9. There is no change to the functionality. Link: https://lore.kernel.org/r/20200730210848.1578826-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- Documentation/power/pci.rst | 2 +- arch/x86/pci/fixup.c | 2 +- arch/x86/pci/intel_mid_pci.c | 2 +- drivers/hid/intel-ish-hid/ipc/ipc.c | 2 +- drivers/net/ethernet/marvell/sky2.c | 2 +- drivers/pci/pci-acpi.c | 6 +-- drivers/pci/pci.c | 14 ++--- drivers/pci/pci.h | 4 +- drivers/pci/quirks.c | 68 ++++++++++++------------ drivers/staging/media/atomisp/pci/atomisp_v4l2.c | 2 +- include/linux/pci.h | 2 +- include/uapi/linux/pci_regs.h | 2 +- 12 files changed, 54 insertions(+), 54 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst index 1831e431f725..b04fb18cc4e2 100644 --- a/Documentation/power/pci.rst +++ b/Documentation/power/pci.rst @@ -320,7 +320,7 @@ that these callbacks operate on:: unsigned int d2_support:1; /* Low power state D2 is supported */ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ unsigned int wakeup_prepared:1; /* Device prepared for wake up */ - unsigned int d3_delay; /* D3->D0 transition time in ms */ + unsigned int d3hot_delay; /* D3hot->D0 transition time in ms */ ... }; diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index b8c9a5b87f37..0a0e168be1cb 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -587,7 +587,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0xa26d, pci_invalid_bar); static void pci_fixup_amd_ehci_pme(struct pci_dev *dev) { dev_info(&dev->dev, "PME# does not work under D3, disabling it\n"); - dev->pme_support &= ~((PCI_PM_CAP_PME_D3 | PCI_PM_CAP_PME_D3cold) + dev->pme_support &= ~((PCI_PM_CAP_PME_D3hot | PCI_PM_CAP_PME_D3cold) >> PCI_PM_CAP_PME_SHIFT); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x7808, pci_fixup_amd_ehci_pme); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 00c62115f39c..979f310b67d4 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -322,7 +322,7 @@ static void pci_d3delay_fixup(struct pci_dev *dev) */ if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) return; - dev->d3_delay = 0; + dev->d3hot_delay = 0; } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); diff --git a/drivers/hid/intel-ish-hid/ipc/ipc.c b/drivers/hid/intel-ish-hid/ipc/ipc.c index 8f8dfdf64833..a45ac7fa417b 100644 --- a/drivers/hid/intel-ish-hid/ipc/ipc.c +++ b/drivers/hid/intel-ish-hid/ipc/ipc.c @@ -755,7 +755,7 @@ static int _ish_hw_reset(struct ishtp_device *dev) csr |= PCI_D3hot; pci_write_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, csr); - mdelay(pdev->d3_delay); + mdelay(pdev->d3hot_delay); csr &= ~PCI_PM_CTRL_STATE_MASK; csr |= PCI_D0; diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index cec8124301c7..dd11c06ca7f9 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -5105,7 +5105,7 @@ static int sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&hw->restart_work, sky2_restart); pci_set_drvdata(pdev, hw); - pdev->d3_delay = 300; + pdev->d3hot_delay = 300; return 0; diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index d5869a03f748..154db9a47511 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -1167,7 +1167,7 @@ static struct acpi_device *acpi_pci_find_companion(struct device *dev) * @pdev: the PCI device whose delay is to be updated * @handle: ACPI handle of this device * - * Update the d3_delay and d3cold_delay of a PCI device from the ACPI _DSM + * Update the d3hot_delay and d3cold_delay of a PCI device from the ACPI _DSM * control method of either the device itself or the PCI host bridge. * * Function 8, "Reset Delay," applies to the entire hierarchy below a PCI @@ -1206,8 +1206,8 @@ static void pci_acpi_optimize_delay(struct pci_dev *pdev, } if (elements[3].type == ACPI_TYPE_INTEGER) { value = (int)elements[3].integer.value / 1000; - if (value < PCI_PM_D3_WAIT) - pdev->d3_delay = value; + if (value < PCI_PM_D3HOT_WAIT) + pdev->d3hot_delay = value; } } ACPI_FREE(obj); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index a458c46d7e39..c4a26532a447 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -49,7 +49,7 @@ EXPORT_SYMBOL(isa_dma_bridge_buggy); int pci_pci_problems; EXPORT_SYMBOL(pci_pci_problems); -unsigned int pci_pm_d3_delay; +unsigned int pci_pm_d3hot_delay; static void pci_pme_list_scan(struct work_struct *work); @@ -66,10 +66,10 @@ struct pci_pme_device { static void pci_dev_d3_sleep(struct pci_dev *dev) { - unsigned int delay = dev->d3_delay; + unsigned int delay = dev->d3hot_delay; - if (delay < pci_pm_d3_delay) - delay = pci_pm_d3_delay; + if (delay < pci_pm_d3hot_delay) + delay = pci_pm_d3hot_delay; if (delay) msleep(delay); @@ -3013,7 +3013,7 @@ void pci_pm_init(struct pci_dev *dev) } dev->pm_cap = pm; - dev->d3_delay = PCI_PM_D3_WAIT; + dev->d3hot_delay = PCI_PM_D3HOT_WAIT; dev->d3cold_delay = PCI_PM_D3COLD_WAIT; dev->bridge_d3 = pci_bridge_d3_possible(dev); dev->d3cold_allowed = true; @@ -3038,7 +3038,7 @@ void pci_pm_init(struct pci_dev *dev) (pmc & PCI_PM_CAP_PME_D0) ? " D0" : "", (pmc & PCI_PM_CAP_PME_D1) ? " D1" : "", (pmc & PCI_PM_CAP_PME_D2) ? " D2" : "", - (pmc & PCI_PM_CAP_PME_D3) ? " D3hot" : "", + (pmc & PCI_PM_CAP_PME_D3hot) ? " D3hot" : "", (pmc & PCI_PM_CAP_PME_D3cold) ? " D3cold" : ""); dev->pme_support = pmc >> PCI_PM_CAP_PME_SHIFT; dev->pme_poll = true; @@ -4621,7 +4621,7 @@ static int pci_af_flr(struct pci_dev *dev, int probe) * * NOTE: This causes the caller to sleep for twice the device power transition * cooldown period, which for the D0->D3hot and D3hot->D0 transitions is 10 ms - * by default (i.e. unless the @dev's d3_delay field has a different value). + * by default (i.e. unless the @dev's d3hot_delay field has a different value). * Moreover, only devices in D0 can be reset by this function. */ static int pci_pm_reset(struct pci_dev *dev, int probe) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index fa12f7cbc1a0..8d492669ecfd 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -44,7 +44,7 @@ int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); #define PCI_PM_D2_DELAY 200 -#define PCI_PM_D3_WAIT 10 +#define PCI_PM_D3HOT_WAIT 10 #define PCI_PM_D3COLD_WAIT 100 #define PCI_PM_BUS_WAIT 50 @@ -178,7 +178,7 @@ extern struct mutex pci_slot_mutex; extern raw_spinlock_t pci_lock; -extern unsigned int pci_pm_d3_delay; +extern unsigned int pci_pm_d3hot_delay; #ifdef CONFIG_PCI_MSI void pci_no_msi(void); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index bdf9b52567e0..72b22a35e516 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1846,7 +1846,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PXHV, quirk_pci */ static void quirk_intel_pcie_pm(struct pci_dev *dev) { - pci_pm_d3_delay = 120; + pci_pm_d3hot_delay = 120; dev->no_d1d2 = 1; } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x25e2, quirk_intel_pcie_pm); @@ -1873,12 +1873,12 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x260b, quirk_intel_pcie_pm); static void quirk_d3hot_delay(struct pci_dev *dev, unsigned int delay) { - if (dev->d3_delay >= delay) + if (dev->d3hot_delay >= delay) return; - dev->d3_delay = delay; + dev->d3hot_delay = delay; pci_info(dev, "extending delay after power-on from D3hot to %d msec\n", - dev->d3_delay); + dev->d3hot_delay); } static void quirk_radeon_pm(struct pci_dev *dev) @@ -3387,36 +3387,36 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0152, disable_igfx_irq); * PCI devices which are on Intel chips can skip the 10ms delay * before entering D3 mode. */ -static void quirk_remove_d3_delay(struct pci_dev *dev) -{ - dev->d3_delay = 0; -} -/* C600 Series devices do not need 10ms d3_delay */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0412, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0c00, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0c0c, quirk_remove_d3_delay); -/* Lynxpoint-H PCH devices do not need 10ms d3_delay */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c02, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c18, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c1c, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c20, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c22, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c26, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c2d, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c31, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c3a, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c3d, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c4e, quirk_remove_d3_delay); -/* Intel Cherrytrail devices do not need 10ms d3_delay */ -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x2280, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x2298, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x229c, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b0, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b5, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b7, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b8, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22d8, quirk_remove_d3_delay); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22dc, quirk_remove_d3_delay); +static void quirk_remove_d3hot_delay(struct pci_dev *dev) +{ + dev->d3hot_delay = 0; +} +/* C600 Series devices do not need 10ms d3hot_delay */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0412, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0c00, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0c0c, quirk_remove_d3hot_delay); +/* Lynxpoint-H PCH devices do not need 10ms d3hot_delay */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c02, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c18, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c1c, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c20, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c22, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c26, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c2d, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c31, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c3a, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c3d, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x8c4e, quirk_remove_d3hot_delay); +/* Intel Cherrytrail devices do not need 10ms d3hot_delay */ +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x2280, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x2298, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x229c, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b0, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b5, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b7, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22b8, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22d8, quirk_remove_d3hot_delay); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x22dc, quirk_remove_d3hot_delay); /* * Some devices may pass our check in pci_intx_mask_supported() if diff --git a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c index a000a1e316f7..beba430a197e 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_v4l2.c +++ b/drivers/staging/media/atomisp/pci/atomisp_v4l2.c @@ -1573,7 +1573,7 @@ static int atomisp_pci_probe(struct pci_dev *pdev, const struct pci_device_id *i spin_lock_init(&isp->lock); /* This is not a true PCI device on SoC, so the delay is not needed. */ - pdev->d3_delay = 0; + pdev->d3hot_delay = 0; pci_set_drvdata(pdev, isp); diff --git a/include/linux/pci.h b/include/linux/pci.h index c9e169c4e216..bea1a03faab6 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -373,7 +373,7 @@ struct pci_dev { user sysfs */ unsigned int clear_retrain_link:1; /* Need to clear Retrain Link bit manually */ - unsigned int d3_delay; /* D3->D0 transition time in ms */ + unsigned int d3hot_delay; /* D3hot->D0 transition time in ms */ unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */ #ifdef CONFIG_PCIEASPM diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f9701410d3b5..49f15c37e771 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -246,7 +246,7 @@ #define PCI_PM_CAP_PME_D0 0x0800 /* PME# from D0 */ #define PCI_PM_CAP_PME_D1 0x1000 /* PME# from D1 */ #define PCI_PM_CAP_PME_D2 0x2000 /* PME# from D2 */ -#define PCI_PM_CAP_PME_D3 0x4000 /* PME# from D3 (hot) */ +#define PCI_PM_CAP_PME_D3hot 0x4000 /* PME# from D3 (hot) */ #define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ #define PCI_PM_CAP_PME_SHIFT 11 /* Start of the PME Mask in PMC */ #define PCI_PM_CTRL 4 /* PM control and status register */ -- cgit v1.2.3 From 4a1e7c0c63e02daad751842b7880f9bbcdfb6e89 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 29 Sep 2020 14:45:51 +0200 Subject: bpf: Support attaching freplace programs to multiple attach points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enables support for attaching freplace programs to multiple attach points. It does this by amending the UAPI for bpf_link_Create with a target btf ID that can be used to supply the new attachment point along with the target program fd. The target must be compatible with the target that was supplied at program load time. The implementation reuses the checks that were factored out of check_attach_btf_id() to ensure compatibility between the BTF types of the old and new attachment. If these match, a new bpf_tracing_link will be created for the new attach target, allowing multiple attachments to co-exist simultaneously. The code could theoretically support multiple-attach of other types of tracing programs as well, but since I don't have a use case for any of those, there is no API support for doing so. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355169.48470.17165680973640685368.stgit@toke.dk --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 9 ++- kernel/bpf/syscall.c | 132 ++++++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 10 ++++ tools/include/uapi/linux/bpf.h | 9 ++- 5 files changed, 142 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 839dd8670a7a..50e5c4b52bd1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -768,6 +768,8 @@ struct bpf_prog_aux { struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; + enum bpf_prog_type saved_dst_prog_type; + enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 96ddb00b91dc..2b1d3f16cbd1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -639,8 +639,13 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ - __aligned_u64 iter_info; /* extra bpf_iter_link_info */ - __u32 iter_info_len; /* iter_info length */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + }; } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e6a0a948e30c..f1528c2a6927 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -2554,12 +2555,15 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { .fill_link_info = bpf_tracing_link_fill_link_info, }; -static int bpf_tracing_prog_attach(struct bpf_prog *prog) +static int bpf_tracing_prog_attach(struct bpf_prog *prog, + int tgt_prog_fd, + u32 btf_id) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; + struct bpf_trampoline *tr = NULL; struct bpf_tracing_link *link; - struct bpf_trampoline *tr; + u64 key = 0; int err; switch (prog->type) { @@ -2588,6 +2592,28 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) goto out_put_prog; } + if (!!tgt_prog_fd != !!btf_id) { + err = -EINVAL; + goto out_put_prog; + } + + if (tgt_prog_fd) { + /* For now we only allow new targets for BPF_PROG_TYPE_EXT */ + if (prog->type != BPF_PROG_TYPE_EXT) { + err = -EINVAL; + goto out_put_prog; + } + + tgt_prog = bpf_prog_get(tgt_prog_fd); + if (IS_ERR(tgt_prog)) { + err = PTR_ERR(tgt_prog); + tgt_prog = NULL; + goto out_put_prog; + } + + key = bpf_trampoline_compute_key(tgt_prog, btf_id); + } + link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; @@ -2599,12 +2625,58 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) mutex_lock(&prog->aux->dst_mutex); - if (!prog->aux->dst_trampoline) { + /* There are a few possible cases here: + * + * - if prog->aux->dst_trampoline is set, the program was just loaded + * and not yet attached to anything, so we can use the values stored + * in prog->aux + * + * - if prog->aux->dst_trampoline is NULL, the program has already been + * attached to a target and its initial target was cleared (below) + * + * - if tgt_prog != NULL, the caller specified tgt_prog_fd + + * target_btf_id using the link_create API. + * + * - if tgt_prog == NULL when this function was called using the old + * raw_tracepoint_open API, and we need a target from prog->aux + * + * The combination of no saved target in prog->aux, and no target + * specified on load is illegal, and we reject that here. + */ + if (!prog->aux->dst_trampoline && !tgt_prog) { err = -ENOENT; goto out_unlock; } - tr = prog->aux->dst_trampoline; - tgt_prog = prog->aux->dst_prog; + + if (!prog->aux->dst_trampoline || + (key && key != prog->aux->dst_trampoline->key)) { + /* If there is no saved target, or the specified target is + * different from the destination specified at load time, we + * need a new trampoline and a check for compatibility + */ + struct bpf_attach_target_info tgt_info = {}; + + err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id, + &tgt_info); + if (err) + goto out_unlock; + + tr = bpf_trampoline_get(key, &tgt_info); + if (!tr) { + err = -ENOMEM; + goto out_unlock; + } + } else { + /* The caller didn't specify a target, or the target was the + * same as the destination supplied during program load. This + * means we can reuse the trampoline and reference from program + * load time, and there is no need to allocate a new one. This + * can only happen once for any program, as the saved values in + * prog->aux are cleared below. + */ + tr = prog->aux->dst_trampoline; + tgt_prog = prog->aux->dst_prog; + } err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -2620,15 +2692,31 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog) link->tgt_prog = tgt_prog; link->trampoline = tr; + /* Always clear the trampoline and target prog from prog->aux to make + * sure the original attach destination is not kept alive after a + * program is (re-)attached to another target. + */ + if (prog->aux->dst_prog && + (tgt_prog_fd || tr != prog->aux->dst_trampoline)) + /* got extra prog ref from syscall, or attaching to different prog */ + bpf_prog_put(prog->aux->dst_prog); + if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline) + /* we allocated a new trampoline, so free the old one */ + bpf_trampoline_put(prog->aux->dst_trampoline); + prog->aux->dst_prog = NULL; prog->aux->dst_trampoline = NULL; mutex_unlock(&prog->aux->dst_mutex); return bpf_link_settle(&link_primer); out_unlock: + if (tr && tr != prog->aux->dst_trampoline) + bpf_trampoline_put(tr); mutex_unlock(&prog->aux->dst_mutex); kfree(link); out_put_prog: + if (tgt_prog_fd && tgt_prog) + bpf_prog_put(tgt_prog); bpf_prog_put(prog); return err; } @@ -2742,7 +2830,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog); + return bpf_tracing_prog_attach(prog, 0, 0); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, @@ -3926,10 +4014,15 @@ err_put: static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { - if (attr->link_create.attach_type == BPF_TRACE_ITER && - prog->expected_attach_type == BPF_TRACE_ITER) - return bpf_iter_link_attach(attr, prog); + if (attr->link_create.attach_type != prog->expected_attach_type) + return -EINVAL; + if (prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + else if (prog->type == BPF_PROG_TYPE_EXT) + return bpf_tracing_prog_attach(prog, + attr->link_create.target_fd, + attr->link_create.target_btf_id); return -EINVAL; } @@ -3943,18 +4036,25 @@ static int link_create(union bpf_attr *attr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; - ptype = attach_type_to_prog_type(attr->link_create.attach_type); - if (ptype == BPF_PROG_TYPE_UNSPEC) - return -EINVAL; - - prog = bpf_prog_get_type(attr->link_create.prog_fd, ptype); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); ret = bpf_prog_attach_check_attach_type(prog, attr->link_create.attach_type); if (ret) - goto err_out; + goto out; + + if (prog->type == BPF_PROG_TYPE_EXT) { + ret = tracing_bpf_link_attach(attr, prog); + goto out; + } + + ptype = attach_type_to_prog_type(attr->link_create.attach_type); + if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { + ret = -EINVAL; + goto out; + } switch (ptype) { case BPF_PROG_TYPE_CGROUP_SKB: @@ -3982,7 +4082,7 @@ static int link_create(union bpf_attr *attr) ret = -EINVAL; } -err_out: +out: if (ret < 0) bpf_prog_put(prog); return ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a97a2f2964e3..015a1c074b6b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11404,6 +11404,11 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (!btf_type_is_func_proto(t)) return -EINVAL; + if ((prog->aux->saved_dst_prog_type || prog->aux->saved_dst_attach_type) && + (!tgt_prog || prog->aux->saved_dst_prog_type != tgt_prog->type || + prog->aux->saved_dst_attach_type != tgt_prog->expected_attach_type)) + return -EINVAL; + if (tgt_prog && conservative) t = NULL; @@ -11512,6 +11517,11 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = tgt_info.tgt_type; prog->aux->attach_func_name = tgt_info.tgt_name; + if (tgt_prog) { + prog->aux->saved_dst_prog_type = tgt_prog->type; + prog->aux->saved_dst_attach_type = tgt_prog->expected_attach_type; + } + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { prog->aux->attach_btf_trace = true; return 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 96ddb00b91dc..2b1d3f16cbd1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -639,8 +639,13 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ - __aligned_u64 iter_info; /* extra bpf_iter_link_info */ - __u32 iter_info_len; /* iter_info length */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + }; } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ -- cgit v1.2.3 From 3f47cb4c1cf3bceb2438ea962bfffc6665ee4a9f Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 29 Sep 2020 13:35:41 +0100 Subject: l2tp: report rx cookie discards in netlink get When an L2TPv3 session receives a data frame with an incorrect cookie l2tp_core logs a warning message and bumps a stats counter to reflect the fact that the packet has been dropped. However, the stats counter in question is missing from the l2tp_netlink get message for tunnel and session instances. Include the statistic in the netlink get response. Signed-off-by: Tom Parkin Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 1 + net/l2tp/l2tp_netlink.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 88a0d32b8c07..30c80d5ba4bf 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -144,6 +144,7 @@ enum { L2TP_ATTR_RX_OOS_PACKETS, /* u64 */ L2TP_ATTR_RX_ERRORS, /* u64 */ L2TP_ATTR_STATS_PAD, + L2TP_ATTR_RX_COOKIE_DISCARDS, /* u64 */ __L2TP_ATTR_STATS_MAX, }; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 83c015f7f20d..5ca5056e9636 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -420,6 +420,9 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&tunnel->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, + atomic_long_read(&tunnel->stats.rx_cookie_discards), + L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&tunnel->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || @@ -760,6 +763,9 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, atomic_long_read(&session->stats.rx_seq_discards), L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS, + atomic_long_read(&session->stats.rx_cookie_discards), + L2TP_ATTR_STATS_PAD) || nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, atomic_long_read(&session->stats.rx_oos_packets), L2TP_ATTR_STATS_PAD) || -- cgit v1.2.3 From 2ec13cbcfadbbeac499f3b63de0f7db490d45a7e Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 29 Sep 2020 11:08:59 -0700 Subject: devlink: include for _BITUL Commit 5d5b4128c4ca ("devlink: introduce flash update overwrite mask") added a usage of _BITUL to the UAPI header, but failed to include the header file where it was defined. It happens that this does not break any existing kernel include chains because it gets included through other sources. However, when including the UAPI headers in a userspace application (such as devlink in iproute2), _BITUL is not defined. Fixes: 5d5b4128c4ca ("devlink: introduce flash update overwrite mask") Signed-off-by: Jacob Keller Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 7b0face1bad5..ba467dc07852 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -13,6 +13,8 @@ #ifndef _UAPI_LINUX_DEVLINK_H_ #define _UAPI_LINUX_DEVLINK_H_ +#include + #define DEVLINK_GENL_NAME "devlink" #define DEVLINK_GENL_VERSION 0x1 #define DEVLINK_GENL_MCGRP_CONFIG_NAME "config" -- cgit v1.2.3 From 002f2176532093753cb6ced61e5ea7b8904c6cae Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Mon, 28 Sep 2020 14:27:10 +0200 Subject: netfilter: nf_tables: add userdata attributes to nft_chain Enables storing userdata for nft_chain. Field udata points to user data and udlen stores its length. Adds new attribute flag NFTA_CHAIN_USERDATA. Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 33 ++++++++++++++++++++++++-------- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c4c526507ddb..0bd2a081ae39 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -945,6 +945,8 @@ struct nft_chain { bound:1, genmask:2; char *name; + u16 udlen; + u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule **rules_next; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 3c2469b43742..352ee51707a1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -208,6 +208,7 @@ enum nft_chain_flags { * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) * @NFTA_CHAIN_FLAGS: chain flags * @NFTA_CHAIN_ID: uniquely identifies a chain in a transaction (NLA_U32) + * @NFTA_CHAIN_USERDATA: user data (NLA_BINARY) */ enum nft_chain_attributes { NFTA_CHAIN_UNSPEC, @@ -222,6 +223,7 @@ enum nft_chain_attributes { NFTA_CHAIN_PAD, NFTA_CHAIN_FLAGS, NFTA_CHAIN_ID, + NFTA_CHAIN_USERDATA, __NFTA_CHAIN_MAX }; #define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0473316aa392..3cfff31e4818 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1304,6 +1304,8 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = { [NFTA_CHAIN_COUNTERS] = { .type = NLA_NESTED }, [NFTA_CHAIN_FLAGS] = { .type = NLA_U32 }, [NFTA_CHAIN_ID] = { .type = NLA_U32 }, + [NFTA_CHAIN_USERDATA] = { .type = NLA_BINARY, + .len = NFT_USERDATA_MAXLEN }, }; static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = { @@ -1445,6 +1447,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use))) goto nla_put_failure; + if (chain->udata && + nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -1682,9 +1688,11 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx) free_percpu(rcu_dereference_raw(basechain->stats)); } kfree(chain->name); + kfree(chain->udata); kfree(basechain); } else { kfree(chain->name); + kfree(chain->udata); kfree(chain); } } @@ -2038,7 +2046,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, } else { if (!(flags & NFT_CHAIN_BINDING)) { err = -EINVAL; - goto err1; + goto err_destroy_chain; } snprintf(name, sizeof(name), "__chain%llu", ++chain_id); @@ -2047,13 +2055,22 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (!chain->name) { err = -ENOMEM; - goto err1; + goto err_destroy_chain; + } + + if (nla[NFTA_CHAIN_USERDATA]) { + chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL); + if (chain->udata == NULL) { + err = -ENOMEM; + goto err_destroy_chain; + } + chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]); } rules = nf_tables_chain_alloc_rules(chain, 0); if (!rules) { err = -ENOMEM; - goto err1; + goto err_destroy_chain; } *rules = NULL; @@ -2062,12 +2079,12 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, err = nf_tables_register_hook(net, table, chain); if (err < 0) - goto err1; + goto err_destroy_chain; trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (IS_ERR(trans)) { err = PTR_ERR(trans); - goto err2; + goto err_unregister_hook; } nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET; @@ -2077,15 +2094,15 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, err = nft_chain_add(table, chain); if (err < 0) { nft_trans_destroy(trans); - goto err2; + goto err_unregister_hook; } table->use++; return 0; -err2: +err_unregister_hook: nf_tables_unregister_hook(net, table, chain); -err1: +err_destroy_chain: nf_tables_chain_destroy(ctx); return err; -- cgit v1.2.3 From b426ce83baa7dff947fb354118d3133f2953aac8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:15 +0200 Subject: bpf: Add classid helper only based on skb->sk Similarly to 5a52ae4e32a6 ("bpf: Allow to retrieve cgroup v1 classid from v2 hooks"), add a helper to retrieve cgroup v1 classid solely based on the skb->sk, so it can be used as key as part of BPF map lookups out of tc from host ns, in particular given the skb->sk is retained these days when crossing net ns thanks to 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb."). This is similar to bpf_skb_cgroup_id() which implements the same for v2. Kubernetes ecosystem is still operating on v1 however, hence net_cls needs to be used there until this can be dropped in with the v2 helper of bpf_skb_cgroup_id(). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/ed633cf27a1c620e901c5aa99ebdefb028dce600.1601477936.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 10 ++++++++++ net/core/filter.c | 21 +++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 10 ++++++++++ 3 files changed, 41 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2b1d3f16cbd1..6116a7f54c8f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3643,6 +3643,15 @@ union bpf_attr { * *flags* are identical to those used for bpf_snprintf_btf. * Return * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3796,6 +3805,7 @@ union bpf_attr { FN(copy_from_user), \ FN(snprintf_btf), \ FN(seq_printf_btf), \ + FN(skb_cgroup_classid), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/filter.c b/net/core/filter.c index af88935e24b1..fa01c697977d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2707,6 +2707,23 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return sock_cgroup_classid(&sk->sk_cgrp_data); +} + +static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = { + .func = bpf_skb_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) @@ -6772,6 +6789,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_skb_cgroup_classid: + return &bpf_skb_cgroup_classid_proto; +#endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2b1d3f16cbd1..6116a7f54c8f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3643,6 +3643,15 @@ union bpf_attr { * *flags* are identical to those used for bpf_snprintf_btf. * Return * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3796,6 +3805,7 @@ union bpf_attr { FN(copy_from_user), \ FN(snprintf_btf), \ FN(seq_printf_btf), \ + FN(skb_cgroup_classid), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b4ab31414970a7a03a5d55d75083f2c101a30592 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:17 +0200 Subject: bpf: Add redirect_neigh helper as redirect drop-in Add a redirect_neigh() helper as redirect() drop-in replacement for the xmit side. Main idea for the helper is to be very similar in semantics to the latter just that the skb gets injected into the neighboring subsystem in order to let the stack do the work it knows best anyway to populate the L2 addresses of the packet and then hand over to dev_queue_xmit() as redirect() does. This solves two bigger items: i) skbs don't need to go up to the stack on the host facing veth ingress side for traffic egressing the container to achieve the same for populating L2 which also has the huge advantage that ii) the skb->sk won't get orphaned in ip_rcv_core() when entering the IP routing layer on the host stack. Given that skb->sk neither gets orphaned when crossing the netns as per 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb.") the helper can then push the skbs directly to the phys device where FQ scheduler can do its work and TCP stack gets proper backpressure given we hold on to skb->sk as long as skb is still residing in queues. With the helper used in BPF data path to then push the skb to the phys device, I observed a stable/consistent TCP_STREAM improvement on veth devices for traffic going container -> host -> host -> container from ~10Gbps to ~15Gbps for a single stream in my test environment. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Acked-by: Martin KaFai Lau Cc: David Ahern Link: https://lore.kernel.org/bpf/f207de81629e1724899b73b8112e0013be782d35.1601477936.git.daniel@iogearbox.net --- include/linux/skbuff.h | 5 + include/uapi/linux/bpf.h | 14 +++ net/core/filter.c | 276 ++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 14 +++ 4 files changed, 294 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 04a18e01b362..3d0cf3722bb4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb) return skb->mac_header != (typeof(skb->mac_header))~0U; } +static inline void skb_unset_mac_header(struct sk_buff *skb) +{ + skb->mac_header = (typeof(skb->mac_header))~0U; +} + static inline void skb_reset_mac_header(struct sk_buff *skb) { skb->mac_header = skb->data - skb->head; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6116a7f54c8f..1f17c6752deb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3652,6 +3652,19 @@ union bpf_attr { * associated socket instead of the current process. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * fills in e.g. MAC addresses based on the L3 information from + * the packet. This helper is supported for IPv4 and IPv6 protocols. + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3806,6 +3819,7 @@ union bpf_attr { FN(snprintf_btf), \ FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ + FN(redirect_neigh), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/filter.c b/net/core/filter.c index a0776e48dcc9..3fb6adad1957 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2163,13 +2163,233 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, return __bpf_redirect_no_mac(skb, dev, flags); } +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst->dev; + u32 hh_len = LL_RESERVED_SPACE(dev); + const struct in6_addr *nexthop; + struct neighbour *neigh; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb->tstamp = 0; + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, hh_len); + if (unlikely(!skb2)) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + rcu_read_lock_bh(); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + neigh = ip_neigh_gw6(dev, nexthop); + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, false); + dev_xmit_recursion_dec(); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; + + skb_dst_set(skb, dst); + + err = bpf_out_neigh_v6(net, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + dev->stats.tx_errors++; + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_IPV6 */ + +#if IS_ENABLED(CONFIG_INET) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + struct net_device *dev = dst->dev; + u32 hh_len = LL_RESERVED_SPACE(dev); + struct neighbour *neigh; + bool is_v6gw = false; + + if (dev_xmit_recursion()) { + net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); + goto out_drop; + } + + skb->dev = dev; + skb->tstamp = 0; + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, hh_len); + if (unlikely(!skb2)) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + consume_skb(skb); + skb = skb2; + } + + rcu_read_lock_bh(); + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + if (likely(!IS_ERR(neigh))) { + int ret; + + sock_confirm_neigh(skb, neigh); + dev_xmit_recursion_inc(); + ret = neigh_output(neigh, skb, is_v6gw); + dev_xmit_recursion_dec(); + rcu_read_unlock_bh(); + return ret; + } + rcu_read_unlock_bh(); +out_drop: + kfree_skb(skb); + return -ENETDOWN; +} + +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net *net = dev_net(dev); + int err, ret = NET_XMIT_DROP; + struct rtable *rt; + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } + + skb_dst_set(skb, &rt->dst); + + err = bpf_out_neigh_v4(net, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out_xmit; +out_drop: + dev->stats.tx_errors++; + kfree_skb(skb); +out_xmit: + return ret; +} +#else +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +{ + kfree_skb(skb); + return NET_XMIT_DROP; +} +#endif /* CONFIG_INET */ + +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) +{ + struct ethhdr *ethh = eth_hdr(skb); + + if (unlikely(skb->mac_header >= skb->network_header)) + goto out; + bpf_push_mac_rcsum(skb); + if (is_multicast_ether_addr(ethh->h_dest)) + goto out; + + skb_pull(skb, sizeof(*ethh)); + skb_unset_mac_header(skb); + skb_reset_network_header(skb); + + if (skb->protocol == htons(ETH_P_IP)) + return __bpf_redirect_neigh_v4(skb, dev); + else if (skb->protocol == htons(ETH_P_IPV6)) + return __bpf_redirect_neigh_v6(skb, dev); +out: + kfree_skb(skb); + return -ENOTSUPP; +} + +/* Internal, non-exposed redirect flags. */ +enum { + BPF_F_NEIGH = (1ULL << 1), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) +}; + BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; - if (unlikely(flags & ~(BPF_F_INGRESS))) + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); @@ -2206,23 +2426,11 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); -BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) -{ - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - - if (unlikely(flags & ~(BPF_F_INGRESS))) - return TC_ACT_SHOT; - - ri->flags = flags; - ri->tgt_index = ifindex; - - return TC_ACT_REDIRECT; -} - int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; + u32 flags = ri->flags; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); ri->tgt_index = 0; @@ -2231,7 +2439,22 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - return __bpf_redirect(skb, dev, ri->flags); + return flags & BPF_F_NEIGH ? + __bpf_redirect_neigh(skb, dev) : + __bpf_redirect(skb, dev, flags); +} + +BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) + return TC_ACT_SHOT; + + ri->flags = flags; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; } static const struct bpf_func_proto bpf_redirect_proto = { @@ -2242,6 +2465,27 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_NEIGH; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_neigh_proto = { + .func = bpf_redirect_neigh, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) { msg->apply_bytes = bytes; @@ -6759,6 +7003,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; + case BPF_FUNC_redirect_neigh: + return &bpf_redirect_neigh_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6116a7f54c8f..1f17c6752deb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3652,6 +3652,19 @@ union bpf_attr { * associated socket instead of the current process. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * fills in e.g. MAC addresses based on the L3 information from + * the packet. This helper is supported for IPv4 and IPv6 protocols. + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3806,6 +3819,7 @@ union bpf_attr { FN(snprintf_btf), \ FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ + FN(redirect_neigh), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 792caccc4526bb489e054f9ab61d7c024b15dea2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 30 Sep 2020 15:49:26 -0700 Subject: bpf: Introduce BPF_F_PRESERVE_ELEMS for perf event array Currently, perf event in perf event array is removed from the array when the map fd used to add the event is closed. This behavior makes it difficult to the share perf events with perf event array. Introduce perf event map that keeps the perf event open with a new flag BPF_F_PRESERVE_ELEMS. With this flag set, perf events in the array are not removed when the original map fd is closed. Instead, the perf event will stay in the map until 1) it is explicitly removed from the array; or 2) the array is freed. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200930224927.1936644-2-songliubraving@fb.com --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 19 +++++++++++++++++-- tools/include/uapi/linux/bpf.h | 3 +++ 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1f17c6752deb..4f556cfcbfbe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -414,6 +414,9 @@ enum { /* Enable memory-mapping BPF map */ BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), }; /* Flags for BPF_PROG_QUERY. */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index e5fd31268ae0..bd777dd6f967 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -15,7 +15,8 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK) + (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ + BPF_F_PRESERVE_ELEMS) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -64,6 +65,10 @@ int array_map_alloc_check(union bpf_attr *attr) attr->map_flags & BPF_F_MMAPABLE) return -EINVAL; + if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && + attr->map_flags & BPF_F_PRESERVE_ELEMS) + return -EINVAL; + if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. @@ -1134,6 +1139,9 @@ static void perf_event_fd_array_release(struct bpf_map *map, struct bpf_event_entry *ee; int i; + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + return; + rcu_read_lock(); for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); @@ -1143,12 +1151,19 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } +static void perf_event_fd_array_map_free(struct bpf_map *map) +{ + if (map->map_flags & BPF_F_PRESERVE_ELEMS) + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + static int perf_event_array_map_btf_id; const struct bpf_map_ops perf_event_array_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, - .map_free = fd_array_map_free, + .map_free = perf_event_fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, .map_delete_elem = fd_array_map_delete_elem, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1f17c6752deb..4f556cfcbfbe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -414,6 +414,9 @@ enum { /* Enable memory-mapping BPF map */ BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), }; /* Flags for BPF_PROG_QUERY. */ -- cgit v1.2.3 From ba1df797e5bbba68ddd1a29bd658b1c11f9a60b6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Oct 2020 18:59:07 -0400 Subject: NFSACL: Replace PROC() macro with open code Clean up: Follow-up on ten-year-old commit b9081d90f5b9 ("NFS: kill off complicated macro 'PROC'") by performing the same conversion in the NFSACL code. To reduce the chance of error, I copied the original C preprocessor output and then made some minor edits. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs2acl.c | 72 ++++++++++++++++++++++++++++++--------------- fs/nfsd/nfs3acl.c | 49 +++++++++++++++++------------- include/uapi/linux/nfsacl.h | 2 ++ 3 files changed, 80 insertions(+), 43 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index cbab1d2d8a75..8d20e0d74417 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -347,36 +347,62 @@ static void nfsaclsvc_release_access(struct svc_rqst *rqstp) fh_put(&resp->fh); } -#define nfsaclsvc_decode_voidargs NULL -#define nfsaclsvc_release_void NULL -#define nfsd3_fhandleargs nfsd_fhandle -#define nfsd3_attrstatres nfsd_attrstat -#define nfsd3_voidres nfsd3_voidargs struct nfsd3_voidargs { int dummy; }; -#define PROC(name, argt, rest, relt, cache, respsize) \ -{ \ - .pc_func = nfsacld_proc_##name, \ - .pc_decode = nfsaclsvc_decode_##argt##args, \ - .pc_encode = nfsaclsvc_encode_##rest##res, \ - .pc_release = nfsaclsvc_release_##relt, \ - .pc_argsize = sizeof(struct nfsd3_##argt##args), \ - .pc_ressize = sizeof(struct nfsd3_##rest##res), \ - .pc_cachetype = cache, \ - .pc_xdrressize = respsize, \ -} - #define ST 1 /* status*/ #define AT 21 /* attributes */ #define pAT (1+AT) /* post attributes - conditional */ #define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ -static const struct svc_procedure nfsd_acl_procedures2[] = { - PROC(null, void, void, void, RC_NOCACHE, ST), - PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, attrstat, attrstat, RC_NOCACHE, ST+AT), - PROC(getattr, fhandle, attrstat, attrstat, RC_NOCACHE, ST+AT), - PROC(access, access, access, access, RC_NOCACHE, ST+AT+1), +static const struct svc_procedure nfsd_acl_procedures2[5] = { + [ACLPROC2_NULL] = { + .pc_func = nfsacld_proc_null, + .pc_encode = nfsaclsvc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [ACLPROC2_GETACL] = { + .pc_func = nfsacld_proc_getacl, + .pc_decode = nfsaclsvc_decode_getaclargs, + .pc_encode = nfsaclsvc_encode_getaclres, + .pc_release = nfsaclsvc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + }, + [ACLPROC2_SETACL] = { + .pc_func = nfsacld_proc_setacl, + .pc_decode = nfsaclsvc_decode_setaclargs, + .pc_encode = nfsaclsvc_encode_attrstatres, + .pc_release = nfsaclsvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [ACLPROC2_GETATTR] = { + .pc_func = nfsacld_proc_getattr, + .pc_decode = nfsaclsvc_decode_fhandleargs, + .pc_encode = nfsaclsvc_encode_attrstatres, + .pc_release = nfsaclsvc_release_attrstat, + .pc_argsize = sizeof(struct nfsd_fhandle), + .pc_ressize = sizeof(struct nfsd_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT, + }, + [ACLPROC2_ACCESS] = { + .pc_func = nfsacld_proc_access, + .pc_decode = nfsaclsvc_decode_accessargs, + .pc_encode = nfsaclsvc_encode_accessres, + .pc_release = nfsaclsvc_release_access, + .pc_argsize = sizeof(struct nfsd3_accessargs), + .pc_ressize = sizeof(struct nfsd3_accessres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+AT+1, + }, }; static unsigned int nfsd_acl_count2[ARRAY_SIZE(nfsd_acl_procedures2)]; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 13bca4a2f89d..292acb2e529c 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -235,33 +235,42 @@ static void nfs3svc_release_getacl(struct svc_rqst *rqstp) posix_acl_release(resp->acl_default); } -#define nfs3svc_decode_voidargs NULL -#define nfs3svc_release_void NULL -#define nfsd3_setaclres nfsd3_attrstat -#define nfsd3_voidres nfsd3_voidargs struct nfsd3_voidargs { int dummy; }; -#define PROC(name, argt, rest, relt, cache, respsize) \ -{ \ - .pc_func = nfsd3_proc_##name, \ - .pc_decode = nfs3svc_decode_##argt##args, \ - .pc_encode = nfs3svc_encode_##rest##res, \ - .pc_release = nfs3svc_release_##relt, \ - .pc_argsize = sizeof(struct nfsd3_##argt##args), \ - .pc_ressize = sizeof(struct nfsd3_##rest##res), \ - .pc_cachetype = cache, \ - .pc_xdrressize = respsize, \ -} - #define ST 1 /* status*/ #define AT 21 /* attributes */ #define pAT (1+AT) /* post attributes - conditional */ #define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ -static const struct svc_procedure nfsd_acl_procedures3[] = { - PROC(null, void, void, void, RC_NOCACHE, ST), - PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), - PROC(setacl, setacl, setacl, fhandle, RC_NOCACHE, ST+pAT), +static const struct svc_procedure nfsd_acl_procedures3[3] = { + [ACLPROC3_NULL] = { + .pc_func = nfsd3_proc_null, + .pc_encode = nfs3svc_encode_voidres, + .pc_argsize = sizeof(struct nfsd3_voidargs), + .pc_ressize = sizeof(struct nfsd3_voidargs), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST, + }, + [ACLPROC3_GETACL] = { + .pc_func = nfsd3_proc_getacl, + .pc_decode = nfs3svc_decode_getaclargs, + .pc_encode = nfs3svc_encode_getaclres, + .pc_release = nfs3svc_release_getacl, + .pc_argsize = sizeof(struct nfsd3_getaclargs), + .pc_ressize = sizeof(struct nfsd3_getaclres), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+1+2*(1+ACL), + }, + [ACLPROC3_SETACL] = { + .pc_func = nfsd3_proc_setacl, + .pc_decode = nfs3svc_decode_setaclargs, + .pc_encode = nfs3svc_encode_setaclres, + .pc_release = nfs3svc_release_fhandle, + .pc_argsize = sizeof(struct nfsd3_setaclargs), + .pc_ressize = sizeof(struct nfsd3_attrstat), + .pc_cachetype = RC_NOCACHE, + .pc_xdrressize = ST+pAT, + }, }; static unsigned int nfsd_acl_count3[ARRAY_SIZE(nfsd_acl_procedures3)]; diff --git a/include/uapi/linux/nfsacl.h b/include/uapi/linux/nfsacl.h index ca9a8501ff30..2c2ad204d3b0 100644 --- a/include/uapi/linux/nfsacl.h +++ b/include/uapi/linux/nfsacl.h @@ -9,11 +9,13 @@ #define NFS_ACL_PROGRAM 100227 +#define ACLPROC2_NULL 0 #define ACLPROC2_GETACL 1 #define ACLPROC2_SETACL 2 #define ACLPROC2_GETATTR 3 #define ACLPROC2_ACCESS 4 +#define ACLPROC3_NULL 0 #define ACLPROC3_GETACL 1 #define ACLPROC3_SETACL 2 -- cgit v1.2.3 From 4976b718c3551faba2c0616ef55ebeb74db1c5ca Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:44 -0700 Subject: bpf: Introduce pseudo_btf_id Pseudo_btf_id is a type of ld_imm insn that associates a btf_id to a ksym so that further dereferences on the ksym can use the BTF info to validate accesses. Internally, when seeing a pseudo_btf_id ld insn, the verifier reads the btf_id stored in the insn[0]'s imm field and marks the dst_reg as PTR_TO_BTF_ID. The btf_id points to a VAR_KIND, which is encoded in btf_vminux by pahole. If the VAR is not of a struct type, the dst reg will be marked as PTR_TO_MEM instead of PTR_TO_BTF_ID and the mem_size is resolved to the size of the VAR's type. >From the VAR btf_id, the verifier can also read the address of the ksym's corresponding kernel var from kallsyms and use that to fill dst_reg. Therefore, the proper functionality of pseudo_btf_id depends on (1) kallsyms and (2) the encoding of kernel global VARs in pahole, which should be available since pahole v1.18. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-2-haoluo@google.com --- include/linux/bpf_verifier.h | 7 +++ include/linux/btf.h | 15 +++++ include/uapi/linux/bpf.h | 36 +++++++++--- kernel/bpf/btf.c | 15 ----- kernel/bpf/verifier.c | 125 ++++++++++++++++++++++++++++++++++++----- tools/include/uapi/linux/bpf.h | 36 +++++++++--- 6 files changed, 188 insertions(+), 46 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 363b4f1c562a..e83ef6f6bf43 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -308,6 +308,13 @@ struct bpf_insn_aux_data { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ }; + struct { + enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ + union { + u32 btf_id; /* btf_id for struct typed var */ + u32 mem_size; /* mem_size for non-struct typed var */ + }; + } btf_var; }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 024e16ff7dcc..af1244180588 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -145,6 +145,21 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; } +static inline bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static inline bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f556cfcbfbe..2aa156af24d6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -356,18 +356,36 @@ enum bpf_link_type { #define BPF_F_SLEEPABLE (1U << 4) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD + * insn[0].imm: map fd + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 +/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_VALUE 2 +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4d0ee7839fdb..00569afe3d0d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -440,16 +440,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t) return !t || btf_type_nosize(t); } -/* union is only a special case of struct: - * all its offsetof(member) == 0 - */ -static bool btf_type_is_struct(const struct btf_type *t) -{ - u8 kind = BTF_INFO_KIND(t->info); - - return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; -} - static bool __btf_type_is_struct(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; @@ -460,11 +450,6 @@ static bool btf_type_is_array(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; } -static bool btf_type_is_var(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 015a1c074b6b..fe4965079773 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7488,6 +7488,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *dst_reg; struct bpf_map *map; int err; @@ -7504,25 +7505,44 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) if (err) return err; + dst_reg = ®s[insn->dst_reg]; if (insn->src_reg == 0) { u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - regs[insn->dst_reg].type = SCALAR_VALUE; + dst_reg->type = SCALAR_VALUE; __mark_reg_known(®s[insn->dst_reg], imm); return 0; } + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { + mark_reg_known_zero(env, regs, insn->dst_reg); + + dst_reg->type = aux->btf_var.reg_type; + switch (dst_reg->type) { + case PTR_TO_MEM: + dst_reg->mem_size = aux->btf_var.mem_size; + break; + case PTR_TO_BTF_ID: + dst_reg->btf_id = aux->btf_var.btf_id; + break; + default: + verbose(env, "bpf verifier is misconfigured\n"); + return -EFAULT; + } + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); - regs[insn->dst_reg].map_ptr = map; + dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { - regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; - regs[insn->dst_reg].off = aux->map_off; + dst_reg->type = PTR_TO_MAP_VALUE; + dst_reg->off = aux->map_off; if (map_value_has_spin_lock(map)) - regs[insn->dst_reg].id = ++env->id_gen; + dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + dst_reg->type = CONST_PTR_TO_MAP; } else { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; @@ -9424,6 +9444,73 @@ process_bpf_exit: return 0; } +/* replace pseudo btf_id with kernel symbol address */ +static int check_pseudo_btf_id(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_insn_aux_data *aux) +{ + u32 type, id = insn->imm; + const struct btf_type *t; + const char *sym_name; + u64 addr; + + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + + if (insn[1].imm != 0) { + verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); + return -EINVAL; + } + + t = btf_type_by_id(btf_vmlinux, id); + if (!t) { + verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); + return -ENOENT; + } + + if (!btf_type_is_var(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", + id); + return -EINVAL; + } + + sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + addr = kallsyms_lookup_name(sym_name); + if (!addr) { + verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", + sym_name); + return -ENOENT; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + type = t->type; + t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + if (!btf_type_is_struct(t)) { + const struct btf_type *ret; + const char *tname; + u32 tsize; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.mem_size = tsize; + } else { + aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf_id = type; + } + return 0; +} + static int check_map_prealloc(struct bpf_map *map) { return (map->map_type != BPF_MAP_TYPE_HASH && @@ -9534,10 +9621,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map) map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); } -/* look for pseudo eBPF instructions that access map FDs and - * replace them with actual map pointers +/* find and rewrite pseudo imm in ld_imm64 instructions: + * + * 1. if it accesses map FD, replace it with actual map pointer. + * 2. if it accesses btf_id of a VAR, replace it with pointer to the var. + * + * NOTE: btf_vmlinux is required for converting pseudo btf_id. */ -static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) +static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; @@ -9578,6 +9669,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* valid generic load 64-bit imm */ goto next_insn; + if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) { + aux = &env->insn_aux_data[i]; + err = check_pseudo_btf_id(env, insn, aux); + if (err) + return err; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -11633,10 +11732,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - ret = replace_map_fd_with_map_ptr(env); - if (ret < 0) - goto skip_full_check; - if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env->prog); if (ret) @@ -11662,6 +11757,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret) goto skip_full_check; + ret = resolve_pseudo_ldimm64(env); + if (ret < 0) + goto skip_full_check; + ret = check_cfg(env); if (ret < 0) goto skip_full_check; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4f556cfcbfbe..2aa156af24d6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -356,18 +356,36 @@ enum bpf_link_type { #define BPF_F_SLEEPABLE (1U << 4) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD + * insn[0].imm: map fd + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 +/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_VALUE 2 +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function -- cgit v1.2.3 From eaa6bcb71ef6ed3dc18fc525ee7e293b06b4882b Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:47 -0700 Subject: bpf: Introduce bpf_per_cpu_ptr() Add bpf_per_cpu_ptr() to help bpf programs access percpu vars. bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the kernel except that it may return NULL. This happens when the cpu parameter is out of range. So the caller must check the returned value. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-5-haoluo@google.com --- include/linux/bpf.h | 4 +++ include/linux/btf.h | 11 ++++++++ include/uapi/linux/bpf.h | 18 ++++++++++++ kernel/bpf/btf.c | 10 ------- kernel/bpf/helpers.c | 18 ++++++++++++ kernel/bpf/verifier.c | 64 ++++++++++++++++++++++++++++++++++++++++-- kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 18 ++++++++++++ 8 files changed, 132 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50e5c4b52bd1..9dde15b2479d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -293,6 +293,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ + ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, }; @@ -307,6 +308,7 @@ enum bpf_return_type { RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -405,6 +407,7 @@ enum bpf_reg_type { PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ + PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ }; /* The information passed from prog-specific *_is_valid_access @@ -1828,6 +1831,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/btf.h b/include/linux/btf.h index af1244180588..2bf641829664 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -110,6 +110,11 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, datasec_type, member) \ + for (i = 0, member = btf_type_var_secinfo(datasec_type); \ + i < btf_type_vlen(datasec_type); \ + i++, member++) + static inline bool btf_type_is_ptr(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; @@ -194,6 +199,12 @@ static inline const struct btf_member *btf_type_member(const struct btf_type *t) return (const struct btf_member *)(t + 1); } +static inline const struct btf_var_secinfo *btf_type_var_secinfo( + const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2aa156af24d6..f3c1b637ab39 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3686,6 +3686,23 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3841,6 +3858,7 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ + FN(bpf_per_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 00569afe3d0d..ed7d02e8bc93 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -188,11 +188,6 @@ i < btf_type_vlen(struct_type); \ i++, member++) -#define for_each_vsi(i, struct_type, member) \ - for (i = 0, member = btf_type_var_secinfo(struct_type); \ - i < btf_type_vlen(struct_type); \ - i++, member++) - #define for_each_vsi_from(i, from, struct_type, member) \ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ i < btf_type_vlen(struct_type); \ @@ -598,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t) return (const struct btf_var *)(t + 1); } -static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) -{ - return (const struct btf_var_secinfo *)(t + 1); -} - static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e825441781ab..14fe3f64fd82 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -623,6 +623,22 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) +{ + if (cpu >= nr_cpu_ids) + return (unsigned long)NULL; + + return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu); +} + +const struct bpf_func_proto bpf_per_cpu_ptr_proto = { + .func = bpf_per_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, + .arg2_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -689,6 +705,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fe4965079773..216b8ece23ce 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,6 +238,8 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; + u32 btf_id; + u32 ret_btf_id; }; struct btf *btf_vmlinux; @@ -517,6 +519,7 @@ static const char * const reg_type_str[] = { [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_MEM_OR_NULL] = "mem_or_null", [PTR_TO_RDONLY_BUF] = "rdonly_buf", @@ -583,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) + if (t == PTR_TO_BTF_ID || + t == PTR_TO_BTF_ID_OR_NULL || + t == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -2204,6 +2209,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: case PTR_TO_RDWR_BUF_OR_NULL: + case PTR_TO_PERCPU_BTF_ID: return true; default: return false; @@ -4017,6 +4023,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4042,6 +4049,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, + [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4205,6 +4213,12 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, false, meta); + } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) { + if (!reg->btf_id) { + verbose(env, "Helper has invalid btf_id in R%d\n", regno); + return -EACCES; + } + meta->ret_btf_id = reg->btf_id; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { if (process_spin_lock(env, regno, true)) @@ -5114,6 +5128,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL) { + const struct btf_type *t; + + mark_reg_known_zero(env, regs, BPF_REG_0); + t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + if (!btf_type_is_struct(t)) { + u32 tsize; + const struct btf_type *ret; + const char *tname; + + /* resolve the type size of ksym. */ + ret = btf_resolve_size(btf_vmlinux, t, &tsize); + if (IS_ERR(ret)) { + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + verbose(env, "unable to resolve the size of type '%s': %ld\n", + tname, PTR_ERR(ret)); + return -EINVAL; + } + regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].mem_size = tsize; + } else { + regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { int ret_btf_id; @@ -7523,6 +7561,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->mem_size = aux->btf_var.mem_size; break; case PTR_TO_BTF_ID: + case PTR_TO_PERCPU_BTF_ID: dst_reg->btf_id = aux->btf_var.btf_id; break; default: @@ -9449,10 +9488,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *aux) { - u32 type, id = insn->imm; + u32 datasec_id, type, id = insn->imm; + const struct btf_var_secinfo *vsi; + const struct btf_type *datasec; const struct btf_type *t; const char *sym_name; + bool percpu = false; u64 addr; + int i; if (!btf_vmlinux) { verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); @@ -9484,12 +9527,27 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, return -ENOENT; } + datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", + BTF_KIND_DATASEC); + if (datasec_id > 0) { + datasec = btf_type_by_id(btf_vmlinux, datasec_id); + for_each_vsi(i, datasec, vsi) { + if (vsi->type == id) { + percpu = true; + break; + } + } + } + insn[0].imm = (u32)addr; insn[1].imm = addr >> 32; type = t->type; t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); - if (!btf_type_is_struct(t)) { + if (percpu) { + aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf_id = type; + } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; const char *tname; u32 tsize; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e118a83439c3..364a322e2898 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1327,6 +1327,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_bpf_per_cpu_ptr: + return &bpf_per_cpu_ptr_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2aa156af24d6..f3c1b637ab39 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3686,6 +3686,23 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3841,6 +3858,7 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ + FN(bpf_per_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 63d9b80dcf2c67bc5ade61cbbaa09d7af21f43f1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:48 -0700 Subject: bpf: Introducte bpf_this_cpu_ptr() Add bpf_this_cpu_ptr() to help access percpu var on this cpu. This helper always returns a valid pointer, therefore no need to check returned value for NULL. Also note that all programs run with preemption disabled, which means that the returned pointer is stable during all the execution of the program. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-6-haoluo@google.com --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 13 +++++++++++++ kernel/bpf/helpers.c | 14 ++++++++++++++ kernel/bpf/verifier.c | 11 ++++++++--- kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 13 +++++++++++++ 6 files changed, 52 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9dde15b2479d..dc63eeed4fd9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -309,6 +309,7 @@ enum bpf_return_type { RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -1832,6 +1833,7 @@ extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3c1b637ab39..c446394135be 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3703,6 +3703,18 @@ union bpf_attr { * Return * A pointer pointing to the kernel percpu variable on *cpu*, or * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3859,6 +3871,7 @@ union bpf_attr { FN(skb_cgroup_classid), \ FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ + FN(bpf_this_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 14fe3f64fd82..25520f5eeaf6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -639,6 +639,18 @@ const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) +{ + return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr); +} + +const struct bpf_func_proto bpf_this_cpu_ptr_proto = { + .func = bpf_this_cpu_ptr, + .gpl_only = false, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -707,6 +719,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_jiffies64_proto; case BPF_FUNC_bpf_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 216b8ece23ce..d9dbf271ebab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5128,7 +5128,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL) { + } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -5146,10 +5147,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_MEM : PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = + fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? + PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 364a322e2898..a136a6a63a71 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1329,6 +1329,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_snprintf_btf_proto; case BPF_FUNC_bpf_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; + case BPF_FUNC_bpf_this_cpu_ptr: + return &bpf_this_cpu_ptr_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3c1b637ab39..c446394135be 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3703,6 +3703,18 @@ union bpf_attr { * Return * A pointer pointing to the kernel percpu variable on *cpu*, or * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3859,6 +3871,7 @@ union bpf_attr { FN(skb_cgroup_classid), \ FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ + FN(bpf_this_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 50a896cf2d6f34e884a00139d6e6012c9833ace3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 3 Oct 2020 10:44:45 +0200 Subject: genetlink: properly support per-op policy dumping Add support for per-op policy dumping. The data is pretty much as before, except that now the assumption that the policy with index 0 is "the" policy no longer holds - you now need to look at the new CTRL_ATTR_OP_POLICY attribute which is a nested attr (indexed by op) containing attributes for do and dump policies. When a single op is requested, the CTRL_ATTR_OP_POLICY will be added in the same way, since do and dump policies may differ. v2: - conditionally advertise per-command policies only if there actually is a policy being used for the do/dump and it's present at all Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/genetlink.h | 10 ++++ net/netlink/genetlink.c | 102 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 102 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 9c0636ec2286..bc9c98e84828 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -64,6 +64,7 @@ enum { CTRL_ATTR_OPS, CTRL_ATTR_MCAST_GROUPS, CTRL_ATTR_POLICY, + CTRL_ATTR_OP_POLICY, __CTRL_ATTR_MAX, }; @@ -85,6 +86,15 @@ enum { __CTRL_ATTR_MCAST_GRP_MAX, }; +enum { + CTRL_ATTR_POLICY_UNSPEC, + CTRL_ATTR_POLICY_DO, + CTRL_ATTR_POLICY_DUMP, + + __CTRL_ATTR_POLICY_DUMP_MAX, + CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1 +}; + #define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 5e33c7938470..eb916c44884f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1112,7 +1112,10 @@ static int genl_ctrl_event(int event, const struct genl_family *family, struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; + const struct genl_family *rt; + unsigned int opidx; u16 fam_id; + u8 policies:1; }; static const struct nla_policy ctrl_policy_policy[] = { @@ -1127,6 +1130,8 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->attrs; const struct genl_family *rt; + struct genl_ops op; + int err, i; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); @@ -1147,11 +1152,23 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) if (!rt) return -ENOENT; - if (!rt->policy) - return -ENODATA; + ctx->rt = rt; + + for (i = 0; i < genl_get_cmd_cnt(rt); i++) { + genl_get_cmd_by_index(i, rt, &op); + + if (op.policy) { + err = netlink_policy_dump_add_policy(&ctx->state, + op.policy, + op.maxattr); + if (err) + return err; + } + } - return netlink_policy_dump_add_policy(&ctx->state, rt->policy, - rt->maxattr); + if (!ctx->state) + return -ENODATA; + return 0; } static void *ctrl_dumppolicy_prep(struct sk_buff *skb, @@ -1172,12 +1189,78 @@ static void *ctrl_dumppolicy_prep(struct sk_buff *skb, return hdr; } +static int ctrl_dumppolicy_put_op(struct sk_buff *skb, + struct netlink_callback *cb, + struct genl_ops *op) +{ + struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; + struct nlattr *nest_pol, *nest_op; + void *hdr; + int idx; + + /* skip if we have nothing to show */ + if (!op->policy) + return 0; + if (!op->doit && + (!op->dumpit || op->validate & GENL_DONT_VALIDATE_DUMP)) + return 0; + + hdr = ctrl_dumppolicy_prep(skb, cb); + if (!hdr) + return -ENOBUFS; + + nest_pol = nla_nest_start(skb, CTRL_ATTR_OP_POLICY); + if (!nest_pol) + goto err; + + nest_op = nla_nest_start(skb, op->cmd); + if (!nest_op) + goto err; + + /* for now both do/dump are always the same */ + idx = netlink_policy_dump_get_policy_idx(ctx->state, + op->policy, + op->maxattr); + + if (op->doit && nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx)) + goto err; + + if (op->dumpit && !(op->validate & GENL_DONT_VALIDATE_DUMP) && + nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx)) + goto err; + + nla_nest_end(skb, nest_op); + nla_nest_end(skb, nest_pol); + genlmsg_end(skb, hdr); + + return 0; +err: + genlmsg_cancel(skb, hdr); + return -ENOBUFS; +} + static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; + void *hdr; + + if (!ctx->policies) { + while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { + struct genl_ops op; + + genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); + + if (ctrl_dumppolicy_put_op(skb, cb, &op)) + return skb->len; + + ctx->opidx++; + } + + /* completed with the per-op policy index list */ + ctx->policies = true; + } while (netlink_policy_dump_loop(ctx->state)) { - void *hdr; struct nlattr *nest; hdr = ctrl_dumppolicy_prep(skb, cb); @@ -1194,14 +1277,13 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) nla_nest_end(skb, nest); genlmsg_end(skb, hdr); - continue; - -nla_put_failure: - genlmsg_cancel(skb, hdr); - break; } return skb->len; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return skb->len; } static int ctrl_dumppolicy_done(struct netlink_callback *cb) -- cgit v1.2.3 From e992a6eda9a1eeeab73a8d2792464e4a2b1ebc3b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 3 Oct 2020 10:44:46 +0200 Subject: genetlink: allow dumping command-specific policy Right now CTRL_CMD_GETPOLICY can only dump the family-wide policy. Support dumping policy of a specific op. v3: - rebase after per-op policy export and handle that v2: - make cmd U32, just in case. v1: - don't echo op in the output in a naive way, this should make it cleaner to extend the output format for dumping policies for all the commands at once in the future. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20201001225933.1373426-11-kuba@kernel.org Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/genetlink.h | 1 + net/netlink/genetlink.c | 41 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index bc9c98e84828..d83f214b4134 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -65,6 +65,7 @@ enum { CTRL_ATTR_MCAST_GROUPS, CTRL_ATTR_POLICY, CTRL_ATTR_OP_POLICY, + CTRL_ATTR_OP, __CTRL_ATTR_MAX, }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index eb916c44884f..c992424e4d63 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -123,7 +123,7 @@ static void genl_op_from_full(const struct genl_family *family, op->policy = family->policy; } -static int genl_get_cmd_full(u8 cmd, const struct genl_family *family, +static int genl_get_cmd_full(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; @@ -152,7 +152,7 @@ static void genl_op_from_small(const struct genl_family *family, op->policy = family->policy; } -static int genl_get_cmd_small(u8 cmd, const struct genl_family *family, +static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, struct genl_ops *op) { int i; @@ -166,7 +166,7 @@ static int genl_get_cmd_small(u8 cmd, const struct genl_family *family, return -ENOENT; } -static int genl_get_cmd(u8 cmd, const struct genl_family *family, +static int genl_get_cmd(u32 cmd, const struct genl_family *family, struct genl_ops *op) { if (!genl_get_cmd_full(cmd, family, op)) @@ -1114,14 +1114,17 @@ struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; const struct genl_family *rt; unsigned int opidx; + u32 op; u16 fam_id; - u8 policies:1; + u8 policies:1, + single_op:1; }; static const struct nla_policy ctrl_policy_policy[] = { [CTRL_ATTR_FAMILY_ID] = { .type = NLA_U16 }, [CTRL_ATTR_FAMILY_NAME] = { .type = NLA_NUL_STRING, .len = GENL_NAMSIZ - 1 }, + [CTRL_ATTR_OP] = { .type = NLA_U32 }, }; static int ctrl_dumppolicy_start(struct netlink_callback *cb) @@ -1154,6 +1157,23 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) ctx->rt = rt; + if (tb[CTRL_ATTR_OP]) { + ctx->single_op = true; + ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); + + err = genl_get_cmd(ctx->op, rt, &op); + if (err) { + NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); + return err; + } + + if (!op.policy) + return -ENODATA; + + return netlink_policy_dump_add_policy(&ctx->state, op.policy, + op.maxattr); + } + for (i = 0; i < genl_get_cmd_cnt(rt); i++) { genl_get_cmd_by_index(i, rt, &op); @@ -1248,7 +1268,18 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { struct genl_ops op; - genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); + if (ctx->single_op) { + int err; + + err = genl_get_cmd(ctx->op, ctx->rt, &op); + if (WARN_ON(err)) + return skb->len; + + /* break out of the loop after this one */ + ctx->opidx = genl_get_cmd_cnt(ctx->rt); + } else { + genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); + } if (ctrl_dumppolicy_put_op(skb, cb, &op)) return skb->len; -- cgit v1.2.3 From 19fbcb36a39eefbe8912a13ccc02e937b1c418d6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 Oct 2020 00:44:28 +0200 Subject: net/sched: act_vlan: Add {POP,PUSH}_ETH actions Implement TCA_VLAN_ACT_POP_ETH and TCA_VLAN_ACT_PUSH_ETH, to respectively pop and push a base Ethernet header at the beginning of a frame. POP_ETH is just a matter of pulling ETH_HLEN bytes. VLAN tags, if any, must be stripped before calling POP_ETH. PUSH_ETH is restricted to skbs with no mac_header, and only the MAC addresses can be configured. The Ethertype is automatically set from skb->protocol. These restrictions ensure that all skb's fields remain consistent, so that this action can't confuse other part of the networking stack (like GSO). Since openvswitch already had these actions, consolidate the code in skbuff.c (like for vlan and mpls push/pop). Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++ include/net/tc_act/tc_vlan.h | 2 ++ include/uapi/linux/tc_act/tc_vlan.h | 4 +++ net/core/skbuff.c | 67 +++++++++++++++++++++++++++++++++++++ net/openvswitch/actions.c | 28 ++++++---------- net/sched/act_vlan.c | 40 ++++++++++++++++++++++ 6 files changed, 126 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3d0cf3722bb4..42131e325e27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3573,6 +3573,9 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); +int skb_eth_pop(struct sk_buff *skb); +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 4e2502408c31..f051046ba034 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -11,6 +11,8 @@ struct tcf_vlan_params { int tcfv_action; + unsigned char tcfv_push_dst[ETH_ALEN]; + unsigned char tcfv_push_src[ETH_ALEN]; u16 tcfv_push_vid; __be16 tcfv_push_proto; u8 tcfv_push_prio; diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index 168995b54a70..5b306fe815cc 100644 --- a/include/uapi/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -16,6 +16,8 @@ #define TCA_VLAN_ACT_POP 1 #define TCA_VLAN_ACT_PUSH 2 #define TCA_VLAN_ACT_MODIFY 3 +#define TCA_VLAN_ACT_POP_ETH 4 +#define TCA_VLAN_ACT_PUSH_ETH 5 struct tc_vlan { tc_gen; @@ -30,6 +32,8 @@ enum { TCA_VLAN_PUSH_VLAN_PROTOCOL, TCA_VLAN_PAD, TCA_VLAN_PUSH_VLAN_PRIORITY, + TCA_VLAN_PUSH_ETH_DST, + TCA_VLAN_PUSH_ETH_SRC, __TCA_VLAN_MAX, }; #define TCA_VLAN_MAX (__TCA_VLAN_MAX - 1) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e0774471f56d..75b043accddb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5558,6 +5558,73 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) } EXPORT_SYMBOL(skb_vlan_push); +/** + * skb_eth_pop() - Drop the Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * + * Drop the Ethernet header of @skb. + * + * Expects that skb->data points to the mac header and that no VLAN tags are + * present. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_pop(struct sk_buff *skb) +{ + if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || + skb_network_offset(skb) < ETH_HLEN) + return -EPROTO; + + skb_pull_rcsum(skb, ETH_HLEN); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + return 0; +} +EXPORT_SYMBOL(skb_eth_pop); + +/** + * skb_eth_push() - Add a new Ethernet header at the head of a packet + * + * @skb: Socket buffer to modify + * @dst: Destination MAC address of the new header + * @src: Source MAC address of the new header + * + * Prepend @skb with a new Ethernet header. + * + * Expects that skb->data points to the mac header, which must be empty. + * + * Returns 0 on success, -errno otherwise. + */ +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src) +{ + struct ethhdr *eth; + int err; + + if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) + return -EPROTO; + + err = skb_cow_head(skb, sizeof(*eth)); + if (err < 0) + return err; + + skb_push(skb, sizeof(*eth)); + skb_reset_mac_header(skb); + skb_reset_mac_len(skb); + + eth = eth_hdr(skb); + ether_addr_copy(eth->h_dest, dst); + ether_addr_copy(eth->h_source, src); + eth->h_proto = skb->protocol; + + skb_postpush_rcsum(skb, eth, sizeof(*eth)); + + return 0; +} +EXPORT_SYMBOL(skb_eth_push); + /* Update the ethertype of hdr and the skb csum value if required. */ static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, __be16 ethertype) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 855f2c155956..b87bfc82f44f 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -277,9 +277,11 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, */ static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) { - skb_pull_rcsum(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); + int err; + + err = skb_eth_pop(skb); + if (err) + return err; /* safe right before invalidate_flow_key */ key->mac_proto = MAC_PROTO_NONE; @@ -290,22 +292,12 @@ static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_eth *ethh) { - struct ethhdr *hdr; - - /* Add the new Ethernet header */ - if (skb_cow_head(skb, ETH_HLEN) < 0) - return -ENOMEM; - - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - hdr = eth_hdr(skb); - ether_addr_copy(hdr->h_source, ethh->addresses.eth_src); - ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst); - hdr->h_proto = skb->protocol; + int err; - skb_postpush_rcsum(skb, hdr, ETH_HLEN); + err = skb_eth_push(skb, ethh->addresses.eth_dst, + ethh->addresses.eth_src); + if (err) + return err; /* safe right before invalidate_flow_key */ key->mac_proto = MAC_PROTO_ETHERNET; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index a5ff9f68ab02..8758bd2a78fa 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -77,6 +77,16 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, /* put updated tci as hwaccel tag */ __vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci); break; + case TCA_VLAN_ACT_POP_ETH: + err = skb_eth_pop(skb); + if (err) + goto drop; + break; + case TCA_VLAN_ACT_PUSH_ETH: + err = skb_eth_push(skb, p->tcfv_push_dst, p->tcfv_push_src); + if (err) + goto drop; + break; default: BUG(); } @@ -93,10 +103,13 @@ drop: } static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { + [TCA_VLAN_UNSPEC] = { .strict_start_type = TCA_VLAN_PUSH_ETH_DST }, [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 }, + [TCA_VLAN_PUSH_ETH_DST] = NLA_POLICY_ETH_ADDR, + [TCA_VLAN_PUSH_ETH_SRC] = NLA_POLICY_ETH_ADDR, }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, @@ -179,6 +192,17 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (tb[TCA_VLAN_PUSH_VLAN_PRIORITY]) push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; + case TCA_VLAN_ACT_POP_ETH: + break; + case TCA_VLAN_ACT_PUSH_ETH: + if (!tb[TCA_VLAN_PUSH_ETH_DST] || !tb[TCA_VLAN_PUSH_ETH_SRC]) { + if (exists) + tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); + return -EINVAL; + } + break; default: if (exists) tcf_idr_release(*a, bind); @@ -219,6 +243,13 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, p->tcfv_push_prio = push_prio; p->tcfv_push_proto = push_proto; + if (action == TCA_VLAN_ACT_PUSH_ETH) { + nla_memcpy(&p->tcfv_push_dst, tb[TCA_VLAN_PUSH_ETH_DST], + ETH_ALEN); + nla_memcpy(&p->tcfv_push_src, tb[TCA_VLAN_PUSH_ETH_SRC], + ETH_ALEN); + } + spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); @@ -279,6 +310,15 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, p->tcfv_push_prio)))) goto nla_put_failure; + if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) { + if (nla_put(skb, TCA_VLAN_PUSH_ETH_DST, ETH_ALEN, + p->tcfv_push_dst)) + goto nla_put_failure; + if (nla_put(skb, TCA_VLAN_PUSH_ETH_SRC, ETH_ALEN, + p->tcfv_push_src)) + goto nla_put_failure; + } + tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; -- cgit v1.2.3 From a45294af9e96a3e060b6272fa7cd2c4b196de335 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 Oct 2020 00:44:31 +0200 Subject: net/sched: act_mpls: Add action to push MPLS LSE before Ethernet header Define the MAC_PUSH action which pushes an MPLS LSE before the mac header (instead of between the mac and the network headers as the plain PUSH action does). The only special case is when the skb has an offloaded VLAN. In that case, it has to be inlined before pushing the MPLS header. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_mpls.h | 1 + net/sched/act_mpls.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tc_act/tc_mpls.h b/include/uapi/linux/tc_act/tc_mpls.h index 9360e95273c7..9e4e8f52a779 100644 --- a/include/uapi/linux/tc_act/tc_mpls.h +++ b/include/uapi/linux/tc_act/tc_mpls.h @@ -10,6 +10,7 @@ #define TCA_MPLS_ACT_PUSH 2 #define TCA_MPLS_ACT_MODIFY 3 #define TCA_MPLS_ACT_DEC_TTL 4 +#define TCA_MPLS_ACT_MAC_PUSH 5 struct tc_mpls { tc_gen; /* generic TC action fields. */ diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 8118e2640979..bb6b715636db 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -87,6 +87,23 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; + case TCA_MPLS_ACT_MAC_PUSH: + if (skb_vlan_tag_present(skb)) { + if (__vlan_insert_inner_tag(skb, skb->vlan_proto, + skb_vlan_tag_get(skb), + ETH_HLEN) < 0) + goto drop; + + skb->protocol = skb->vlan_proto; + __vlan_hwaccel_clear_tag(skb); + } + + new_lse = tcf_mpls_get_lse(NULL, p, mac_len || + !eth_p_mpls(skb->protocol)); + + if (skb_mpls_push(skb, new_lse, p->tcfm_proto, 0, false)) + goto drop; + break; case TCA_MPLS_ACT_MODIFY: new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false); if (skb_mpls_update_lse(skb, new_lse)) @@ -188,6 +205,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, } break; case TCA_MPLS_ACT_PUSH: + case TCA_MPLS_ACT_MAC_PUSH: if (!tb[TCA_MPLS_LABEL]) { NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push"); return -EINVAL; -- cgit v1.2.3 From cf1166349c68816f4259d32559f54972b0d5c1a4 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 4 Oct 2020 18:12:51 +0200 Subject: net: devlink: Add unused port flavour Not all ports of a switch need to be used, particularly in embedded systems. Add a port flavour for ports which physically exist in the switch, but are not connected to the front panel etc, and so are unused. By having unused ports present in devlink, it gives a more accurate representation of the hardware. It also allows regions to be associated to such ports, so allowing, for example, to determine unused ports are correctly powered off, or to compare probable reset defaults of unused ports to used ports experiences issues. Actually registering unused ports and setting the flavour to unused is optional. The DSA core will register all such switch ports, but such ports are expected to be limited in number. Bigger ASICs may decide not to list unused ports. v2: Expand the description about why it is useful Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ba467dc07852..5f1d6c327670 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -197,6 +197,9 @@ enum devlink_port_flavour { * port that faces the PCI VF. */ DEVLINK_PORT_FLAVOUR_VIRTUAL, /* Any virtual port facing the user. */ + DEVLINK_PORT_FLAVOUR_UNUSED, /* Port which exists in the switch, but + * is not used in any way. + */ }; enum devlink_param_cmode { diff --git a/net/core/devlink.c b/net/core/devlink.c index 0f3c8b2ec056..20224fd1ebaf 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7612,7 +7612,8 @@ static bool devlink_port_type_should_warn(struct devlink_port *devlink_port) { /* Ignore CPU and DSA flavours. */ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU && - devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA; + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA && + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED; } #define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600) @@ -7897,6 +7898,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, break; case DEVLINK_PORT_FLAVOUR_CPU: case DEVLINK_PORT_FLAVOUR_DSA: + case DEVLINK_PORT_FLAVOUR_UNUSED: /* As CPU and DSA ports do not have a netdevice associated * case should not ever happen. */ -- cgit v1.2.3 From bdbb4e29df8b790db50cb73ce25d23543329f05f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 5 Oct 2020 15:07:38 -0700 Subject: netlink: add mask validation We don't have good validation policy for existing unsigned int attrs which serve as flags (for new ones we could use NLA_BITFIELD32). With increased use of policy dumping having the validation be expressed as part of the policy is important. Add validation policy in form of a mask of supported/valid bits. Support u64 in the uAPI to be future-proof, but really for now the embedded mask member can only hold 32 bits, so anything with bit 32+ set will always fail validation. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 10 ++++++++++ include/uapi/linux/netlink.h | 2 ++ lib/nlattr.c | 36 ++++++++++++++++++++++++++++++++++++ net/netlink/policy.c | 8 ++++++++ 4 files changed, 56 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/netlink.h b/include/net/netlink.h index c5aa46f379bc..2b9e41075f19 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -200,6 +200,7 @@ enum nla_policy_validation { NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, + NLA_VALIDATE_MASK, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; @@ -317,6 +318,7 @@ struct nla_policy { u16 len; union { const u32 bitfield32_valid; + const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; struct netlink_range_validation *range; @@ -368,6 +370,8 @@ struct nla_policy { (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) +#define NLA_ENSURE_UINT_TYPE(tp) \ + (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ @@ -416,6 +420,12 @@ struct nla_policy { .max = _max, \ } +#define NLA_POLICY_MASK(tp, _mask) { \ + .type = NLA_ENSURE_UINT_TYPE(tp), \ + .validation_type = NLA_VALIDATE_MASK, \ + .mask = _mask, \ +} + #define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \ .type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \ .validation_type = NLA_VALIDATE_FUNCTION, \ diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index eac8a6a648ea..d02e472ba54c 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -331,6 +331,7 @@ enum netlink_attribute_type { * the index, if limited inside the nesting (U32) * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the * bitfield32 type (U32) + * @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64) * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment */ enum netlink_policy_type_attr { @@ -346,6 +347,7 @@ enum netlink_policy_type_attr { NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, NL_POLICY_TYPE_ATTR_PAD, + NL_POLICY_TYPE_ATTR_MASK, /* keep last */ __NL_POLICY_TYPE_ATTR_MAX, diff --git a/lib/nlattr.c b/lib/nlattr.c index 80ff9fe83696..9c99f5daa4d2 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -323,6 +323,37 @@ static int nla_validate_int_range(const struct nla_policy *pt, } } +static int nla_validate_mask(const struct nla_policy *pt, + const struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + u64 value; + + switch (pt->type) { + case NLA_U8: + value = nla_get_u8(nla); + break; + case NLA_U16: + value = nla_get_u16(nla); + break; + case NLA_U32: + value = nla_get_u32(nla); + break; + case NLA_U64: + value = nla_get_u64(nla); + break; + default: + return -EINVAL; + } + + if (value & ~(u64)pt->mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); + return -EINVAL; + } + + return 0; +} + static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) @@ -503,6 +534,11 @@ static int validate_nla(const struct nlattr *nla, int maxtype, if (err) return err; break; + case NLA_VALIDATE_MASK: + err = nla_validate_mask(pt, nla, extack); + if (err) + return err; + break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); diff --git a/net/netlink/policy.c b/net/netlink/policy.c index cf23c0151721..ee26d01328ee 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -263,6 +263,14 @@ send_attribute: else type = NL_ATTR_TYPE_U64; + if (pt->validation_type == NLA_VALIDATE_MASK) { + if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MASK, + pt->mask, + NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + nla_get_range_unsigned(pt, &range); if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U, -- cgit v1.2.3 From eb88531bdbfaafb827192d1fc6c5a3fcc4fadd96 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sun, 27 Sep 2020 01:24:31 +0900 Subject: can: raw: add missing error queue support Error queue are not yet implemented in CAN-raw sockets. The problem: a userland call to recvmsg(soc, msg, MSG_ERRQUEUE) on a CAN-raw socket would unqueue messages from the normal queue without any kind of error or warning. As such, it prevented CAN drivers from using the functionalities that relies on the error queue such as skb_tx_timestamp(). SCM_CAN_RAW_ERRQUEUE is defined as the type for the CAN raw error queue. SCM stands for "Socket control messages". The name is inspired from SCM_J1939_ERRQUEUE of include/uapi/linux/can/j1939.h. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20200926162527.270030-1-mailhol.vincent@wanadoo.fr Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/raw.h | 3 +++ net/can/raw.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h index 6a11d308eb5c..3386aa81fdf2 100644 --- a/include/uapi/linux/can/raw.h +++ b/include/uapi/linux/can/raw.h @@ -49,6 +49,9 @@ #include #define SOL_CAN_RAW (SOL_CAN_BASE + CAN_RAW) +enum { + SCM_CAN_RAW_ERRQUEUE = 1, +}; /* for socket options affecting the socket (not the global system) */ diff --git a/net/can/raw.c b/net/can/raw.c index 24db4b4afdc7..ea70850f9152 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -804,6 +804,10 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, noblock = flags & MSG_DONTWAIT; flags &= ~MSG_DONTWAIT; + if (flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sk, msg, size, + SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE); + skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) return err; -- cgit v1.2.3 From 49f3d12b0f70ea867b891ad2a97f6e51bb564e18 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Wed, 7 Oct 2020 07:57:17 +0200 Subject: bpf: Fix typo in uapi/linux/bpf.h Reported-by: Samanta Navarro Signed-off-by: Jakub Wilk Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201007055717.7319-1-jwilk@jwilk.net --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c446394135be..d83561e8cd2c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2253,7 +2253,7 @@ union bpf_attr { * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it + * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c446394135be..d83561e8cd2c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2253,7 +2253,7 @@ union bpf_attr { * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it + * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The -- cgit v1.2.3 From fb1ff4c1941573aea59e4cb575dc5a723303cd70 Mon Sep 17 00:00:00 2001 From: Bharat Bhushan Date: Mon, 5 Oct 2020 20:36:45 +0300 Subject: vfio/fsl-mc: Add VFIO framework skeleton for fsl-mc devices DPAA2 (Data Path Acceleration Architecture) consists in mechanisms for processing Ethernet packets, queue management, accelerators, etc. The Management Complex (mc) is a hardware entity that manages the DPAA2 hardware resources. It provides an object-based abstraction for software drivers to use the DPAA2 hardware. The MC mediates operations such as create, discover, destroy of DPAA2 objects. The MC provides memory-mapped I/O command interfaces (MC portals) which DPAA2 software drivers use to operate on DPAA2 objects. A DPRC is a container object that holds other types of DPAA2 objects. Each object in the DPRC is a Linux device and bound to a driver. The MC-bus driver is a platform driver (different from PCI or platform bus). The DPRC driver does runtime management of a bus instance. It performs the initial scan of the DPRC and handles changes in the DPRC configuration (adding/removing objects). All objects inside a container share the same hardware isolation context, meaning that only an entire DPRC can be assigned to a virtual machine. When a container is assigned to a virtual machine, all the objects within that container are assigned to that virtual machine. The DPRC container assigned to the virtual machine is not allowed to change contents (add/remove objects) by the guest. The restriction is set by the host and enforced by the mc hardware. The DPAA2 objects can be directly assigned to the guest. However the MC portals (the memory mapped command interface to the MC) need to be emulated because there are commands that configure the interrupts and the isolation IDs which are virtual in the guest. Example: echo vfio-fsl-mc > /sys/bus/fsl-mc/devices/dprc.2/driver_override echo dprc.2 > /sys/bus/fsl-mc/drivers/vfio-fsl-mc/bind The dprc.2 is bound to the VFIO driver and all the objects within dprc.2 are going to be bound to the VFIO driver. This patch adds the infrastructure for VFIO support for fsl-mc devices. Subsequent patches will add support for binding and secure assigning these devices using VFIO. More details about the DPAA2 objects can be found here: Documentation/networking/device_drivers/freescale/dpaa2/overview.rst Signed-off-by: Bharat Bhushan Signed-off-by: Diana Craciun Reviewed-by: Eric Auger Signed-off-by: Alex Williamson --- MAINTAINERS | 6 ++ drivers/vfio/Kconfig | 1 + drivers/vfio/Makefile | 1 + drivers/vfio/fsl-mc/Kconfig | 9 ++ drivers/vfio/fsl-mc/Makefile | 4 + drivers/vfio/fsl-mc/vfio_fsl_mc.c | 157 ++++++++++++++++++++++++++++++ drivers/vfio/fsl-mc/vfio_fsl_mc_private.h | 14 +++ include/uapi/linux/vfio.h | 1 + 8 files changed, 193 insertions(+) create mode 100644 drivers/vfio/fsl-mc/Kconfig create mode 100644 drivers/vfio/fsl-mc/Makefile create mode 100644 drivers/vfio/fsl-mc/vfio_fsl_mc.c create mode 100644 drivers/vfio/fsl-mc/vfio_fsl_mc_private.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index d746519253c3..e955a00af046 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18260,6 +18260,12 @@ F: drivers/vfio/ F: include/linux/vfio.h F: include/uapi/linux/vfio.h +VFIO FSL-MC DRIVER +M: Diana Craciun +L: kvm@vger.kernel.org +S: Maintained +F: drivers/vfio/fsl-mc/ + VFIO MEDIATED DEVICE DRIVERS M: Kirti Wankhede L: kvm@vger.kernel.org diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index fd17db9b432f..5533df91b257 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -47,4 +47,5 @@ menuconfig VFIO_NOIOMMU source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "drivers/vfio/mdev/Kconfig" +source "drivers/vfio/fsl-mc/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index de67c4725cce..fee73f3d9480 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -9,3 +9,4 @@ obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ obj-$(CONFIG_VFIO_MDEV) += mdev/ +obj-$(CONFIG_VFIO_FSL_MC) += fsl-mc/ diff --git a/drivers/vfio/fsl-mc/Kconfig b/drivers/vfio/fsl-mc/Kconfig new file mode 100644 index 000000000000..b1a527d6b6f2 --- /dev/null +++ b/drivers/vfio/fsl-mc/Kconfig @@ -0,0 +1,9 @@ +config VFIO_FSL_MC + tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices" + depends on VFIO && FSL_MC_BUS && EVENTFD + help + Driver to enable support for the VFIO QorIQ DPAA2 fsl-mc + (Management Complex) devices. This is required to passthrough + fsl-mc bus devices using the VFIO framework. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/fsl-mc/Makefile b/drivers/vfio/fsl-mc/Makefile new file mode 100644 index 000000000000..0c6e5d2ddaae --- /dev/null +++ b/drivers/vfio/fsl-mc/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) + +vfio-fsl-mc-y := vfio_fsl_mc.o +obj-$(CONFIG_VFIO_FSL_MC) += vfio-fsl-mc.o diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c new file mode 100644 index 000000000000..a7a483a1e90b --- /dev/null +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) +/* + * Copyright 2013-2016 Freescale Semiconductor Inc. + * Copyright 2016-2017,2019-2020 NXP + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vfio_fsl_mc_private.h" + +static int vfio_fsl_mc_open(void *device_data) +{ + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + return 0; +} + +static void vfio_fsl_mc_release(void *device_data) +{ + module_put(THIS_MODULE); +} + +static long vfio_fsl_mc_ioctl(void *device_data, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + { + return -ENOTTY; + } + case VFIO_DEVICE_GET_REGION_INFO: + { + return -ENOTTY; + } + case VFIO_DEVICE_GET_IRQ_INFO: + { + return -ENOTTY; + } + case VFIO_DEVICE_SET_IRQS: + { + return -ENOTTY; + } + case VFIO_DEVICE_RESET: + { + return -ENOTTY; + } + default: + return -ENOTTY; + } +} + +static ssize_t vfio_fsl_mc_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + return -EINVAL; +} + +static ssize_t vfio_fsl_mc_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + return -EINVAL; +} + +static int vfio_fsl_mc_mmap(void *device_data, struct vm_area_struct *vma) +{ + return -EINVAL; +} + +static const struct vfio_device_ops vfio_fsl_mc_ops = { + .name = "vfio-fsl-mc", + .open = vfio_fsl_mc_open, + .release = vfio_fsl_mc_release, + .ioctl = vfio_fsl_mc_ioctl, + .read = vfio_fsl_mc_read, + .write = vfio_fsl_mc_write, + .mmap = vfio_fsl_mc_mmap, +}; + +static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) +{ + struct iommu_group *group; + struct vfio_fsl_mc_device *vdev; + struct device *dev = &mc_dev->dev; + int ret; + + group = vfio_iommu_group_get(dev); + if (!group) { + dev_err(dev, "VFIO_FSL_MC: No IOMMU group\n"); + return -EINVAL; + } + + vdev = devm_kzalloc(dev, sizeof(*vdev), GFP_KERNEL); + if (!vdev) { + ret = -ENOMEM; + goto out_group_put; + } + + vdev->mc_dev = mc_dev; + + ret = vfio_add_group_dev(dev, &vfio_fsl_mc_ops, vdev); + if (ret) { + dev_err(dev, "VFIO_FSL_MC: Failed to add to vfio group\n"); + goto out_group_put; + } + return 0; + +out_group_put: + vfio_iommu_group_put(group, dev); + return ret; +} + +static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) +{ + struct vfio_fsl_mc_device *vdev; + struct device *dev = &mc_dev->dev; + + vdev = vfio_del_group_dev(dev); + if (!vdev) + return -EINVAL; + + vfio_iommu_group_put(mc_dev->dev.iommu_group, dev); + + return 0; +} + +static struct fsl_mc_driver vfio_fsl_mc_driver = { + .probe = vfio_fsl_mc_probe, + .remove = vfio_fsl_mc_remove, + .driver = { + .name = "vfio-fsl-mc", + .owner = THIS_MODULE, + }, +}; + +static int __init vfio_fsl_mc_driver_init(void) +{ + return fsl_mc_driver_register(&vfio_fsl_mc_driver); +} + +static void __exit vfio_fsl_mc_driver_exit(void) +{ + fsl_mc_driver_unregister(&vfio_fsl_mc_driver); +} + +module_init(vfio_fsl_mc_driver_init); +module_exit(vfio_fsl_mc_driver_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("VFIO for FSL-MC devices - User Level meta-driver"); diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h new file mode 100644 index 000000000000..e79cc116f6b8 --- /dev/null +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright 2013-2016 Freescale Semiconductor Inc. + * Copyright 2016,2019-2020 NXP + */ + +#ifndef VFIO_FSL_MC_PRIVATE_H +#define VFIO_FSL_MC_PRIVATE_H + +struct vfio_fsl_mc_device { + struct fsl_mc_device *mc_dev; +}; + +#endif /* VFIO_FSL_MC_PRIVATE_H */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..95deac891378 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -201,6 +201,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ +#define VFIO_DEVICE_FLAGS_FSL_MC (1 << 6) /* vfio-fsl-mc device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; -- cgit v1.2.3 From 0c633f0be1dc70a6db46d90dba4cdae82073350a Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Wed, 7 Oct 2020 14:56:22 -0400 Subject: vfio: Introduce capability definitions for VFIO_DEVICE_GET_INFO Allow the VFIO_DEVICE_GET_INFO ioctl to include a capability chain. Add a flag indicating capability chain support, and introduce the definitions for the first set of capabilities which are specified to s390 zPCI devices. Signed-off-by: Matthew Rosato Reviewed-by: Cornelia Huck Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 11 ++++++ include/uapi/linux/vfio_zdev.h | 78 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 include/uapi/linux/vfio_zdev.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 920470502329..04fbe425ad0c 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -201,8 +201,10 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */ +#define VFIO_DEVICE_FLAGS_CAPS (1 << 7) /* Info supports caps */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ + __u32 cap_offset; /* Offset within info struct of first cap */ }; #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) @@ -218,6 +220,15 @@ struct vfio_device_info { #define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" #define VFIO_DEVICE_API_AP_STRING "vfio-ap" +/* + * The following capabilities are unique to s390 zPCI devices. Their contents + * are further-defined in vfio_zdev.h + */ +#define VFIO_DEVICE_INFO_CAP_ZPCI_BASE 1 +#define VFIO_DEVICE_INFO_CAP_ZPCI_GROUP 2 +#define VFIO_DEVICE_INFO_CAP_ZPCI_UTIL 3 +#define VFIO_DEVICE_INFO_CAP_ZPCI_PFIP 4 + /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, * struct vfio_region_info) diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h new file mode 100644 index 000000000000..b4309397b6b2 --- /dev/null +++ b/include/uapi/linux/vfio_zdev.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * VFIO Region definitions for ZPCI devices + * + * Copyright IBM Corp. 2020 + * + * Author(s): Pierre Morel + * Matthew Rosato + */ + +#ifndef _VFIO_ZDEV_H_ +#define _VFIO_ZDEV_H_ + +#include +#include + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_BASE - Base PCI Function information + * + * This capability provides a set of descriptive information about the + * associated PCI function. + */ +struct vfio_device_info_cap_zpci_base { + struct vfio_info_cap_header header; + __u64 start_dma; /* Start of available DMA addresses */ + __u64 end_dma; /* End of available DMA addresses */ + __u16 pchid; /* Physical Channel ID */ + __u16 vfn; /* Virtual function number */ + __u16 fmb_length; /* Measurement Block Length (in bytes) */ + __u8 pft; /* PCI Function Type */ + __u8 gid; /* PCI function group ID */ +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_GROUP - Base PCI Function Group information + * + * This capability provides a set of descriptive information about the group of + * PCI functions that the associated device belongs to. + */ +struct vfio_device_info_cap_zpci_group { + struct vfio_info_cap_header header; + __u64 dasm; /* DMA Address space mask */ + __u64 msi_addr; /* MSI address */ + __u64 flags; +#define VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH 1 /* Program-specified TLB refresh */ + __u16 mui; /* Measurement Block Update Interval */ + __u16 noi; /* Maximum number of MSIs */ + __u16 maxstbl; /* Maximum Store Block Length */ + __u8 version; /* Supported PCI Version */ +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_UTIL - Utility String + * + * This capability provides the utility string for the associated device, which + * is a device identifier string made up of EBCDID characters. 'size' specifies + * the length of 'util_str'. + */ +struct vfio_device_info_cap_zpci_util { + struct vfio_info_cap_header header; + __u32 size; + __u8 util_str[]; +}; + +/** + * VFIO_DEVICE_INFO_CAP_ZPCI_PFIP - PCI Function Path + * + * This capability provides the PCI function path string, which is an identifier + * that describes the internal hardware path of the device. 'size' specifies + * the length of 'pfip'. + */ +struct vfio_device_info_cap_zpci_pfip { + struct vfio_info_cap_header header; + __u32 size; + __u8 pfip[]; +}; + +#endif -- cgit v1.2.3 From e057dd3fc20ffb3d7f150af46542a51b59b90127 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 28 Sep 2020 22:04:04 +0200 Subject: can: add ISO 15765-2:2016 transport protocol CAN Transport Protocols offer support for segmented Point-to-Point communication between CAN nodes via two defined CAN Identifiers. As CAN frames can only transport a small amount of data bytes (max. 8 bytes for 'classic' CAN and max. 64 bytes for CAN FD) this segmentation is needed to transport longer PDUs as needed e.g. for vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. This protocol driver implements data transfers according to ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20200928200404.82229-1-socketcan@hartkopp.net [mkl: Removed "WITH Linux-syscall-note" from isotp.c. Fixed indention, a checkpatch warning and typos. Replaced __u{8,32} by u{8,32}. Removed always false (optlen < 0) check in isotp_setsockopt().] Signed-off-by: Marc Kleine-Budde --- MAINTAINERS | 1 + include/uapi/linux/can/isotp.h | 166 +++++ net/can/Kconfig | 13 + net/can/Makefile | 3 + net/can/isotp.c | 1426 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 1609 insertions(+) create mode 100644 include/uapi/linux/can/isotp.h create mode 100644 net/can/isotp.c (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index d651a0934be7..7a8a53adba91 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3912,6 +3912,7 @@ F: include/net/netns/can.h F: include/uapi/linux/can.h F: include/uapi/linux/can/bcm.h F: include/uapi/linux/can/gw.h +F: include/uapi/linux/can/isotp.h F: include/uapi/linux/can/raw.h F: net/can/ diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h new file mode 100644 index 000000000000..553006509f4e --- /dev/null +++ b/include/uapi/linux/can/isotp.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * linux/can/isotp.h + * + * Definitions for isotp CAN sockets (ISO 15765-2:2016) + * + * Copyright (c) 2020 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#ifndef _UAPI_CAN_ISOTP_H +#define _UAPI_CAN_ISOTP_H + +#include +#include + +#define SOL_CAN_ISOTP (SOL_CAN_BASE + CAN_ISOTP) + +/* for socket options affecting the socket (not the global system) */ + +#define CAN_ISOTP_OPTS 1 /* pass struct can_isotp_options */ + +#define CAN_ISOTP_RECV_FC 2 /* pass struct can_isotp_fc_options */ + +/* sockopts to force stmin timer values for protocol regression tests */ + +#define CAN_ISOTP_TX_STMIN 3 /* pass __u32 value in nano secs */ + /* use this time instead of value */ + /* provided in FC from the receiver */ + +#define CAN_ISOTP_RX_STMIN 4 /* pass __u32 value in nano secs */ + /* ignore received CF frames which */ + /* timestamps differ less than val */ + +#define CAN_ISOTP_LL_OPTS 5 /* pass struct can_isotp_ll_options */ + +struct can_isotp_options { + + __u32 flags; /* set flags for isotp behaviour. */ + /* __u32 value : flags see below */ + + __u32 frame_txtime; /* frame transmission time (N_As/N_Ar) */ + /* __u32 value : time in nano secs */ + + __u8 ext_address; /* set address for extended addressing */ + /* __u8 value : extended address */ + + __u8 txpad_content; /* set content of padding byte (tx) */ + /* __u8 value : content on tx path */ + + __u8 rxpad_content; /* set content of padding byte (rx) */ + /* __u8 value : content on rx path */ + + __u8 rx_ext_address; /* set address for extended addressing */ + /* __u8 value : extended address (rx) */ +}; + +struct can_isotp_fc_options { + + __u8 bs; /* blocksize provided in FC frame */ + /* __u8 value : blocksize. 0 = off */ + + __u8 stmin; /* separation time provided in FC frame */ + /* __u8 value : */ + /* 0x00 - 0x7F : 0 - 127 ms */ + /* 0x80 - 0xF0 : reserved */ + /* 0xF1 - 0xF9 : 100 us - 900 us */ + /* 0xFA - 0xFF : reserved */ + + __u8 wftmax; /* max. number of wait frame transmiss. */ + /* __u8 value : 0 = omit FC N_PDU WT */ +}; + +struct can_isotp_ll_options { + + __u8 mtu; /* generated & accepted CAN frame type */ + /* __u8 value : */ + /* CAN_MTU (16) -> standard CAN 2.0 */ + /* CANFD_MTU (72) -> CAN FD frame */ + + __u8 tx_dl; /* tx link layer data length in bytes */ + /* (configured maximum payload length) */ + /* __u8 value : 8,12,16,20,24,32,48,64 */ + /* => rx path supports all LL_DL values */ + + __u8 tx_flags; /* set into struct canfd_frame.flags */ + /* at frame creation: e.g. CANFD_BRS */ + /* Obsolete when the BRS flag is fixed */ + /* by the CAN netdriver configuration */ +}; + +/* flags for isotp behaviour */ + +#define CAN_ISOTP_LISTEN_MODE 0x001 /* listen only (do not send FC) */ +#define CAN_ISOTP_EXTEND_ADDR 0x002 /* enable extended addressing */ +#define CAN_ISOTP_TX_PADDING 0x004 /* enable CAN frame padding tx path */ +#define CAN_ISOTP_RX_PADDING 0x008 /* enable CAN frame padding rx path */ +#define CAN_ISOTP_CHK_PAD_LEN 0x010 /* check received CAN frame padding */ +#define CAN_ISOTP_CHK_PAD_DATA 0x020 /* check received CAN frame padding */ +#define CAN_ISOTP_HALF_DUPLEX 0x040 /* half duplex error state handling */ +#define CAN_ISOTP_FORCE_TXSTMIN 0x080 /* ignore stmin from received FC */ +#define CAN_ISOTP_FORCE_RXSTMIN 0x100 /* ignore CFs depending on rx stmin */ +#define CAN_ISOTP_RX_EXT_ADDR 0x200 /* different rx extended addressing */ +#define CAN_ISOTP_WAIT_TX_DONE 0x400 /* wait for tx completion */ + + +/* default values */ + +#define CAN_ISOTP_DEFAULT_FLAGS 0 +#define CAN_ISOTP_DEFAULT_EXT_ADDRESS 0x00 +#define CAN_ISOTP_DEFAULT_PAD_CONTENT 0xCC /* prevent bit-stuffing */ +#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 0 +#define CAN_ISOTP_DEFAULT_RECV_BS 0 +#define CAN_ISOTP_DEFAULT_RECV_STMIN 0x00 +#define CAN_ISOTP_DEFAULT_RECV_WFTMAX 0 + +#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU +#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN +#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 + +/* + * Remark on CAN_ISOTP_DEFAULT_RECV_* values: + * + * We can strongly assume, that the Linux Kernel implementation of + * CAN_ISOTP is capable to run with BS=0, STmin=0 and WFTmax=0. + * But as we like to be able to behave as a commonly available ECU, + * these default settings can be changed via sockopts. + * For that reason the STmin value is intentionally _not_ checked for + * consistency and copied directly into the flow control (FC) frame. + * + */ + +#endif /* !_UAPI_CAN_ISOTP_H */ diff --git a/net/can/Kconfig b/net/can/Kconfig index 25436a715db3..021fe03a8ed6 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -55,6 +55,19 @@ config CAN_GW source "net/can/j1939/Kconfig" +config CAN_ISOTP + tristate "ISO 15765-2:2016 CAN transport protocol" + default y + help + CAN Transport Protocols offer support for segmented Point-to-Point + communication between CAN nodes via two defined CAN Identifiers. + As CAN frames can only transport a small amount of data bytes + (max. 8 bytes for 'classic' CAN and max. 64 bytes for CAN FD) this + segmentation is needed to transport longer PDUs as needed e.g. for + vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. + This protocol driver implements data transfers according to + ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. + source "drivers/net/can/Kconfig" endif diff --git a/net/can/Makefile b/net/can/Makefile index 08bd217fc051..58f2c31c1ef3 100644 --- a/net/can/Makefile +++ b/net/can/Makefile @@ -17,3 +17,6 @@ obj-$(CONFIG_CAN_GW) += can-gw.o can-gw-y := gw.o obj-$(CONFIG_CAN_J1939) += j1939/ + +obj-$(CONFIG_CAN_ISOTP) += can-isotp.o +can-isotp-y := isotp.o diff --git a/net/can/isotp.c b/net/can/isotp.c new file mode 100644 index 000000000000..e6ff032b5426 --- /dev/null +++ b/net/can/isotp.c @@ -0,0 +1,1426 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN + * + * This implementation does not provide ISO-TP specific return values to the + * userspace. + * + * - RX path timeout of data reception leads to -ETIMEDOUT + * - RX path SN mismatch leads to -EILSEQ + * - RX path data reception with wrong padding leads to -EBADMSG + * - TX path flowcontrol reception timeout leads to -ECOMM + * - TX path flowcontrol reception overflow leads to -EMSGSIZE + * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG + * - when a transfer (tx) is on the run the next write() blocks until it's done + * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent + * - as we have static buffers the check whether the PDU fits into the buffer + * is done at FF reception time (no support for sending 'wait frames') + * - take care of the tx-queue-len as traffic shaping is still on the TODO list + * + * Copyright (c) 2020 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAN_ISOTP_VERSION "20200928" + +MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Oliver Hartkopp "); +MODULE_ALIAS("can-proto-6"); + +#define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \ + (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ + (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) + +/* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can + * take full 32 bit values (4 Gbyte). We would need some good concept to handle + * this between user space and kernel space. For now increase the static buffer + * to something about 8 kbyte to be able to test this new functionality. + */ +#define MAX_MSG_LENGTH 8200 + +/* N_PCI type values in bits 7-4 of N_PCI bytes */ +#define N_PCI_SF 0x00 /* single frame */ +#define N_PCI_FF 0x10 /* first frame */ +#define N_PCI_CF 0x20 /* consecutive frame */ +#define N_PCI_FC 0x30 /* flow control */ + +#define N_PCI_SZ 1 /* size of the PCI byte #1 */ +#define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ +#define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ +#define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ +#define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ +#define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ + +#define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) + +/* Flow Status given in FC frame */ +#define ISOTP_FC_CTS 0 /* clear to send */ +#define ISOTP_FC_WT 1 /* wait */ +#define ISOTP_FC_OVFLW 2 /* overflow */ + +enum { + ISOTP_IDLE = 0, + ISOTP_WAIT_FIRST_FC, + ISOTP_WAIT_FC, + ISOTP_WAIT_DATA, + ISOTP_SENDING +}; + +struct tpcon { + int idx; + int len; + u8 state; + u8 bs; + u8 sn; + u8 ll_dl; + u8 buf[MAX_MSG_LENGTH + 1]; +}; + +struct isotp_sock { + struct sock sk; + int bound; + int ifindex; + canid_t txid; + canid_t rxid; + ktime_t tx_gap; + ktime_t lastrxcf_tstamp; + struct hrtimer rxtimer, txtimer; + struct can_isotp_options opt; + struct can_isotp_fc_options rxfc, txfc; + struct can_isotp_ll_options ll; + u32 force_tx_stmin; + u32 force_rx_stmin; + struct tpcon rx, tx; + struct notifier_block notifier; + wait_queue_head_t wait; +}; + +static inline struct isotp_sock *isotp_sk(const struct sock *sk) +{ + return (struct isotp_sock *)sk; +} + +static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + rxtimer); + struct sock *sk = &so->sk; + + if (so->rx.state == ISOTP_WAIT_DATA) { + /* we did not get new data frames in time */ + + /* report 'connection timed out' */ + sk->sk_err = ETIMEDOUT; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + /* reset rx state */ + so->rx.state = ISOTP_IDLE; + } + + return HRTIMER_NORESTART; +} + +static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) +{ + struct net_device *dev; + struct sk_buff *nskb; + struct canfd_frame *ncf; + struct isotp_sock *so = isotp_sk(sk); + int can_send_ret; + + nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); + if (!nskb) + return 1; + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) { + kfree_skb(nskb); + return 1; + } + + can_skb_reserve(nskb); + can_skb_prv(nskb)->ifindex = dev->ifindex; + can_skb_prv(nskb)->skbcnt = 0; + + nskb->dev = dev; + can_skb_set_owner(nskb, sk); + ncf = (struct canfd_frame *)nskb->data; + skb_put(nskb, so->ll.mtu); + + /* create & send flow control reply */ + ncf->can_id = so->txid; + + if (so->opt.flags & CAN_ISOTP_TX_PADDING) { + memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); + ncf->len = CAN_MAX_DLEN; + } else { + ncf->len = ae + FC_CONTENT_SZ; + } + + ncf->data[ae] = N_PCI_FC | flowstatus; + ncf->data[ae + 1] = so->rxfc.bs; + ncf->data[ae + 2] = so->rxfc.stmin; + + if (ae) + ncf->data[0] = so->opt.ext_address; + + if (so->ll.mtu == CANFD_MTU) + ncf->flags = so->ll.tx_flags; + + can_send_ret = can_send(nskb, 1); + if (can_send_ret) + printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", + __func__, can_send_ret); + + dev_put(dev); + + /* reset blocksize counter */ + so->rx.bs = 0; + + /* reset last CF frame rx timestamp for rx stmin enforcement */ + so->lastrxcf_tstamp = ktime_set(0, 0); + + /* start rx timeout watchdog */ + hrtimer_start(&so->rxtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + return 0; +} + +static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; + + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); + + memset(addr, 0, sizeof(*addr)); + addr->can_family = AF_CAN; + addr->can_ifindex = skb->dev->ifindex; + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); +} + +static u8 padlen(u8 datalen) +{ + const u8 plen[] = {8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ + 12, 12, 12, 12, /* 9 - 12 */ + 16, 16, 16, 16, /* 13 - 16 */ + 20, 20, 20, 20, /* 17 - 20 */ + 24, 24, 24, 24, /* 21 - 24 */ + 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ + 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ + 48, 48, 48, 48, 48, 48, 48, 48}; /* 41 - 48 */ + + if (datalen > 48) + return 64; + + return plen[datalen]; +} + +/* check for length optimization and return 1/true when the check fails */ +static int check_optimized(struct canfd_frame *cf, int start_index) +{ + /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the + * padding would start at this point. E.g. if the padding would + * start at cf.data[7] cf->len has to be 7 to be optimal. + * Note: The data[] index starts with zero. + */ + if (cf->len <= CAN_MAX_DLEN) + return (cf->len != start_index); + + /* This relation is also valid in the non-linear DLC range, where + * we need to take care of the minimal next possible CAN_DL. + * The correct check would be (padlen(cf->len) != padlen(start_index)). + * But as cf->len can only take discrete values from 12, .., 64 at this + * point the padlen(cf->len) is always equal to cf->len. + */ + return (cf->len != padlen(start_index)); +} + +/* check padding and return 1/true when the check fails */ +static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, + int start_index, u8 content) +{ + int i; + + /* no RX_PADDING value => check length of optimized frame length */ + if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { + if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) + return check_optimized(cf, start_index); + + /* no valid test against empty value => ignore frame */ + return 1; + } + + /* check datalength of correctly padded CAN frame */ + if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && + cf->len != padlen(cf->len)) + return 1; + + /* check padding content */ + if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { + for (i = start_index; i < cf->len; i++) + if (cf->data[i] != content) + return 1; + } + return 0; +} + +static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) +{ + struct sock *sk = &so->sk; + + if (so->tx.state != ISOTP_WAIT_FC && + so->tx.state != ISOTP_WAIT_FIRST_FC) + return 0; + + hrtimer_cancel(&so->txtimer); + + if ((cf->len < ae + FC_CONTENT_SZ) || + ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { + /* malformed PDU - report 'not a data message' */ + sk->sk_err = EBADMSG; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + return 1; + } + + /* get communication parameters only from the first FC frame */ + if (so->tx.state == ISOTP_WAIT_FIRST_FC) { + so->txfc.bs = cf->data[ae + 1]; + so->txfc.stmin = cf->data[ae + 2]; + + /* fix wrong STmin values according spec */ + if (so->txfc.stmin > 0x7F && + (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9)) + so->txfc.stmin = 0x7F; + + so->tx_gap = ktime_set(0, 0); + /* add transmission time for CAN frame N_As */ + so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime); + /* add waiting time for consecutive frames N_Cs */ + if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) + so->tx_gap = ktime_add_ns(so->tx_gap, + so->force_tx_stmin); + else if (so->txfc.stmin < 0x80) + so->tx_gap = ktime_add_ns(so->tx_gap, + so->txfc.stmin * 1000000); + else + so->tx_gap = ktime_add_ns(so->tx_gap, + (so->txfc.stmin - 0xF0) + * 100000); + so->tx.state = ISOTP_WAIT_FC; + } + + switch (cf->data[ae] & 0x0F) { + case ISOTP_FC_CTS: + so->tx.bs = 0; + so->tx.state = ISOTP_SENDING; + /* start cyclic timer for sending CF frame */ + hrtimer_start(&so->txtimer, so->tx_gap, + HRTIMER_MODE_REL_SOFT); + break; + + case ISOTP_FC_WT: + /* start timer to wait for next FC frame */ + hrtimer_start(&so->txtimer, ktime_set(1, 0), + HRTIMER_MODE_REL_SOFT); + break; + + case ISOTP_FC_OVFLW: + /* overflow on receiver side - report 'message too long' */ + sk->sk_err = EMSGSIZE; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + fallthrough; + + default: + /* stop this tx job */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + } + return 0; +} + +static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, + struct sk_buff *skb, int len) +{ + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *nskb; + + hrtimer_cancel(&so->rxtimer); + so->rx.state = ISOTP_IDLE; + + if (!len || len > cf->len - pcilen) + return 1; + + if ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) { + /* malformed PDU - report 'not a data message' */ + sk->sk_err = EBADMSG; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + return 1; + } + + nskb = alloc_skb(len, gfp_any()); + if (!nskb) + return 1; + + memcpy(skb_put(nskb, len), &cf->data[pcilen], len); + + nskb->tstamp = skb->tstamp; + nskb->dev = skb->dev; + isotp_rcv_skb(nskb, sk); + return 0; +} + +static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) +{ + struct isotp_sock *so = isotp_sk(sk); + int i; + int off; + int ff_pci_sz; + + hrtimer_cancel(&so->rxtimer); + so->rx.state = ISOTP_IDLE; + + /* get the used sender LL_DL from the (first) CAN frame data length */ + so->rx.ll_dl = padlen(cf->len); + + /* the first frame has to use the entire frame up to LL_DL length */ + if (cf->len != so->rx.ll_dl) + return 1; + + /* get the FF_DL */ + so->rx.len = (cf->data[ae] & 0x0F) << 8; + so->rx.len += cf->data[ae + 1]; + + /* Check for FF_DL escape sequence supporting 32 bit PDU length */ + if (so->rx.len) { + ff_pci_sz = FF_PCI_SZ12; + } else { + /* FF_DL = 0 => get real length from next 4 bytes */ + so->rx.len = cf->data[ae + 2] << 24; + so->rx.len += cf->data[ae + 3] << 16; + so->rx.len += cf->data[ae + 4] << 8; + so->rx.len += cf->data[ae + 5]; + ff_pci_sz = FF_PCI_SZ32; + } + + /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ + off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; + + if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) + return 1; + + if (so->rx.len > MAX_MSG_LENGTH) { + /* send FC frame with overflow status */ + isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); + return 1; + } + + /* copy the first received data bytes */ + so->rx.idx = 0; + for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) + so->rx.buf[so->rx.idx++] = cf->data[i]; + + /* initial setup for this pdu reception */ + so->rx.sn = 1; + so->rx.state = ISOTP_WAIT_DATA; + + /* no creation of flow control frames */ + if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) + return 0; + + /* send our first FC frame */ + isotp_send_fc(sk, ae, ISOTP_FC_CTS); + return 0; +} + +static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, + struct sk_buff *skb) +{ + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *nskb; + int i; + + if (so->rx.state != ISOTP_WAIT_DATA) + return 0; + + /* drop if timestamp gap is less than force_rx_stmin nano secs */ + if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { + if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < + so->force_rx_stmin) + return 0; + + so->lastrxcf_tstamp = skb->tstamp; + } + + hrtimer_cancel(&so->rxtimer); + + /* CFs are never longer than the FF */ + if (cf->len > so->rx.ll_dl) + return 1; + + /* CFs have usually the LL_DL length */ + if (cf->len < so->rx.ll_dl) { + /* this is only allowed for the last CF */ + if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) + return 1; + } + + if ((cf->data[ae] & 0x0F) != so->rx.sn) { + /* wrong sn detected - report 'illegal byte sequence' */ + sk->sk_err = EILSEQ; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + /* reset rx state */ + so->rx.state = ISOTP_IDLE; + return 1; + } + so->rx.sn++; + so->rx.sn %= 16; + + for (i = ae + N_PCI_SZ; i < cf->len; i++) { + so->rx.buf[so->rx.idx++] = cf->data[i]; + if (so->rx.idx >= so->rx.len) + break; + } + + if (so->rx.idx >= so->rx.len) { + /* we are done */ + so->rx.state = ISOTP_IDLE; + + if ((so->opt.flags & ISOTP_CHECK_PADDING) && + check_pad(so, cf, i + 1, so->opt.rxpad_content)) { + /* malformed PDU - report 'not a data message' */ + sk->sk_err = EBADMSG; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + return 1; + } + + nskb = alloc_skb(so->rx.len, gfp_any()); + if (!nskb) + return 1; + + memcpy(skb_put(nskb, so->rx.len), so->rx.buf, + so->rx.len); + + nskb->tstamp = skb->tstamp; + nskb->dev = skb->dev; + isotp_rcv_skb(nskb, sk); + return 0; + } + + /* no creation of flow control frames */ + if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) + return 0; + + /* perform blocksize handling, if enabled */ + if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { + /* start rx timeout watchdog */ + hrtimer_start(&so->rxtimer, ktime_set(1, 0), + HRTIMER_MODE_REL_SOFT); + return 0; + } + + /* we reached the specified blocksize so->rxfc.bs */ + isotp_send_fc(sk, ae, ISOTP_FC_CTS); + return 0; +} + +static void isotp_rcv(struct sk_buff *skb, void *data) +{ + struct sock *sk = (struct sock *)data; + struct isotp_sock *so = isotp_sk(sk); + struct canfd_frame *cf; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; + u8 n_pci_type, sf_dl; + + /* Strictly receive only frames with the configured MTU size + * => clear separation of CAN2.0 / CAN FD transport channels + */ + if (skb->len != so->ll.mtu) + return; + + cf = (struct canfd_frame *)skb->data; + + /* if enabled: check reception of my configured extended address */ + if (ae && cf->data[0] != so->opt.rx_ext_address) + return; + + n_pci_type = cf->data[ae] & 0xF0; + + if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { + /* check rx/tx path half duplex expectations */ + if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || + (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) + return; + } + + switch (n_pci_type) { + case N_PCI_FC: + /* tx path: flow control frame containing the FC parameters */ + isotp_rcv_fc(so, cf, ae); + break; + + case N_PCI_SF: + /* rx path: single frame + * + * As we do not have a rx.ll_dl configuration, we can only test + * if the CAN frames payload length matches the LL_DL == 8 + * requirements - no matter if it's CAN 2.0 or CAN FD + */ + + /* get the SF_DL from the N_PCI byte */ + sf_dl = cf->data[ae] & 0x0F; + + if (cf->len <= CAN_MAX_DLEN) { + isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); + } else { + if (skb->len == CANFD_MTU) { + /* We have a CAN FD frame and CAN_DL is greater than 8: + * Only frames with the SF_DL == 0 ESC value are valid. + * + * If so take care of the increased SF PCI size + * (SF_PCI_SZ8) to point to the message content behind + * the extended SF PCI info and get the real SF_DL + * length value from the formerly first data byte. + */ + if (sf_dl == 0) + isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, + cf->data[SF_PCI_SZ4 + ae]); + } + } + break; + + case N_PCI_FF: + /* rx path: first frame */ + isotp_rcv_ff(sk, cf, ae); + break; + + case N_PCI_CF: + /* rx path: consecutive frame */ + isotp_rcv_cf(sk, cf, ae, skb); + break; + } +} + +static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, + int ae, int off) +{ + int pcilen = N_PCI_SZ + ae + off; + int space = so->tx.ll_dl - pcilen; + int num = min_t(int, so->tx.len - so->tx.idx, space); + int i; + + cf->can_id = so->txid; + cf->len = num + pcilen; + + if (num < space) { + if (so->opt.flags & CAN_ISOTP_TX_PADDING) { + /* user requested padding */ + cf->len = padlen(cf->len); + memset(cf->data, so->opt.txpad_content, cf->len); + } else if (cf->len > CAN_MAX_DLEN) { + /* mandatory padding for CAN FD frames */ + cf->len = padlen(cf->len); + memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, + cf->len); + } + } + + for (i = 0; i < num; i++) + cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; + + if (ae) + cf->data[0] = so->opt.ext_address; +} + +static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, + int ae) +{ + int i; + int ff_pci_sz; + + cf->can_id = so->txid; + cf->len = so->tx.ll_dl; + if (ae) + cf->data[0] = so->opt.ext_address; + + /* create N_PCI bytes with 12/32 bit FF_DL data length */ + if (so->tx.len > 4095) { + /* use 32 bit FF_DL notation */ + cf->data[ae] = N_PCI_FF; + cf->data[ae + 1] = 0; + cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU; + cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU; + cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU; + cf->data[ae + 5] = (u8)so->tx.len & 0xFFU; + ff_pci_sz = FF_PCI_SZ32; + } else { + /* use 12 bit FF_DL notation */ + cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF; + cf->data[ae + 1] = (u8)so->tx.len & 0xFFU; + ff_pci_sz = FF_PCI_SZ12; + } + + /* add first data bytes depending on ae */ + for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) + cf->data[i] = so->tx.buf[so->tx.idx++]; + + so->tx.sn = 1; + so->tx.state = ISOTP_WAIT_FIRST_FC; +} + +static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) +{ + struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, + txtimer); + struct sock *sk = &so->sk; + struct sk_buff *skb; + struct net_device *dev; + struct canfd_frame *cf; + enum hrtimer_restart restart = HRTIMER_NORESTART; + int can_send_ret; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; + + switch (so->tx.state) { + case ISOTP_WAIT_FC: + case ISOTP_WAIT_FIRST_FC: + + /* we did not get any flow control frame in time */ + + /* report 'communication error on send' */ + sk->sk_err = ECOMM; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + /* reset tx state */ + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + break; + + case ISOTP_SENDING: + + /* push out the next segmented pdu */ + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) + break; + +isotp_tx_burst: + skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), + gfp_any()); + if (!skb) { + dev_put(dev); + break; + } + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + cf = (struct canfd_frame *)skb->data; + skb_put(skb, so->ll.mtu); + + /* create consecutive frame */ + isotp_fill_dataframe(cf, so, ae, 0); + + /* place consecutive frame N_PCI in appropriate index */ + cf->data[ae] = N_PCI_CF | so->tx.sn++; + so->tx.sn %= 16; + so->tx.bs++; + + if (so->ll.mtu == CANFD_MTU) + cf->flags = so->ll.tx_flags; + + skb->dev = dev; + can_skb_set_owner(skb, sk); + + can_send_ret = can_send(skb, 1); + if (can_send_ret) + printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", + __func__, can_send_ret); + + if (so->tx.idx >= so->tx.len) { + /* we are done */ + so->tx.state = ISOTP_IDLE; + dev_put(dev); + wake_up_interruptible(&so->wait); + break; + } + + if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { + /* stop and wait for FC */ + so->tx.state = ISOTP_WAIT_FC; + dev_put(dev); + hrtimer_set_expires(&so->txtimer, + ktime_add(ktime_get(), + ktime_set(1, 0))); + restart = HRTIMER_RESTART; + break; + } + + /* no gap between data frames needed => use burst mode */ + if (!so->tx_gap) + goto isotp_tx_burst; + + /* start timer to send next data frame with correct delay */ + dev_put(dev); + hrtimer_set_expires(&so->txtimer, + ktime_add(ktime_get(), so->tx_gap)); + restart = HRTIMER_RESTART; + break; + + default: + WARN_ON_ONCE(1); + } + + return restart; +} + +static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + struct sk_buff *skb; + struct net_device *dev; + struct canfd_frame *cf; + int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; + int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; + int off; + int err; + + if (!so->bound) + return -EADDRNOTAVAIL; + + /* we do not support multiple buffers - for now */ + if (so->tx.state != ISOTP_IDLE || wq_has_sleeper(&so->wait)) { + if (msg->msg_flags & MSG_DONTWAIT) + return -EAGAIN; + + /* wait for complete transmission of current pdu */ + wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + } + + if (!size || size > MAX_MSG_LENGTH) + return -EINVAL; + + err = memcpy_from_msg(so->tx.buf, msg, size); + if (err < 0) + return err; + + dev = dev_get_by_index(sock_net(sk), so->ifindex); + if (!dev) + return -ENXIO; + + skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) { + dev_put(dev); + return err; + } + + can_skb_reserve(skb); + can_skb_prv(skb)->ifindex = dev->ifindex; + can_skb_prv(skb)->skbcnt = 0; + + so->tx.state = ISOTP_SENDING; + so->tx.len = size; + so->tx.idx = 0; + + cf = (struct canfd_frame *)skb->data; + skb_put(skb, so->ll.mtu); + + /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ + off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; + + /* check for single frame transmission depending on TX_DL */ + if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { + /* The message size generally fits into a SingleFrame - good. + * + * SF_DL ESC offset optimization: + * + * When TX_DL is greater 8 but the message would still fit + * into a 8 byte CAN frame, we can omit the offset. + * This prevents a protocol caused length extension from + * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. + */ + if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) + off = 0; + + isotp_fill_dataframe(cf, so, ae, off); + + /* place single frame N_PCI w/o length in appropriate index */ + cf->data[ae] = N_PCI_SF; + + /* place SF_DL size value depending on the SF_DL ESC offset */ + if (off) + cf->data[SF_PCI_SZ4 + ae] = size; + else + cf->data[ae] |= size; + + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); + + /* don't enable wait queue for a single frame transmission */ + wait_tx_done = 0; + } else { + /* send first frame and wait for FC */ + + isotp_create_fframe(cf, so, ae); + + /* start timeout for FC */ + hrtimer_start(&so->txtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); + } + + /* send the first or only CAN frame */ + if (so->ll.mtu == CANFD_MTU) + cf->flags = so->ll.tx_flags; + + skb->dev = dev; + skb->sk = sk; + err = can_send(skb, 1); + dev_put(dev); + if (err) { + printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", + __func__, err); + return err; + } + + if (wait_tx_done) { + /* wait for complete transmission of current pdu */ + wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + } + + return size; +} + +static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int err = 0; + int noblock; + + noblock = flags & MSG_DONTWAIT; + flags &= ~MSG_DONTWAIT; + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + return err; + + if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + else + size = skb->len; + + err = memcpy_to_msg(msg, skb->data, size); + if (err < 0) { + skb_free_datagram(sk, skb); + return err; + } + + sock_recv_timestamp(msg, sk, skb); + + if (msg->msg_name) { + msg->msg_namelen = sizeof(struct sockaddr_can); + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + } + + skb_free_datagram(sk, skb); + + return size; +} + +static int isotp_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so; + struct net *net; + + if (!sk) + return 0; + + so = isotp_sk(sk); + net = sock_net(sk); + + /* wait for complete transmission of current pdu */ + wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + + unregister_netdevice_notifier(&so->notifier); + + lock_sock(sk); + + hrtimer_cancel(&so->txtimer); + hrtimer_cancel(&so->rxtimer); + + /* remove current filters & unregister */ + if (so->bound) { + if (so->ifindex) { + struct net_device *dev; + + dev = dev_get_by_index(net, so->ifindex); + if (dev) { + can_rx_unregister(net, dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + dev_put(dev); + } + } + } + + so->ifindex = 0; + so->bound = 0; + + sock_orphan(sk); + sock->sk = NULL; + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + struct net *net = sock_net(sk); + int ifindex; + struct net_device *dev; + int err = 0; + int notify_enetdown = 0; + + if (len < CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp)) + return -EINVAL; + + if (addr->can_addr.tp.rx_id == addr->can_addr.tp.tx_id) + return -EADDRNOTAVAIL; + + if ((addr->can_addr.tp.rx_id | addr->can_addr.tp.tx_id) & + (CAN_ERR_FLAG | CAN_RTR_FLAG)) + return -EADDRNOTAVAIL; + + if (!addr->can_ifindex) + return -ENODEV; + + lock_sock(sk); + + if (so->bound && addr->can_ifindex == so->ifindex && + addr->can_addr.tp.rx_id == so->rxid && + addr->can_addr.tp.tx_id == so->txid) + goto out; + + dev = dev_get_by_index(net, addr->can_ifindex); + if (!dev) { + err = -ENODEV; + goto out; + } + if (dev->type != ARPHRD_CAN) { + dev_put(dev); + err = -ENODEV; + goto out; + } + if (dev->mtu < so->ll.mtu) { + dev_put(dev); + err = -EINVAL; + goto out; + } + if (!(dev->flags & IFF_UP)) + notify_enetdown = 1; + + ifindex = dev->ifindex; + + can_rx_register(net, dev, addr->can_addr.tp.rx_id, + SINGLE_MASK(addr->can_addr.tp.rx_id), isotp_rcv, sk, + "isotp", sk); + + dev_put(dev); + + if (so->bound) { + /* unregister old filter */ + if (so->ifindex) { + dev = dev_get_by_index(net, so->ifindex); + if (dev) { + can_rx_unregister(net, dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + dev_put(dev); + } + } + } + + /* switch to new settings */ + so->ifindex = ifindex; + so->rxid = addr->can_addr.tp.rx_id; + so->txid = addr->can_addr.tp.tx_id; + so->bound = 1; + +out: + release_sock(sk); + + if (notify_enetdown) { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + + return err; +} + +static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) +{ + struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + + if (peer) + return -EOPNOTSUPP; + + addr->can_family = AF_CAN; + addr->can_ifindex = so->ifindex; + addr->can_addr.tp.rx_id = so->rxid; + addr->can_addr.tp.tx_id = so->txid; + + return sizeof(*addr); +} + +static int isotp_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + int ret = 0; + + if (level != SOL_CAN_ISOTP) + return -EINVAL; + + switch (optname) { + case CAN_ISOTP_OPTS: + if (optlen != sizeof(struct can_isotp_options)) + return -EINVAL; + + if (copy_from_sockptr(&so->opt, optval, optlen)) + return -EFAULT; + + /* no separate rx_ext_address is given => use ext_address */ + if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) + so->opt.rx_ext_address = so->opt.ext_address; + break; + + case CAN_ISOTP_RECV_FC: + if (optlen != sizeof(struct can_isotp_fc_options)) + return -EINVAL; + + if (copy_from_sockptr(&so->rxfc, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_TX_STMIN: + if (optlen != sizeof(u32)) + return -EINVAL; + + if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_RX_STMIN: + if (optlen != sizeof(u32)) + return -EINVAL; + + if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen)) + return -EFAULT; + break; + + case CAN_ISOTP_LL_OPTS: + if (optlen == sizeof(struct can_isotp_ll_options)) { + struct can_isotp_ll_options ll; + + if (copy_from_sockptr(&ll, optval, optlen)) + return -EFAULT; + + /* check for correct ISO 11898-1 DLC data length */ + if (ll.tx_dl != padlen(ll.tx_dl)) + return -EINVAL; + + if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) + return -EINVAL; + + if (ll.mtu == CAN_MTU && ll.tx_dl > CAN_MAX_DLEN) + return -EINVAL; + + memcpy(&so->ll, &ll, sizeof(ll)); + + /* set ll_dl for tx path to similar place as for rx */ + so->tx.ll_dl = ll.tx_dl; + } else { + return -EINVAL; + } + break; + + default: + ret = -ENOPROTOOPT; + } + + return ret; +} + +static int isotp_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + int len; + void *val; + + if (level != SOL_CAN_ISOTP) + return -EINVAL; + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + + switch (optname) { + case CAN_ISOTP_OPTS: + len = min_t(int, len, sizeof(struct can_isotp_options)); + val = &so->opt; + break; + + case CAN_ISOTP_RECV_FC: + len = min_t(int, len, sizeof(struct can_isotp_fc_options)); + val = &so->rxfc; + break; + + case CAN_ISOTP_TX_STMIN: + len = min_t(int, len, sizeof(u32)); + val = &so->force_tx_stmin; + break; + + case CAN_ISOTP_RX_STMIN: + len = min_t(int, len, sizeof(u32)); + val = &so->force_rx_stmin; + break; + + case CAN_ISOTP_LL_OPTS: + len = min_t(int, len, sizeof(struct can_isotp_ll_options)); + val = &so->ll; + break; + + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, val, len)) + return -EFAULT; + return 0; +} + +static int isotp_notifier(struct notifier_block *nb, unsigned long msg, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct isotp_sock *so = container_of(nb, struct isotp_sock, notifier); + struct sock *sk = &so->sk; + + if (!net_eq(dev_net(dev), sock_net(sk))) + return NOTIFY_DONE; + + if (dev->type != ARPHRD_CAN) + return NOTIFY_DONE; + + if (so->ifindex != dev->ifindex) + return NOTIFY_DONE; + + switch (msg) { + case NETDEV_UNREGISTER: + lock_sock(sk); + /* remove current filters & unregister */ + if (so->bound) + can_rx_unregister(dev_net(dev), dev, so->rxid, + SINGLE_MASK(so->rxid), + isotp_rcv, sk); + + so->ifindex = 0; + so->bound = 0; + release_sock(sk); + + sk->sk_err = ENODEV; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + + case NETDEV_DOWN: + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + break; + } + + return NOTIFY_DONE; +} + +static int isotp_init(struct sock *sk) +{ + struct isotp_sock *so = isotp_sk(sk); + + so->ifindex = 0; + so->bound = 0; + + so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; + so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; + so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; + so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; + so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; + so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; + so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; + so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; + so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; + so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; + so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; + so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; + + /* set ll_dl for tx path to similar place as for rx */ + so->tx.ll_dl = so->ll.tx_dl; + + so->rx.state = ISOTP_IDLE; + so->tx.state = ISOTP_IDLE; + + hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + so->rxtimer.function = isotp_rx_timer_handler; + hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + so->txtimer.function = isotp_tx_timer_handler; + + init_waitqueue_head(&so->wait); + + so->notifier.notifier_call = isotp_notifier; + register_netdevice_notifier(&so->notifier); + + return 0; +} + +static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + /* no ioctls for socket layer -> hand it down to NIC layer */ + return -ENOIOCTLCMD; +} + +static const struct proto_ops isotp_ops = { + .family = PF_CAN, + .release = isotp_release, + .bind = isotp_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = isotp_getname, + .poll = datagram_poll, + .ioctl = isotp_sock_no_ioctlcmd, + .gettstamp = sock_gettstamp, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = isotp_setsockopt, + .getsockopt = isotp_getsockopt, + .sendmsg = isotp_sendmsg, + .recvmsg = isotp_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto isotp_proto __read_mostly = { + .name = "CAN_ISOTP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct isotp_sock), + .init = isotp_init, +}; + +static const struct can_proto isotp_can_proto = { + .type = SOCK_DGRAM, + .protocol = CAN_ISOTP, + .ops = &isotp_ops, + .prot = &isotp_proto, +}; + +static __init int isotp_module_init(void) +{ + int err; + + pr_info("can: isotp protocol (rev " CAN_ISOTP_VERSION ")\n"); + + err = can_proto_register(&isotp_can_proto); + if (err < 0) + pr_err("can: registration of isotp protocol failed\n"); + + return err; +} + +static __exit void isotp_module_exit(void) +{ + can_proto_unregister(&isotp_can_proto); +} + +module_init(isotp_module_init); +module_exit(isotp_module_exit); -- cgit v1.2.3 From eca43ee6c46db92dd850ce659316b0680d70e137 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Fri, 9 Oct 2020 07:03:25 +0000 Subject: bpf: Add tcp_notsent_lowat bpf setsockopt Adding support for TCP_NOTSENT_LOWAT sockoption (https://lwn.net/Articles/560082/) in tcp bpf programs. Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201009070325.226855-1-tehnerd@tehnerd.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 4 ++++ tools/include/uapi/linux/bpf.h | 2 +- tools/testing/selftests/bpf/progs/connect4_prog.c | 19 +++++++++++++++++++ 4 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d83561e8cd2c..42d2df799397 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1698,7 +1698,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/net/core/filter.c b/net/core/filter.c index 05df73780dd3..5da44b11e1ec 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4827,6 +4827,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, else icsk->icsk_user_timeout = val; break; + case TCP_NOTSENT_LOWAT: + tp->notsent_lowat = val; + sk->sk_write_space(sk); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d83561e8cd2c..42d2df799397 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1698,7 +1698,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index b1b2773c0b9d..a943d394fd3a 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -23,6 +23,10 @@ #define TCP_CA_NAME_MAX 16 #endif +#ifndef TCP_NOTSENT_LOWAT +#define TCP_NOTSENT_LOWAT 25 +#endif + #ifndef IFNAMSIZ #define IFNAMSIZ 16 #endif @@ -128,6 +132,18 @@ static __inline int set_keepalive(struct bpf_sock_addr *ctx) return 0; } +static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx) +{ + int lowat = 65535; + + if (ctx->type == SOCK_STREAM) { + if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat))) + return 1; + } + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -148,6 +164,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) if (set_keepalive(ctx)) return 0; + if (set_notsent_lowat(ctx)) + return 0; + if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM) return 0; else if (ctx->type == SOCK_STREAM) -- cgit v1.2.3 From ccdf07219da6bd1f43c6ddcde4c0e36993c7365a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:43 +0300 Subject: devlink: Add reload action option to devlink reload command Add devlink reload action to allow the user to request a specific reload action. The action parameter is optional, if not specified then devlink driver re-init action is used (backward compatible). Note that when required to do firmware activation some drivers may need to reload the driver. On the other hand some drivers may need to reset the firmware to reinitialize the driver entities. Therefore, the devlink reload command returns the actions which were actually performed. Reload actions supported are: driver_reinit: driver entities re-initialization, applying devlink-param and devlink-resource values. fw_activate: firmware activate. command examples: $devlink dev reload pci/0000:82:00.0 action driver_reinit reload_actions_performed: driver_reinit $devlink dev reload pci/0000:82:00.0 action fw_activate reload_actions_performed: driver_reinit fw_activate Signed-off-by: Moshe Shemesh Reviewed-by: Jakub Kicinski Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/main.c | 7 +- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 7 +- drivers/net/ethernet/mellanox/mlxsw/core.c | 10 ++- drivers/net/netdevsim/dev.c | 8 +- include/net/devlink.h | 7 +- include/uapi/linux/devlink.h | 13 +++ net/core/devlink.c | 98 +++++++++++++++++++++-- 7 files changed, 131 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 70cf24ba71e4..649c5323cf9f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3946,6 +3946,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload, struct devlink *devlink); static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, + enum devlink_reload_action action, struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); @@ -3962,14 +3963,15 @@ static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, return 0; } -static int mlx4_devlink_reload_up(struct devlink *devlink, - struct netlink_ext_ack *extack) +static int mlx4_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); struct mlx4_dev *dev = &priv->dev; struct mlx4_dev_persistent *persist = dev->persist; int err; + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); err = mlx4_restart_one_up(persist->pdev, true, devlink); if (err) mlx4_err(persist->dev, "mlx4_restart_one_up failed, ret=%d\n", @@ -3980,6 +3982,7 @@ static int mlx4_devlink_reload_up(struct devlink *devlink, static const struct devlink_ops mlx4_devlink_ops = { .port_type_set = mlx4_devlink_port_type_set, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), .reload_down = mlx4_devlink_reload_down, .reload_up = mlx4_devlink_reload_up, }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 9b14e3f805a2..1b248c01a209 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -85,6 +85,7 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, } static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, + enum devlink_reload_action action, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); @@ -93,11 +94,12 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, return 0; } -static int mlx5_devlink_reload_up(struct devlink *devlink, - struct netlink_ext_ack *extack) +static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); return mlx5_load_one(dev, false); } @@ -114,6 +116,7 @@ static const struct devlink_ops mlx5_devlink_ops = { #endif .flash_update = mlx5_devlink_flash_update, .info_get = mlx5_devlink_info_get, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), .reload_down = mlx5_devlink_reload_down, .reload_up = mlx5_devlink_reload_up, }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index a21afa56e3f7..cd9f56c73827 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1414,7 +1414,7 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, static int mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, - bool netns_change, + bool netns_change, enum devlink_reload_action action, struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); @@ -1427,11 +1427,13 @@ mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, } static int -mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, - struct netlink_ext_ack *extack) +mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | + BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE); return mlxsw_core_bus_device_register(mlxsw_core->bus_info, mlxsw_core->bus, mlxsw_core->bus_priv, true, @@ -1564,6 +1566,8 @@ mlxsw_devlink_trap_policer_counter_get(struct devlink *devlink, } static const struct devlink_ops mlxsw_devlink_ops = { + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) | + BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE), .reload_down = mlxsw_devlink_core_bus_device_reload_down, .reload_up = mlxsw_devlink_core_bus_device_reload_up, .port_type_set = mlxsw_devlink_port_type_set, diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 56213ba151f6..b57e35c4ef6f 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -701,7 +701,7 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev); static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, - struct netlink_ext_ack *extack) + enum devlink_reload_action action, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); @@ -717,8 +717,8 @@ static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, return 0; } -static int nsim_dev_reload_up(struct devlink *devlink, - struct netlink_ext_ack *extack) +static int nsim_dev_reload_up(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); @@ -730,6 +730,7 @@ static int nsim_dev_reload_up(struct devlink *devlink, return -EINVAL; } + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); return nsim_dev_reload_create(nsim_dev, extack); } @@ -886,6 +887,7 @@ nsim_dev_devlink_trap_policer_counter_get(struct devlink *devlink, static const struct devlink_ops nsim_dev_devlink_ops = { .supported_flash_update_params = DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT | DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), .reload_down = nsim_dev_reload_down, .reload_up = nsim_dev_reload_up, .info_get = nsim_dev_info_get, diff --git a/include/net/devlink.h b/include/net/devlink.h index 237ba5e29a3b..93c535ae5a4b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1150,10 +1150,11 @@ struct devlink_ops { * implemementation. */ u32 supported_flash_update_params; + unsigned long reload_actions; int (*reload_down)(struct devlink *devlink, bool netns_change, - struct netlink_ext_ack *extack); - int (*reload_up)(struct devlink *devlink, - struct netlink_ext_ack *extack); + enum devlink_reload_action action, struct netlink_ext_ack *extack); + int (*reload_up)(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 5f1d6c327670..74bdad252c36 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -301,6 +301,16 @@ enum { DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE, }; +enum devlink_reload_action { + DEVLINK_RELOAD_ACTION_UNSPEC, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, /* Driver entities re-instantiation */ + DEVLINK_RELOAD_ACTION_FW_ACTIVATE, /* FW activate */ + + /* Add new reload actions above */ + __DEVLINK_RELOAD_ACTION_MAX, + DEVLINK_RELOAD_ACTION_MAX = __DEVLINK_RELOAD_ACTION_MAX - 1 +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -493,6 +503,9 @@ enum devlink_attr { DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK, /* bitfield32 */ + DEVLINK_ATTR_RELOAD_ACTION, /* u8 */ + DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 5c45b3964ec3..c026ed3519c9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -479,6 +479,12 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } +static bool +devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) +{ + return test_bit(action, &devlink->ops->reload_actions); +} + static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) @@ -2984,6 +2990,7 @@ bool devlink_is_reload_failed(const struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_is_reload_failed); static int devlink_reload(struct devlink *devlink, struct net *dest_net, + enum devlink_reload_action action, u32 *actions_performed, struct netlink_ext_ack *extack) { int err; @@ -2991,22 +2998,60 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, if (!devlink->reload_enabled) return -EOPNOTSUPP; - err = devlink->ops->reload_down(devlink, !!dest_net, extack); + err = devlink->ops->reload_down(devlink, !!dest_net, action, extack); if (err) return err; if (dest_net && !net_eq(dest_net, devlink_net(devlink))) devlink_reload_netns_change(devlink, dest_net); - err = devlink->ops->reload_up(devlink, extack); + err = devlink->ops->reload_up(devlink, action, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); - return err; + if (err) + return err; + + WARN_ON(!(*actions_performed & BIT(action))); + return 0; +} + +static int +devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed, + enum devlink_command cmd, struct genl_info *info) +{ + struct sk_buff *msg; + void *hdr; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd); + if (!hdr) + goto free_msg; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + + if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed, + actions_performed)) + goto nla_put_failure; + genlmsg_end(msg, hdr); + + return genlmsg_reply(msg, info); + +nla_put_failure: + genlmsg_cancel(msg, hdr); +free_msg: + nlmsg_free(msg); + return -EMSGSIZE; } static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + enum devlink_reload_action action; struct net *dest_net = NULL; + u32 actions_performed; int err; if (!devlink_reload_supported(devlink->ops)) @@ -3026,12 +3071,30 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(dest_net); } - err = devlink_reload(devlink, dest_net, info->extack); + if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]); + else + action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT; + + if (!devlink_reload_action_is_supported(devlink, action)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested reload action is not supported by the driver"); + return -EOPNOTSUPP; + } + + err = devlink_reload(devlink, dest_net, action, &actions_performed, info->extack); if (dest_net) put_net(dest_net); - return err; + if (err) + return err; + /* For backward compatibility generate reply only if attributes used by user */ + if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + return 0; + + return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, + DEVLINK_CMD_RELOAD, info); } static int devlink_nl_flash_update_fill(struct sk_buff *msg, @@ -7282,6 +7345,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 }, [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 }, [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_ACTION_MAX), }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7615,6 +7680,21 @@ static struct genl_family devlink_nl_family __ro_after_init = { .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), }; +static bool devlink_reload_actions_valid(const struct devlink_ops *ops) +{ + if (!devlink_reload_supported(ops)) { + if (WARN_ON(ops->reload_actions)) + return false; + return true; + } + + if (WARN_ON(!ops->reload_actions || + ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) + return false; + return true; +} + /** * devlink_alloc - Allocate new devlink instance resources * @@ -7631,6 +7711,9 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) if (WARN_ON(!ops)) return NULL; + if (!devlink_reload_actions_valid(ops)) + return NULL; + devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); if (!devlink) return NULL; @@ -9960,6 +10043,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; + u32 actions_performed; int err; /* In case network namespace is getting destroyed, reload @@ -9970,7 +10054,9 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) if (net_eq(devlink_net(devlink), net)) { if (WARN_ON(!devlink_reload_supported(devlink->ops))) continue; - err = devlink_reload(devlink, &init_net, NULL); + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + &actions_performed, NULL); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } -- cgit v1.2.3 From dc64cc7c63102ac78bac3cfbc00ef3abd7a3fdf3 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:44 +0300 Subject: devlink: Add devlink reload limit option Add reload limit to demand restrictions on reload actions. Reload limits supported: no_reset: No reset allowed, no down time allowed, no link flap and no configuration is lost. By default reload limit is unspecified and so no constraints on reload actions are required. Some combinations of action and limit are invalid. For example, driver can not reinitialize its entities without any downtime. The no_reset reload limit will have usecase in this patchset to implement restricted fw_activate on mlx5. Have the uapi parameter of reload limit ready for future support of multiselection. Signed-off-by: Moshe Shemesh Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/main.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 4 +- drivers/net/ethernet/mellanox/mlxsw/core.c | 4 +- drivers/net/netdevsim/dev.c | 6 +- include/net/devlink.h | 8 +- include/uapi/linux/devlink.h | 14 ++++ net/core/devlink.c | 92 +++++++++++++++++++++-- 7 files changed, 119 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 649c5323cf9f..c326b434734e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3947,6 +3947,7 @@ static int mlx4_restart_one_up(struct pci_dev *pdev, bool reload, static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, enum devlink_reload_action action, + enum devlink_reload_limit limit, struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); @@ -3964,7 +3965,8 @@ static int mlx4_devlink_reload_down(struct devlink *devlink, bool netns_change, } static int mlx4_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack) + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) { struct mlx4_priv *priv = devlink_priv(devlink); struct mlx4_dev *dev = &priv->dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 1b248c01a209..0016041e8779 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -86,6 +86,7 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, enum devlink_reload_action action, + enum devlink_reload_limit limit, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); @@ -95,7 +96,8 @@ static int mlx5_devlink_reload_down(struct devlink *devlink, bool netns_change, } static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack) + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index cd9f56c73827..7f77c2a71d1c 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1415,6 +1415,7 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, static int mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, bool netns_change, enum devlink_reload_action action, + enum devlink_reload_limit limit, struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); @@ -1428,7 +1429,8 @@ mlxsw_devlink_core_bus_device_reload_down(struct devlink *devlink, static int mlxsw_devlink_core_bus_device_reload_up(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack) + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) { struct mlxsw_core *mlxsw_core = devlink_priv(devlink); diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index b57e35c4ef6f..d07061417675 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -701,7 +701,8 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev); static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, - enum devlink_reload_action action, struct netlink_ext_ack *extack) + enum devlink_reload_action action, enum devlink_reload_limit limit, + struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); @@ -718,7 +719,8 @@ static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, } static int nsim_dev_reload_up(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack) + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); diff --git a/include/net/devlink.h b/include/net/devlink.h index 93c535ae5a4b..9f5c37c391f8 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1151,10 +1151,14 @@ struct devlink_ops { */ u32 supported_flash_update_params; unsigned long reload_actions; + unsigned long reload_limits; int (*reload_down)(struct devlink *devlink, bool netns_change, - enum devlink_reload_action action, struct netlink_ext_ack *extack); + enum devlink_reload_action action, + enum devlink_reload_limit limit, + struct netlink_ext_ack *extack); int (*reload_up)(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack); + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 74bdad252c36..82a5e66c1518 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -311,6 +311,19 @@ enum devlink_reload_action { DEVLINK_RELOAD_ACTION_MAX = __DEVLINK_RELOAD_ACTION_MAX - 1 }; +enum devlink_reload_limit { + DEVLINK_RELOAD_LIMIT_UNSPEC, /* unspecified, no constraints */ + DEVLINK_RELOAD_LIMIT_NO_RESET, /* No reset allowed, no down time allowed, + * no link flap and no configuration is lost. + */ + + /* Add new reload limit above */ + __DEVLINK_RELOAD_LIMIT_MAX, + DEVLINK_RELOAD_LIMIT_MAX = __DEVLINK_RELOAD_LIMIT_MAX - 1 +}; + +#define DEVLINK_RELOAD_LIMITS_VALID_MASK (BIT(__DEVLINK_RELOAD_LIMIT_MAX) - 1) + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -505,6 +518,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTION, /* u8 */ DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ + DEVLINK_ATTR_RELOAD_LIMITS, /* bitfield32 */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index c026ed3519c9..28b63faa3c6b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -479,12 +479,44 @@ static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink) return 0; } +struct devlink_reload_combination { + enum devlink_reload_action action; + enum devlink_reload_limit limit; +}; + +static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = { + { + /* can't reinitialize driver with no down time */ + .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + .limit = DEVLINK_RELOAD_LIMIT_NO_RESET, + }, +}; + +static bool +devlink_reload_combination_is_invalid(enum devlink_reload_action action, + enum devlink_reload_limit limit) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) + if (devlink_reload_invalid_combinations[i].action == action && + devlink_reload_invalid_combinations[i].limit == limit) + return true; + return false; +} + static bool devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action) { return test_bit(action, &devlink->ops->reload_actions); } +static bool +devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit) +{ + return test_bit(limit, &devlink->ops->reload_limits); +} + static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) @@ -2990,22 +3022,22 @@ bool devlink_is_reload_failed(const struct devlink *devlink) EXPORT_SYMBOL_GPL(devlink_is_reload_failed); static int devlink_reload(struct devlink *devlink, struct net *dest_net, - enum devlink_reload_action action, u32 *actions_performed, - struct netlink_ext_ack *extack) + enum devlink_reload_action action, enum devlink_reload_limit limit, + u32 *actions_performed, struct netlink_ext_ack *extack) { int err; if (!devlink->reload_enabled) return -EOPNOTSUPP; - err = devlink->ops->reload_down(devlink, !!dest_net, action, extack); + err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; if (dest_net && !net_eq(dest_net, devlink_net(devlink))) devlink_reload_netns_change(devlink, dest_net); - err = devlink->ops->reload_up(devlink, action, actions_performed, extack); + err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); if (err) return err; @@ -3050,6 +3082,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; enum devlink_reload_action action; + enum devlink_reload_limit limit; struct net *dest_net = NULL; u32 actions_performed; int err; @@ -3082,7 +3115,38 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; } - err = devlink_reload(devlink, dest_net, action, &actions_performed, info->extack); + limit = DEVLINK_RELOAD_LIMIT_UNSPEC; + if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) { + struct nla_bitfield32 limits; + u32 limits_selected; + + limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]); + limits_selected = limits.value & limits.selector; + if (!limits_selected) { + NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected"); + return -EINVAL; + } + for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++) + if (limits_selected & BIT(limit)) + break; + /* UAPI enables multiselection, but currently it is not used */ + if (limits_selected != BIT(limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Multiselection of limit is not supported"); + return -EOPNOTSUPP; + } + if (!devlink_reload_limit_is_supported(devlink, limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested limit is not supported by the driver"); + return -EOPNOTSUPP; + } + if (devlink_reload_combination_is_invalid(action, limit)) { + NL_SET_ERR_MSG_MOD(info->extack, + "Requested limit is invalid for this action"); + return -EINVAL; + } + } + err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack); if (dest_net) put_net(dest_net); @@ -3090,7 +3154,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) if (err) return err; /* For backward compatibility generate reply only if attributes used by user */ - if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION]) + if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) return 0; return devlink_nl_reload_actions_performed_snd(devlink, actions_performed, @@ -7347,6 +7411,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PORT_FUNCTION] = { .type = NLA_NESTED }, [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_ACTION_MAX), + [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(DEVLINK_RELOAD_LIMITS_VALID_MASK), }; static const struct genl_small_ops devlink_nl_ops[] = { @@ -7682,6 +7747,9 @@ static struct genl_family devlink_nl_family __ro_after_init = { static bool devlink_reload_actions_valid(const struct devlink_ops *ops) { + const struct devlink_reload_combination *comb; + int i; + if (!devlink_reload_supported(ops)) { if (WARN_ON(ops->reload_actions)) return false; @@ -7692,6 +7760,17 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops) ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX))) return false; + + if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) || + ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX))) + return false; + + for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) { + comb = &devlink_reload_invalid_combinations[i]; + if (ops->reload_actions == BIT(comb->action) && + ops->reload_limits == BIT(comb->limit)) + return false; + } return true; } @@ -10056,6 +10135,7 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) continue; err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); -- cgit v1.2.3 From a254c264267e8746fb257806c166e54375cf9c06 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:45 +0300 Subject: devlink: Add reload stats Add reload stats to hold the history per reload action type and limit. For example, the number of times fw_activate has been performed on this device since the driver module was added or if the firmware activation was performed with or without reset. Add devlink notification on stats update. Expose devlink reload stats to the user through devlink dev get command. Examples: $ devlink dev show pci/0000:82:00.0: stats: reload: driver_reinit 2 fw_activate 1 fw_activate_no_reset 0 pci/0000:82:00.1: stats: reload: driver_reinit 1 fw_activate 0 fw_activate_no_reset 0 $ devlink dev show -jp { "dev": { "pci/0000:82:00.0": { "stats": { "reload": { "driver_reinit": 2, "fw_activate": 1, "fw_activate_no_reset": 0 } } }, "pci/0000:82:00.1": { "stats": { "reload": { "driver_reinit": 1, "fw_activate": 0, "fw_activate_no_reset": 0 } } } } } Signed-off-by: Moshe Shemesh Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 8 ++++ include/uapi/linux/devlink.h | 6 +++ net/core/devlink.c | 90 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 9f5c37c391f8..d091c6ba82ce 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -20,6 +20,13 @@ #include #include +#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ + (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) + +struct devlink_dev_stats { + u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; +}; + struct devlink_ops; struct devlink { @@ -38,6 +45,7 @@ struct devlink { struct list_head trap_policer_list; const struct devlink_ops *ops; struct xarray snapshot_ids; + struct devlink_dev_stats stats; struct device *dev; possible_net_t _net; struct mutex lock; /* Serializes access to devlink instance specific objects such as diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 82a5e66c1518..ab15fc597b74 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -520,6 +520,12 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ DEVLINK_ATTR_RELOAD_LIMITS, /* bitfield32 */ + DEVLINK_ATTR_DEV_STATS, /* nested */ + DEVLINK_ATTR_RELOAD_STATS, /* nested */ + DEVLINK_ATTR_RELOAD_STATS_ENTRY, /* nested */ + DEVLINK_ATTR_RELOAD_STATS_LIMIT, /* u8 */ + DEVLINK_ATTR_RELOAD_STATS_VALUE, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 28b63faa3c6b..a167c3bb468c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -517,10 +517,66 @@ devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_l return test_bit(limit, &devlink->ops->reload_limits); } +static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_action action, + enum devlink_reload_limit limit, u32 value) +{ + struct nlattr *reload_stats_entry; + + reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY); + if (!reload_stats_entry) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, action) || + nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || + nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) + goto nla_put_failure; + nla_nest_end(msg, reload_stats_entry); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, reload_stats_entry); + return -EMSGSIZE; +} + +static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink) +{ + struct nlattr *reload_stats_attr; + int i, j, stat_idx; + u32 value; + + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + + if (!reload_stats_attr) + return -EMSGSIZE; + + for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { + if (j != DEVLINK_RELOAD_LIMIT_UNSPEC && + !devlink_reload_limit_is_supported(devlink, j)) + continue; + for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { + if (!devlink_reload_action_is_supported(devlink, i) || + devlink_reload_combination_is_invalid(i, j)) + continue; + + stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; + value = devlink->stats.reload_stats[stat_idx]; + if (devlink_reload_stat_put(msg, i, j, value)) + goto nla_put_failure; + } + } + nla_nest_end(msg, reload_stats_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(msg, reload_stats_attr); + return -EMSGSIZE; +} + static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags) { + struct nlattr *dev_stats; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -532,9 +588,19 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed)) goto nla_put_failure; + dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS); + if (!dev_stats) + goto nla_put_failure; + + if (devlink_reload_stats_put(msg, devlink)) + goto dev_stats_nest_cancel; + + nla_nest_end(msg, dev_stats); genlmsg_end(msg, hdr); return 0; +dev_stats_nest_cancel: + nla_nest_cancel(msg, dev_stats); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; @@ -3021,6 +3087,29 @@ bool devlink_is_reload_failed(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_is_reload_failed); +static void +__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats, + enum devlink_reload_limit limit, u32 actions_performed) +{ + unsigned long actions = actions_performed; + int stat_idx; + int action; + + for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) { + stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action; + reload_stats[stat_idx]++; + } + devlink_notify(devlink, DEVLINK_CMD_NEW); +} + +static void +devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit, + u32 actions_performed) +{ + __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit, + actions_performed); +} + static int devlink_reload(struct devlink *devlink, struct net *dest_net, enum devlink_reload_action action, enum devlink_reload_limit limit, u32 *actions_performed, struct netlink_ext_ack *extack) @@ -3043,6 +3132,7 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, return err; WARN_ON(!(*actions_performed & BIT(action))); + devlink_reload_stats_update(devlink, limit, *actions_performed); return 0; } -- cgit v1.2.3 From 77069ba2e3adf48c472fbbd9cbd7a4f5370b17df Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:46 +0300 Subject: devlink: Add remote reload stats Add remote reload stats to hold the history of actions performed due devlink reload commands initiated by remote host. For example, in case firmware activation with reset finished successfully but was initiated by remote host. The function devlink_remote_reload_actions_performed() is exported to enable drivers update on remote reload actions performed as it was not initiated by their own devlink instance. Expose devlink remote reload stats to the user through devlink dev get command. Examples: $ devlink dev show pci/0000:82:00.0: stats: reload: driver_reinit 2 fw_activate 1 fw_activate_no_reset 0 remote_reload: driver_reinit 0 fw_activate 0 fw_activate_no_reset 0 pci/0000:82:00.1: stats: reload: driver_reinit 1 fw_activate 0 fw_activate_no_reset 0 remote_reload: driver_reinit 1 fw_activate 1 fw_activate_no_reset 0 $ devlink dev show -jp { "dev": { "pci/0000:82:00.0": { "stats": { "reload": { "driver_reinit": 2, "fw_activate": 1, "fw_activate_no_reset": 0 }, "remote_reload": { "driver_reinit": 0, "fw_activate": 0, "fw_activate_no_reset": 0 } } }, "pci/0000:82:00.1": { "stats": { "reload": { "driver_reinit": 1, "fw_activate": 0, "fw_activate_no_reset": 0 }, "remote_reload": { "driver_reinit": 1, "fw_activate": 1, "fw_activate_no_reset": 0 } } } } } Signed-off-by: Moshe Shemesh Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 4 +++ include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 60 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 59 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index d091c6ba82ce..d2771e57a278 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -25,6 +25,7 @@ struct devlink_dev_stats { u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; }; struct devlink_ops; @@ -1567,6 +1568,9 @@ void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter); bool devlink_is_reload_failed(const struct devlink *devlink); +void devlink_remote_reload_actions_performed(struct devlink *devlink, + enum devlink_reload_limit limit, + u32 actions_performed); void devlink_flash_update_begin_notify(struct devlink *devlink); void devlink_flash_update_end_notify(struct devlink *devlink); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ab15fc597b74..0113bc4db9f5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -525,6 +525,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_STATS_ENTRY, /* nested */ DEVLINK_ATTR_RELOAD_STATS_LIMIT, /* u8 */ DEVLINK_ATTR_RELOAD_STATS_VALUE, /* u32 */ + DEVLINK_ATTR_REMOTE_RELOAD_STATS, /* nested */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index a167c3bb468c..dd889334fed9 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -538,28 +538,39 @@ nla_put_failure: return -EMSGSIZE; } -static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink) +static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) { struct nlattr *reload_stats_attr; int i, j, stat_idx; u32 value; - reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + if (!is_remote) + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS); + else + reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS); if (!reload_stats_attr) return -EMSGSIZE; for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { - if (j != DEVLINK_RELOAD_LIMIT_UNSPEC && + /* Remote stats are shown even if not locally supported. Stats + * of actions with unspecified limit are shown though drivers + * don't need to register unspecified limit. + */ + if (!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && !devlink_reload_limit_is_supported(devlink, j)) continue; for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { - if (!devlink_reload_action_is_supported(devlink, i) || + if ((!is_remote && !devlink_reload_action_is_supported(devlink, i)) || + i == DEVLINK_RELOAD_ACTION_UNSPEC || devlink_reload_combination_is_invalid(i, j)) continue; stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i; - value = devlink->stats.reload_stats[stat_idx]; + if (!is_remote) + value = devlink->stats.reload_stats[stat_idx]; + else + value = devlink->stats.remote_reload_stats[stat_idx]; if (devlink_reload_stat_put(msg, i, j, value)) goto nla_put_failure; } @@ -592,7 +603,9 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink, if (!dev_stats) goto nla_put_failure; - if (devlink_reload_stats_put(msg, devlink)) + if (devlink_reload_stats_put(msg, devlink, false)) + goto dev_stats_nest_cancel; + if (devlink_reload_stats_put(msg, devlink, true)) goto dev_stats_nest_cancel; nla_nest_end(msg, dev_stats); @@ -3110,15 +3123,47 @@ devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit l actions_performed); } +/** + * devlink_remote_reload_actions_performed - Update devlink on reload actions + * performed which are not a direct result of devlink reload call. + * + * This should be called by a driver after performing reload actions in case it was not + * a result of devlink reload call. For example fw_activate was performed as a result + * of devlink reload triggered fw_activate on another host. + * The motivation for this function is to keep data on reload actions performed on this + * function whether it was done due to direct devlink reload call or not. + * + * @devlink: devlink + * @limit: reload limit + * @actions_performed: bitmask of actions performed + */ +void devlink_remote_reload_actions_performed(struct devlink *devlink, + enum devlink_reload_limit limit, + u32 actions_performed) +{ + if (WARN_ON(!actions_performed || + actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) || + actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) || + limit > DEVLINK_RELOAD_LIMIT_MAX)) + return; + + __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit, + actions_performed); +} +EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed); + static int devlink_reload(struct devlink *devlink, struct net *dest_net, enum devlink_reload_action action, enum devlink_reload_limit limit, u32 *actions_performed, struct netlink_ext_ack *extack) { + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; int err; if (!devlink->reload_enabled) return -EOPNOTSUPP; + memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats)); err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; @@ -3132,6 +3177,9 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, return err; WARN_ON(!(*actions_performed & BIT(action))); + /* Catch driver on updating the remote action within devlink reload */ + WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, + sizeof(remote_reload_stats))); devlink_reload_stats_update(devlink, limit, *actions_performed); return 0; } -- cgit v1.2.3 From 44f3625bc61653ea3bde9960298faf2f5518fda5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 8 Oct 2020 12:45:17 +0200 Subject: netlink: export policy in extended ACK Add a new attribute NLMSGERR_ATTR_POLICY to the extended ACK to advertise the policy, e.g. if an attribute was out of range, you'll know the range that's permissible. Add new NL_SET_ERR_MSG_ATTR_POL() and NL_SET_ERR_MSG_ATTR_POL() macros to set this, since realistically it's only useful to do this when the bad attribute (offset) is also returned. Use it in lib/nlattr.c which practically does all the policy validation. v2: - add and use netlink_policy_dump_attr_size_estimate() v3: - remove redundant break v4: - really remove redundant break ... sorry Reviewed-by: Jakub Kicinski Signed-off-by: Johannes Berg Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 30 ++++++++++++++-------- include/net/netlink.h | 4 +++ include/uapi/linux/netlink.h | 2 ++ lib/nlattr.c | 35 +++++++++++++------------ net/netlink/af_netlink.c | 5 ++++ net/netlink/policy.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index e3e49f0e5c13..666cd0390699 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -68,12 +68,14 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error + * @policy: policy for a bad attribute * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; + const struct nla_policy *policy; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; }; @@ -95,21 +97,29 @@ struct netlink_ext_ack { #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) -#define NL_SET_BAD_ATTR(extack, attr) do { \ - if ((extack)) \ +#define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ + if ((extack)) { \ (extack)->bad_attr = (attr); \ + (extack)->policy = (pol); \ + } \ } while (0) -#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) do { \ - static const char __msg[] = msg; \ - struct netlink_ext_ack *__extack = (extack); \ - \ - if (__extack) { \ - __extack->_msg = __msg; \ - __extack->bad_attr = (attr); \ - } \ +#define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) + +#define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ + static const char __msg[] = msg; \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) { \ + __extack->_msg = __msg; \ + __extack->bad_attr = (attr); \ + __extack->policy = (pol); \ + } \ } while (0) +#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ + NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { diff --git a/include/net/netlink.h b/include/net/netlink.h index 2b9e41075f19..7356f41d23ba 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1957,6 +1957,10 @@ int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); +int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt); +int netlink_policy_dump_write_attr(struct sk_buff *skb, + const struct nla_policy *pt, + int nestattr); void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index d02e472ba54c..c3816ff7bfc3 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -129,6 +129,7 @@ struct nlmsgerr { * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to * be used - in the success case - to identify a created * object or operation or similar (binary) + * @NLMSGERR_ATTR_POLICY: policy for a rejected attribute * @__NLMSGERR_ATTR_MAX: number of attributes * @NLMSGERR_ATTR_MAX: highest attribute number */ @@ -137,6 +138,7 @@ enum nlmsgerr_attrs { NLMSGERR_ATTR_MSG, NLMSGERR_ATTR_OFFS, NLMSGERR_ATTR_COOKIE, + NLMSGERR_ATTR_POLICY, __NLMSGERR_ATTR_MAX, NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 diff --git a/lib/nlattr.c b/lib/nlattr.c index 9c99f5daa4d2..74019c8ebf6b 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -96,8 +96,8 @@ static int nla_validate_array(const struct nlattr *head, int len, int maxtype, continue; if (nla_len(entry) < NLA_HDRLEN) { - NL_SET_ERR_MSG_ATTR(extack, entry, - "Array element too short"); + NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, + "Array element too short"); return -ERANGE; } @@ -195,8 +195,8 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt, pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "invalid attribute length"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "invalid attribute length"); return -EINVAL; } @@ -208,11 +208,11 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt, bool binary = pt->type == NLA_BINARY; if (binary) - NL_SET_ERR_MSG_ATTR(extack, nla, - "binary attribute size out of range"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "binary attribute size out of range"); else - NL_SET_ERR_MSG_ATTR(extack, nla, - "integer out of range"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "integer out of range"); return -ERANGE; } @@ -291,8 +291,8 @@ static int nla_validate_int_range_signed(const struct nla_policy *pt, nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "integer out of range"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "integer out of range"); return -ERANGE; } @@ -377,8 +377,8 @@ static int validate_nla(const struct nlattr *nla, int maxtype, pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "invalid attribute length"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "invalid attribute length"); return -EINVAL; } } @@ -386,14 +386,14 @@ static int validate_nla(const struct nlattr *nla, int maxtype, if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "NLA_F_NESTED is missing"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { - NL_SET_ERR_MSG_ATTR(extack, nla, - "NLA_F_NESTED not expected"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "NLA_F_NESTED not expected"); return -EINVAL; } } @@ -550,7 +550,8 @@ static int validate_nla(const struct nlattr *nla, int maxtype, return 0; out_err: - NL_SET_ERR_MSG_ATTR(extack, nla, "Attribute failed policy validation"); + NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, + "Attribute failed policy validation"); return err; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index df675a8e1918..daca50d6bb12 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2420,6 +2420,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, tlvlen += nla_total_size(sizeof(u32)); if (nlk_has_extack && extack && extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); + if (err && nlk_has_extack && extack && extack->policy) + tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy); if (tlvlen) flags |= NLM_F_ACK_TLVS; @@ -2452,6 +2454,9 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, extack->cookie_len, extack->cookie)); + if (extack->policy) + netlink_policy_dump_write_attr(skb, extack->policy, + NLMSGERR_ATTR_POLICY); } nlmsg_end(skb, rep); diff --git a/net/netlink/policy.c b/net/netlink/policy.c index 4383436759e2..8d7c900e27f4 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -196,12 +196,54 @@ bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state) return !netlink_policy_dump_finished(state); } +int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt) +{ + /* nested + type */ + int common = 2 * nla_attr_size(sizeof(u32)); + + switch (pt->type) { + case NLA_UNSPEC: + case NLA_REJECT: + /* these actually don't need any space */ + return 0; + case NLA_NESTED: + case NLA_NESTED_ARRAY: + /* common, policy idx, policy maxattr */ + return common + 2 * nla_attr_size(sizeof(u32)); + case NLA_U8: + case NLA_U16: + case NLA_U32: + case NLA_U64: + case NLA_MSECS: + case NLA_S8: + case NLA_S16: + case NLA_S32: + case NLA_S64: + /* maximum is common, u64 min/max with padding */ + return common + + 2 * (nla_attr_size(0) + nla_attr_size(sizeof(u64))); + case NLA_BITFIELD32: + return common + nla_attr_size(sizeof(u32)); + case NLA_STRING: + case NLA_NUL_STRING: + case NLA_BINARY: + /* maximum is common, u32 min-length/max-length */ + return common + 2 * nla_attr_size(sizeof(u32)); + case NLA_FLAG: + return common; + } + + /* this should then cause a warning later */ + return 0; +} + static int __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, struct sk_buff *skb, const struct nla_policy *pt, int nestattr) { + int estimate = netlink_policy_dump_attr_size_estimate(pt); enum netlink_attribute_type type; struct nlattr *attr; @@ -334,12 +376,31 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, goto nla_put_failure; nla_nest_end(skb, attr); + WARN_ON(attr->nla_len > estimate); + return 0; nla_put_failure: nla_nest_cancel(skb, attr); return -ENOBUFS; } +/** + * netlink_policy_dump_write_attr - write a given attribute policy + * @skb: the message skb to write to + * @pt: the attribute's policy + * @nestattr: the nested attribute ID to use + * + * Returns: 0 on success, an error code otherwise; -%ENODATA is + * special, indicating that there's no policy data and + * the attribute is generally rejected. + */ +int netlink_policy_dump_write_attr(struct sk_buff *skb, + const struct nla_policy *pt, + int nestattr) +{ + return __netlink_policy_dump_write_attr(NULL, skb, pt, nestattr); +} + /** * netlink_policy_dump_write - write current policy dump attributes * @skb: the message skb to write to -- cgit v1.2.3 From dd2ce6a5373c6f5c830be54be10775458a8bd312 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:01 +0200 Subject: bpf: Improve bpf_redirect_neigh helper description Follow-up to address David's feedback that we should better describe internals of the bpf_redirect_neigh() helper. Suggested-by: David Ahern Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/20201010234006.7075-2-daniel@iogearbox.net --- include/uapi/linux/bpf.h | 10 +++++++--- tools/include/uapi/linux/bpf.h | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42d2df799397..4272cc53d478 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3679,10 +3679,14 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 42d2df799397..4272cc53d478 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3679,10 +3679,14 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. -- cgit v1.2.3 From 9aa1206e8f48222f35a0c809f33b2f4aaa1e2661 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:02 +0200 Subject: bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net --- drivers/net/veth.c | 9 +++++++ include/linux/netdevice.h | 4 ++++ include/uapi/linux/bpf.h | 17 +++++++++++++ net/core/dev.c | 15 +++++++++--- net/core/filter.c | 54 ++++++++++++++++++++++++++++++++++++------ tools/include/uapi/linux/bpf.h | 17 +++++++++++++ 6 files changed, 106 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 091e5b4ba042..8c737668008a 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev) return smp_processor_id() % dev->real_num_rx_queues; } +static struct net_device *veth_peer_dev(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + + /* Callers must be under RCU read side. */ + return rcu_dereference(priv->peer); +} + static int veth_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags, bool ndo_xmit) @@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = { .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, .ndo_xdp_xmit = veth_ndo_xdp_xmit, + .ndo_get_peer_dev = veth_peer_dev, }; #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28cfa53daf72..0533f86018dd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1277,6 +1277,9 @@ struct netdev_net_notifier { * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. + * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); + * If a device is paired with a peer device, return the peer instance. + * The caller must be under RCU read context. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1484,6 +1487,7 @@ struct net_device_ops { struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); + struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); }; /** diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4272cc53d478..b97bc5abb3b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/net/core/dev.c b/net/core/dev.c index 9d55bf5d1a65..7dd015823593 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); static inline struct sk_buff * sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev) + struct net_device *orig_dev, bool *another) { #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); @@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, * redirecting to another netdev */ __skb_push(skb, skb->mac_len); - skb_do_redirect(skb); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } return NULL; case TC_ACT_CONSUMED: return NULL; @@ -5163,7 +5167,12 @@ another_round: skip_taps: #ifdef CONFIG_NET_INGRESS if (static_branch_unlikely(&ingress_needed_key)) { - skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); + bool another = false; + + skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, + &another); + if (another) + goto another_round; if (!skb) goto out; diff --git a/net/core/filter.c b/net/core/filter.c index 5da44b11e1ec..fab951c6be57 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2380,8 +2380,9 @@ out: /* Internal, non-exposed redirect flags. */ enum { - BPF_F_NEIGH = (1ULL << 1), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) + BPF_F_NEIGH = (1ULL << 1), + BPF_F_PEER = (1ULL << 2), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + struct net *net = dev_net(skb->dev); struct net_device *dev; u32 flags = ri->flags; - dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); + dev = dev_get_by_index_rcu(net, ri->tgt_index); ri->tgt_index = 0; - if (unlikely(!dev)) { - kfree_skb(skb); - return -EINVAL; + ri->flags = 0; + if (unlikely(!dev)) + goto out_drop; + if (flags & BPF_F_PEER) { + const struct net_device_ops *ops = dev->netdev_ops; + + if (unlikely(!ops->ndo_get_peer_dev || + !skb_at_tc_ingress(skb))) + goto out_drop; + dev = ops->ndo_get_peer_dev(dev); + if (unlikely(!dev || + !is_skb_forwardable(dev, skb) || + net_eq(net, dev_net(dev)))) + goto out_drop; + skb->dev = dev; + return -EAGAIN; } - return flags & BPF_F_NEIGH ? __bpf_redirect_neigh(skb, dev) : __bpf_redirect(skb, dev, flags); +out_drop: + kfree_skb(skb); + return -EINVAL; } BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) @@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + if (unlikely(flags)) + return TC_ACT_SHOT; + + ri->flags = BPF_F_PEER; + ri->tgt_index = ifindex; + + return TC_ACT_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_peer_proto = { + .func = bpf_redirect_peer, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); @@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_redirect_proto; case BPF_FUNC_redirect_neigh: return &bpf_redirect_neigh_proto; + case BPF_FUNC_redirect_peer: + return &bpf_redirect_peer_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4272cc53d478..b97bc5abb3b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 4a8f87e60f6db40e640f1db555d063b2c4dea5f1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:03 +0200 Subject: bpf: Allow for map-in-map with dynamic inner array map entries Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf ("bpf: Relax max_entries check for most of the inner map types") added support for dynamic inner max elements for most map-in-map types. Exceptions were maps like array or prog array where the map_gen_lookup() callback uses the maps' max_entries field as a constant when emitting instructions. We recently implemented Maglev consistent hashing into Cilium's load balancer which uses map-in-map with an outer map being hash and inner being array holding the Maglev backend table for each service. This has been designed this way in order to reduce overall memory consumption given the outer hash map allows to avoid preallocating a large, flat memory area for all services. Also, the number of service mappings is not always known a-priori. The use case for dynamic inner array map entries is to further reduce memory overhead, for example, some services might just have a small number of back ends while others could have a large number. Right now the Maglev backend table for small and large number of backends would need to have the same inner array map entries which adds a lot of unneeded overhead. Dynamic inner array map entries can be realized by avoiding the inlined code generation for their lookup. The lookup will still be efficient since it will be calling into array_map_lookup_elem() directly and thus avoiding retpoline. The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips inline code generation and relaxes array_map_meta_equal() check to ignore both maps' max_entries. This also still allows to have faster lookups for map-in-map when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed. Example code generation where inner map is dynamic sized array: # bpftool p d x i 125 int handle__sys_enter(void * ctx): ; int handle__sys_enter(void *ctx) 0: (b4) w1 = 0 ; int key = 0; 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 ; 3: (07) r2 += -4 ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 4: (18) r1 = map[id:468] 6: (07) r1 += 272 7: (61) r0 = *(u32 *)(r2 +0) 8: (35) if r0 >= 0x3 goto pc+5 9: (67) r0 <<= 3 10: (0f) r0 += r1 11: (79) r0 = *(u64 *)(r0 +0) 12: (15) if r0 == 0x0 goto pc+1 13: (05) goto pc+1 14: (b7) r0 = 0 15: (b4) w6 = -1 ; if (!inner_map) 16: (15) if r0 == 0x0 goto pc+6 17: (bf) r2 = r10 ; 18: (07) r2 += -4 ; val = bpf_map_lookup_elem(inner_map, &key); 19: (bf) r1 = r0 | No inlining but instead 20: (85) call array_map_lookup_elem#149280 | call to array_map_lookup_elem() ; return val ? *val : -1; | for inner array lookup. 21: (15) if r0 == 0x0 goto pc+1 ; return val ? *val : -1; 22: (61) r6 = *(u32 *)(r0 +0) ; } 23: (bc) w0 = w6 24: (95) exit Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net --- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/arraymap.c | 17 +++++++++++------ kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/verifier.c | 6 ++++-- net/xdp/xskmap.c | 2 +- tools/include/uapi/linux/bpf.h | 3 +++ 7 files changed, 26 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc63eeed4fd9..2b16bf48aab6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -82,7 +82,7 @@ struct bpf_map_ops { void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); - u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); + int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b97bc5abb3b8..bf5a99d803e4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -435,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index bd777dd6f967..c6c81eceb68f 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -16,7 +16,7 @@ #define ARRAY_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \ - BPF_F_PRESERVE_ELEMS) + BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_ARRAY && - attr->map_flags & BPF_F_MMAPABLE) + attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP)) return -EINVAL; if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY && @@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ -static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; @@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) const int map_ptr = BPF_REG_1; const int index = BPF_REG_2; + if (map->map_flags & BPF_F_INNER_MAP) + return -EOPNOTSUPP; + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); if (!map->bypass_spec_v1) { @@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) static bool array_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { - return meta0->max_entries == meta1->max_entries && - bpf_map_meta_equal(meta0, meta1); + if (!bpf_map_meta_equal(meta0, meta1)) + return false; + return meta0->map_flags & BPF_F_INNER_MAP ? true : + meta0->max_entries == meta1->max_entries; } struct bpf_iter_seq_array_map_info { @@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 array_of_map_gen_lookup(struct bpf_map *map, +static int array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_array *array = container_of(map, struct bpf_array, map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3395cf140d22..1815e97d4c9c 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) * bpf_prog * __htab_map_lookup_elem */ -static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) return __htab_lru_map_lookup_elem(map, key, false); } -static u32 htab_lru_map_gen_lookup(struct bpf_map *map, +static int htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; @@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) return READ_ONCE(*inner_map); } -static u32 htab_of_map_gen_lookup(struct bpf_map *map, +static int htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3e36eade3d4..fa5badc9279a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11049,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) if (insn->imm == BPF_FUNC_map_lookup_elem && ops->map_gen_lookup) { cnt = ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + if (cnt == -EOPNOTSUPP) + goto patch_map_ops_generic; + if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) { verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -11079,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); - +patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 0c5df593bc56..49da2b8ace8b 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; struct bpf_insn *insn = insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b97bc5abb3b8..bf5a99d803e4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -435,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ -- cgit v1.2.3 From 60a3815da702fd9e4759945f26cce5c47d3967ad Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 8 Oct 2020 01:14:47 +0200 Subject: netfilter: add inet ingress support This patch adds the NF_INET_INGRESS pseudohook for the NFPROTO_INET family. This is a mapping this new hook to the existing NFPROTO_NETDEV and NF_NETDEV_INGRESS hook. The hook does not guarantee that packets are inet only, users must filter out non-ip traffic explicitly. This infrastructure makes it easier to support this new hook in nf_tables. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter.h | 1 + net/netfilter/core.c | 103 ++++++++++++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 21 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index ca9e63d6e0e4..6a6179af0d7c 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -45,6 +45,7 @@ enum nf_inet_hooks { NF_INET_FORWARD, NF_INET_LOCAL_OUT, NF_INET_POST_ROUTING, + NF_INET_INGRESS, NF_INET_NUMHOOKS }; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index c82f779a587e..63d032191e62 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -281,6 +281,16 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum)) return NULL; return net->nf.hooks_bridge + hooknum; +#endif +#ifdef CONFIG_NETFILTER_INGRESS + case NFPROTO_INET: + if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS)) + return NULL; + if (!dev || dev_net(dev) != net) { + WARN_ON_ONCE(1); + return NULL; + } + return &dev->nf_hooks_ingress; #endif case NFPROTO_IPV4: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum)) @@ -311,22 +321,56 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, return NULL; } +static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg, + int hooknum) +{ +#ifndef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == hooknum) + return -EOPNOTSUPP; +#endif + if (reg->hooknum != hooknum || + !reg->dev || dev_net(reg->dev) != net) + return -EINVAL; + + return 0; +} + static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) { - return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS; + if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) || + (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS)) + return true; + + return false; } static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL - static_key_slow_inc(&nf_hooks_needed[pf][reg->hooknum]); + int hooknum; + + if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { + pf = NFPROTO_NETDEV; + hooknum = NF_NETDEV_INGRESS; + } else { + hooknum = reg->hooknum; + } + static_key_slow_inc(&nf_hooks_needed[pf][hooknum]); #endif } static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL - static_key_slow_dec(&nf_hooks_needed[pf][reg->hooknum]); + int hooknum; + + if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { + pf = NFPROTO_NETDEV; + hooknum = NF_NETDEV_INGRESS; + } else { + hooknum = reg->hooknum; + } + static_key_slow_dec(&nf_hooks_needed[pf][hooknum]); #endif } @@ -335,15 +379,22 @@ static int __nf_register_net_hook(struct net *net, int pf, { struct nf_hook_entries *p, *new_hooks; struct nf_hook_entries __rcu **pp; + int err; - if (pf == NFPROTO_NETDEV) { -#ifndef CONFIG_NETFILTER_INGRESS - if (reg->hooknum == NF_NETDEV_INGRESS) - return -EOPNOTSUPP; -#endif - if (reg->hooknum != NF_NETDEV_INGRESS || - !reg->dev || dev_net(reg->dev) != net) - return -EINVAL; + switch (pf) { + case NFPROTO_NETDEV: + err = nf_ingress_check(net, reg, NF_NETDEV_INGRESS); + if (err < 0) + return err; + break; + case NFPROTO_INET: + if (reg->hooknum != NF_INET_INGRESS) + break; + + err = nf_ingress_check(net, reg, NF_INET_INGRESS); + if (err < 0) + return err; + break; } pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); @@ -441,8 +492,12 @@ static void __nf_unregister_net_hook(struct net *net, int pf, void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { if (reg->pf == NFPROTO_INET) { - __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); - __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); + if (reg->hooknum == NF_INET_INGRESS) { + __nf_unregister_net_hook(net, NFPROTO_INET, reg); + } else { + __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); + __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); + } } else { __nf_unregister_net_hook(net, reg->pf, reg); } @@ -467,14 +522,20 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) int err; if (reg->pf == NFPROTO_INET) { - err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); - if (err < 0) - return err; - - err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); - if (err < 0) { - __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); - return err; + if (reg->hooknum == NF_INET_INGRESS) { + err = __nf_register_net_hook(net, NFPROTO_INET, reg); + if (err < 0) + return err; + } else { + err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); + if (err < 0) + return err; + + err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); + if (err < 0) { + __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); + return err; + } } } else { err = __nf_register_net_hook(net, reg->pf, reg); -- cgit v1.2.3 From ac911bfeb34b5d79fb4e23a08b8db0b89c529b53 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Oct 2020 09:43:53 +0200 Subject: can: isotp: implement cleanups / improvements from review As pointed out by Jakub Kicinski here: http://lore.kernel.org/r/20201009175751.5c54097f@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com this patch addresses the remarked issues: - remove empty line in comment - remove default=y for CAN_ISOTP in Kconfig - make use of pr_notice_once() - use GFP_ATOMIC instead of gfp_any() in soft hrtimer context The version strings in the CAN subsystem are removed by a separate patch. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20201012074354.25839-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/isotp.h | 1 - net/can/Kconfig | 3 ++- net/can/isotp.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h index 553006509f4e..7793b26aa154 100644 --- a/include/uapi/linux/can/isotp.h +++ b/include/uapi/linux/can/isotp.h @@ -160,7 +160,6 @@ struct can_isotp_ll_options { * these default settings can be changed via sockopts. * For that reason the STmin value is intentionally _not_ checked for * consistency and copied directly into the flow control (FC) frame. - * */ #endif /* !_UAPI_CAN_ISOTP_H */ diff --git a/net/can/Kconfig b/net/can/Kconfig index 021fe03a8ed6..224e5e0283a9 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -57,7 +57,6 @@ source "net/can/j1939/Kconfig" config CAN_ISOTP tristate "ISO 15765-2:2016 CAN transport protocol" - default y help CAN Transport Protocols offer support for segmented Point-to-Point communication between CAN nodes via two defined CAN Identifiers. @@ -67,6 +66,8 @@ config CAN_ISOTP vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. This protocol driver implements data transfers according to ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. + If you want to perform automotive vehicle diagnostic services (UDS), + say 'y'. source "drivers/net/can/Kconfig" diff --git a/net/can/isotp.c b/net/can/isotp.c index e6ff032b5426..ca63061bb932 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -222,8 +222,8 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) can_send_ret = can_send(nskb, 1); if (can_send_ret) - printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); + pr_notice_once("can-isotp: %s: can_send_ret %d\n", + __func__, can_send_ret); dev_put(dev); @@ -769,7 +769,7 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) isotp_tx_burst: skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), - gfp_any()); + GFP_ATOMIC); if (!skb) { dev_put(dev); break; @@ -798,8 +798,8 @@ isotp_tx_burst: can_send_ret = can_send(skb, 1); if (can_send_ret) - printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", - __func__, can_send_ret); + pr_notice_once("can-isotp: %s: can_send_ret %d\n", + __func__, can_send_ret); if (so->tx.idx >= so->tx.len) { /* we are done */ @@ -942,8 +942,8 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = can_send(skb, 1); dev_put(dev); if (err) { - printk_once(KERN_NOTICE "can-isotp: %s: can_send_ret %d\n", - __func__, err); + pr_notice_once("can-isotp: %s: can_send_ret %d\n", + __func__, err); return err; } -- cgit v1.2.3 From d25e2e9388eda61b6e298585024ee3355f50c493 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 14 Oct 2020 21:34:32 +0200 Subject: netfilter: restore NF_INET_NUMHOOKS This definition is used by the iptables legacy UAPI, restore it. Fixes: d3519cb89f6d ("netfilter: nf_tables: add inet ingress support") Reported-by: Jason A. Donenfeld Tested-by: Jason A. Donenfeld Signed-off-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- include/net/netfilter/nf_tables.h | 4 +++- include/uapi/linux/netfilter.h | 4 ++-- net/netfilter/nf_tables_api.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3965ce18226f..3f7e56b1171e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -14,6 +14,8 @@ #include #include +#define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) + struct module; #define NFT_JUMP_STACK_SIZE 16 @@ -979,7 +981,7 @@ struct nft_chain_type { int family; struct module *owner; unsigned int hook_mask; - nf_hookfn *hooks[NF_MAX_HOOKS]; + nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 6a6179af0d7c..ef9a44286e23 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -45,8 +45,8 @@ enum nf_inet_hooks { NF_INET_FORWARD, NF_INET_LOCAL_OUT, NF_INET_POST_ROUTING, - NF_INET_INGRESS, - NF_INET_NUMHOOKS + NF_INET_NUMHOOKS, + NF_INET_INGRESS = NF_INET_NUMHOOKS, }; enum nf_dev_hooks { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f22ad21d0230..7f1c184c00d2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1864,7 +1864,7 @@ static int nft_chain_parse_hook(struct net *net, if (IS_ERR(type)) return PTR_ERR(type); } - if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) + if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num))) return -EOPNOTSUPP; if (type->type == NFT_CHAIN_T_NAT && -- cgit v1.2.3 From 346e320cb2103edef709c4466a29140c4a8e527a Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 15 Oct 2020 18:39:27 +0200 Subject: netfilter: nftables: allow re-computing sctp CRC-32C in 'payload' statements nftables payload statements are used to mangle SCTP headers, but they can only replace the Internet Checksum. As a consequence, nftables rules that mangle sport/dport/vtag in SCTP headers potentially generate packets that are discarded by the receiver, unless the CRC-32C is "offloaded" (e.g the rule mangles a skb having 'ip_summed' equal to 'CHECKSUM_PARTIAL'. Fix this extending uAPI definitions and L4 checksum update function, in a way that userspace programs (e.g. nft) can instruct the kernel to compute CRC-32C in SCTP headers. Also ensure that LIBCRC32C is built if NF_TABLES is 'y' or 'm' in the kernel build configuration. Signed-off-by: Davide Caratti Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/Kconfig | 1 + net/netfilter/nft_payload.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 352ee51707a1..98272cb5f617 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -749,10 +749,12 @@ enum nft_payload_bases { * * @NFT_PAYLOAD_CSUM_NONE: no checksumming * @NFT_PAYLOAD_CSUM_INET: internet checksum (RFC 791) + * @NFT_PAYLOAD_CSUM_SCTP: CRC-32c, for use in SCTP header (RFC 3309) */ enum nft_payload_csum_types { NFT_PAYLOAD_CSUM_NONE, NFT_PAYLOAD_CSUM_INET, + NFT_PAYLOAD_CSUM_SCTP, }; enum nft_payload_csum_flags { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 25313c29d799..52370211e46b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -441,6 +441,7 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK + select LIBCRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7a2e59638499..dcd3c7b8a367 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -22,6 +22,7 @@ #include #include #include +#include static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, struct vlan_ethhdr *veth) @@ -484,6 +485,19 @@ static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, return 0; } +static int nft_payload_csum_sctp(struct sk_buff *skb, int offset) +{ + struct sctphdr *sh; + + if (skb_ensure_writable(skb, offset + sizeof(*sh))) + return -1; + + sh = (struct sctphdr *)(skb->data + offset); + sh->checksum = sctp_compute_cksum(skb, offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + return 0; +} + static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, struct sk_buff *skb, __wsum fsum, __wsum tsum) @@ -587,6 +601,13 @@ static void nft_payload_set_eval(const struct nft_expr *expr, skb_store_bits(skb, offset, src, priv->len) < 0) goto err; + if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && + pkt->tprot == IPPROTO_SCTP && + skb->ip_summed != CHECKSUM_PARTIAL) { + if (nft_payload_csum_sctp(skb, pkt->xt.thoff)) + goto err; + } + return; err: regs->verdict.code = NFT_BREAK; @@ -623,6 +644,13 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, case NFT_PAYLOAD_CSUM_NONE: case NFT_PAYLOAD_CSUM_INET: break; + case NFT_PAYLOAD_CSUM_SCTP: + if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER) + return -EINVAL; + + if (priv->csum_offset != offsetof(struct sctphdr, checksum)) + return -EINVAL; + break; default: return -EOPNOTSUPP; } -- cgit v1.2.3 From 8c39076c276be0b31982e44654e2c2357473258a Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 16 Oct 2020 09:25:45 -0400 Subject: NFSv4.2: support EXCHGID4_FLAG_SUPP_FENCE_OPS 4.2 EXCHANGE_ID flag RFC 7862 introduced a new flag that either client or server is allowed to set: EXCHGID4_FLAG_SUPP_FENCE_OPS. Client needs to update its bitmask to allow for this flag value. v2: changed minor version argument to unsigned int Signed-off-by: Olga Kornievskaia CC: Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 ++++++--- include/uapi/linux/nfs4.h | 3 +++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2e33995691f5..9e0ca9b2b210 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8133,9 +8133,11 @@ int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, * both PNFS and NON_PNFS flags set, and not having one of NON_PNFS, PNFS, or * DS flags set. */ -static int nfs4_check_cl_exchange_flags(u32 flags) +static int nfs4_check_cl_exchange_flags(u32 flags, u32 version) { - if (flags & ~EXCHGID4_FLAG_MASK_R) + if (version >= 2 && (flags & ~EXCHGID4_2_FLAG_MASK_R)) + goto out_inval; + else if (version < 2 && (flags & ~EXCHGID4_FLAG_MASK_R)) goto out_inval; if ((flags & EXCHGID4_FLAG_USE_PNFS_MDS) && (flags & EXCHGID4_FLAG_USE_NON_PNFS)) @@ -8548,7 +8550,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre if (status != 0) goto out; - status = nfs4_check_cl_exchange_flags(resp->flags); + status = nfs4_check_cl_exchange_flags(resp->flags, + clp->cl_mvops->minor_version); if (status != 0) goto out; diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index bf197e99b98f..ed5415e0f1c1 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -139,6 +139,8 @@ #define EXCHGID4_FLAG_UPD_CONFIRMED_REC_A 0x40000000 #define EXCHGID4_FLAG_CONFIRMED_R 0x80000000 + +#define EXCHGID4_FLAG_SUPP_FENCE_OPS 0x00000004 /* * Since the validity of these bits depends on whether * they're set in the argument or response, have separate @@ -146,6 +148,7 @@ */ #define EXCHGID4_FLAG_MASK_A 0x40070103 #define EXCHGID4_FLAG_MASK_R 0x80070103 +#define EXCHGID4_2_FLAG_MASK_R 0x80070107 #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 -- cgit v1.2.3 From c6e5f02b5281a3166a9b7b4d66830cc234421ba5 Mon Sep 17 00:00:00 2001 From: "Saheed O. Bolarinwa" Date: Thu, 15 Oct 2020 14:30:31 -0500 Subject: PCI/ASPM: Remove struct aspm_register_info.support Previously we stored the "ASPM Support" field from the Link Capabilities register in the struct aspm_register_info. Read the Link Capabilities directly when needed and remove it from the struct aspm_register_info. No functional change intended. [bhelgaas: remove pci_dev cached copy since LNKCAP isn't truly read-only, add PCI_EXP_LNKCAP_ASPM_L0S & PCI_EXP_LNKCAP_ASPM_L1, check them directly instead of adding aspm_support()] Link: https://lore.kernel.org/r/20201015193039.12585-5-helgaas@kernel.org Signed-off-by: Saheed O. Bolarinwa Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aspm.c | 25 ++++++++++++++----------- include/uapi/linux/pci_regs.h | 2 ++ 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index 0725511cbeb5..82ce34e2ef53 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -381,7 +381,6 @@ static void encode_l12_threshold(u32 threshold_us, u32 *scale, u32 *value) } struct aspm_register_info { - u32 support:2; u32 enabled:2; u32 latency_encoding_l0s; u32 latency_encoding_l1; @@ -400,7 +399,6 @@ static void pcie_get_aspm_reg(struct pci_dev *pdev, u32 reg32; pcie_capability_read_dword(pdev, PCI_EXP_LNKCAP, ®32); - info->support = (reg32 & PCI_EXP_LNKCAP_ASPMS) >> 10; info->latency_encoding_l0s = (reg32 & PCI_EXP_LNKCAP_L0SEL) >> 12; info->latency_encoding_l1 = (reg32 & PCI_EXP_LNKCAP_L1EL) >> 15; pcie_capability_read_word(pdev, PCI_EXP_LNKCTL, ®16); @@ -550,6 +548,7 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) { struct pci_dev *child = link->downstream, *parent = link->pdev; + u32 parent_lnkcap, child_lnkcap; struct pci_bus *linkbus = parent->subordinate; struct aspm_register_info upreg, dwreg; @@ -560,24 +559,26 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) return; } - /* Get upstream/downstream components' register state */ - pcie_get_aspm_reg(parent, &upreg); - pcie_get_aspm_reg(child, &dwreg); - /* * If ASPM not supported, don't mess with the clocks and link, * bail out now. */ - if (!(upreg.support & dwreg.support)) + pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); + pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); + if (!(parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPMS)) return; /* Configure common clock before checking latencies */ pcie_aspm_configure_common_clock(link); /* - * Re-read upstream/downstream components' register state - * after clock configuration + * Re-read upstream/downstream components' register state after + * clock configuration. L0s & L1 exit latencies in the otherwise + * read-only Link Capabilities may change depending on common clock + * configuration (PCIe r5.0, sec 7.5.3.6). */ + pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &parent_lnkcap); + pcie_capability_read_dword(child, PCI_EXP_LNKCAP, &child_lnkcap); pcie_get_aspm_reg(parent, &upreg); pcie_get_aspm_reg(child, &dwreg); @@ -588,8 +589,9 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) * given link unless components on both sides of the link each * support L0s. */ - if (dwreg.support & upreg.support & PCIE_LINK_STATE_L0S) + if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L0S) link->aspm_support |= ASPM_STATE_L0S; + if (dwreg.enabled & PCIE_LINK_STATE_L0S) link->aspm_enabled |= ASPM_STATE_L0S_UP; if (upreg.enabled & PCIE_LINK_STATE_L0S) @@ -598,8 +600,9 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) link->latency_dw.l0s = calc_l0s_latency(dwreg.latency_encoding_l0s); /* Setup L1 state */ - if (upreg.support & dwreg.support & PCIE_LINK_STATE_L1) + if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L1) link->aspm_support |= ASPM_STATE_L1; + if (upreg.enabled & dwreg.enabled & PCIE_LINK_STATE_L1) link->aspm_enabled |= ASPM_STATE_L1; link->latency_up.l1 = calc_l1_latency(upreg.latency_encoding_l1); diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f9701410d3b5..06846ec2e071 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -532,6 +532,8 @@ #define PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */ #define PCI_EXP_LNKCAP_MLW 0x000003f0 /* Maximum Link Width */ #define PCI_EXP_LNKCAP_ASPMS 0x00000c00 /* ASPM Support */ +#define PCI_EXP_LNKCAP_ASPM_L0S 0x00000400 /* ASPM L0s Support */ +#define PCI_EXP_LNKCAP_ASPM_L1 0x00000800 /* ASPM L1 Support */ #define PCI_EXP_LNKCAP_L0SEL 0x00007000 /* L0s Exit Latency */ #define PCI_EXP_LNKCAP_L1EL 0x00038000 /* L1 Exit Latency */ #define PCI_EXP_LNKCAP_CLKPM 0x00040000 /* Clock Power Management */ -- cgit v1.2.3 From df8f10587d3d11b055d54138994a1a9a681da0c4 Mon Sep 17 00:00:00 2001 From: "Saheed O. Bolarinwa" Date: Thu, 15 Oct 2020 14:30:39 -0500 Subject: PCI/ASPM: Remove struct pcie_link_state.l1ss Previously we computed L1.2 parameters in the enumeration path, saved them in struct pcie_link_state.l1ss, and programmed them into the devices whenever we enabled or disabled L1.2 on the link. But these parameters are constant and don't need to be updated when enabling/disabling L1.2. Compute and program the L1.2 parameters once during enumeration and remove the struct pcie_link_state.l1ss member. No functional change intended. [bhelgaas: rework to program L1.2 parameters during enumeration] Link: https://lore.kernel.org/r/20201015193039.12585-13-helgaas@kernel.org Signed-off-by: Saheed O. Bolarinwa Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/aspm.c | 84 +++++++++++++++++++++++++------------------ include/uapi/linux/pci_regs.h | 1 + 2 files changed, 50 insertions(+), 35 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c index d76f23908d67..ac0557a305af 100644 --- a/drivers/pci/pcie/aspm.c +++ b/drivers/pci/pcie/aspm.c @@ -74,12 +74,6 @@ struct pcie_link_state { * has one slot under it, so at most there are 8 functions. */ struct aspm_latency acceptable[8]; - - /* L1 PM Substate info */ - struct { - u32 ctl1; /* value to be programmed in ctl1 */ - u32 ctl2; /* value to be programmed in ctl2 */ - } l1ss; }; static int aspm_disabled, aspm_force; @@ -461,8 +455,9 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, struct pci_dev *child = link->downstream, *parent = link->pdev; u32 val1, val2, scale1, scale2; u32 t_common_mode, t_power_on, l1_2_threshold, scale, value; - - link->l1ss.ctl1 = link->l1ss.ctl2 = 0; + u32 ctl1 = 0, ctl2 = 0; + u32 pctl1, pctl2, cctl1, cctl2; + u32 pl1_2_enables, cl1_2_enables; if (!(link->aspm_support & ASPM_STATE_L1_2_MASK)) return; @@ -480,10 +475,10 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, if (calc_l1ss_pwron(parent, scale1, val1) > calc_l1ss_pwron(child, scale2, val2)) { - link->l1ss.ctl2 |= scale1 | (val1 << 3); + ctl2 |= scale1 | (val1 << 3); t_power_on = calc_l1ss_pwron(parent, scale1, val1); } else { - link->l1ss.ctl2 |= scale2 | (val2 << 3); + ctl2 |= scale2 | (val2 << 3); t_power_on = calc_l1ss_pwron(child, scale2, val2); } @@ -499,7 +494,50 @@ static void aspm_calc_l1ss_info(struct pcie_link_state *link, */ l1_2_threshold = 2 + 4 + t_common_mode + t_power_on; encode_l12_threshold(l1_2_threshold, &scale, &value); - link->l1ss.ctl1 |= t_common_mode << 8 | scale << 29 | value << 16; + ctl1 |= t_common_mode << 8 | scale << 29 | value << 16; + + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1, &pctl1); + pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, &pctl2); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL1, &cctl1); + pci_read_config_dword(child, child->l1ss + PCI_L1SS_CTL2, &cctl2); + + if (ctl1 == pctl1 && ctl1 == cctl1 && + ctl2 == pctl2 && ctl2 == cctl2) + return; + + /* Disable L1.2 while updating. See PCIe r5.0, sec 5.5.4, 7.8.3.3 */ + pl1_2_enables = pctl1 & PCI_L1SS_CTL1_L1_2_MASK; + cl1_2_enables = cctl1 & PCI_L1SS_CTL1_L1_2_MASK; + + if (pl1_2_enables || cl1_2_enables) { + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_L1_2_MASK, 0); + } + + /* Program T_POWER_ON times in both ports */ + pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, ctl2); + pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, ctl2); + + /* Program Common_Mode_Restore_Time in upstream device */ + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_CM_RESTORE_TIME, ctl1); + + /* Program LTR_L1.2_THRESHOLD time in both ports */ + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, + PCI_L1SS_CTL1_LTR_L12_TH_VALUE | + PCI_L1SS_CTL1_LTR_L12_TH_SCALE, ctl1); + + if (pl1_2_enables || cl1_2_enables) { + pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, 0, + pl1_2_enables); + pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, 0, + cl1_2_enables); + } } static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist) @@ -679,30 +717,6 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state) PCI_EXP_LNKCTL_ASPM_L1, 0); } - if (enable_req & ASPM_STATE_L1_2_MASK) { - - /* Program T_POWER_ON times in both ports */ - pci_write_config_dword(parent, parent->l1ss + PCI_L1SS_CTL2, - link->l1ss.ctl2); - pci_write_config_dword(child, child->l1ss + PCI_L1SS_CTL2, - link->l1ss.ctl2); - - /* Program Common_Mode_Restore_Time in upstream device */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_CM_RESTORE_TIME, - link->l1ss.ctl1); - - /* Program LTR_L1.2_THRESHOLD time in both ports */ - pci_clear_and_set_dword(parent, parent->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE, - link->l1ss.ctl1); - pci_clear_and_set_dword(child, child->l1ss + PCI_L1SS_CTL1, - PCI_L1SS_CTL1_LTR_L12_TH_VALUE | - PCI_L1SS_CTL1_LTR_L12_TH_SCALE, - link->l1ss.ctl1); - } - val = 0; if (state & ASPM_STATE_L1_1) val |= PCI_L1SS_CTL1_ASPM_L1_1; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 06846ec2e071..c7e0acba0e20 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1058,6 +1058,7 @@ #define PCI_L1SS_CTL1_PCIPM_L1_1 0x00000002 /* PCI-PM L1.1 Enable */ #define PCI_L1SS_CTL1_ASPM_L1_2 0x00000004 /* ASPM L1.2 Enable */ #define PCI_L1SS_CTL1_ASPM_L1_1 0x00000008 /* ASPM L1.1 Enable */ +#define PCI_L1SS_CTL1_L1_2_MASK 0x00000005 #define PCI_L1SS_CTL1_L1SS_MASK 0x0000000f #define PCI_L1SS_CTL1_CM_RESTORE_TIME 0x0000ff00 /* Common_Mode_Restore_Time */ #define PCI_L1SS_CTL1_LTR_L12_TH_VALUE 0x03ff0000 /* LTR_L1.2_THRESHOLD_Value */ -- cgit v1.2.3 From f3d301c1f2f5676465cdf3259737ea19cc82731f Mon Sep 17 00:00:00 2001 From: Al Grant Date: Mon, 21 Sep 2020 21:46:37 +0100 Subject: perf: correct SNOOPX field offset perf_event.h has macros that define the field offsets in the data_src bitmask in perf records. The SNOOPX and REMOTE offsets were both 37. These are distinct fields, and the bitfield layout in perf_mem_data_src confirms that SNOOPX should be at offset 38. Fixes: 52839e653b5629bd ("perf tools: Add support for printing new mem_info encodings") Signed-off-by: Al Grant Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andi Kleen Link: https://lkml.kernel.org/r/4ac9f5cc-4388-b34a-9999-418a4099415d@foss.arm.com --- include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 077e7ee69e3d..b95d3c485d27 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1196,7 +1196,7 @@ union perf_mem_data_src { #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ /* 1 free */ -#define PERF_MEM_SNOOPX_SHIFT 37 +#define PERF_MEM_SNOOPX_SHIFT 38 /* locked instruction */ #define PERF_MEM_LOCK_NA 0x01 /* not available */ -- cgit v1.2.3 From 66570e966dd9cb4fd57811d0056c6472a14a2c41 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Tue, 18 Aug 2020 15:24:28 +0000 Subject: kvm: x86: only provide PV features if enabled in guest's CPUID KVM unconditionally provides PV features to the guest, regardless of the configured CPUID. An unwitting guest that doesn't check KVM_CPUID_FEATURES before use could access paravirt features that userspace did not intend to provide. Fix this by checking the guest's CPUID before performing any paravirtual operations. Introduce a capability, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, to gate the aforementioned enforcement. Migrating a VM from a host w/o this patch to a host with this patch could silently change the ABI exposed to the guest, warranting that we default to the old behavior and opt-in for the new one. Reviewed-by: Jim Mattson Reviewed-by: Peter Shier Signed-off-by: Oliver Upton Change-Id: I202a0926f65035b872bfe8ad15307c026de59a98 Message-Id: <20200818152429.1923996-4-oupton@google.com> Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 11 +++++++ arch/x86/include/asm/kvm_host.h | 15 +++++++++ arch/x86/kvm/cpuid.c | 7 +++++ arch/x86/kvm/cpuid.h | 10 ++++++ arch/x86/kvm/x86.c | 67 ++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/kvm.h | 1 + 6 files changed, 106 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 9ece9a827a58..76317221d29f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6380,3 +6380,14 @@ ranges that KVM should reject access to. In combination with KVM_CAP_X86_USER_SPACE_MSR, this allows user space to trap and emulate MSRs that are outside of the scope of KVM as well as limit the attack surface on KVM's MSR emulation code. + + +8.26 KVM_CAP_ENFORCE_PV_CPUID +----------------------------- + +Architectures: x86 + +When enabled, KVM will disable paravirtual features provided to the +guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf +(0x40000001). Otherwise, a guest may use the paravirtual features +regardless of what has actually been exposed through the CPUID leaf. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d0f77235da92..15e51343957e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -789,6 +789,21 @@ struct kvm_vcpu_arch { /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; + + /* pv related cpuid info */ + struct { + /* + * value of the eax register in the KVM_CPUID_FEATURES CPUID + * leaf. + */ + u32 features; + + /* + * indicates whether pv emulation should be disabled if features + * are not present in the guest's cpuid + */ + bool enforce; + } pv_cpuid; }; struct kvm_lpage_info { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 37c3668a774f..d253c023ee76 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -107,6 +107,13 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); + /* + * save the feature bitmap to avoid cpuid lookup for every PV + * operation + */ + if (best) + vcpu->arch.pv_cpuid.features = best->eax; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); if (best) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 1d2c4f2e4bb6..bf8577947ed2 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -5,6 +5,7 @@ #include "x86.h" #include #include +#include extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); @@ -313,4 +314,13 @@ static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); } +static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, + unsigned int kvm_feature) +{ + if (!vcpu->arch.pv_cpuid.enforce) + return true; + + return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); +} + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b928e092da03..ca940de53e18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2877,6 +2877,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) if (data & 0x30) return 1; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) + return 1; + if (!lapic_in_kernel(vcpu)) return data ? 1 : 0; @@ -2954,10 +2962,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb_guest(vcpu); + if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st->preempted & KVM_VCPU_FLUSH_TLB); + if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb_guest(vcpu); + } vcpu->arch.st.preempted = 0; @@ -3118,30 +3128,54 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + + kvm_write_wall_clock(vcpu->kvm, data); + break; case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; case MSR_KVM_SYSTEM_TIME: - kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; kvm_check_async_pf_completion(vcpu); } break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; if (unlikely(!sched_info_on())) return 1; @@ -3158,11 +3192,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; @@ -3658,6 +3698,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: case KVM_CAP_X86_MSR_FILTER: + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4528,6 +4569,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_x86_ops.enable_direct_tlbflush(vcpu); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: + vcpu->arch.pv_cpuid.enforce = cap->args[0]; + + return 0; + default: return -EINVAL; } @@ -8000,11 +8046,16 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) goto out; } + ret = -KVM_ENOSYS; + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; case KVM_HC_KICK_CPU: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); kvm_sched_yield(vcpu->kvm, a1); ret = 0; @@ -8015,9 +8066,15 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; #endif case KVM_HC_SEND_IPI: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; case KVM_HC_SCHED_YIELD: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + kvm_sched_yield(vcpu->kvm, a0); ret = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 58f43aa1fc21..ca41220b40b8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1052,6 +1052,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_STEAL_TIME 187 #define KVM_CAP_X86_USER_SPACE_MSR 188 #define KVM_CAP_X86_MSR_FILTER 189 +#define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From ba452c9e996d8a4c347b32805f91abb70de5de7e Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 20 Oct 2020 23:25:56 +0200 Subject: bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on the discussion in [0], update the bpf_redirect_neigh() helper to accept an optional parameter specifying the nexthop information. This makes it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without incurring a duplicate FIB lookup - since the FIB lookup helper will return the nexthop information even if no neighbour is present, this can simply be passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns BPF_FIB_LKUP_RET_NO_NEIGH. Thus fix & extend it before helper API is frozen. [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/ Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/160322915615.32199.1187570224032024535.stgit@toke.dk --- include/linux/filter.h | 9 +++ include/uapi/linux/bpf.h | 22 ++++-- net/core/filter.c | 158 ++++++++++++++++++++++++++--------------- scripts/bpf_helpers_doc.py | 1 + tools/include/uapi/linux/bpf.h | 22 ++++-- 5 files changed, 145 insertions(+), 67 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 20fc24c9779a..72d62cbc1578 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -607,12 +607,21 @@ struct bpf_skb_data_end { void *data_end; }; +struct bpf_nh_params { + u32 nh_family; + union { + u32 ipv4_nh; + struct in6_addr ipv6_nh; + }; +}; + struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; struct bpf_map *map; u32 kern_flags; + struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bf5a99d803e4..e6ceac3f7d62 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,16 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ diff --git a/net/core/filter.c b/net/core/filter.c index c5e2a1c5fd8d..6d0fa65a4a46 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2165,12 +2165,12 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, } #if IS_ENABLED(CONFIG_IPV6) -static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *nexthop; + struct dst_entry *dst = NULL; struct neighbour *neigh; if (dev_xmit_recursion()) { @@ -2196,8 +2196,13 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), - &ipv6_hdr(skb)->daddr); + if (!nh) { + dst = skb_dst(skb); + nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), + &ipv6_hdr(skb)->daddr); + } else { + nexthop = &nh->ipv6_nh; + } neigh = ip_neigh_gw6(dev, nexthop); if (likely(!IS_ERR(neigh))) { int ret; @@ -2210,36 +2215,43 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) return ret; } rcu_read_unlock_bh(); - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + if (dst) + IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; } -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct dst_entry *dst; - struct flowi6 fl6 = { - .flowi6_flags = FLOWI_FLAG_ANYSRC, - .flowi6_mark = skb->mark, - .flowlabel = ip6_flowinfo(ip6h), - .flowi6_oif = dev->ifindex, - .flowi6_proto = ip6h->nexthdr, - .daddr = ip6h->daddr, - .saddr = ip6h->saddr, - }; - dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); - if (IS_ERR(dst)) - goto out_drop; + if (!nh) { + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowi6_mark = skb->mark, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_oif = dev->ifindex, + .flowi6_proto = ip6h->nexthdr, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + }; + + dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); + if (IS_ERR(dst)) + goto out_drop; - skb_dst_set(skb, dst); + skb_dst_set(skb, dst); + } else if (nh->nh_family != AF_INET6) { + goto out_drop; + } - err = bpf_out_neigh_v6(net, skb); + err = bpf_out_neigh_v6(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2252,7 +2264,8 @@ out_xmit: return ret; } #else -static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; @@ -2260,11 +2273,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) #endif /* CONFIG_IPV6 */ #if IS_ENABLED(CONFIG_INET) -static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) +static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, + struct net_device *dev, struct bpf_nh_params *nh) { - struct dst_entry *dst = skb_dst(skb); - struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; u32 hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; @@ -2292,7 +2303,21 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) } rcu_read_lock_bh(); - neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + if (!nh) { + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = container_of(dst, struct rtable, dst); + + neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); + } else if (nh->nh_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &nh->ipv6_nh); + is_v6gw = true; + } else if (nh->nh_family == AF_INET) { + neigh = ip_neigh_gw4(dev, nh->ipv4_nh); + } else { + rcu_read_unlock_bh(); + goto out_drop; + } + if (likely(!IS_ERR(neigh))) { int ret; @@ -2309,33 +2334,37 @@ out_drop: return -ENETDOWN; } -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { const struct iphdr *ip4h = ip_hdr(skb); struct net *net = dev_net(dev); int err, ret = NET_XMIT_DROP; - struct rtable *rt; - struct flowi4 fl4 = { - .flowi4_flags = FLOWI_FLAG_ANYSRC, - .flowi4_mark = skb->mark, - .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_oif = dev->ifindex, - .flowi4_proto = ip4h->protocol, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, - }; - rt = ip_route_output_flow(net, &fl4, NULL); - if (IS_ERR(rt)) - goto out_drop; - if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { - ip_rt_put(rt); - goto out_drop; - } + if (!nh) { + struct flowi4 fl4 = { + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_oif = dev->ifindex, + .flowi4_proto = ip4h->protocol, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + struct rtable *rt; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto out_drop; + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto out_drop; + } - skb_dst_set(skb, &rt->dst); + skb_dst_set(skb, &rt->dst); + } - err = bpf_out_neigh_v4(net, skb); + err = bpf_out_neigh_v4(net, skb, dev, nh); if (unlikely(net_xmit_eval(err))) dev->stats.tx_errors++; else @@ -2348,14 +2377,16 @@ out_xmit: return ret; } #else -static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { kfree_skb(skb); return NET_XMIT_DROP; } #endif /* CONFIG_INET */ -static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) +static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev, + struct bpf_nh_params *nh) { struct ethhdr *ethh = eth_hdr(skb); @@ -2370,9 +2401,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) skb_reset_network_header(skb); if (skb->protocol == htons(ETH_P_IP)) - return __bpf_redirect_neigh_v4(skb, dev); + return __bpf_redirect_neigh_v4(skb, dev, nh); else if (skb->protocol == htons(ETH_P_IPV6)) - return __bpf_redirect_neigh_v6(skb, dev); + return __bpf_redirect_neigh_v6(skb, dev, nh); out: kfree_skb(skb); return -ENOTSUPP; @@ -2382,7 +2413,8 @@ out: enum { BPF_F_NEIGH = (1ULL << 1), BPF_F_PEER = (1ULL << 2), -#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) + BPF_F_NEXTHOP = (1ULL << 3), +#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP) }; BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) @@ -2455,7 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb) return -EAGAIN; } return flags & BPF_F_NEIGH ? - __bpf_redirect_neigh(skb, dev) : + __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ? + &ri->nh : NULL) : __bpf_redirect(skb, dev, flags); out_drop: kfree_skb(skb); @@ -2504,16 +2537,21 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) +BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params, + int, plen, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - if (unlikely(flags)) + if (unlikely((plen && plen < sizeof(*params)) || flags)) return TC_ACT_SHOT; - ri->flags = BPF_F_NEIGH; + ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0); ri->tgt_index = ifindex; + BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params)); + if (plen) + memcpy(&ri->nh, params, sizeof(ri->nh)); + return TC_ACT_REDIRECT; } @@ -2522,7 +2560,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 7d86fdd190be..6769caae142f 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -453,6 +453,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', 'struct bpf_sk_lookup', 'struct bpf_sock', 'struct bpf_sock_addr', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bf5a99d803e4..e6ceac3f7d62 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3677,15 +3677,19 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) * Description * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it * populates L2 addresses as well, meaning, internally, the helper - * performs a FIB lookup based on the skb's networking header to - * get the address of the next hop and then relies on the neighbor - * lookup for the L2 address of the nexthop. + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. * * The *flags* argument is reserved and must be 0. The helper is * currently only supported for tc BPF program types, and enabled @@ -4906,6 +4910,16 @@ struct bpf_fib_lookup { __u8 dmac[6]; /* ETH_ALEN */ }; +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ -- cgit v1.2.3 From 1b48dc03e575a872404f33b04cd237953c5d7498 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 23 Oct 2020 17:00:42 +0800 Subject: vhost: vdpa: report iova range This patch introduces a new ioctl for vhost-vdpa device that can report the iova range by the device. For device that implements get_iova_range() method, we fetch it from the vDPA device. If device doesn't implement get_iova_range() but depends on platform IOMMU, we will query via DOMAIN_ATTR_GEOMETRY, otherwise [0, ULLONG_MAX] is assumed. For safety, this patch also rules out the map request which is not in the valid range. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20201023090043.14430-3-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 41 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vhost.h | 4 ++++ include/uapi/linux/vhost_types.h | 9 +++++++++ 3 files changed, 54 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a2dbc85e0b0d..846de69d9c01 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -47,6 +47,7 @@ struct vhost_vdpa { int minor; struct eventfd_ctx *config_ctx; int in_batch; + struct vdpa_iova_range range; }; static DEFINE_IDA(vhost_vdpa_ida); @@ -337,6 +338,16 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp) return 0; } +static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp) +{ + struct vhost_vdpa_iova_range range = { + .first = v->range.first, + .last = v->range.last, + }; + + return copy_to_user(argp, &range, sizeof(range)); +} + static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, void __user *argp) { @@ -471,6 +482,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, features = VHOST_VDPA_BACKEND_FEATURES; r = copy_to_user(featurep, &features, sizeof(features)); break; + case VHOST_VDPA_GET_IOVA_RANGE: + r = vhost_vdpa_get_iova_range(v, argp); + break; default: r = vhost_dev_ioctl(&v->vdev, cmd, argp); if (r == -ENOIOCTLCMD) @@ -597,6 +611,10 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, long pinned; int ret = 0; + if (msg->iova < v->range.first || + msg->iova + msg->size - 1 > v->range.last) + return -EINVAL; + if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; @@ -783,6 +801,27 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v) v->domain = NULL; } +static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v) +{ + struct vdpa_iova_range *range = &v->range; + struct iommu_domain_geometry geo; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (ops->get_iova_range) { + *range = ops->get_iova_range(vdpa); + } else if (v->domain && + !iommu_domain_get_attr(v->domain, + DOMAIN_ATTR_GEOMETRY, &geo) && + geo.force_aperture) { + range->first = geo.aperture_start; + range->last = geo.aperture_end; + } else { + range->first = 0; + range->last = ULLONG_MAX; + } +} + static int vhost_vdpa_open(struct inode *inode, struct file *filep) { struct vhost_vdpa *v; @@ -823,6 +862,8 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) if (r) goto err_init_iotlb; + vhost_vdpa_set_iova_range(v); + filep->private_data = v; return 0; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 75232185324a..c998860d7bbc 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -146,4 +146,8 @@ /* Set event fd for config interrupt*/ #define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int) + +/* Get the valid iova range */ +#define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ + struct vhost_vdpa_iova_range) #endif diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 9a269a88a6ff..f7f6a3a28977 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -138,6 +138,15 @@ struct vhost_vdpa_config { __u8 buf[0]; }; +/* vhost vdpa IOVA range + * @first: First address that can be mapped by vhost-vDPA + * @last: Last address that can be mapped by vhost-vDPA + */ +struct vhost_vdpa_iova_range { + __u64 first; + __u64 last; +}; + /* Feature bits */ /* Log all write descriptors. Can be changed while device is active. */ #define VHOST_F_LOG_ALL 26 -- cgit v1.2.3 From 5760648e63e6c1006a3ed0bfc2167f623b8bcbcd Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:25 +0800 Subject: gpio: uapi: fix kernel-doc warnings Fix kernel-doc warnings, specifically gpioline_info_changed.padding is not documented and 'GPIO event types' describes defines, which are not documented by kernel-doc. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-2-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 07865c601099..b0d5e7a1c693 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -346,6 +346,7 @@ enum { * @timestamp: estimate of time of status change occurrence, in nanoseconds * @event_type: one of GPIOLINE_CHANGED_REQUESTED, GPIOLINE_CHANGED_RELEASED * and GPIOLINE_CHANGED_CONFIG + * @padding: reserved for future use * * Note: struct gpioline_info embedded here has 32-bit alignment on its own, * but it works fine with 64-bit alignment too. With its 72 byte size, we can @@ -469,7 +470,7 @@ struct gpioevent_request { int fd; }; -/** +/* * GPIO event types */ #define GPIOEVENT_EVENT_RISING_EDGE 0x01 -- cgit v1.2.3 From f20160217537e9006ce4a625da62b358416fc4ed Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:26 +0800 Subject: gpio: uapi: comment consistency Make debounce_period_us field documentation consistent with other fields in the union. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-3-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index b0d5e7a1c693..1fdb0e851f83 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -98,7 +98,7 @@ struct gpio_v2_line_values { * identifying which field of the attribute union is in use. * @GPIO_V2_LINE_ATTR_ID_FLAGS: flags field is in use * @GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES: values field is in use - * @GPIO_V2_LINE_ATTR_ID_DEBOUNCE: debounce_period_us is in use + * @GPIO_V2_LINE_ATTR_ID_DEBOUNCE: debounce_period_us field is in use */ enum gpio_v2_line_attr_id { GPIO_V2_LINE_ATTR_ID_FLAGS = 1, -- cgit v1.2.3 From 2cc522d3931ba2aa744d09d41f874d61bf3a1851 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:27 +0800 Subject: gpio: uapi: kernel-doc formatting improvements Add kernel-doc formatting to all references to structs, enums, fields and constants, and move deprecation warnings into the Note section of the deprecated struct. Replace 'OR:ed' with 'added', as the former looks odd. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-4-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 93 ++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 46 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 1fdb0e851f83..32dd18f238c3 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -110,17 +110,17 @@ enum gpio_v2_line_attr_id { * struct gpio_v2_line_attribute - a configurable attribute of a line * @id: attribute identifier with value from &enum gpio_v2_line_attr_id * @padding: reserved for future use and must be zero filled - * @flags: if id is GPIO_V2_LINE_ATTR_ID_FLAGS, the flags for the GPIO - * line, with values from enum gpio_v2_line_flag, such as - * GPIO_V2_LINE_FLAG_ACTIVE_LOW, GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed + * @flags: if id is %GPIO_V2_LINE_ATTR_ID_FLAGS, the flags for the GPIO + * line, with values from &enum gpio_v2_line_flag, such as + * %GPIO_V2_LINE_FLAG_ACTIVE_LOW, %GPIO_V2_LINE_FLAG_OUTPUT etc, added * together. This overrides the default flags contained in the &struct * gpio_v2_line_config for the associated line. - * @values: if id is GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES, a bitmap + * @values: if id is %GPIO_V2_LINE_ATTR_ID_OUTPUT_VALUES, a bitmap * containing the values to which the lines will be set, with each bit * number corresponding to the index into &struct * gpio_v2_line_request.offsets. - * @debounce_period_us: if id is GPIO_V2_LINE_ATTR_ID_DEBOUNCE, the desired - * debounce period, in microseconds + * @debounce_period_us: if id is %GPIO_V2_LINE_ATTR_ID_DEBOUNCE, the + * desired debounce period, in microseconds */ struct gpio_v2_line_attribute { __u32 id; @@ -147,12 +147,12 @@ struct gpio_v2_line_config_attribute { /** * struct gpio_v2_line_config - Configuration for GPIO lines - * @flags: flags for the GPIO lines, with values from enum - * gpio_v2_line_flag, such as GPIO_V2_LINE_FLAG_ACTIVE_LOW, - * GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed together. This is the default for + * @flags: flags for the GPIO lines, with values from &enum + * gpio_v2_line_flag, such as %GPIO_V2_LINE_FLAG_ACTIVE_LOW, + * %GPIO_V2_LINE_FLAG_OUTPUT etc, added together. This is the default for * all requested lines but may be overridden for particular lines using - * attrs. - * @num_attrs: the number of attributes in attrs + * @attrs. + * @num_attrs: the number of attributes in @attrs * @padding: reserved for future use and must be zero filled * @attrs: the configuration attributes associated with the requested * lines. Any attribute should only be associated with a particular line @@ -175,17 +175,17 @@ struct gpio_v2_line_config { * "my-bitbanged-relay" * @config: requested configuration for the lines. * @num_lines: number of lines requested in this request, i.e. the number - * of valid fields in the GPIO_V2_LINES_MAX sized arrays, set to 1 to + * of valid fields in the %GPIO_V2_LINES_MAX sized arrays, set to 1 to * request a single line * @event_buffer_size: a suggested minimum number of line events that the * kernel should buffer. This is only relevant if edge detection is * enabled in the configuration. Note that this is only a suggested value * and the kernel may allocate a larger buffer or cap the size of the * buffer. If this field is zero then the buffer size defaults to a minimum - * of num_lines*16. + * of @num_lines * 16. * @padding: reserved for future use and must be zero filled * @fd: if successful this field will contain a valid anonymous file handle - * after a GPIO_GET_LINE_IOCTL operation, zero or negative value means + * after a %GPIO_GET_LINE_IOCTL operation, zero or negative value means * error */ struct gpio_v2_line_request { @@ -207,11 +207,12 @@ struct gpio_v2_line_request { * @consumer: a functional name for the consumer of this GPIO line as set * by whatever is using it, will be empty if there is no current user but * may also be empty if the consumer doesn't set this up - * @flags: flags for the GPIO line, such as GPIO_V2_LINE_FLAG_ACTIVE_LOW, - * GPIO_V2_LINE_FLAG_OUTPUT etc, OR:ed together * @offset: the local offset on this GPIO chip, fill this in when * requesting the line information from the kernel - * @num_attrs: the number of attributes in attrs + * @num_attrs: the number of attributes in @attrs + * @flags: flags for the GPIO lines, with values from &enum + * gpio_v2_line_flag, such as %GPIO_V2_LINE_FLAG_ACTIVE_LOW, + * %GPIO_V2_LINE_FLAG_OUTPUT etc, added together. * @attrs: the configuration attributes associated with the line * @padding: reserved for future use */ @@ -244,7 +245,7 @@ enum gpio_v2_line_changed_type { * of a GPIO line * @info: updated line information * @timestamp_ns: estimate of time of status change occurrence, in nanoseconds - * @event_type: the type of change with a value from enum + * @event_type: the type of change with a value from &enum * gpio_v2_line_changed_type * @padding: reserved for future use */ @@ -269,10 +270,10 @@ enum gpio_v2_line_event_id { /** * struct gpio_v2_line_event - The actual event being pushed to userspace * @timestamp_ns: best estimate of time of event occurrence, in nanoseconds. - * The timestamp_ns is read from CLOCK_MONOTONIC and is intended to allow the - * accurate measurement of the time between events. It does not provide + * The @timestamp_ns is read from %CLOCK_MONOTONIC and is intended to allow + * the accurate measurement of the time between events. It does not provide * the wall-clock time. - * @id: event identifier with value from enum gpio_v2_line_event_id + * @id: event identifier with value from &enum gpio_v2_line_event_id * @offset: the offset of the line that triggered the event * @seqno: the sequence number for this event in the sequence of events for * all the lines in this line request @@ -319,8 +320,8 @@ struct gpio_v2_line_event { * whatever is using it, will be empty if there is no current user but may * also be empty if the consumer doesn't set this up * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_info instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_info instead. */ struct gpioline_info { __u32 line_offset; @@ -344,18 +345,18 @@ enum { * of a GPIO line * @info: updated line information * @timestamp: estimate of time of status change occurrence, in nanoseconds - * @event_type: one of GPIOLINE_CHANGED_REQUESTED, GPIOLINE_CHANGED_RELEASED - * and GPIOLINE_CHANGED_CONFIG + * @event_type: one of %GPIOLINE_CHANGED_REQUESTED, + * %GPIOLINE_CHANGED_RELEASED and %GPIOLINE_CHANGED_CONFIG * @padding: reserved for future use * - * Note: struct gpioline_info embedded here has 32-bit alignment on its own, + * The &struct gpioline_info embedded here has 32-bit alignment on its own, * but it works fine with 64-bit alignment too. With its 72 byte size, we can * guarantee there are no implicit holes between it and subsequent members. * The 20-byte padding at the end makes sure we don't add any implicit padding * at the end of the structure on 64-bit architectures. * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_info_changed instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_info_changed instead. */ struct gpioline_info_changed { struct gpioline_info info; @@ -379,13 +380,13 @@ struct gpioline_info_changed { * @lineoffsets: an array of desired lines, specified by offset index for the * associated GPIO device * @flags: desired flags for the desired GPIO lines, such as - * GPIOHANDLE_REQUEST_OUTPUT, GPIOHANDLE_REQUEST_ACTIVE_LOW etc, OR:ed + * %GPIOHANDLE_REQUEST_OUTPUT, %GPIOHANDLE_REQUEST_ACTIVE_LOW etc, added * together. Note that even if multiple lines are requested, the same flags * must be applicable to all of them, if you want lines with individual * flags set, request them one by one. It is possible to select * a batch of input or output lines, but they must all have the same * characteristics, i.e. all inputs or all outputs, all active low etc - * @default_values: if the GPIOHANDLE_REQUEST_OUTPUT is set for a requested + * @default_values: if the %GPIOHANDLE_REQUEST_OUTPUT is set for a requested * line, this specifies the default output value, should be 0 (low) or * 1 (high), anything else than 0 or 1 will be interpreted as 1 (high) * @consumer_label: a desired consumer label for the selected GPIO line(s) @@ -393,11 +394,11 @@ struct gpioline_info_changed { * @lines: number of lines requested in this request, i.e. the number of * valid fields in the above arrays, set to 1 to request a single line * @fd: if successful this field will contain a valid anonymous file handle - * after a GPIO_GET_LINEHANDLE_IOCTL operation, zero or negative value + * after a %GPIO_GET_LINEHANDLE_IOCTL operation, zero or negative value * means error * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_request instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_request instead. */ struct gpiohandle_request { __u32 lineoffsets[GPIOHANDLES_MAX]; @@ -411,15 +412,15 @@ struct gpiohandle_request { /** * struct gpiohandle_config - Configuration for a GPIO handle request * @flags: updated flags for the requested GPIO lines, such as - * GPIOHANDLE_REQUEST_OUTPUT, GPIOHANDLE_REQUEST_ACTIVE_LOW etc, OR:ed + * %GPIOHANDLE_REQUEST_OUTPUT, %GPIOHANDLE_REQUEST_ACTIVE_LOW etc, added * together - * @default_values: if the GPIOHANDLE_REQUEST_OUTPUT is set in flags, + * @default_values: if the %GPIOHANDLE_REQUEST_OUTPUT is set in flags, * this specifies the default output value, should be 0 (low) or * 1 (high), anything else than 0 or 1 will be interpreted as 1 (high) * @padding: reserved for future use and should be zero filled * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_config instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_config instead. */ struct gpiohandle_config { __u32 flags; @@ -433,8 +434,8 @@ struct gpiohandle_config { * state of a line, when setting the state of lines these should contain * the desired target state * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_values instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_values instead. */ struct gpiohandle_data { __u8 values[GPIOHANDLES_MAX]; @@ -450,17 +451,17 @@ struct gpiohandle_data { * @lineoffset: the desired line to subscribe to events from, specified by * offset index for the associated GPIO device * @handleflags: desired handle flags for the desired GPIO line, such as - * GPIOHANDLE_REQUEST_ACTIVE_LOW or GPIOHANDLE_REQUEST_OPEN_DRAIN + * %GPIOHANDLE_REQUEST_ACTIVE_LOW or %GPIOHANDLE_REQUEST_OPEN_DRAIN * @eventflags: desired flags for the desired GPIO event line, such as - * GPIOEVENT_REQUEST_RISING_EDGE or GPIOEVENT_REQUEST_FALLING_EDGE + * %GPIOEVENT_REQUEST_RISING_EDGE or %GPIOEVENT_REQUEST_FALLING_EDGE * @consumer_label: a desired consumer label for the selected GPIO line(s) * such as "my-listener" * @fd: if successful this field will contain a valid anonymous file handle - * after a GPIO_GET_LINEEVENT_IOCTL operation, zero or negative value + * after a %GPIO_GET_LINEEVENT_IOCTL operation, zero or negative value * means error * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_request instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_request instead. */ struct gpioevent_request { __u32 lineoffset; @@ -481,8 +482,8 @@ struct gpioevent_request { * @timestamp: best estimate of time of event occurrence, in nanoseconds * @id: event identifier * - * This struct is part of ABI v1 and is deprecated. - * Use struct gpio_v2_line_event instead. + * Note: This struct is part of ABI v1 and is deprecated. + * Use &struct gpio_v2_line_event instead. */ struct gpioevent_data { __u64 timestamp; -- cgit v1.2.3 From c303c51c87a61ace7330b5e0217468b1b8f98a75 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:28 +0800 Subject: gpio: uapi: remove whitespace Remove leading whitespace in ABI v1 comment. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-5-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index 32dd18f238c3..ad3f56dd87ec 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -292,7 +292,7 @@ struct gpio_v2_line_event { }; /* - * ABI v1 + * ABI v1 * * This version of the ABI is deprecated. * Use the latest version of the ABI, defined above, instead. -- cgit v1.2.3 From 2f84a2de539cc4301a332c2c76473fc25baf21b7 Mon Sep 17 00:00:00 2001 From: Kent Gibson Date: Mon, 5 Oct 2020 15:03:29 +0800 Subject: gpio: uapi: clarify the meaning of 'empty' char arrays Clarify that a char array containing a string is considered 'empty' if the first character is the null terminator. The remaining characters are not relevant to this determination. Signed-off-by: Kent Gibson Reviewed-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20201005070329.21055-6-warthog618@gmail.com Signed-off-by: Linus Walleij --- include/uapi/linux/gpio.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h index ad3f56dd87ec..2072c260f5d0 100644 --- a/include/uapi/linux/gpio.h +++ b/include/uapi/linux/gpio.h @@ -26,7 +26,7 @@ * struct gpiochip_info - Information about a certain GPIO chip * @name: the Linux kernel name of this GPIO chip * @label: a functional name for this GPIO chip, such as a product - * number, may be empty + * number, may be empty (i.e. label[0] == '\0') * @lines: number of GPIO lines on this chip */ struct gpiochip_info { @@ -203,7 +203,7 @@ struct gpio_v2_line_request { * struct gpio_v2_line_info - Information about a certain GPIO line * @name: the name of this GPIO line, such as the output pin of the line on * the chip, a rail or a pin header name on a board, as specified by the - * GPIO chip, may be empty + * GPIO chip, may be empty (i.e. name[0] == '\0') * @consumer: a functional name for the consumer of this GPIO line as set * by whatever is using it, will be empty if there is no current user but * may also be empty if the consumer doesn't set this up @@ -315,7 +315,7 @@ struct gpio_v2_line_event { * @flags: various flags for this line * @name: the name of this GPIO line, such as the output pin of the line on the * chip, a rail or a pin header name on a board, as specified by the gpio - * chip, may be empty + * chip, may be empty (i.e. name[0] == '\0') * @consumer: a functional name for the consumer of this GPIO line as set by * whatever is using it, will be empty if there is no current user but may * also be empty if the consumer doesn't set this up -- cgit v1.2.3 From 80ade22c06ca115b81dd168e99479c8e09843513 Mon Sep 17 00:00:00 2001 From: Sudeep Dutt Date: Tue, 27 Oct 2020 20:14:15 -0700 Subject: misc: mic: remove the MIC drivers This patch removes the MIC drivers from the kernel tree since the corresponding devices have been discontinued. Removing the dma and char-misc changes in one patch and merging via the char-misc tree is best to avoid any potential build breakage. Cc: Nikhil Rao Reviewed-by: Ashutosh Dixit Signed-off-by: Sudeep Dutt Acked-By: Vinod Koul Reviewed-by: Sherry Sun Link: https://lore.kernel.org/r/8c1443136563de34699d2c084df478181c205db4.1603854416.git.sudeep.dutt@intel.com Signed-off-by: Greg Kroah-Hartman --- Documentation/misc-devices/mic/index.rst | 16 - Documentation/misc-devices/mic/mic_overview.rst | 85 - Documentation/misc-devices/mic/scif_overview.rst | 108 -- MAINTAINERS | 16 - drivers/dma/Kconfig | 18 - drivers/dma/Makefile | 1 - drivers/dma/mic_x100_dma.c | 770 --------- drivers/dma/mic_x100_dma.h | 275 --- drivers/misc/Kconfig | 1 - drivers/misc/Makefile | 1 - drivers/misc/mic/Kconfig | 141 -- drivers/misc/mic/Makefile | 12 - drivers/misc/mic/bus/Makefile | 9 - drivers/misc/mic/bus/cosm_bus.c | 130 -- drivers/misc/mic/bus/cosm_bus.h | 125 -- drivers/misc/mic/bus/mic_bus.c | 194 --- drivers/misc/mic/bus/scif_bus.c | 201 --- drivers/misc/mic/bus/scif_bus.h | 125 -- drivers/misc/mic/bus/vop_bus.c | 194 --- drivers/misc/mic/bus/vop_bus.h | 129 -- drivers/misc/mic/card/Makefile | 11 - drivers/misc/mic/card/mic_debugfs.c | 85 - drivers/misc/mic/card/mic_device.c | 417 ----- drivers/misc/mic/card/mic_device.h | 137 -- drivers/misc/mic/card/mic_x100.c | 347 ---- drivers/misc/mic/card/mic_x100.h | 37 - drivers/misc/mic/common/mic_dev.h | 55 - drivers/misc/mic/cosm/Makefile | 11 - drivers/misc/mic/cosm/cosm_debugfs.c | 116 -- drivers/misc/mic/cosm/cosm_main.c | 382 ----- drivers/misc/mic/cosm/cosm_main.h | 61 - drivers/misc/mic/cosm/cosm_scif_server.c | 399 ----- drivers/misc/mic/cosm/cosm_sysfs.c | 449 ----- drivers/misc/mic/cosm_client/Makefile | 8 - drivers/misc/mic/cosm_client/cosm_scif_client.c | 269 --- drivers/misc/mic/host/Makefile | 12 - drivers/misc/mic/host/mic_boot.c | 588 ------- drivers/misc/mic/host/mic_debugfs.c | 149 -- drivers/misc/mic/host/mic_device.h | 157 -- drivers/misc/mic/host/mic_intr.c | 635 ------- drivers/misc/mic/host/mic_intr.h | 137 -- drivers/misc/mic/host/mic_main.c | 335 ---- drivers/misc/mic/host/mic_smpt.c | 427 ----- drivers/misc/mic/host/mic_smpt.h | 87 - drivers/misc/mic/host/mic_x100.c | 585 ------- drivers/misc/mic/host/mic_x100.h | 77 - drivers/misc/mic/scif/Makefile | 21 - drivers/misc/mic/scif/scif_api.c | 1485 ----------------- drivers/misc/mic/scif/scif_debugfs.c | 116 -- drivers/misc/mic/scif/scif_dma.c | 1940 ---------------------- drivers/misc/mic/scif/scif_epd.c | 357 ---- drivers/misc/mic/scif/scif_epd.h | 200 --- drivers/misc/mic/scif/scif_fd.c | 462 ------ drivers/misc/mic/scif/scif_fence.c | 783 --------- drivers/misc/mic/scif/scif_main.c | 351 ---- drivers/misc/mic/scif/scif_main.h | 274 --- drivers/misc/mic/scif/scif_map.h | 127 -- drivers/misc/mic/scif/scif_mmap.c | 690 -------- drivers/misc/mic/scif/scif_nm.c | 229 --- drivers/misc/mic/scif/scif_nodeqp.c | 1349 --------------- drivers/misc/mic/scif/scif_nodeqp.h | 221 --- drivers/misc/mic/scif/scif_peer_bus.c | 175 -- drivers/misc/mic/scif/scif_peer_bus.h | 23 - drivers/misc/mic/scif/scif_ports.c | 116 -- drivers/misc/mic/scif/scif_rb.c | 240 --- drivers/misc/mic/scif/scif_rb.h | 100 -- drivers/misc/mic/scif/scif_rma.c | 1760 -------------------- drivers/misc/mic/scif/scif_rma.h | 477 ------ drivers/misc/mic/scif/scif_rma_list.c | 282 ---- drivers/misc/mic/scif/scif_rma_list.h | 48 - drivers/misc/mic/vop/Makefile | 10 - drivers/misc/mic/vop/vop_debugfs.c | 184 -- drivers/misc/mic/vop/vop_main.c | 784 --------- drivers/misc/mic/vop/vop_main.h | 158 -- drivers/misc/mic/vop/vop_vringh.c | 1166 ------------- include/linux/mic_bus.h | 100 -- include/linux/scif.h | 1339 --------------- include/uapi/linux/mic_common.h | 235 --- include/uapi/linux/mic_ioctl.h | 77 - samples/mic/mpssd/.gitignore | 2 - samples/mic/mpssd/Makefile | 28 - samples/mic/mpssd/micctrl | 162 -- samples/mic/mpssd/mpss | 189 --- samples/mic/mpssd/mpssd.c | 1815 -------------------- samples/mic/mpssd/mpssd.h | 89 - samples/mic/mpssd/sysfs.c | 91 - 86 files changed, 26779 deletions(-) delete mode 100644 Documentation/misc-devices/mic/index.rst delete mode 100644 Documentation/misc-devices/mic/mic_overview.rst delete mode 100644 Documentation/misc-devices/mic/scif_overview.rst delete mode 100644 drivers/dma/mic_x100_dma.c delete mode 100644 drivers/dma/mic_x100_dma.h delete mode 100644 drivers/misc/mic/Kconfig delete mode 100644 drivers/misc/mic/Makefile delete mode 100644 drivers/misc/mic/bus/Makefile delete mode 100644 drivers/misc/mic/bus/cosm_bus.c delete mode 100644 drivers/misc/mic/bus/cosm_bus.h delete mode 100644 drivers/misc/mic/bus/mic_bus.c delete mode 100644 drivers/misc/mic/bus/scif_bus.c delete mode 100644 drivers/misc/mic/bus/scif_bus.h delete mode 100644 drivers/misc/mic/bus/vop_bus.c delete mode 100644 drivers/misc/mic/bus/vop_bus.h delete mode 100644 drivers/misc/mic/card/Makefile delete mode 100644 drivers/misc/mic/card/mic_debugfs.c delete mode 100644 drivers/misc/mic/card/mic_device.c delete mode 100644 drivers/misc/mic/card/mic_device.h delete mode 100644 drivers/misc/mic/card/mic_x100.c delete mode 100644 drivers/misc/mic/card/mic_x100.h delete mode 100644 drivers/misc/mic/common/mic_dev.h delete mode 100644 drivers/misc/mic/cosm/Makefile delete mode 100644 drivers/misc/mic/cosm/cosm_debugfs.c delete mode 100644 drivers/misc/mic/cosm/cosm_main.c delete mode 100644 drivers/misc/mic/cosm/cosm_main.h delete mode 100644 drivers/misc/mic/cosm/cosm_scif_server.c delete mode 100644 drivers/misc/mic/cosm/cosm_sysfs.c delete mode 100644 drivers/misc/mic/cosm_client/Makefile delete mode 100644 drivers/misc/mic/cosm_client/cosm_scif_client.c delete mode 100644 drivers/misc/mic/host/Makefile delete mode 100644 drivers/misc/mic/host/mic_boot.c delete mode 100644 drivers/misc/mic/host/mic_debugfs.c delete mode 100644 drivers/misc/mic/host/mic_device.h delete mode 100644 drivers/misc/mic/host/mic_intr.c delete mode 100644 drivers/misc/mic/host/mic_intr.h delete mode 100644 drivers/misc/mic/host/mic_main.c delete mode 100644 drivers/misc/mic/host/mic_smpt.c delete mode 100644 drivers/misc/mic/host/mic_smpt.h delete mode 100644 drivers/misc/mic/host/mic_x100.c delete mode 100644 drivers/misc/mic/host/mic_x100.h delete mode 100644 drivers/misc/mic/scif/Makefile delete mode 100644 drivers/misc/mic/scif/scif_api.c delete mode 100644 drivers/misc/mic/scif/scif_debugfs.c delete mode 100644 drivers/misc/mic/scif/scif_dma.c delete mode 100644 drivers/misc/mic/scif/scif_epd.c delete mode 100644 drivers/misc/mic/scif/scif_epd.h delete mode 100644 drivers/misc/mic/scif/scif_fd.c delete mode 100644 drivers/misc/mic/scif/scif_fence.c delete mode 100644 drivers/misc/mic/scif/scif_main.c delete mode 100644 drivers/misc/mic/scif/scif_main.h delete mode 100644 drivers/misc/mic/scif/scif_map.h delete mode 100644 drivers/misc/mic/scif/scif_mmap.c delete mode 100644 drivers/misc/mic/scif/scif_nm.c delete mode 100644 drivers/misc/mic/scif/scif_nodeqp.c delete mode 100644 drivers/misc/mic/scif/scif_nodeqp.h delete mode 100644 drivers/misc/mic/scif/scif_peer_bus.c delete mode 100644 drivers/misc/mic/scif/scif_peer_bus.h delete mode 100644 drivers/misc/mic/scif/scif_ports.c delete mode 100644 drivers/misc/mic/scif/scif_rb.c delete mode 100644 drivers/misc/mic/scif/scif_rb.h delete mode 100644 drivers/misc/mic/scif/scif_rma.c delete mode 100644 drivers/misc/mic/scif/scif_rma.h delete mode 100644 drivers/misc/mic/scif/scif_rma_list.c delete mode 100644 drivers/misc/mic/scif/scif_rma_list.h delete mode 100644 drivers/misc/mic/vop/Makefile delete mode 100644 drivers/misc/mic/vop/vop_debugfs.c delete mode 100644 drivers/misc/mic/vop/vop_main.c delete mode 100644 drivers/misc/mic/vop/vop_main.h delete mode 100644 drivers/misc/mic/vop/vop_vringh.c delete mode 100644 include/linux/mic_bus.h delete mode 100644 include/linux/scif.h delete mode 100644 include/uapi/linux/mic_common.h delete mode 100644 include/uapi/linux/mic_ioctl.h delete mode 100644 samples/mic/mpssd/.gitignore delete mode 100644 samples/mic/mpssd/Makefile delete mode 100755 samples/mic/mpssd/micctrl delete mode 100755 samples/mic/mpssd/mpss delete mode 100644 samples/mic/mpssd/mpssd.c delete mode 100644 samples/mic/mpssd/mpssd.h delete mode 100644 samples/mic/mpssd/sysfs.c (limited to 'include/uapi/linux') diff --git a/Documentation/misc-devices/mic/index.rst b/Documentation/misc-devices/mic/index.rst deleted file mode 100644 index 3a8d06367ef1..000000000000 --- a/Documentation/misc-devices/mic/index.rst +++ /dev/null @@ -1,16 +0,0 @@ -============================================= -Intel Many Integrated Core (MIC) architecture -============================================= - -.. toctree:: - :maxdepth: 1 - - mic_overview - scif_overview - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/misc-devices/mic/mic_overview.rst b/Documentation/misc-devices/mic/mic_overview.rst deleted file mode 100644 index 17d956bdaf7c..000000000000 --- a/Documentation/misc-devices/mic/mic_overview.rst +++ /dev/null @@ -1,85 +0,0 @@ -====================================================== -Intel Many Integrated Core (MIC) architecture overview -====================================================== - -An Intel MIC X100 device is a PCIe form factor add-in coprocessor -card based on the Intel Many Integrated Core (MIC) architecture -that runs a Linux OS. It is a PCIe endpoint in a platform and therefore -implements the three required standard address spaces i.e. configuration, -memory and I/O. The host OS loads a device driver as is typical for -PCIe devices. The card itself runs a bootstrap after reset that -transfers control to the card OS downloaded from the host driver. The -host driver supports OSPM suspend and resume operations. It shuts down -the card during suspend and reboots the card OS during resume. -The card OS as shipped by Intel is a Linux kernel with modifications -for the X100 devices. - -Since it is a PCIe card, it does not have the ability to host hardware -devices for networking, storage and console. We provide these devices -on X100 coprocessors thus enabling a self-bootable equivalent -environment for applications. A key benefit of our solution is that it -leverages the standard virtio framework for network, disk and console -devices, though in our case the virtio framework is used across a PCIe -bus. A Virtio Over PCIe (VOP) driver allows creating user space -backends or devices on the host which are used to probe virtio drivers -for these devices on the MIC card. The existing VRINGH infrastructure -in the kernel is used to access virtio rings from the host. The card -VOP driver allows card virtio drivers to communicate with their user -space backends on the host via a device page. Ring 3 apps on the host -can add, remove and configure virtio devices. A thin MIC specific -virtio_config_ops is implemented which is borrowed heavily from -previous similar implementations in lguest and s390. - -MIC PCIe card has a dma controller with 8 channels. These channels are -shared between the host s/w and the card s/w. 0 to 3 are used by host -and 4 to 7 by card. As the dma device doesn't show up as PCIe device, -a virtual bus called mic bus is created and virtual dma devices are -created on it by the host/card drivers. On host the channels are private -and used only by the host driver to transfer data for the virtio devices. - -The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a -low level communications API across PCIe currently implemented for MIC. -More details are available at scif_overview.txt. - -The Coprocessor State Management (COSM) driver on the host allows for -boot, shutdown and reset of Intel MIC devices. It communicates with a COSM -"client" driver on the MIC cards over SCIF to perform these functions. - -Here is a block diagram of the various components described above. The -virtio backends are situated on the host rather than the card given better -single threaded performance for the host compared to MIC, the ability of -the host to initiate DMA's to/from the card using the MIC DMA engine and -the fact that the virtio block storage backend can only be on the host:: - - +----------+ | +----------+ - | Card OS | | | Host OS | - +----------+ | +----------+ - | - +-------+ +--------+ +------+ | +---------+ +--------+ +--------+ - | Virtio| |Virtio | |Virtio| | |Virtio | |Virtio | |Virtio | - | Net | |Console | |Block | | |Net | |Console | |Block | - | Driver| |Driver | |Driver| | |backend | |backend | |backend | - +---+---+ +---+----+ +--+---+ | +---------+ +----+---+ +--------+ - | | | | | | | - | | | |User | | | - | | | |------|------------|--+------|------- - +---------+---------+ |Kernel | - | | | - +---------+ +---+----+ +------+ | +------+ +------+ +--+---+ +-------+ - |MIC DMA | | VOP | | SCIF | | | SCIF | | COSM | | VOP | |MIC DMA| - +---+-----+ +---+----+ +--+---+ | +--+---+ +--+---+ +------+ +----+--+ - | | | | | | | - +---+-----+ +---+----+ +--+---+ | +--+---+ +--+---+ +------+ +----+--+ - |MIC | | VOP | |SCIF | | |SCIF | | COSM | | VOP | | MIC | - |HW Bus | | HW Bus| |HW Bus| | |HW Bus| | Bus | |HW Bus| |HW Bus | - +---------+ +--------+ +--+---+ | +--+---+ +------+ +------+ +-------+ - | | | | | | | - | +-----------+--+ | | | +---------------+ | - | |Intel MIC | | | | |Intel MIC | | - | |Card Driver | | | | |Host Driver | | - +---+--------------+------+ | +----+---------------+-----+ - | | | - +-------------------------------------------------------------+ - | | - | PCIe Bus | - +-------------------------------------------------------------+ diff --git a/Documentation/misc-devices/mic/scif_overview.rst b/Documentation/misc-devices/mic/scif_overview.rst deleted file mode 100644 index 4c8ad9e43706..000000000000 --- a/Documentation/misc-devices/mic/scif_overview.rst +++ /dev/null @@ -1,108 +0,0 @@ -======================================== -Symmetric Communication Interface (SCIF) -======================================== - -The Symmetric Communication Interface (SCIF (pronounced as skiff)) is a low -level communications API across PCIe currently implemented for MIC. Currently -SCIF provides inter-node communication within a single host platform, where a -node is a MIC Coprocessor or Xeon based host. SCIF abstracts the details of -communicating over the PCIe bus while providing an API that is symmetric -across all the nodes in the PCIe network. An important design objective for SCIF -is to deliver the maximum possible performance given the communication -abilities of the hardware. SCIF has been used to implement an offload compiler -runtime and OFED support for MPI implementations for MIC coprocessors. - -SCIF API Components -=================== - -The SCIF API has the following parts: - -1. Connection establishment using a client server model -2. Byte stream messaging intended for short messages -3. Node enumeration to determine online nodes -4. Poll semantics for detection of incoming connections and messages -5. Memory registration to pin down pages -6. Remote memory mapping for low latency CPU accesses via mmap -7. Remote DMA (RDMA) for high bandwidth DMA transfers -8. Fence APIs for RDMA synchronization - -SCIF exposes the notion of a connection which can be used by peer processes on -nodes in a SCIF PCIe "network" to share memory "windows" and to communicate. A -process in a SCIF node initiates a SCIF connection to a peer process on a -different node via a SCIF "endpoint". SCIF endpoints support messaging APIs -which are similar to connection oriented socket APIs. Connected SCIF endpoints -can also register local memory which is followed by data transfer using either -DMA, CPU copies or remote memory mapping via mmap. SCIF supports both user and -kernel mode clients which are functionally equivalent. - -SCIF Performance for MIC -======================== - -DMA bandwidth comparison between the TCP (over ethernet over PCIe) stack versus -SCIF shows the performance advantages of SCIF for HPC applications and -runtimes:: - - Comparison of TCP and SCIF based BW - - Throughput (GB/sec) - 8 + PCIe Bandwidth ****** - + TCP ###### - 7 + ************************************** SCIF %%%%%% - | %%%%%%%%%%%%%%%%%%% - 6 + %%%% - | %% - | %%% - 5 + %% - | %% - 4 + %% - | %% - 3 + %% - | % - 2 + %% - | %% - | % - 1 + - + ###################################### - 0 +++---+++--+--+-+--+--+-++-+--+-++-+--+-++-+- - 1 10 100 1000 10000 100000 - Transfer Size (KBytes) - -SCIF allows memory sharing via mmap(..) between processes on different PCIe -nodes and thus provides bare-metal PCIe latency. The round trip SCIF mmap -latency from the host to an x100 MIC for an 8 byte message is 0.44 usecs. - -SCIF has a user space library which is a thin IOCTL wrapper providing a user -space API similar to the kernel API in scif.h. The SCIF user space library -is distributed @ https://software.intel.com/en-us/mic-developer - -Here is some pseudo code for an example of how two applications on two PCIe -nodes would typically use the SCIF API:: - - Process A (on node A) Process B (on node B) - - /* get online node information */ - scif_get_node_ids(..) scif_get_node_ids(..) - scif_open(..) scif_open(..) - scif_bind(..) scif_bind(..) - scif_listen(..) - scif_accept(..) scif_connect(..) - /* SCIF connection established */ - - /* Send and receive short messages */ - scif_send(..)/scif_recv(..) scif_send(..)/scif_recv(..) - - /* Register memory */ - scif_register(..) scif_register(..) - - /* RDMA */ - scif_readfrom(..)/scif_writeto(..) scif_readfrom(..)/scif_writeto(..) - - /* Fence DMAs */ - scif_fence_signal(..) scif_fence_signal(..) - - mmap(..) mmap(..) - - /* Access remote registered memory */ - - /* Close the endpoints */ - scif_close(..) scif_close(..) diff --git a/MAINTAINERS b/MAINTAINERS index e73636b75f29..9289a9b43a51 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8976,22 +8976,6 @@ S: Supported W: https://01.org/linux-acpi F: drivers/platform/x86/intel_menlow.c -INTEL MIC DRIVERS (mic) -M: Sudeep Dutt -M: Ashutosh Dixit -S: Supported -W: https://github.com/sudeepdutt/mic -W: http://software.intel.com/en-us/mic-developer -F: Documentation/misc-devices/mic/ -F: drivers/dma/mic_x100_dma.c -F: drivers/dma/mic_x100_dma.h -F: drivers/misc/mic/ -F: include/linux/mic_bus.h -F: include/linux/scif.h -F: include/uapi/linux/mic_common.h -F: include/uapi/linux/mic_ioctl.h -F: include/uapi/linux/scif_ioctl.h - INTEL P-Unit IPC DRIVER M: Zha Qipeng L: platform-driver-x86@vger.kernel.org diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 518a1437862a..90284ffda58a 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -318,24 +318,6 @@ config INTEL_IOP_ADMA help Enable support for the Intel(R) IOP Series RAID engines. -config INTEL_MIC_X100_DMA - tristate "Intel MIC X100 DMA Driver" - depends on 64BIT && X86 && INTEL_MIC_BUS - select DMA_ENGINE - help - This enables DMA support for the Intel Many Integrated Core - (MIC) family of PCIe form factor coprocessor X100 devices that - run a 64 bit Linux OS. This driver will be used by both MIC - host and card drivers. - - If you are building host kernel with a MIC device or a card - kernel for a MIC device, then say M (recommended) or Y, else - say N. If unsure say N. - - More information about the Intel MIC family as well as the Linux - OS and tools for MIC to use with this driver are available from - . - config K3_DMA tristate "Hisilicon K3 DMA support" depends on ARCH_HI3xxx || ARCH_HISI || COMPILE_TEST diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index e60f81331d4c..948a8da05f8b 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -44,7 +44,6 @@ obj-$(CONFIG_INTEL_IDMA64) += idma64.o obj-$(CONFIG_INTEL_IOATDMA) += ioat/ obj-$(CONFIG_INTEL_IDXD) += idxd/ obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o -obj-$(CONFIG_INTEL_MIC_X100_DMA) += mic_x100_dma.o obj-$(CONFIG_K3_DMA) += k3dma.o obj-$(CONFIG_LPC18XX_DMAMUX) += lpc18xx-dmamux.o obj-$(CONFIG_MILBEAUT_HDMAC) += milbeaut-hdmac.o diff --git a/drivers/dma/mic_x100_dma.c b/drivers/dma/mic_x100_dma.c deleted file mode 100644 index fea8608a7810..000000000000 --- a/drivers/dma/mic_x100_dma.c +++ /dev/null @@ -1,770 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel MIC X100 DMA Driver. - * - * Adapted from IOAT dma driver. - */ -#include -#include -#include -#include - -#include "mic_x100_dma.h" - -#define MIC_DMA_MAX_XFER_SIZE_CARD (1 * 1024 * 1024 -\ - MIC_DMA_ALIGN_BYTES) -#define MIC_DMA_MAX_XFER_SIZE_HOST (1 * 1024 * 1024 >> 1) -#define MIC_DMA_DESC_TYPE_SHIFT 60 -#define MIC_DMA_MEMCPY_LEN_SHIFT 46 -#define MIC_DMA_STAT_INTR_SHIFT 59 - -/* high-water mark for pushing dma descriptors */ -static int mic_dma_pending_level = 4; - -/* Status descriptor is used to write a 64 bit value to a memory location */ -enum mic_dma_desc_format_type { - MIC_DMA_MEMCPY = 1, - MIC_DMA_STATUS, -}; - -static inline u32 mic_dma_hw_ring_inc(u32 val) -{ - return (val + 1) % MIC_DMA_DESC_RX_SIZE; -} - -static inline u32 mic_dma_hw_ring_dec(u32 val) -{ - return val ? val - 1 : MIC_DMA_DESC_RX_SIZE - 1; -} - -static inline void mic_dma_hw_ring_inc_head(struct mic_dma_chan *ch) -{ - ch->head = mic_dma_hw_ring_inc(ch->head); -} - -/* Prepare a memcpy desc */ -static inline void mic_dma_memcpy_desc(struct mic_dma_desc *desc, - dma_addr_t src_phys, dma_addr_t dst_phys, u64 size) -{ - u64 qw0, qw1; - - qw0 = src_phys; - qw0 |= (size >> MIC_DMA_ALIGN_SHIFT) << MIC_DMA_MEMCPY_LEN_SHIFT; - qw1 = MIC_DMA_MEMCPY; - qw1 <<= MIC_DMA_DESC_TYPE_SHIFT; - qw1 |= dst_phys; - desc->qw0 = qw0; - desc->qw1 = qw1; -} - -/* Prepare a status desc. with @data to be written at @dst_phys */ -static inline void mic_dma_prep_status_desc(struct mic_dma_desc *desc, u64 data, - dma_addr_t dst_phys, bool generate_intr) -{ - u64 qw0, qw1; - - qw0 = data; - qw1 = (u64) MIC_DMA_STATUS << MIC_DMA_DESC_TYPE_SHIFT | dst_phys; - if (generate_intr) - qw1 |= (1ULL << MIC_DMA_STAT_INTR_SHIFT); - desc->qw0 = qw0; - desc->qw1 = qw1; -} - -static void mic_dma_cleanup(struct mic_dma_chan *ch) -{ - struct dma_async_tx_descriptor *tx; - u32 tail; - u32 last_tail; - - spin_lock(&ch->cleanup_lock); - tail = mic_dma_read_cmp_cnt(ch); - /* - * This is the barrier pair for smp_wmb() in fn. - * mic_dma_tx_submit_unlock. It's required so that we read the - * updated cookie value from tx->cookie. - */ - smp_rmb(); - for (last_tail = ch->last_tail; tail != last_tail;) { - tx = &ch->tx_array[last_tail]; - if (tx->cookie) { - dma_cookie_complete(tx); - dmaengine_desc_get_callback_invoke(tx, NULL); - tx->callback = NULL; - } - last_tail = mic_dma_hw_ring_inc(last_tail); - } - /* finish all completion callbacks before incrementing tail */ - smp_mb(); - ch->last_tail = last_tail; - spin_unlock(&ch->cleanup_lock); -} - -static u32 mic_dma_ring_count(u32 head, u32 tail) -{ - u32 count; - - if (head >= tail) - count = (tail - 0) + (MIC_DMA_DESC_RX_SIZE - head); - else - count = tail - head; - return count - 1; -} - -/* Returns the num. of free descriptors on success, -ENOMEM on failure */ -static int mic_dma_avail_desc_ring_space(struct mic_dma_chan *ch, int required) -{ - struct device *dev = mic_dma_ch_to_device(ch); - u32 count; - - count = mic_dma_ring_count(ch->head, ch->last_tail); - if (count < required) { - mic_dma_cleanup(ch); - count = mic_dma_ring_count(ch->head, ch->last_tail); - } - - if (count < required) { - dev_dbg(dev, "Not enough desc space"); - dev_dbg(dev, "%s %d required=%u, avail=%u\n", - __func__, __LINE__, required, count); - return -ENOMEM; - } else { - return count; - } -} - -/* Program memcpy descriptors into the descriptor ring and update s/w head ptr*/ -static int mic_dma_prog_memcpy_desc(struct mic_dma_chan *ch, dma_addr_t src, - dma_addr_t dst, size_t len) -{ - size_t current_transfer_len; - size_t max_xfer_size = to_mic_dma_dev(ch)->max_xfer_size; - /* 3 is added to make sure we have enough space for status desc */ - int num_desc = len / max_xfer_size + 3; - int ret; - - if (len % max_xfer_size) - num_desc++; - - ret = mic_dma_avail_desc_ring_space(ch, num_desc); - if (ret < 0) - return ret; - do { - current_transfer_len = min(len, max_xfer_size); - mic_dma_memcpy_desc(&ch->desc_ring[ch->head], - src, dst, current_transfer_len); - mic_dma_hw_ring_inc_head(ch); - len -= current_transfer_len; - dst = dst + current_transfer_len; - src = src + current_transfer_len; - } while (len > 0); - return 0; -} - -/* It's a h/w quirk and h/w needs 2 status descriptors for every status desc */ -static void mic_dma_prog_intr(struct mic_dma_chan *ch) -{ - mic_dma_prep_status_desc(&ch->desc_ring[ch->head], 0, - ch->status_dest_micpa, false); - mic_dma_hw_ring_inc_head(ch); - mic_dma_prep_status_desc(&ch->desc_ring[ch->head], 0, - ch->status_dest_micpa, true); - mic_dma_hw_ring_inc_head(ch); -} - -/* Wrapper function to program memcpy descriptors/status descriptors */ -static int mic_dma_do_dma(struct mic_dma_chan *ch, int flags, dma_addr_t src, - dma_addr_t dst, size_t len) -{ - if (len && -ENOMEM == mic_dma_prog_memcpy_desc(ch, src, dst, len)) { - return -ENOMEM; - } else { - /* 3 is the maximum number of status descriptors */ - int ret = mic_dma_avail_desc_ring_space(ch, 3); - - if (ret < 0) - return ret; - } - - /* Above mic_dma_prog_memcpy_desc() makes sure we have enough space */ - if (flags & DMA_PREP_FENCE) { - mic_dma_prep_status_desc(&ch->desc_ring[ch->head], 0, - ch->status_dest_micpa, false); - mic_dma_hw_ring_inc_head(ch); - } - - if (flags & DMA_PREP_INTERRUPT) - mic_dma_prog_intr(ch); - - return 0; -} - -static inline void mic_dma_issue_pending(struct dma_chan *ch) -{ - struct mic_dma_chan *mic_ch = to_mic_dma_chan(ch); - - spin_lock(&mic_ch->issue_lock); - /* - * Write to head triggers h/w to act on the descriptors. - * On MIC, writing the same head value twice causes - * a h/w error. On second write, h/w assumes we filled - * the entire ring & overwrote some of the descriptors. - */ - if (mic_ch->issued == mic_ch->submitted) - goto out; - mic_ch->issued = mic_ch->submitted; - /* - * make descriptor updates visible before advancing head, - * this is purposefully not smp_wmb() since we are also - * publishing the descriptor updates to a dma device - */ - wmb(); - mic_dma_write_reg(mic_ch, MIC_DMA_REG_DHPR, mic_ch->issued); -out: - spin_unlock(&mic_ch->issue_lock); -} - -static inline void mic_dma_update_pending(struct mic_dma_chan *ch) -{ - if (mic_dma_ring_count(ch->issued, ch->submitted) - > mic_dma_pending_level) - mic_dma_issue_pending(&ch->api_ch); -} - -static dma_cookie_t mic_dma_tx_submit_unlock(struct dma_async_tx_descriptor *tx) -{ - struct mic_dma_chan *mic_ch = to_mic_dma_chan(tx->chan); - dma_cookie_t cookie; - - dma_cookie_assign(tx); - cookie = tx->cookie; - /* - * We need an smp write barrier here because another CPU might see - * an update to submitted and update h/w head even before we - * assigned a cookie to this tx. - */ - smp_wmb(); - mic_ch->submitted = mic_ch->head; - spin_unlock(&mic_ch->prep_lock); - mic_dma_update_pending(mic_ch); - return cookie; -} - -static inline struct dma_async_tx_descriptor * -allocate_tx(struct mic_dma_chan *ch) -{ - u32 idx = mic_dma_hw_ring_dec(ch->head); - struct dma_async_tx_descriptor *tx = &ch->tx_array[idx]; - - dma_async_tx_descriptor_init(tx, &ch->api_ch); - tx->tx_submit = mic_dma_tx_submit_unlock; - return tx; -} - -/* Program a status descriptor with dst as address and value to be written */ -static struct dma_async_tx_descriptor * -mic_dma_prep_status_lock(struct dma_chan *ch, dma_addr_t dst, u64 src_val, - unsigned long flags) -{ - struct mic_dma_chan *mic_ch = to_mic_dma_chan(ch); - int result; - - spin_lock(&mic_ch->prep_lock); - result = mic_dma_avail_desc_ring_space(mic_ch, 4); - if (result < 0) - goto error; - mic_dma_prep_status_desc(&mic_ch->desc_ring[mic_ch->head], src_val, dst, - false); - mic_dma_hw_ring_inc_head(mic_ch); - result = mic_dma_do_dma(mic_ch, flags, 0, 0, 0); - if (result < 0) - goto error; - - return allocate_tx(mic_ch); -error: - dev_err(mic_dma_ch_to_device(mic_ch), - "Error enqueueing dma status descriptor, error=%d\n", result); - spin_unlock(&mic_ch->prep_lock); - return NULL; -} - -/* - * Prepare a memcpy descriptor to be added to the ring. - * Note that the temporary descriptor adds an extra overhead of copying the - * descriptor to ring. So, we copy directly to the descriptor ring - */ -static struct dma_async_tx_descriptor * -mic_dma_prep_memcpy_lock(struct dma_chan *ch, dma_addr_t dma_dest, - dma_addr_t dma_src, size_t len, unsigned long flags) -{ - struct mic_dma_chan *mic_ch = to_mic_dma_chan(ch); - struct device *dev = mic_dma_ch_to_device(mic_ch); - int result; - - if (!len && !flags) - return NULL; - - spin_lock(&mic_ch->prep_lock); - result = mic_dma_do_dma(mic_ch, flags, dma_src, dma_dest, len); - if (result >= 0) - return allocate_tx(mic_ch); - dev_err(dev, "Error enqueueing dma, error=%d\n", result); - spin_unlock(&mic_ch->prep_lock); - return NULL; -} - -static struct dma_async_tx_descriptor * -mic_dma_prep_interrupt_lock(struct dma_chan *ch, unsigned long flags) -{ - struct mic_dma_chan *mic_ch = to_mic_dma_chan(ch); - int ret; - - spin_lock(&mic_ch->prep_lock); - ret = mic_dma_do_dma(mic_ch, flags, 0, 0, 0); - if (!ret) - return allocate_tx(mic_ch); - spin_unlock(&mic_ch->prep_lock); - return NULL; -} - -/* Return the status of the transaction */ -static enum dma_status -mic_dma_tx_status(struct dma_chan *ch, dma_cookie_t cookie, - struct dma_tx_state *txstate) -{ - struct mic_dma_chan *mic_ch = to_mic_dma_chan(ch); - - if (DMA_COMPLETE != dma_cookie_status(ch, cookie, txstate)) - mic_dma_cleanup(mic_ch); - - return dma_cookie_status(ch, cookie, txstate); -} - -static irqreturn_t mic_dma_thread_fn(int irq, void *data) -{ - mic_dma_cleanup((struct mic_dma_chan *)data); - return IRQ_HANDLED; -} - -static irqreturn_t mic_dma_intr_handler(int irq, void *data) -{ - struct mic_dma_chan *ch = ((struct mic_dma_chan *)data); - - mic_dma_ack_interrupt(ch); - return IRQ_WAKE_THREAD; -} - -static int mic_dma_alloc_desc_ring(struct mic_dma_chan *ch) -{ - u64 desc_ring_size = MIC_DMA_DESC_RX_SIZE * sizeof(*ch->desc_ring); - struct device *dev = &to_mbus_device(ch)->dev; - - desc_ring_size = ALIGN(desc_ring_size, MIC_DMA_ALIGN_BYTES); - ch->desc_ring = kzalloc(desc_ring_size, GFP_KERNEL); - - if (!ch->desc_ring) - return -ENOMEM; - - ch->desc_ring_micpa = dma_map_single(dev, ch->desc_ring, - desc_ring_size, DMA_BIDIRECTIONAL); - if (dma_mapping_error(dev, ch->desc_ring_micpa)) - goto map_error; - - ch->tx_array = vzalloc(array_size(MIC_DMA_DESC_RX_SIZE, - sizeof(*ch->tx_array))); - if (!ch->tx_array) - goto tx_error; - return 0; -tx_error: - dma_unmap_single(dev, ch->desc_ring_micpa, desc_ring_size, - DMA_BIDIRECTIONAL); -map_error: - kfree(ch->desc_ring); - return -ENOMEM; -} - -static void mic_dma_free_desc_ring(struct mic_dma_chan *ch) -{ - u64 desc_ring_size = MIC_DMA_DESC_RX_SIZE * sizeof(*ch->desc_ring); - - vfree(ch->tx_array); - desc_ring_size = ALIGN(desc_ring_size, MIC_DMA_ALIGN_BYTES); - dma_unmap_single(&to_mbus_device(ch)->dev, ch->desc_ring_micpa, - desc_ring_size, DMA_BIDIRECTIONAL); - kfree(ch->desc_ring); - ch->desc_ring = NULL; -} - -static void mic_dma_free_status_dest(struct mic_dma_chan *ch) -{ - dma_unmap_single(&to_mbus_device(ch)->dev, ch->status_dest_micpa, - L1_CACHE_BYTES, DMA_BIDIRECTIONAL); - kfree(ch->status_dest); -} - -static int mic_dma_alloc_status_dest(struct mic_dma_chan *ch) -{ - struct device *dev = &to_mbus_device(ch)->dev; - - ch->status_dest = kzalloc(L1_CACHE_BYTES, GFP_KERNEL); - if (!ch->status_dest) - return -ENOMEM; - ch->status_dest_micpa = dma_map_single(dev, ch->status_dest, - L1_CACHE_BYTES, DMA_BIDIRECTIONAL); - if (dma_mapping_error(dev, ch->status_dest_micpa)) { - kfree(ch->status_dest); - ch->status_dest = NULL; - return -ENOMEM; - } - return 0; -} - -static int mic_dma_check_chan(struct mic_dma_chan *ch) -{ - if (mic_dma_read_reg(ch, MIC_DMA_REG_DCHERR) || - mic_dma_read_reg(ch, MIC_DMA_REG_DSTAT) & MIC_DMA_CHAN_QUIESCE) { - mic_dma_disable_chan(ch); - mic_dma_chan_mask_intr(ch); - dev_err(mic_dma_ch_to_device(ch), - "%s %d error setting up mic dma chan %d\n", - __func__, __LINE__, ch->ch_num); - return -EBUSY; - } - return 0; -} - -static int mic_dma_chan_setup(struct mic_dma_chan *ch) -{ - if (MIC_DMA_CHAN_MIC == ch->owner) - mic_dma_chan_set_owner(ch); - mic_dma_disable_chan(ch); - mic_dma_chan_mask_intr(ch); - mic_dma_write_reg(ch, MIC_DMA_REG_DCHERRMSK, 0); - mic_dma_chan_set_desc_ring(ch); - ch->last_tail = mic_dma_read_reg(ch, MIC_DMA_REG_DTPR); - ch->head = ch->last_tail; - ch->issued = 0; - mic_dma_chan_unmask_intr(ch); - mic_dma_enable_chan(ch); - return mic_dma_check_chan(ch); -} - -static void mic_dma_chan_destroy(struct mic_dma_chan *ch) -{ - mic_dma_disable_chan(ch); - mic_dma_chan_mask_intr(ch); -} - -static int mic_dma_setup_irq(struct mic_dma_chan *ch) -{ - ch->cookie = - to_mbus_hw_ops(ch)->request_threaded_irq(to_mbus_device(ch), - mic_dma_intr_handler, mic_dma_thread_fn, - "mic dma_channel", ch, ch->ch_num); - return PTR_ERR_OR_ZERO(ch->cookie); -} - -static inline void mic_dma_free_irq(struct mic_dma_chan *ch) -{ - to_mbus_hw_ops(ch)->free_irq(to_mbus_device(ch), ch->cookie, ch); -} - -static int mic_dma_chan_init(struct mic_dma_chan *ch) -{ - int ret = mic_dma_alloc_desc_ring(ch); - - if (ret) - goto ring_error; - ret = mic_dma_alloc_status_dest(ch); - if (ret) - goto status_error; - ret = mic_dma_chan_setup(ch); - if (ret) - goto chan_error; - return ret; -chan_error: - mic_dma_free_status_dest(ch); -status_error: - mic_dma_free_desc_ring(ch); -ring_error: - return ret; -} - -static int mic_dma_drain_chan(struct mic_dma_chan *ch) -{ - struct dma_async_tx_descriptor *tx; - int err = 0; - dma_cookie_t cookie; - - tx = mic_dma_prep_memcpy_lock(&ch->api_ch, 0, 0, 0, DMA_PREP_FENCE); - if (!tx) { - err = -ENOMEM; - goto error; - } - - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) - err = -ENOMEM; - else - err = dma_sync_wait(&ch->api_ch, cookie); - if (err) { - dev_err(mic_dma_ch_to_device(ch), "%s %d TO chan 0x%x\n", - __func__, __LINE__, ch->ch_num); - err = -EIO; - } -error: - mic_dma_cleanup(ch); - return err; -} - -static inline void mic_dma_chan_uninit(struct mic_dma_chan *ch) -{ - mic_dma_chan_destroy(ch); - mic_dma_cleanup(ch); - mic_dma_free_status_dest(ch); - mic_dma_free_desc_ring(ch); -} - -static int mic_dma_init(struct mic_dma_device *mic_dma_dev, - enum mic_dma_chan_owner owner) -{ - int i, first_chan = mic_dma_dev->start_ch; - struct mic_dma_chan *ch; - int ret; - - for (i = first_chan; i < first_chan + MIC_DMA_NUM_CHAN; i++) { - ch = &mic_dma_dev->mic_ch[i]; - ch->ch_num = i; - ch->owner = owner; - spin_lock_init(&ch->cleanup_lock); - spin_lock_init(&ch->prep_lock); - spin_lock_init(&ch->issue_lock); - ret = mic_dma_setup_irq(ch); - if (ret) - goto error; - } - return 0; -error: - for (i = i - 1; i >= first_chan; i--) - mic_dma_free_irq(ch); - return ret; -} - -static void mic_dma_uninit(struct mic_dma_device *mic_dma_dev) -{ - int i, first_chan = mic_dma_dev->start_ch; - struct mic_dma_chan *ch; - - for (i = first_chan; i < first_chan + MIC_DMA_NUM_CHAN; i++) { - ch = &mic_dma_dev->mic_ch[i]; - mic_dma_free_irq(ch); - } -} - -static int mic_dma_alloc_chan_resources(struct dma_chan *ch) -{ - int ret = mic_dma_chan_init(to_mic_dma_chan(ch)); - if (ret) - return ret; - return MIC_DMA_DESC_RX_SIZE; -} - -static void mic_dma_free_chan_resources(struct dma_chan *ch) -{ - struct mic_dma_chan *mic_ch = to_mic_dma_chan(ch); - mic_dma_drain_chan(mic_ch); - mic_dma_chan_uninit(mic_ch); -} - -/* Set the fn. handlers and register the dma device with dma api */ -static int mic_dma_register_dma_device(struct mic_dma_device *mic_dma_dev, - enum mic_dma_chan_owner owner) -{ - int i, first_chan = mic_dma_dev->start_ch; - - dma_cap_zero(mic_dma_dev->dma_dev.cap_mask); - /* - * This dma engine is not capable of host memory to host memory - * transfers - */ - dma_cap_set(DMA_MEMCPY, mic_dma_dev->dma_dev.cap_mask); - - if (MIC_DMA_CHAN_HOST == owner) - dma_cap_set(DMA_PRIVATE, mic_dma_dev->dma_dev.cap_mask); - mic_dma_dev->dma_dev.device_alloc_chan_resources = - mic_dma_alloc_chan_resources; - mic_dma_dev->dma_dev.device_free_chan_resources = - mic_dma_free_chan_resources; - mic_dma_dev->dma_dev.device_tx_status = mic_dma_tx_status; - mic_dma_dev->dma_dev.device_prep_dma_memcpy = mic_dma_prep_memcpy_lock; - mic_dma_dev->dma_dev.device_prep_dma_imm_data = - mic_dma_prep_status_lock; - mic_dma_dev->dma_dev.device_prep_dma_interrupt = - mic_dma_prep_interrupt_lock; - mic_dma_dev->dma_dev.device_issue_pending = mic_dma_issue_pending; - mic_dma_dev->dma_dev.copy_align = MIC_DMA_ALIGN_SHIFT; - INIT_LIST_HEAD(&mic_dma_dev->dma_dev.channels); - for (i = first_chan; i < first_chan + MIC_DMA_NUM_CHAN; i++) { - mic_dma_dev->mic_ch[i].api_ch.device = &mic_dma_dev->dma_dev; - dma_cookie_init(&mic_dma_dev->mic_ch[i].api_ch); - list_add_tail(&mic_dma_dev->mic_ch[i].api_ch.device_node, - &mic_dma_dev->dma_dev.channels); - } - return dmaenginem_async_device_register(&mic_dma_dev->dma_dev); -} - -/* - * Initializes dma channels and registers the dma device with the - * dma engine api. - */ -static struct mic_dma_device *mic_dma_dev_reg(struct mbus_device *mbdev, - enum mic_dma_chan_owner owner) -{ - struct mic_dma_device *mic_dma_dev; - int ret; - struct device *dev = &mbdev->dev; - - mic_dma_dev = devm_kzalloc(dev, sizeof(*mic_dma_dev), GFP_KERNEL); - if (!mic_dma_dev) { - ret = -ENOMEM; - goto alloc_error; - } - mic_dma_dev->mbdev = mbdev; - mic_dma_dev->dma_dev.dev = dev; - mic_dma_dev->mmio = mbdev->mmio_va; - if (MIC_DMA_CHAN_HOST == owner) { - mic_dma_dev->start_ch = 0; - mic_dma_dev->max_xfer_size = MIC_DMA_MAX_XFER_SIZE_HOST; - } else { - mic_dma_dev->start_ch = 4; - mic_dma_dev->max_xfer_size = MIC_DMA_MAX_XFER_SIZE_CARD; - } - ret = mic_dma_init(mic_dma_dev, owner); - if (ret) - goto init_error; - ret = mic_dma_register_dma_device(mic_dma_dev, owner); - if (ret) - goto reg_error; - return mic_dma_dev; -reg_error: - mic_dma_uninit(mic_dma_dev); -init_error: - mic_dma_dev = NULL; -alloc_error: - dev_err(dev, "Error at %s %d ret=%d\n", __func__, __LINE__, ret); - return mic_dma_dev; -} - -static void mic_dma_dev_unreg(struct mic_dma_device *mic_dma_dev) -{ - mic_dma_uninit(mic_dma_dev); -} - -/* DEBUGFS CODE */ -static int mic_dma_reg_show(struct seq_file *s, void *pos) -{ - struct mic_dma_device *mic_dma_dev = s->private; - int i, chan_num, first_chan = mic_dma_dev->start_ch; - struct mic_dma_chan *ch; - - seq_printf(s, "SBOX_DCR: %#x\n", - mic_dma_mmio_read(&mic_dma_dev->mic_ch[first_chan], - MIC_DMA_SBOX_BASE + MIC_DMA_SBOX_DCR)); - seq_puts(s, "DMA Channel Registers\n"); - seq_printf(s, "%-10s| %-10s %-10s %-10s %-10s %-10s", - "Channel", "DCAR", "DTPR", "DHPR", "DRAR_HI", "DRAR_LO"); - seq_printf(s, " %-11s %-14s %-10s\n", "DCHERR", "DCHERRMSK", "DSTAT"); - for (i = first_chan; i < first_chan + MIC_DMA_NUM_CHAN; i++) { - ch = &mic_dma_dev->mic_ch[i]; - chan_num = ch->ch_num; - seq_printf(s, "%-10i| %-#10x %-#10x %-#10x %-#10x", - chan_num, - mic_dma_read_reg(ch, MIC_DMA_REG_DCAR), - mic_dma_read_reg(ch, MIC_DMA_REG_DTPR), - mic_dma_read_reg(ch, MIC_DMA_REG_DHPR), - mic_dma_read_reg(ch, MIC_DMA_REG_DRAR_HI)); - seq_printf(s, " %-#10x %-#10x %-#14x %-#10x\n", - mic_dma_read_reg(ch, MIC_DMA_REG_DRAR_LO), - mic_dma_read_reg(ch, MIC_DMA_REG_DCHERR), - mic_dma_read_reg(ch, MIC_DMA_REG_DCHERRMSK), - mic_dma_read_reg(ch, MIC_DMA_REG_DSTAT)); - } - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(mic_dma_reg); - -/* Debugfs parent dir */ -static struct dentry *mic_dma_dbg; - -static int mic_dma_driver_probe(struct mbus_device *mbdev) -{ - struct mic_dma_device *mic_dma_dev; - enum mic_dma_chan_owner owner; - - if (MBUS_DEV_DMA_MIC == mbdev->id.device) - owner = MIC_DMA_CHAN_MIC; - else - owner = MIC_DMA_CHAN_HOST; - - mic_dma_dev = mic_dma_dev_reg(mbdev, owner); - dev_set_drvdata(&mbdev->dev, mic_dma_dev); - - if (mic_dma_dbg) { - mic_dma_dev->dbg_dir = debugfs_create_dir(dev_name(&mbdev->dev), - mic_dma_dbg); - debugfs_create_file("mic_dma_reg", 0444, mic_dma_dev->dbg_dir, - mic_dma_dev, &mic_dma_reg_fops); - } - return 0; -} - -static void mic_dma_driver_remove(struct mbus_device *mbdev) -{ - struct mic_dma_device *mic_dma_dev; - - mic_dma_dev = dev_get_drvdata(&mbdev->dev); - debugfs_remove_recursive(mic_dma_dev->dbg_dir); - mic_dma_dev_unreg(mic_dma_dev); -} - -static struct mbus_device_id id_table[] = { - {MBUS_DEV_DMA_MIC, MBUS_DEV_ANY_ID}, - {MBUS_DEV_DMA_HOST, MBUS_DEV_ANY_ID}, - {0}, -}; - -static struct mbus_driver mic_dma_driver = { - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = mic_dma_driver_probe, - .remove = mic_dma_driver_remove, -}; - -static int __init mic_x100_dma_init(void) -{ - int rc = mbus_register_driver(&mic_dma_driver); - if (rc) - return rc; - mic_dma_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); - return 0; -} - -static void __exit mic_x100_dma_exit(void) -{ - debugfs_remove_recursive(mic_dma_dbg); - mbus_unregister_driver(&mic_dma_driver); -} - -module_init(mic_x100_dma_init); -module_exit(mic_x100_dma_exit); - -MODULE_DEVICE_TABLE(mbus, id_table); -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) MIC X100 DMA Driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/mic_x100_dma.h b/drivers/dma/mic_x100_dma.h deleted file mode 100644 index 68ef43a91714..000000000000 --- a/drivers/dma/mic_x100_dma.h +++ /dev/null @@ -1,275 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel MIC X100 DMA Driver. - * - * Adapted from IOAT dma driver. - */ -#ifndef _MIC_X100_DMA_H_ -#define _MIC_X100_DMA_H_ - -#include -#include -#include -#include -#include -#include -#include - -#include "dmaengine.h" - -/* - * MIC has a total of 8 dma channels. - * Four channels are assigned for host SW use & the remaining for MIC SW. - * MIC DMA transfer size & addresses need to be 64 byte aligned. - */ -#define MIC_DMA_MAX_NUM_CHAN 8 -#define MIC_DMA_NUM_CHAN 4 -#define MIC_DMA_ALIGN_SHIFT DMAENGINE_ALIGN_64_BYTES -#define MIC_DMA_ALIGN_BYTES (1 << MIC_DMA_ALIGN_SHIFT) -#define MIC_DMA_DESC_RX_SIZE (128 * 1024 - 4) - -/* - * Register descriptions - * All the registers are 32 bit registers. - * DCR is a global register and all others are per-channel. - * DCR - bits 0, 2, 4, 6, 8, 10, 12, 14 - enable bits for channels 0 to 7 - * bits 1, 3, 5, 7, 9, 11, 13, 15 - owner bits for channels 0 to 7 - * DCAR - bit 24 & 25 interrupt masks for mic owned & host owned channels - * DHPR - head of the descriptor ring updated by s/w - * DTPR - tail of the descriptor ring updated by h/w - * DRAR_LO - lower 32 bits of descriptor ring's mic address - * DRAR_HI - 3:0 - remaining 4 bits of descriptor ring's mic address - * 20:4 descriptor ring size - * 25:21 mic smpt entry number - * DSTAT - 16:0 h/w completion count; 31:28 dma engine status - * DCHERR - this register is non-zero on error - * DCHERRMSK - interrupt mask register - */ -#define MIC_DMA_HW_CMP_CNT_MASK 0x1ffff -#define MIC_DMA_CHAN_QUIESCE 0x20000000 -#define MIC_DMA_SBOX_BASE 0x00010000 -#define MIC_DMA_SBOX_DCR 0x0000A280 -#define MIC_DMA_SBOX_CH_BASE 0x0001A000 -#define MIC_DMA_SBOX_CHAN_OFF 0x40 -#define MIC_DMA_SBOX_DCAR_IM0 (0x1 << 24) -#define MIC_DMA_SBOX_DCAR_IM1 (0x1 << 25) -#define MIC_DMA_SBOX_DRARHI_SYS_MASK (0x1 << 26) -#define MIC_DMA_REG_DCAR 0 -#define MIC_DMA_REG_DHPR 4 -#define MIC_DMA_REG_DTPR 8 -#define MIC_DMA_REG_DRAR_LO 20 -#define MIC_DMA_REG_DRAR_HI 24 -#define MIC_DMA_REG_DSTAT 32 -#define MIC_DMA_REG_DCHERR 44 -#define MIC_DMA_REG_DCHERRMSK 48 - -/* HW dma desc */ -struct mic_dma_desc { - u64 qw0; - u64 qw1; -}; - -enum mic_dma_chan_owner { - MIC_DMA_CHAN_MIC = 0, - MIC_DMA_CHAN_HOST -}; - -/* - * mic_dma_chan - channel specific information - * @ch_num: channel number - * @owner: owner of this channel - * @last_tail: cached value of descriptor ring tail - * @head: index of next descriptor in desc_ring - * @issued: hardware notification point - * @submitted: index that will be used to submit descriptors to h/w - * @api_ch: dma engine api channel - * @desc_ring: dma descriptor ring - * @desc_ring_micpa: mic physical address of desc_ring - * @status_dest: destination for status (fence) descriptor - * @status_dest_micpa: mic address for status_dest, - * DMA controller uses this address - * @tx_array: array of async_tx - * @cleanup_lock: lock held when processing completed tx - * @prep_lock: lock held in prep_memcpy & released in tx_submit - * @issue_lock: lock used to synchronize writes to head - * @cookie: mic_irq cookie used with mic irq request - */ -struct mic_dma_chan { - int ch_num; - enum mic_dma_chan_owner owner; - u32 last_tail; - u32 head; - u32 issued; - u32 submitted; - struct dma_chan api_ch; - struct mic_dma_desc *desc_ring; - dma_addr_t desc_ring_micpa; - u64 *status_dest; - dma_addr_t status_dest_micpa; - struct dma_async_tx_descriptor *tx_array; - spinlock_t cleanup_lock; - spinlock_t prep_lock; - spinlock_t issue_lock; - struct mic_irq *cookie; -}; - -/* - * struct mic_dma_device - per mic device - * @mic_ch: dma channels - * @dma_dev: underlying dma device - * @mbdev: mic bus dma device - * @mmio: virtual address of the mmio space - * @dbg_dir: debugfs directory - * @start_ch: first channel number that can be used - * @max_xfer_size: maximum transfer size per dma descriptor - */ -struct mic_dma_device { - struct mic_dma_chan mic_ch[MIC_DMA_MAX_NUM_CHAN]; - struct dma_device dma_dev; - struct mbus_device *mbdev; - void __iomem *mmio; - struct dentry *dbg_dir; - int start_ch; - size_t max_xfer_size; -}; - -static inline struct mic_dma_chan *to_mic_dma_chan(struct dma_chan *ch) -{ - return container_of(ch, struct mic_dma_chan, api_ch); -} - -static inline struct mic_dma_device *to_mic_dma_dev(struct mic_dma_chan *ch) -{ - return - container_of((const typeof(((struct mic_dma_device *)0)->mic_ch)*) - (ch - ch->ch_num), struct mic_dma_device, mic_ch); -} - -static inline struct mbus_device *to_mbus_device(struct mic_dma_chan *ch) -{ - return to_mic_dma_dev(ch)->mbdev; -} - -static inline struct mbus_hw_ops *to_mbus_hw_ops(struct mic_dma_chan *ch) -{ - return to_mbus_device(ch)->hw_ops; -} - -static inline struct device *mic_dma_ch_to_device(struct mic_dma_chan *ch) -{ - return to_mic_dma_dev(ch)->dma_dev.dev; -} - -static inline void __iomem *mic_dma_chan_to_mmio(struct mic_dma_chan *ch) -{ - return to_mic_dma_dev(ch)->mmio; -} - -static inline u32 mic_dma_read_reg(struct mic_dma_chan *ch, u32 reg) -{ - return ioread32(mic_dma_chan_to_mmio(ch) + MIC_DMA_SBOX_CH_BASE + - ch->ch_num * MIC_DMA_SBOX_CHAN_OFF + reg); -} - -static inline void mic_dma_write_reg(struct mic_dma_chan *ch, u32 reg, u32 val) -{ - iowrite32(val, mic_dma_chan_to_mmio(ch) + MIC_DMA_SBOX_CH_BASE + - ch->ch_num * MIC_DMA_SBOX_CHAN_OFF + reg); -} - -static inline u32 mic_dma_mmio_read(struct mic_dma_chan *ch, u32 offset) -{ - return ioread32(mic_dma_chan_to_mmio(ch) + offset); -} - -static inline void mic_dma_mmio_write(struct mic_dma_chan *ch, u32 val, - u32 offset) -{ - iowrite32(val, mic_dma_chan_to_mmio(ch) + offset); -} - -static inline u32 mic_dma_read_cmp_cnt(struct mic_dma_chan *ch) -{ - return mic_dma_read_reg(ch, MIC_DMA_REG_DSTAT) & - MIC_DMA_HW_CMP_CNT_MASK; -} - -static inline void mic_dma_chan_set_owner(struct mic_dma_chan *ch) -{ - u32 dcr = mic_dma_mmio_read(ch, MIC_DMA_SBOX_BASE + MIC_DMA_SBOX_DCR); - u32 chan_num = ch->ch_num; - - dcr = (dcr & ~(0x1 << (chan_num * 2))) | (ch->owner << (chan_num * 2)); - mic_dma_mmio_write(ch, dcr, MIC_DMA_SBOX_BASE + MIC_DMA_SBOX_DCR); -} - -static inline void mic_dma_enable_chan(struct mic_dma_chan *ch) -{ - u32 dcr = mic_dma_mmio_read(ch, MIC_DMA_SBOX_BASE + MIC_DMA_SBOX_DCR); - - dcr |= 2 << (ch->ch_num << 1); - mic_dma_mmio_write(ch, dcr, MIC_DMA_SBOX_BASE + MIC_DMA_SBOX_DCR); -} - -static inline void mic_dma_disable_chan(struct mic_dma_chan *ch) -{ - u32 dcr = mic_dma_mmio_read(ch, MIC_DMA_SBOX_BASE + MIC_DMA_SBOX_DCR); - - dcr &= ~(2 << (ch->ch_num << 1)); - mic_dma_mmio_write(ch, dcr, MIC_DMA_SBOX_BASE + MIC_DMA_SBOX_DCR); -} - -static void mic_dma_chan_set_desc_ring(struct mic_dma_chan *ch) -{ - u32 drar_hi; - dma_addr_t desc_ring_micpa = ch->desc_ring_micpa; - - drar_hi = (MIC_DMA_DESC_RX_SIZE & 0x1ffff) << 4; - if (MIC_DMA_CHAN_MIC == ch->owner) { - drar_hi |= (desc_ring_micpa >> 32) & 0xf; - } else { - drar_hi |= MIC_DMA_SBOX_DRARHI_SYS_MASK; - drar_hi |= ((desc_ring_micpa >> 34) - & 0x1f) << 21; - drar_hi |= (desc_ring_micpa >> 32) & 0x3; - } - mic_dma_write_reg(ch, MIC_DMA_REG_DRAR_LO, (u32) desc_ring_micpa); - mic_dma_write_reg(ch, MIC_DMA_REG_DRAR_HI, drar_hi); -} - -static inline void mic_dma_chan_mask_intr(struct mic_dma_chan *ch) -{ - u32 dcar = mic_dma_read_reg(ch, MIC_DMA_REG_DCAR); - - if (MIC_DMA_CHAN_MIC == ch->owner) - dcar |= MIC_DMA_SBOX_DCAR_IM0; - else - dcar |= MIC_DMA_SBOX_DCAR_IM1; - mic_dma_write_reg(ch, MIC_DMA_REG_DCAR, dcar); -} - -static inline void mic_dma_chan_unmask_intr(struct mic_dma_chan *ch) -{ - u32 dcar = mic_dma_read_reg(ch, MIC_DMA_REG_DCAR); - - if (MIC_DMA_CHAN_MIC == ch->owner) - dcar &= ~MIC_DMA_SBOX_DCAR_IM0; - else - dcar &= ~MIC_DMA_SBOX_DCAR_IM1; - mic_dma_write_reg(ch, MIC_DMA_REG_DCAR, dcar); -} - -static void mic_dma_ack_interrupt(struct mic_dma_chan *ch) -{ - if (MIC_DMA_CHAN_MIC == ch->owner) { - /* HW errata */ - mic_dma_chan_mask_intr(ch); - mic_dma_chan_unmask_intr(ch); - } - to_mbus_hw_ops(ch)->ack_interrupt(to_mbus_device(ch), ch->ch_num); -} -#endif diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index d5ce8082b0a0..fafa8b0d8099 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -474,7 +474,6 @@ source "drivers/misc/lis3lv02d/Kconfig" source "drivers/misc/altera-stapl/Kconfig" source "drivers/misc/mei/Kconfig" source "drivers/misc/vmw_vmci/Kconfig" -source "drivers/misc/mic/Kconfig" source "drivers/misc/genwqe/Kconfig" source "drivers/misc/echo/Kconfig" source "drivers/misc/cxl/Kconfig" diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 2521359e8ef7..d23231e73330 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -46,7 +46,6 @@ obj-$(CONFIG_VMWARE_VMCI) += vmw_vmci/ obj-$(CONFIG_LATTICE_ECP3_CONFIG) += lattice-ecp3-config.o obj-$(CONFIG_SRAM) += sram.o obj-$(CONFIG_SRAM_EXEC) += sram-exec.o -obj-y += mic/ obj-$(CONFIG_GENWQE) += genwqe/ obj-$(CONFIG_ECHO) += echo/ obj-$(CONFIG_CXL_BASE) += cxl/ diff --git a/drivers/misc/mic/Kconfig b/drivers/misc/mic/Kconfig deleted file mode 100644 index 8a7c2c5711d5..000000000000 --- a/drivers/misc/mic/Kconfig +++ /dev/null @@ -1,141 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menu "Intel MIC & related support" - -config INTEL_MIC_BUS - tristate "Intel MIC Bus Driver" - depends on 64BIT && PCI && X86 - select DMA_OPS - help - This option is selected by any driver which registers a - device or driver on the MIC Bus, such as CONFIG_INTEL_MIC_HOST, - CONFIG_INTEL_MIC_CARD, CONFIG_INTEL_MIC_X100_DMA etc. - - If you are building a host/card kernel with an Intel MIC device - then say M (recommended) or Y, else say N. If unsure say N. - - More information about the Intel MIC family as well as the Linux - OS and tools for MIC to use with this driver are available from - . - -config SCIF_BUS - tristate "SCIF Bus Driver" - depends on 64BIT && PCI && X86 - select DMA_OPS - help - This option is selected by any driver which registers a - device or driver on the SCIF Bus, such as CONFIG_INTEL_MIC_HOST - and CONFIG_INTEL_MIC_CARD. - - If you are building a host/card kernel with an Intel MIC device - then say M (recommended) or Y, else say N. If unsure say N. - - More information about the Intel MIC family as well as the Linux - OS and tools for MIC to use with this driver are available from - . - -config VOP_BUS - tristate "VOP Bus Driver" - depends on HAS_DMA - select DMA_OPS - help - This option is selected by any driver which registers a - device or driver on the VOP Bus, such as CONFIG_INTEL_MIC_HOST - and CONFIG_INTEL_MIC_CARD. - - If you are building a host/card kernel with an Intel MIC device - then say M (recommended) or Y, else say N. If unsure say N. - - More information about the Intel MIC family as well as the Linux - OS and tools for MIC to use with this driver are available from - . - -config INTEL_MIC_HOST - tristate "Intel MIC Host Driver" - depends on 64BIT && PCI && X86 - depends on INTEL_MIC_BUS && SCIF_BUS && MIC_COSM && VOP_BUS - select DMA_OPS - help - This enables Host Driver support for the Intel Many Integrated - Core (MIC) family of PCIe form factor coprocessor devices that - run a 64 bit Linux OS. The driver manages card OS state and - enables communication between host and card. Intel MIC X100 - devices are currently supported. - - If you are building a host kernel with an Intel MIC device then - say M (recommended) or Y, else say N. If unsure say N. - - More information about the Intel MIC family as well as the Linux - OS and tools for MIC to use with this driver are available from - . - -config INTEL_MIC_CARD - tristate "Intel MIC Card Driver" - depends on 64BIT && X86 - depends on INTEL_MIC_BUS && SCIF_BUS && MIC_COSM && VOP_BUS - select VIRTIO - help - This enables card driver support for the Intel Many Integrated - Core (MIC) device family. The card driver communicates shutdown/ - crash events to the host and allows registration/configuration of - virtio devices. Intel MIC X100 devices are currently supported. - - If you are building a card kernel for an Intel MIC device then - say M (recommended) or Y, else say N. If unsure say N. - - For more information see - . - -config SCIF - tristate "SCIF Driver" - depends on 64BIT && PCI && X86 && SCIF_BUS && IOMMU_SUPPORT - select IOMMU_IOVA - help - This enables SCIF Driver support for the Intel Many Integrated - Core (MIC) family of PCIe form factor coprocessor devices that - run a 64 bit Linux OS. The Symmetric Communication Interface - (SCIF (pronounced as skiff)) is a low level communications API - across PCIe currently implemented for MIC. - - If you are building a host kernel with an Intel MIC device then - say M (recommended) or Y, else say N. If unsure say N. - - More information about the Intel MIC family as well as the Linux - OS and tools for MIC to use with this driver are available from - . - -config MIC_COSM - tristate "Intel MIC Coprocessor State Management (COSM) Drivers" - depends on 64BIT && PCI && X86 && SCIF - help - This enables COSM driver support for the Intel Many - Integrated Core (MIC) family of PCIe form factor coprocessor - devices. COSM drivers implement functions such as boot, - shutdown, reset and reboot of MIC devices. - - If you are building a host kernel with an Intel MIC device then - say M (recommended) or Y, else say N. If unsure say N. - - More information about the Intel MIC family as well as the Linux - OS and tools for MIC to use with this driver are available from - . - -config VOP - tristate "VOP Driver" - depends on VOP_BUS - select VHOST_RING - select VIRTIO - help - This enables VOP (Virtio over PCIe) Driver support for the Intel - Many Integrated Core (MIC) family of PCIe form factor coprocessor - devices. The VOP driver allows virtio drivers, e.g. net, console - and block drivers, on the card connect to user space virtio - devices on the host. - - If you are building a host kernel with an Intel MIC device then - say M (recommended) or Y, else say N. If unsure say N. - - More information about the Intel MIC family as well as the Linux - OS and tools for MIC to use with this driver are available from - . - -endmenu diff --git a/drivers/misc/mic/Makefile b/drivers/misc/mic/Makefile deleted file mode 100644 index 1a43622b183f..000000000000 --- a/drivers/misc/mic/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile - Intel MIC Linux driver. -# Copyright(c) 2013, Intel Corporation. -# -obj-$(CONFIG_INTEL_MIC_HOST) += host/ -obj-$(CONFIG_INTEL_MIC_CARD) += card/ -obj-y += bus/ -obj-$(CONFIG_SCIF) += scif/ -obj-$(CONFIG_MIC_COSM) += cosm/ -obj-$(CONFIG_MIC_COSM) += cosm_client/ -obj-$(CONFIG_VOP) += vop/ diff --git a/drivers/misc/mic/bus/Makefile b/drivers/misc/mic/bus/Makefile deleted file mode 100644 index 0a6aa21b2f67..000000000000 --- a/drivers/misc/mic/bus/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Makefile - Intel MIC Linux driver. -# Copyright(c) 2014, Intel Corporation. -# -obj-$(CONFIG_INTEL_MIC_BUS) += mic_bus.o -obj-$(CONFIG_SCIF_BUS) += scif_bus.o -obj-$(CONFIG_MIC_COSM) += cosm_bus.o -obj-$(CONFIG_VOP_BUS) += vop_bus.o diff --git a/drivers/misc/mic/bus/cosm_bus.c b/drivers/misc/mic/bus/cosm_bus.c deleted file mode 100644 index 5f2141c71738..000000000000 --- a/drivers/misc/mic/bus/cosm_bus.c +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel MIC COSM Bus Driver - */ -#include -#include -#include -#include "cosm_bus.h" - -/* Unique numbering for cosm devices. */ -static DEFINE_IDA(cosm_index_ida); - -static int cosm_dev_probe(struct device *d) -{ - struct cosm_device *dev = dev_to_cosm(d); - struct cosm_driver *drv = drv_to_cosm(dev->dev.driver); - - return drv->probe(dev); -} - -static int cosm_dev_remove(struct device *d) -{ - struct cosm_device *dev = dev_to_cosm(d); - struct cosm_driver *drv = drv_to_cosm(dev->dev.driver); - - drv->remove(dev); - return 0; -} - -static struct bus_type cosm_bus = { - .name = "cosm_bus", - .probe = cosm_dev_probe, - .remove = cosm_dev_remove, -}; - -int cosm_register_driver(struct cosm_driver *driver) -{ - driver->driver.bus = &cosm_bus; - return driver_register(&driver->driver); -} -EXPORT_SYMBOL_GPL(cosm_register_driver); - -void cosm_unregister_driver(struct cosm_driver *driver) -{ - driver_unregister(&driver->driver); -} -EXPORT_SYMBOL_GPL(cosm_unregister_driver); - -static inline void cosm_release_dev(struct device *d) -{ - struct cosm_device *cdev = dev_to_cosm(d); - - kfree(cdev); -} - -struct cosm_device * -cosm_register_device(struct device *pdev, struct cosm_hw_ops *hw_ops) -{ - struct cosm_device *cdev; - int ret; - - cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); - if (!cdev) - return ERR_PTR(-ENOMEM); - - cdev->dev.parent = pdev; - cdev->dev.release = cosm_release_dev; - cdev->hw_ops = hw_ops; - dev_set_drvdata(&cdev->dev, cdev); - cdev->dev.bus = &cosm_bus; - - /* Assign a unique device index and hence name */ - ret = ida_simple_get(&cosm_index_ida, 0, 0, GFP_KERNEL); - if (ret < 0) - goto free_cdev; - - cdev->index = ret; - cdev->dev.id = ret; - dev_set_name(&cdev->dev, "cosm-dev%u", cdev->index); - - ret = device_register(&cdev->dev); - if (ret) - goto ida_remove; - return cdev; -ida_remove: - ida_simple_remove(&cosm_index_ida, cdev->index); -free_cdev: - put_device(&cdev->dev); - return ERR_PTR(ret); -} -EXPORT_SYMBOL_GPL(cosm_register_device); - -void cosm_unregister_device(struct cosm_device *dev) -{ - int index = dev->index; /* save for after device release */ - - device_unregister(&dev->dev); - ida_simple_remove(&cosm_index_ida, index); -} -EXPORT_SYMBOL_GPL(cosm_unregister_device); - -struct cosm_device *cosm_find_cdev_by_id(int id) -{ - struct device *dev = subsys_find_device_by_id(&cosm_bus, id, NULL); - - return dev ? container_of(dev, struct cosm_device, dev) : NULL; -} -EXPORT_SYMBOL_GPL(cosm_find_cdev_by_id); - -static int __init cosm_init(void) -{ - return bus_register(&cosm_bus); -} - -static void __exit cosm_exit(void) -{ - bus_unregister(&cosm_bus); - ida_destroy(&cosm_index_ida); -} - -core_initcall(cosm_init); -module_exit(cosm_exit); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) MIC card OS state management bus driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/bus/cosm_bus.h b/drivers/misc/mic/bus/cosm_bus.h deleted file mode 100644 index d50d7aea168d..000000000000 --- a/drivers/misc/mic/bus/cosm_bus.h +++ /dev/null @@ -1,125 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel MIC COSM Bus Driver - */ -#ifndef _COSM_BUS_H_ -#define _COSM_BUS_H_ - -#include -#include -#include "../common/mic_dev.h" - -/** - * cosm_device - representation of a cosm device - * - * @attr_group: Pointer to list of sysfs attribute groups. - * @sdev: Device for sysfs entries. - * @state: MIC state. - * @prev_state: MIC state previous to MIC_RESETTING - * @shutdown_status: MIC status reported by card for shutdown/crashes. - * @shutdown_status_int: Internal shutdown status maintained by the driver - * @cosm_mutex: Mutex for synchronizing access to data structures. - * @reset_trigger_work: Work for triggering reset requests. - * @scif_work: Work for handling per device SCIF connections - * @cmdline: Kernel command line. - * @firmware: Firmware file name. - * @ramdisk: Ramdisk file name. - * @bootmode: Boot mode i.e. "linux" or "elf" for flash updates. - * @log_buf_addr: Log buffer address for MIC. - * @log_buf_len: Log buffer length address for MIC. - * @state_sysfs: Sysfs dirent for notifying ring 3 about MIC state changes. - * @hw_ops: the hardware bus ops for this device. - * @dev: underlying device. - * @index: unique position on the cosm bus - * @dbg_dir: debug fs directory - * @newepd: new endpoint from scif accept to be assigned to this cdev - * @epd: SCIF endpoint for this cdev - * @heartbeat_watchdog_enable: if heartbeat watchdog is enabled for this cdev - * @sysfs_heartbeat_enable: sysfs setting for disabling heartbeat notification - */ -struct cosm_device { - const struct attribute_group **attr_group; - struct device *sdev; - u8 state; - u8 prev_state; - u8 shutdown_status; - u8 shutdown_status_int; - struct mutex cosm_mutex; - struct work_struct reset_trigger_work; - struct work_struct scif_work; - char *cmdline; - char *firmware; - char *ramdisk; - char *bootmode; - void *log_buf_addr; - int *log_buf_len; - struct kernfs_node *state_sysfs; - struct cosm_hw_ops *hw_ops; - struct device dev; - int index; - struct dentry *dbg_dir; - scif_epd_t newepd; - scif_epd_t epd; - bool heartbeat_watchdog_enable; - bool sysfs_heartbeat_enable; -}; - -/** - * cosm_driver - operations for a cosm driver - * - * @driver: underlying device driver (populate name and owner). - * @probe: the function to call when a device is found. Returns 0 or -errno. - * @remove: the function to call when a device is removed. - */ -struct cosm_driver { - struct device_driver driver; - int (*probe)(struct cosm_device *dev); - void (*remove)(struct cosm_device *dev); -}; - -/** - * cosm_hw_ops - cosm bus ops - * - * @reset: trigger MIC reset - * @force_reset: force MIC reset - * @post_reset: inform MIC reset is complete - * @ready: is MIC ready for OS download - * @start: boot MIC - * @stop: prepare MIC for reset - * @family: return MIC HW family string - * @stepping: return MIC HW stepping string - * @aper: return MIC PCIe aperture - */ -struct cosm_hw_ops { - void (*reset)(struct cosm_device *cdev); - void (*force_reset)(struct cosm_device *cdev); - void (*post_reset)(struct cosm_device *cdev, enum mic_states state); - bool (*ready)(struct cosm_device *cdev); - int (*start)(struct cosm_device *cdev, int id); - void (*stop)(struct cosm_device *cdev, bool force); - ssize_t (*family)(struct cosm_device *cdev, char *buf); - ssize_t (*stepping)(struct cosm_device *cdev, char *buf); - struct mic_mw *(*aper)(struct cosm_device *cdev); -}; - -struct cosm_device * -cosm_register_device(struct device *pdev, struct cosm_hw_ops *hw_ops); -void cosm_unregister_device(struct cosm_device *dev); -int cosm_register_driver(struct cosm_driver *drv); -void cosm_unregister_driver(struct cosm_driver *drv); -struct cosm_device *cosm_find_cdev_by_id(int id); - -static inline struct cosm_device *dev_to_cosm(struct device *dev) -{ - return container_of(dev, struct cosm_device, dev); -} - -static inline struct cosm_driver *drv_to_cosm(struct device_driver *drv) -{ - return container_of(drv, struct cosm_driver, driver); -} -#endif /* _COSM_BUS_H */ diff --git a/drivers/misc/mic/bus/mic_bus.c b/drivers/misc/mic/bus/mic_bus.c deleted file mode 100644 index a08cb29692a8..000000000000 --- a/drivers/misc/mic/bus/mic_bus.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel MIC Bus driver. - * - * This implementation is very similar to the the virtio bus driver - * implementation @ drivers/virtio/virtio.c - */ -#include -#include -#include -#include -#include - -static ssize_t device_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct mbus_device *dev = dev_to_mbus(d); - return sprintf(buf, "0x%04x\n", dev->id.device); -} -static DEVICE_ATTR_RO(device); - -static ssize_t vendor_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct mbus_device *dev = dev_to_mbus(d); - return sprintf(buf, "0x%04x\n", dev->id.vendor); -} -static DEVICE_ATTR_RO(vendor); - -static ssize_t modalias_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct mbus_device *dev = dev_to_mbus(d); - return sprintf(buf, "mbus:d%08Xv%08X\n", - dev->id.device, dev->id.vendor); -} -static DEVICE_ATTR_RO(modalias); - -static struct attribute *mbus_dev_attrs[] = { - &dev_attr_device.attr, - &dev_attr_vendor.attr, - &dev_attr_modalias.attr, - NULL, -}; -ATTRIBUTE_GROUPS(mbus_dev); - -static inline int mbus_id_match(const struct mbus_device *dev, - const struct mbus_device_id *id) -{ - if (id->device != dev->id.device && id->device != MBUS_DEV_ANY_ID) - return 0; - - return id->vendor == MBUS_DEV_ANY_ID || id->vendor == dev->id.vendor; -} - -/* - * This looks through all the IDs a driver claims to support. If any of them - * match, we return 1 and the kernel will call mbus_dev_probe(). - */ -static int mbus_dev_match(struct device *dv, struct device_driver *dr) -{ - unsigned int i; - struct mbus_device *dev = dev_to_mbus(dv); - const struct mbus_device_id *ids; - - ids = drv_to_mbus(dr)->id_table; - for (i = 0; ids[i].device; i++) - if (mbus_id_match(dev, &ids[i])) - return 1; - return 0; -} - -static int mbus_uevent(struct device *dv, struct kobj_uevent_env *env) -{ - struct mbus_device *dev = dev_to_mbus(dv); - - return add_uevent_var(env, "MODALIAS=mbus:d%08Xv%08X", - dev->id.device, dev->id.vendor); -} - -static int mbus_dev_probe(struct device *d) -{ - int err; - struct mbus_device *dev = dev_to_mbus(d); - struct mbus_driver *drv = drv_to_mbus(dev->dev.driver); - - err = drv->probe(dev); - if (!err) - if (drv->scan) - drv->scan(dev); - return err; -} - -static int mbus_dev_remove(struct device *d) -{ - struct mbus_device *dev = dev_to_mbus(d); - struct mbus_driver *drv = drv_to_mbus(dev->dev.driver); - - drv->remove(dev); - return 0; -} - -static struct bus_type mic_bus = { - .name = "mic_bus", - .match = mbus_dev_match, - .dev_groups = mbus_dev_groups, - .uevent = mbus_uevent, - .probe = mbus_dev_probe, - .remove = mbus_dev_remove, -}; - -int mbus_register_driver(struct mbus_driver *driver) -{ - driver->driver.bus = &mic_bus; - return driver_register(&driver->driver); -} -EXPORT_SYMBOL_GPL(mbus_register_driver); - -void mbus_unregister_driver(struct mbus_driver *driver) -{ - driver_unregister(&driver->driver); -} -EXPORT_SYMBOL_GPL(mbus_unregister_driver); - -static void mbus_release_dev(struct device *d) -{ - struct mbus_device *mbdev = dev_to_mbus(d); - kfree(mbdev); -} - -struct mbus_device * -mbus_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_ops, - struct mbus_hw_ops *hw_ops, int index, - void __iomem *mmio_va) -{ - int ret; - struct mbus_device *mbdev; - - mbdev = kzalloc(sizeof(*mbdev), GFP_KERNEL); - if (!mbdev) - return ERR_PTR(-ENOMEM); - - mbdev->mmio_va = mmio_va; - mbdev->dev.parent = pdev; - mbdev->id.device = id; - mbdev->id.vendor = MBUS_DEV_ANY_ID; - mbdev->dev.dma_ops = dma_ops; - mbdev->dev.dma_mask = &mbdev->dev.coherent_dma_mask; - dma_set_mask(&mbdev->dev, DMA_BIT_MASK(64)); - mbdev->dev.release = mbus_release_dev; - mbdev->hw_ops = hw_ops; - mbdev->dev.bus = &mic_bus; - mbdev->index = index; - dev_set_name(&mbdev->dev, "mbus-dev%u", mbdev->index); - /* - * device_register() causes the bus infrastructure to look for a - * matching driver. - */ - ret = device_register(&mbdev->dev); - if (ret) - goto free_mbdev; - return mbdev; -free_mbdev: - put_device(&mbdev->dev); - return ERR_PTR(ret); -} -EXPORT_SYMBOL_GPL(mbus_register_device); - -void mbus_unregister_device(struct mbus_device *mbdev) -{ - device_unregister(&mbdev->dev); -} -EXPORT_SYMBOL_GPL(mbus_unregister_device); - -static int __init mbus_init(void) -{ - return bus_register(&mic_bus); -} - -static void __exit mbus_exit(void) -{ - bus_unregister(&mic_bus); -} - -core_initcall(mbus_init); -module_exit(mbus_exit); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) MIC Bus driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/bus/scif_bus.c b/drivers/misc/mic/bus/scif_bus.c deleted file mode 100644 index ad7c3604f151..000000000000 --- a/drivers/misc/mic/bus/scif_bus.c +++ /dev/null @@ -1,201 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel Symmetric Communications Interface Bus driver. - */ -#include -#include -#include -#include - -#include "scif_bus.h" - -static ssize_t device_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct scif_hw_dev *dev = dev_to_scif(d); - - return sprintf(buf, "0x%04x\n", dev->id.device); -} -static DEVICE_ATTR_RO(device); - -static ssize_t vendor_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct scif_hw_dev *dev = dev_to_scif(d); - - return sprintf(buf, "0x%04x\n", dev->id.vendor); -} -static DEVICE_ATTR_RO(vendor); - -static ssize_t modalias_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct scif_hw_dev *dev = dev_to_scif(d); - - return sprintf(buf, "scif:d%08Xv%08X\n", - dev->id.device, dev->id.vendor); -} -static DEVICE_ATTR_RO(modalias); - -static struct attribute *scif_dev_attrs[] = { - &dev_attr_device.attr, - &dev_attr_vendor.attr, - &dev_attr_modalias.attr, - NULL, -}; -ATTRIBUTE_GROUPS(scif_dev); - -static inline int scif_id_match(const struct scif_hw_dev *dev, - const struct scif_hw_dev_id *id) -{ - if (id->device != dev->id.device && id->device != SCIF_DEV_ANY_ID) - return 0; - - return id->vendor == SCIF_DEV_ANY_ID || id->vendor == dev->id.vendor; -} - -/* - * This looks through all the IDs a driver claims to support. If any of them - * match, we return 1 and the kernel will call scif_dev_probe(). - */ -static int scif_dev_match(struct device *dv, struct device_driver *dr) -{ - unsigned int i; - struct scif_hw_dev *dev = dev_to_scif(dv); - const struct scif_hw_dev_id *ids; - - ids = drv_to_scif(dr)->id_table; - for (i = 0; ids[i].device; i++) - if (scif_id_match(dev, &ids[i])) - return 1; - return 0; -} - -static int scif_uevent(struct device *dv, struct kobj_uevent_env *env) -{ - struct scif_hw_dev *dev = dev_to_scif(dv); - - return add_uevent_var(env, "MODALIAS=scif:d%08Xv%08X", - dev->id.device, dev->id.vendor); -} - -static int scif_dev_probe(struct device *d) -{ - struct scif_hw_dev *dev = dev_to_scif(d); - struct scif_driver *drv = drv_to_scif(dev->dev.driver); - - return drv->probe(dev); -} - -static int scif_dev_remove(struct device *d) -{ - struct scif_hw_dev *dev = dev_to_scif(d); - struct scif_driver *drv = drv_to_scif(dev->dev.driver); - - drv->remove(dev); - return 0; -} - -static struct bus_type scif_bus = { - .name = "scif_bus", - .match = scif_dev_match, - .dev_groups = scif_dev_groups, - .uevent = scif_uevent, - .probe = scif_dev_probe, - .remove = scif_dev_remove, -}; - -int scif_register_driver(struct scif_driver *driver) -{ - driver->driver.bus = &scif_bus; - return driver_register(&driver->driver); -} -EXPORT_SYMBOL_GPL(scif_register_driver); - -void scif_unregister_driver(struct scif_driver *driver) -{ - driver_unregister(&driver->driver); -} -EXPORT_SYMBOL_GPL(scif_unregister_driver); - -static void scif_release_dev(struct device *d) -{ - struct scif_hw_dev *sdev = dev_to_scif(d); - - kfree(sdev); -} - -struct scif_hw_dev * -scif_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_ops, - struct scif_hw_ops *hw_ops, u8 dnode, u8 snode, - struct mic_mw *mmio, struct mic_mw *aper, void *dp, - void __iomem *rdp, struct dma_chan **chan, int num_chan, - bool card_rel_da) -{ - int ret; - struct scif_hw_dev *sdev; - - sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); - if (!sdev) - return ERR_PTR(-ENOMEM); - - sdev->dev.parent = pdev; - sdev->id.device = id; - sdev->id.vendor = SCIF_DEV_ANY_ID; - sdev->dev.dma_ops = dma_ops; - sdev->dev.release = scif_release_dev; - sdev->hw_ops = hw_ops; - sdev->dnode = dnode; - sdev->snode = snode; - dev_set_drvdata(&sdev->dev, sdev); - sdev->dev.bus = &scif_bus; - sdev->mmio = mmio; - sdev->aper = aper; - sdev->dp = dp; - sdev->rdp = rdp; - sdev->dev.dma_mask = &sdev->dev.coherent_dma_mask; - dma_set_mask(&sdev->dev, DMA_BIT_MASK(64)); - sdev->dma_ch = chan; - sdev->num_dma_ch = num_chan; - sdev->card_rel_da = card_rel_da; - dev_set_name(&sdev->dev, "scif-dev%u", sdev->dnode); - /* - * device_register() causes the bus infrastructure to look for a - * matching driver. - */ - ret = device_register(&sdev->dev); - if (ret) - goto free_sdev; - return sdev; -free_sdev: - put_device(&sdev->dev); - return ERR_PTR(ret); -} -EXPORT_SYMBOL_GPL(scif_register_device); - -void scif_unregister_device(struct scif_hw_dev *sdev) -{ - device_unregister(&sdev->dev); -} -EXPORT_SYMBOL_GPL(scif_unregister_device); - -static int __init scif_init(void) -{ - return bus_register(&scif_bus); -} - -static void __exit scif_exit(void) -{ - bus_unregister(&scif_bus); -} - -core_initcall(scif_init); -module_exit(scif_exit); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) SCIF Bus driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/bus/scif_bus.h b/drivers/misc/mic/bus/scif_bus.h deleted file mode 100644 index 4981eb56f879..000000000000 --- a/drivers/misc/mic/bus/scif_bus.h +++ /dev/null @@ -1,125 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel Symmetric Communications Interface Bus driver. - */ -#ifndef _SCIF_BUS_H_ -#define _SCIF_BUS_H_ -/* - * Everything a scif driver needs to work with any particular scif - * hardware abstraction layer. - */ -#include - -#include -#include "../common/mic_dev.h" - -struct scif_hw_dev_id { - u32 device; - u32 vendor; -}; - -#define MIC_SCIF_DEV 1 -#define SCIF_DEV_ANY_ID 0xffffffff - -/** - * scif_hw_dev - representation of a hardware device abstracted for scif - * @hw_ops: the hardware ops supported by this device - * @id: the device type identification (used to match it with a driver) - * @mmio: MMIO memory window - * @aper: Aperture memory window - * @dev: underlying device - * @dnode - The destination node which this device will communicate with. - * @snode - The source node for this device. - * @dp - Self device page - * @rdp - Remote device page - * @dma_ch - Array of DMA channels - * @num_dma_ch - Number of DMA channels available - * @card_rel_da - Set to true if DMA addresses programmed in the DMA engine - * are relative to the card point of view - */ -struct scif_hw_dev { - struct scif_hw_ops *hw_ops; - struct scif_hw_dev_id id; - struct mic_mw *mmio; - struct mic_mw *aper; - struct device dev; - u8 dnode; - u8 snode; - void *dp; - void __iomem *rdp; - struct dma_chan **dma_ch; - int num_dma_ch; - bool card_rel_da; -}; - -/** - * scif_driver - operations for a scif I/O driver - * @driver: underlying device driver (populate name and owner). - * @id_table: the ids serviced by this driver. - * @probe: the function to call when a device is found. Returns 0 or -errno. - * @remove: the function to call when a device is removed. - */ -struct scif_driver { - struct device_driver driver; - const struct scif_hw_dev_id *id_table; - int (*probe)(struct scif_hw_dev *dev); - void (*remove)(struct scif_hw_dev *dev); -}; - -/** - * scif_hw_ops - Hardware operations for accessing a SCIF device on the SCIF bus. - * - * @next_db: Obtain the next available doorbell. - * @request_irq: Request an interrupt on a particular doorbell. - * @free_irq: Free an interrupt requested previously. - * @ack_interrupt: acknowledge an interrupt in the ISR. - * @send_intr: Send an interrupt to the remote node on a specified doorbell. - * @send_p2p_intr: Send an interrupt to the peer node on a specified doorbell - * which is specifically targeted for a peer to peer node. - * @remap: Map a buffer with the specified physical address and length. - * @unmap: Unmap a buffer previously mapped. - */ -struct scif_hw_ops { - int (*next_db)(struct scif_hw_dev *sdev); - struct mic_irq * (*request_irq)(struct scif_hw_dev *sdev, - irqreturn_t (*func)(int irq, - void *data), - const char *name, void *data, - int db); - void (*free_irq)(struct scif_hw_dev *sdev, - struct mic_irq *cookie, void *data); - void (*ack_interrupt)(struct scif_hw_dev *sdev, int num); - void (*send_intr)(struct scif_hw_dev *sdev, int db); - void (*send_p2p_intr)(struct scif_hw_dev *sdev, int db, - struct mic_mw *mw); - void __iomem * (*remap)(struct scif_hw_dev *sdev, - phys_addr_t pa, size_t len); - void (*unmap)(struct scif_hw_dev *sdev, void __iomem *va); -}; - -int scif_register_driver(struct scif_driver *driver); -void scif_unregister_driver(struct scif_driver *driver); -struct scif_hw_dev * -scif_register_device(struct device *pdev, int id, - const struct dma_map_ops *dma_ops, - struct scif_hw_ops *hw_ops, u8 dnode, u8 snode, - struct mic_mw *mmio, struct mic_mw *aper, - void *dp, void __iomem *rdp, - struct dma_chan **chan, int num_chan, - bool card_rel_da); -void scif_unregister_device(struct scif_hw_dev *sdev); - -static inline struct scif_hw_dev *dev_to_scif(struct device *dev) -{ - return container_of(dev, struct scif_hw_dev, dev); -} - -static inline struct scif_driver *drv_to_scif(struct device_driver *drv) -{ - return container_of(drv, struct scif_driver, driver); -} -#endif /* _SCIF_BUS_H */ diff --git a/drivers/misc/mic/bus/vop_bus.c b/drivers/misc/mic/bus/vop_bus.c deleted file mode 100644 index 6935ddca1bd5..000000000000 --- a/drivers/misc/mic/bus/vop_bus.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2016 Intel Corporation. - * - * Intel Virtio Over PCIe (VOP) Bus driver. - */ -#include -#include -#include -#include - -#include "vop_bus.h" - -static ssize_t device_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct vop_device *dev = dev_to_vop(d); - - return sprintf(buf, "0x%04x\n", dev->id.device); -} -static DEVICE_ATTR_RO(device); - -static ssize_t vendor_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct vop_device *dev = dev_to_vop(d); - - return sprintf(buf, "0x%04x\n", dev->id.vendor); -} -static DEVICE_ATTR_RO(vendor); - -static ssize_t modalias_show(struct device *d, - struct device_attribute *attr, char *buf) -{ - struct vop_device *dev = dev_to_vop(d); - - return sprintf(buf, "vop:d%08Xv%08X\n", - dev->id.device, dev->id.vendor); -} -static DEVICE_ATTR_RO(modalias); - -static struct attribute *vop_dev_attrs[] = { - &dev_attr_device.attr, - &dev_attr_vendor.attr, - &dev_attr_modalias.attr, - NULL, -}; -ATTRIBUTE_GROUPS(vop_dev); - -static inline int vop_id_match(const struct vop_device *dev, - const struct vop_device_id *id) -{ - if (id->device != dev->id.device && id->device != VOP_DEV_ANY_ID) - return 0; - - return id->vendor == VOP_DEV_ANY_ID || id->vendor == dev->id.vendor; -} - -/* - * This looks through all the IDs a driver claims to support. If any of them - * match, we return 1 and the kernel will call vop_dev_probe(). - */ -static int vop_dev_match(struct device *dv, struct device_driver *dr) -{ - unsigned int i; - struct vop_device *dev = dev_to_vop(dv); - const struct vop_device_id *ids; - - ids = drv_to_vop(dr)->id_table; - for (i = 0; ids[i].device; i++) - if (vop_id_match(dev, &ids[i])) - return 1; - return 0; -} - -static int vop_uevent(struct device *dv, struct kobj_uevent_env *env) -{ - struct vop_device *dev = dev_to_vop(dv); - - return add_uevent_var(env, "MODALIAS=vop:d%08Xv%08X", - dev->id.device, dev->id.vendor); -} - -static int vop_dev_probe(struct device *d) -{ - struct vop_device *dev = dev_to_vop(d); - struct vop_driver *drv = drv_to_vop(dev->dev.driver); - - return drv->probe(dev); -} - -static int vop_dev_remove(struct device *d) -{ - struct vop_device *dev = dev_to_vop(d); - struct vop_driver *drv = drv_to_vop(dev->dev.driver); - - drv->remove(dev); - return 0; -} - -static struct bus_type vop_bus = { - .name = "vop_bus", - .match = vop_dev_match, - .dev_groups = vop_dev_groups, - .uevent = vop_uevent, - .probe = vop_dev_probe, - .remove = vop_dev_remove, -}; - -int vop_register_driver(struct vop_driver *driver) -{ - driver->driver.bus = &vop_bus; - return driver_register(&driver->driver); -} -EXPORT_SYMBOL_GPL(vop_register_driver); - -void vop_unregister_driver(struct vop_driver *driver) -{ - driver_unregister(&driver->driver); -} -EXPORT_SYMBOL_GPL(vop_unregister_driver); - -static void vop_release_dev(struct device *d) -{ - struct vop_device *dev = dev_to_vop(d); - - kfree(dev); -} - -struct vop_device * -vop_register_device(struct device *pdev, int id, - const struct dma_map_ops *dma_ops, - struct vop_hw_ops *hw_ops, u8 dnode, struct mic_mw *aper, - struct dma_chan *chan) -{ - int ret; - struct vop_device *vdev; - - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return ERR_PTR(-ENOMEM); - - vdev->dev.parent = pdev; - vdev->id.device = id; - vdev->id.vendor = VOP_DEV_ANY_ID; - vdev->dev.dma_ops = dma_ops; - vdev->dev.dma_mask = &vdev->dev.coherent_dma_mask; - dma_set_mask(&vdev->dev, DMA_BIT_MASK(64)); - vdev->dev.release = vop_release_dev; - vdev->hw_ops = hw_ops; - vdev->dev.bus = &vop_bus; - vdev->dnode = dnode; - vdev->aper = aper; - vdev->dma_ch = chan; - vdev->index = dnode - 1; - dev_set_name(&vdev->dev, "vop-dev%u", vdev->index); - /* - * device_register() causes the bus infrastructure to look for a - * matching driver. - */ - ret = device_register(&vdev->dev); - if (ret) - goto free_vdev; - return vdev; -free_vdev: - put_device(&vdev->dev); - return ERR_PTR(ret); -} -EXPORT_SYMBOL_GPL(vop_register_device); - -void vop_unregister_device(struct vop_device *dev) -{ - device_unregister(&dev->dev); -} -EXPORT_SYMBOL_GPL(vop_unregister_device); - -static int __init vop_init(void) -{ - return bus_register(&vop_bus); -} - -static void __exit vop_exit(void) -{ - bus_unregister(&vop_bus); -} - -core_initcall(vop_init); -module_exit(vop_exit); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) VOP Bus driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/bus/vop_bus.h b/drivers/misc/mic/bus/vop_bus.h deleted file mode 100644 index 4fa02808c1e2..000000000000 --- a/drivers/misc/mic/bus/vop_bus.h +++ /dev/null @@ -1,129 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2016 Intel Corporation. - * - * Intel Virtio over PCIe Bus driver. - */ -#ifndef _VOP_BUS_H_ -#define _VOP_BUS_H_ -/* - * Everything a vop driver needs to work with any particular vop - * implementation. - */ -#include -#include - -#include "../common/mic_dev.h" - -struct vop_device_id { - u32 device; - u32 vendor; -}; - -#define VOP_DEV_TRNSP 1 -#define VOP_DEV_ANY_ID 0xffffffff -/* - * Size of the internal buffer used during DMA's as an intermediate buffer - * for copy to/from user. Must be an integral number of pages. - */ -#define VOP_INT_DMA_BUF_SIZE PAGE_ALIGN(64 * 1024ULL) - -/** - * vop_device - representation of a device using vop - * @hw_ops: the hardware ops supported by this device. - * @id: the device type identification (used to match it with a driver). - * @dev: underlying device. - * @dnode - The destination node which this device will communicate with. - * @aper: Aperture memory window - * @dma_ch - DMA channel - * @index: unique position on the vop bus - */ -struct vop_device { - struct vop_hw_ops *hw_ops; - struct vop_device_id id; - struct device dev; - u8 dnode; - struct mic_mw *aper; - struct dma_chan *dma_ch; - int index; -}; - -/** - * vop_driver - operations for a vop I/O driver - * @driver: underlying device driver (populate name and owner). - * @id_table: the ids serviced by this driver. - * @probe: the function to call when a device is found. Returns 0 or -errno. - * @remove: the function to call when a device is removed. - */ -struct vop_driver { - struct device_driver driver; - const struct vop_device_id *id_table; - int (*probe)(struct vop_device *dev); - void (*remove)(struct vop_device *dev); -}; - -/** - * vop_hw_ops - Hardware operations for accessing a VOP device on the VOP bus. - * - * @next_db: Obtain the next available doorbell. - * @request_irq: Request an interrupt on a particular doorbell. - * @free_irq: Free an interrupt requested previously. - * @ack_interrupt: acknowledge an interrupt in the ISR. - * @get_remote_dp: Get access to the virtio device page used by the remote - * node to add/remove/configure virtio devices. - * @get_dp: Get access to the virtio device page used by the self - * node to add/remove/configure virtio devices. - * @send_intr: Send an interrupt to the peer node on a specified doorbell. - * @remap: Map a buffer with the specified DMA address and length. - * @unmap: Unmap a buffer previously mapped. - * @dma_filter: The DMA filter function to use for obtaining access to - * a DMA channel on the peer node. - */ -struct vop_hw_ops { - int (*next_db)(struct vop_device *vpdev); - struct mic_irq *(*request_irq)(struct vop_device *vpdev, - irqreturn_t (*func)(int irq, void *data), - const char *name, void *data, - int intr_src); - void (*free_irq)(struct vop_device *vpdev, - struct mic_irq *cookie, void *data); - void (*ack_interrupt)(struct vop_device *vpdev, int num); - void __iomem * (*get_remote_dp)(struct vop_device *vpdev); - void * (*get_dp)(struct vop_device *vpdev); - void (*send_intr)(struct vop_device *vpdev, int db); - void __iomem * (*remap)(struct vop_device *vpdev, - dma_addr_t pa, size_t len); - void (*unmap)(struct vop_device *vpdev, void __iomem *va); -}; - -struct vop_device * -vop_register_device(struct device *pdev, int id, - const struct dma_map_ops *dma_ops, - struct vop_hw_ops *hw_ops, u8 dnode, struct mic_mw *aper, - struct dma_chan *chan); -void vop_unregister_device(struct vop_device *dev); -int vop_register_driver(struct vop_driver *drv); -void vop_unregister_driver(struct vop_driver *drv); - -/* - * module_vop_driver() - Helper macro for drivers that don't do - * anything special in module init/exit. This eliminates a lot of - * boilerplate. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit() - */ -#define module_vop_driver(__vop_driver) \ - module_driver(__vop_driver, vop_register_driver, \ - vop_unregister_driver) - -static inline struct vop_device *dev_to_vop(struct device *dev) -{ - return container_of(dev, struct vop_device, dev); -} - -static inline struct vop_driver *drv_to_vop(struct device_driver *drv) -{ - return container_of(drv, struct vop_driver, driver); -} -#endif /* _VOP_BUS_H */ diff --git a/drivers/misc/mic/card/Makefile b/drivers/misc/mic/card/Makefile deleted file mode 100644 index 921a7e7e0fbd..000000000000 --- a/drivers/misc/mic/card/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile - Intel MIC Linux driver. -# Copyright(c) 2013, Intel Corporation. -# -ccflags-y += -DINTEL_MIC_CARD - -obj-$(CONFIG_INTEL_MIC_CARD) += mic_card.o -mic_card-y += mic_x100.o -mic_card-y += mic_device.o -mic_card-y += mic_debugfs.o diff --git a/drivers/misc/mic/card/mic_debugfs.c b/drivers/misc/mic/card/mic_debugfs.c deleted file mode 100644 index 4c326e8f4d99..000000000000 --- a/drivers/misc/mic/card/mic_debugfs.c +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Disclaimer: The codes contained in these modules may be specific to - * the Intel Software Development Platform codenamed: Knights Ferry, and - * the Intel product codenamed: Knights Corner, and are not backward - * compatible with other Intel products. Additionally, Intel will NOT - * support the codes or instruction set in future products. - * - * Intel MIC Card driver. - */ -#include -#include -#include -#include -#include - -#include "../common/mic_dev.h" -#include "mic_device.h" - -/* Debugfs parent dir */ -static struct dentry *mic_dbg; - -/* - * mic_intr_show - Send interrupts to host. - */ -static int mic_intr_show(struct seq_file *s, void *unused) -{ - struct mic_driver *mdrv = s->private; - struct mic_device *mdev = &mdrv->mdev; - - mic_send_intr(mdev, 0); - msleep(1000); - mic_send_intr(mdev, 1); - msleep(1000); - mic_send_intr(mdev, 2); - msleep(1000); - mic_send_intr(mdev, 3); - msleep(1000); - - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(mic_intr); - -/* - * mic_create_card_debug_dir - Initialize MIC debugfs entries. - */ -void __init mic_create_card_debug_dir(struct mic_driver *mdrv) -{ - if (!mic_dbg) - return; - - mdrv->dbg_dir = debugfs_create_dir(mdrv->name, mic_dbg); - - debugfs_create_file("intr_test", 0444, mdrv->dbg_dir, mdrv, - &mic_intr_fops); -} - -/* - * mic_delete_card_debug_dir - Uninitialize MIC debugfs entries. - */ -void mic_delete_card_debug_dir(struct mic_driver *mdrv) -{ - debugfs_remove_recursive(mdrv->dbg_dir); -} - -/* - * mic_init_card_debugfs - Initialize global debugfs entry. - */ -void __init mic_init_card_debugfs(void) -{ - mic_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); -} - -/* - * mic_exit_card_debugfs - Uninitialize global debugfs entry - */ -void mic_exit_card_debugfs(void) -{ - debugfs_remove(mic_dbg); -} diff --git a/drivers/misc/mic/card/mic_device.c b/drivers/misc/mic/card/mic_device.c deleted file mode 100644 index a15606259bdc..000000000000 --- a/drivers/misc/mic/card/mic_device.c +++ /dev/null @@ -1,417 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Disclaimer: The codes contained in these modules may be specific to - * the Intel Software Development Platform codenamed: Knights Ferry, and - * the Intel product codenamed: Knights Corner, and are not backward - * compatible with other Intel products. Additionally, Intel will NOT - * support the codes or instruction set in future products. - * - * Intel MIC Card driver. - */ -#include -#include -#include -#include -#include -#include - -#include -#include "../common/mic_dev.h" -#include "mic_device.h" - -static struct mic_driver *g_drv; - -static int __init mic_dp_init(void) -{ - struct mic_driver *mdrv = g_drv; - struct mic_device *mdev = &mdrv->mdev; - struct mic_bootparam __iomem *bootparam; - u64 lo, hi, dp_dma_addr; - u32 magic; - - lo = mic_read_spad(&mdrv->mdev, MIC_DPLO_SPAD); - hi = mic_read_spad(&mdrv->mdev, MIC_DPHI_SPAD); - - dp_dma_addr = lo | (hi << 32); - mdrv->dp = mic_card_map(mdev, dp_dma_addr, MIC_DP_SIZE); - if (!mdrv->dp) { - dev_err(mdrv->dev, "Cannot remap Aperture BAR\n"); - return -ENOMEM; - } - bootparam = mdrv->dp; - magic = ioread32(&bootparam->magic); - if (MIC_MAGIC != magic) { - dev_err(mdrv->dev, "bootparam magic mismatch 0x%x\n", magic); - return -EIO; - } - return 0; -} - -/* Uninitialize the device page */ -static void mic_dp_uninit(void) -{ - mic_card_unmap(&g_drv->mdev, g_drv->dp); -} - -/** - * mic_request_card_irq - request an irq. - * - * @handler: interrupt handler passed to request_threaded_irq. - * @thread_fn: thread fn. passed to request_threaded_irq. - * @name: The ASCII name of the callee requesting the irq. - * @data: private data that is returned back when calling the - * function handler. - * @index: The doorbell index of the requester. - * - * returns: The cookie that is transparent to the caller. Passed - * back when calling mic_free_irq. An appropriate error code - * is returned on failure. Caller needs to use IS_ERR(return_val) - * to check for failure and PTR_ERR(return_val) to obtained the - * error code. - * - */ -struct mic_irq * -mic_request_card_irq(irq_handler_t handler, - irq_handler_t thread_fn, const char *name, - void *data, int index) -{ - int rc = 0; - unsigned long cookie; - struct mic_driver *mdrv = g_drv; - - rc = request_threaded_irq(mic_db_to_irq(mdrv, index), handler, - thread_fn, 0, name, data); - if (rc) { - dev_err(mdrv->dev, "request_threaded_irq failed rc = %d\n", rc); - goto err; - } - mdrv->irq_info.irq_usage_count[index]++; - cookie = index; - return (struct mic_irq *)cookie; -err: - return ERR_PTR(rc); -} - -/** - * mic_free_card_irq - free irq. - * - * @cookie: cookie obtained during a successful call to mic_request_threaded_irq - * @data: private data specified by the calling function during the - * mic_request_threaded_irq - * - * returns: none. - */ -void mic_free_card_irq(struct mic_irq *cookie, void *data) -{ - int index; - struct mic_driver *mdrv = g_drv; - - index = (unsigned long)cookie & 0xFFFFU; - free_irq(mic_db_to_irq(mdrv, index), data); - mdrv->irq_info.irq_usage_count[index]--; -} - -/** - * mic_next_card_db - Get the doorbell with minimum usage count. - * - * Returns the irq index. - */ -int mic_next_card_db(void) -{ - int i; - int index = 0; - struct mic_driver *mdrv = g_drv; - - for (i = 0; i < mdrv->intr_info.num_intr; i++) { - if (mdrv->irq_info.irq_usage_count[i] < - mdrv->irq_info.irq_usage_count[index]) - index = i; - } - - return index; -} - -/** - * mic_init_irq - Initialize irq information. - * - * Returns 0 in success. Appropriate error code on failure. - */ -static int mic_init_irq(void) -{ - struct mic_driver *mdrv = g_drv; - - mdrv->irq_info.irq_usage_count = kzalloc((sizeof(u32) * - mdrv->intr_info.num_intr), - GFP_KERNEL); - if (!mdrv->irq_info.irq_usage_count) - return -ENOMEM; - return 0; -} - -/** - * mic_uninit_irq - Uninitialize irq information. - * - * None. - */ -static void mic_uninit_irq(void) -{ - struct mic_driver *mdrv = g_drv; - - kfree(mdrv->irq_info.irq_usage_count); -} - -static inline struct mic_driver *scdev_to_mdrv(struct scif_hw_dev *scdev) -{ - return dev_get_drvdata(scdev->dev.parent); -} - -static struct mic_irq * -___mic_request_irq(struct scif_hw_dev *scdev, - irqreturn_t (*func)(int irq, void *data), - const char *name, void *data, - int db) -{ - return mic_request_card_irq(func, NULL, name, data, db); -} - -static void -___mic_free_irq(struct scif_hw_dev *scdev, - struct mic_irq *cookie, void *data) -{ - return mic_free_card_irq(cookie, data); -} - -static void ___mic_ack_interrupt(struct scif_hw_dev *scdev, int num) -{ - struct mic_driver *mdrv = scdev_to_mdrv(scdev); - - mic_ack_interrupt(&mdrv->mdev); -} - -static int ___mic_next_db(struct scif_hw_dev *scdev) -{ - return mic_next_card_db(); -} - -static void ___mic_send_intr(struct scif_hw_dev *scdev, int db) -{ - struct mic_driver *mdrv = scdev_to_mdrv(scdev); - - mic_send_intr(&mdrv->mdev, db); -} - -static void ___mic_send_p2p_intr(struct scif_hw_dev *scdev, int db, - struct mic_mw *mw) -{ - mic_send_p2p_intr(db, mw); -} - -static void __iomem * -___mic_ioremap(struct scif_hw_dev *scdev, - phys_addr_t pa, size_t len) -{ - struct mic_driver *mdrv = scdev_to_mdrv(scdev); - - return mic_card_map(&mdrv->mdev, pa, len); -} - -static void ___mic_iounmap(struct scif_hw_dev *scdev, void __iomem *va) -{ - struct mic_driver *mdrv = scdev_to_mdrv(scdev); - - mic_card_unmap(&mdrv->mdev, va); -} - -static struct scif_hw_ops scif_hw_ops = { - .request_irq = ___mic_request_irq, - .free_irq = ___mic_free_irq, - .ack_interrupt = ___mic_ack_interrupt, - .next_db = ___mic_next_db, - .send_intr = ___mic_send_intr, - .send_p2p_intr = ___mic_send_p2p_intr, - .remap = ___mic_ioremap, - .unmap = ___mic_iounmap, -}; - -static inline struct mic_driver *vpdev_to_mdrv(struct vop_device *vpdev) -{ - return dev_get_drvdata(vpdev->dev.parent); -} - -static struct mic_irq * -__mic_request_irq(struct vop_device *vpdev, - irqreturn_t (*func)(int irq, void *data), - const char *name, void *data, int intr_src) -{ - return mic_request_card_irq(func, NULL, name, data, intr_src); -} - -static void __mic_free_irq(struct vop_device *vpdev, - struct mic_irq *cookie, void *data) -{ - return mic_free_card_irq(cookie, data); -} - -static void __mic_ack_interrupt(struct vop_device *vpdev, int num) -{ - struct mic_driver *mdrv = vpdev_to_mdrv(vpdev); - - mic_ack_interrupt(&mdrv->mdev); -} - -static int __mic_next_db(struct vop_device *vpdev) -{ - return mic_next_card_db(); -} - -static void __iomem *__mic_get_remote_dp(struct vop_device *vpdev) -{ - struct mic_driver *mdrv = vpdev_to_mdrv(vpdev); - - return mdrv->dp; -} - -static void __mic_send_intr(struct vop_device *vpdev, int db) -{ - struct mic_driver *mdrv = vpdev_to_mdrv(vpdev); - - mic_send_intr(&mdrv->mdev, db); -} - -static void __iomem *__mic_ioremap(struct vop_device *vpdev, - dma_addr_t pa, size_t len) -{ - struct mic_driver *mdrv = vpdev_to_mdrv(vpdev); - - return mic_card_map(&mdrv->mdev, pa, len); -} - -static void __mic_iounmap(struct vop_device *vpdev, void __iomem *va) -{ - struct mic_driver *mdrv = vpdev_to_mdrv(vpdev); - - mic_card_unmap(&mdrv->mdev, va); -} - -static struct vop_hw_ops vop_hw_ops = { - .request_irq = __mic_request_irq, - .free_irq = __mic_free_irq, - .ack_interrupt = __mic_ack_interrupt, - .next_db = __mic_next_db, - .get_remote_dp = __mic_get_remote_dp, - .send_intr = __mic_send_intr, - .remap = __mic_ioremap, - .unmap = __mic_iounmap, -}; - -static int mic_request_dma_chans(struct mic_driver *mdrv) -{ - dma_cap_mask_t mask; - struct dma_chan *chan; - - dma_cap_zero(mask); - dma_cap_set(DMA_MEMCPY, mask); - - do { - chan = dma_request_channel(mask, NULL, NULL); - if (chan) { - mdrv->dma_ch[mdrv->num_dma_ch++] = chan; - if (mdrv->num_dma_ch >= MIC_MAX_DMA_CHAN) - break; - } - } while (chan); - dev_info(mdrv->dev, "DMA channels # %d\n", mdrv->num_dma_ch); - return mdrv->num_dma_ch; -} - -static void mic_free_dma_chans(struct mic_driver *mdrv) -{ - int i = 0; - - for (i = 0; i < mdrv->num_dma_ch; i++) { - dma_release_channel(mdrv->dma_ch[i]); - mdrv->dma_ch[i] = NULL; - } - mdrv->num_dma_ch = 0; -} - -/* - * mic_driver_init - MIC driver initialization tasks. - * - * Returns 0 in success. Appropriate error code on failure. - */ -int __init mic_driver_init(struct mic_driver *mdrv) -{ - int rc; - struct mic_bootparam __iomem *bootparam; - u8 node_id; - - g_drv = mdrv; - /* Unloading the card module is not supported. */ - if (!try_module_get(mdrv->dev->driver->owner)) { - rc = -ENODEV; - goto done; - } - rc = mic_dp_init(); - if (rc) - goto put; - rc = mic_init_irq(); - if (rc) - goto dp_uninit; - if (!mic_request_dma_chans(mdrv)) { - rc = -ENODEV; - goto irq_uninit; - } - mdrv->vpdev = vop_register_device(mdrv->dev, VOP_DEV_TRNSP, - NULL, &vop_hw_ops, 0, - NULL, mdrv->dma_ch[0]); - if (IS_ERR(mdrv->vpdev)) { - rc = PTR_ERR(mdrv->vpdev); - goto dma_free; - } - bootparam = mdrv->dp; - node_id = ioread8(&bootparam->node_id); - mdrv->scdev = scif_register_device(mdrv->dev, MIC_SCIF_DEV, - NULL, &scif_hw_ops, - 0, node_id, &mdrv->mdev.mmio, NULL, - NULL, mdrv->dp, mdrv->dma_ch, - mdrv->num_dma_ch, true); - if (IS_ERR(mdrv->scdev)) { - rc = PTR_ERR(mdrv->scdev); - goto vop_remove; - } - mic_create_card_debug_dir(mdrv); -done: - return rc; -vop_remove: - vop_unregister_device(mdrv->vpdev); -dma_free: - mic_free_dma_chans(mdrv); -irq_uninit: - mic_uninit_irq(); -dp_uninit: - mic_dp_uninit(); -put: - module_put(mdrv->dev->driver->owner); - return rc; -} - -/* - * mic_driver_uninit - MIC driver uninitialization tasks. - * - * Returns None - */ -void mic_driver_uninit(struct mic_driver *mdrv) -{ - mic_delete_card_debug_dir(mdrv); - scif_unregister_device(mdrv->scdev); - vop_unregister_device(mdrv->vpdev); - mic_free_dma_chans(mdrv); - mic_uninit_irq(); - mic_dp_uninit(); - module_put(mdrv->dev->driver->owner); -} diff --git a/drivers/misc/mic/card/mic_device.h b/drivers/misc/mic/card/mic_device.h deleted file mode 100644 index d6cc69a235a3..000000000000 --- a/drivers/misc/mic/card/mic_device.h +++ /dev/null @@ -1,137 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Disclaimer: The codes contained in these modules may be specific to - * the Intel Software Development Platform codenamed: Knights Ferry, and - * the Intel product codenamed: Knights Corner, and are not backward - * compatible with other Intel products. Additionally, Intel will NOT - * support the codes or instruction set in future products. - * - * Intel MIC Card driver. - */ -#ifndef _MIC_CARD_DEVICE_H_ -#define _MIC_CARD_DEVICE_H_ - -#include -#include -#include -#include -#include "../bus/scif_bus.h" -#include "../bus/vop_bus.h" - -/** - * struct mic_intr_info - Contains h/w specific interrupt sources info - * - * @num_intr: The number of irqs available - */ -struct mic_intr_info { - u32 num_intr; -}; - -/** - * struct mic_irq_info - OS specific irq information - * - * @irq_usage_count: usage count array tracking the number of sources - * assigned for each irq. - */ -struct mic_irq_info { - int *irq_usage_count; -}; - -/** - * struct mic_device - MIC device information. - * - * @mmio: MMIO bar information. - */ -struct mic_device { - struct mic_mw mmio; -}; - -/** - * struct mic_driver - MIC card driver information. - * - * @name: Name for MIC driver. - * @dbg_dir: debugfs directory of this MIC device. - * @dev: The device backing this MIC. - * @dp: The pointer to the virtio device page. - * @mdev: MIC device information for the host. - * @hotplug_work: Hot plug work for adding/removing virtio devices. - * @irq_info: The OS specific irq information - * @intr_info: H/W specific interrupt information. - * @dma_mbdev: dma device on the MIC virtual bus. - * @dma_ch - Array of DMA channels - * @num_dma_ch - Number of DMA channels available - * @scdev: SCIF device on the SCIF virtual bus. - * @vpdev: Virtio over PCIe device on the VOP virtual bus. - */ -struct mic_driver { - char name[20]; - struct dentry *dbg_dir; - struct device *dev; - void __iomem *dp; - struct mic_device mdev; - struct work_struct hotplug_work; - struct mic_irq_info irq_info; - struct mic_intr_info intr_info; - struct mbus_device *dma_mbdev; - struct dma_chan *dma_ch[MIC_MAX_DMA_CHAN]; - int num_dma_ch; - struct scif_hw_dev *scdev; - struct vop_device *vpdev; -}; - -/** - * struct mic_irq - opaque pointer used as cookie - */ -struct mic_irq; - -/** - * mic_mmio_read - read from an MMIO register. - * @mw: MMIO register base virtual address. - * @offset: register offset. - * - * RETURNS: register value. - */ -static inline u32 mic_mmio_read(struct mic_mw *mw, u32 offset) -{ - return ioread32(mw->va + offset); -} - -/** - * mic_mmio_write - write to an MMIO register. - * @mw: MMIO register base virtual address. - * @val: the data value to put into the register - * @offset: register offset. - * - * RETURNS: none. - */ -static inline void -mic_mmio_write(struct mic_mw *mw, u32 val, u32 offset) -{ - iowrite32(val, mw->va + offset); -} - -int mic_driver_init(struct mic_driver *mdrv); -void mic_driver_uninit(struct mic_driver *mdrv); -int mic_next_card_db(void); -struct mic_irq * -mic_request_card_irq(irq_handler_t handler, irq_handler_t thread_fn, - const char *name, void *data, int db); -void mic_free_card_irq(struct mic_irq *cookie, void *data); -u32 mic_read_spad(struct mic_device *mdev, unsigned int idx); -void mic_send_intr(struct mic_device *mdev, int doorbell); -void mic_send_p2p_intr(int doorbell, struct mic_mw *mw); -int mic_db_to_irq(struct mic_driver *mdrv, int db); -u32 mic_ack_interrupt(struct mic_device *mdev); -void mic_hw_intr_init(struct mic_driver *mdrv); -void __iomem * -mic_card_map(struct mic_device *mdev, dma_addr_t addr, size_t size); -void mic_card_unmap(struct mic_device *mdev, void __iomem *addr); -void __init mic_create_card_debug_dir(struct mic_driver *mdrv); -void mic_delete_card_debug_dir(struct mic_driver *mdrv); -void __init mic_init_card_debugfs(void); -void mic_exit_card_debugfs(void); -#endif diff --git a/drivers/misc/mic/card/mic_x100.c b/drivers/misc/mic/card/mic_x100.c deleted file mode 100644 index c8bff2916d3d..000000000000 --- a/drivers/misc/mic/card/mic_x100.c +++ /dev/null @@ -1,347 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Disclaimer: The codes contained in these modules may be specific to - * the Intel Software Development Platform codenamed: Knights Ferry, and - * the Intel product codenamed: Knights Corner, and are not backward - * compatible with other Intel products. Additionally, Intel will NOT - * support the codes or instruction set in future products. - * - * Intel MIC Card driver. - */ -#include -#include -#include - -#include "../common/mic_dev.h" -#include "mic_device.h" -#include "mic_x100.h" - -static const char mic_driver_name[] = "mic"; - -static struct mic_driver g_drv; - -/** - * mic_read_spad - read from the scratchpad register - * @mdev: pointer to mic_device instance - * @idx: index to scratchpad register, 0 based - * - * This function allows reading of the 32bit scratchpad register. - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -u32 mic_read_spad(struct mic_device *mdev, unsigned int idx) -{ - return mic_mmio_read(&mdev->mmio, - MIC_X100_SBOX_BASE_ADDRESS + - MIC_X100_SBOX_SPAD0 + idx * 4); -} - -/** - * __mic_send_intr - Send interrupt to Host. - * @mdev: pointer to mic_device instance - * @doorbell: Doorbell number. - */ -void mic_send_intr(struct mic_device *mdev, int doorbell) -{ - struct mic_mw *mw = &mdev->mmio; - - if (doorbell > MIC_X100_MAX_DOORBELL_IDX) - return; - /* Ensure that the interrupt is ordered w.r.t previous stores. */ - wmb(); - mic_mmio_write(mw, MIC_X100_SBOX_SDBIC0_DBREQ_BIT, - MIC_X100_SBOX_BASE_ADDRESS + - (MIC_X100_SBOX_SDBIC0 + (4 * doorbell))); -} - -/* - * mic_x100_send_sbox_intr - Send an MIC_X100_SBOX interrupt to MIC. - */ -static void mic_x100_send_sbox_intr(struct mic_mw *mw, int doorbell) -{ - u64 apic_icr_offset = MIC_X100_SBOX_APICICR0 + doorbell * 8; - u32 apicicr_low = mic_mmio_read(mw, MIC_X100_SBOX_BASE_ADDRESS + - apic_icr_offset); - - /* for MIC we need to make sure we "hit" the send_icr bit (13) */ - apicicr_low = (apicicr_low | (1 << 13)); - /* - * Ensure that the interrupt is ordered w.r.t. previous stores - * to main memory. Fence instructions are not implemented in X100 - * since execution is in order but a compiler barrier is still - * required. - */ - wmb(); - mic_mmio_write(mw, apicicr_low, - MIC_X100_SBOX_BASE_ADDRESS + apic_icr_offset); -} - -static void mic_x100_send_rdmasr_intr(struct mic_mw *mw, int doorbell) -{ - int rdmasr_offset = MIC_X100_SBOX_RDMASR0 + (doorbell << 2); - /* - * Ensure that the interrupt is ordered w.r.t. previous stores - * to main memory. Fence instructions are not implemented in X100 - * since execution is in order but a compiler barrier is still - * required. - */ - wmb(); - mic_mmio_write(mw, 0, MIC_X100_SBOX_BASE_ADDRESS + rdmasr_offset); -} - -/** - * mic_ack_interrupt - Device specific interrupt handling. - * @mdev: pointer to mic_device instance - * - * Returns: bitmask of doorbell events triggered. - */ -u32 mic_ack_interrupt(struct mic_device *mdev) -{ - return 0; -} - -static inline int mic_get_sbox_irq(int db) -{ - return MIC_X100_IRQ_BASE + db; -} - -static inline int mic_get_rdmasr_irq(int index) -{ - return MIC_X100_RDMASR_IRQ_BASE + index; -} - -void mic_send_p2p_intr(int db, struct mic_mw *mw) -{ - int rdmasr_index; - - if (db < MIC_X100_NUM_SBOX_IRQ) { - mic_x100_send_sbox_intr(mw, db); - } else { - rdmasr_index = db - MIC_X100_NUM_SBOX_IRQ; - mic_x100_send_rdmasr_intr(mw, rdmasr_index); - } -} - -/** - * mic_hw_intr_init - Initialize h/w specific interrupt - * information. - * @mdrv: pointer to mic_driver - */ -void mic_hw_intr_init(struct mic_driver *mdrv) -{ - mdrv->intr_info.num_intr = MIC_X100_NUM_SBOX_IRQ + - MIC_X100_NUM_RDMASR_IRQ; -} - -/** - * mic_db_to_irq - Retrieve irq number corresponding to a doorbell. - * @mdrv: pointer to mic_driver - * @db: The doorbell obtained for which the irq is needed. Doorbell - * may correspond to an sbox doorbell or an rdmasr index. - * - * Returns the irq corresponding to the doorbell. - */ -int mic_db_to_irq(struct mic_driver *mdrv, int db) -{ - int rdmasr_index; - - /* - * The total number of doorbell interrupts on the card are 16. Indices - * 0-8 falls in the SBOX category and 8-15 fall in the RDMASR category. - */ - if (db < MIC_X100_NUM_SBOX_IRQ) { - return mic_get_sbox_irq(db); - } else { - rdmasr_index = db - MIC_X100_NUM_SBOX_IRQ; - return mic_get_rdmasr_irq(rdmasr_index); - } -} - -/* - * mic_card_map - Allocate virtual address for a remote memory region. - * @mdev: pointer to mic_device instance. - * @addr: Remote DMA address. - * @size: Size of the region. - * - * Returns: Virtual address backing the remote memory region. - */ -void __iomem * -mic_card_map(struct mic_device *mdev, dma_addr_t addr, size_t size) -{ - return ioremap(addr, size); -} - -/* - * mic_card_unmap - Unmap the virtual address for a remote memory region. - * @mdev: pointer to mic_device instance. - * @addr: Virtual address for remote memory region. - * - * Returns: None. - */ -void mic_card_unmap(struct mic_device *mdev, void __iomem *addr) -{ - iounmap(addr); -} - -static inline struct mic_driver *mbdev_to_mdrv(struct mbus_device *mbdev) -{ - return dev_get_drvdata(mbdev->dev.parent); -} - -static struct mic_irq * -_mic_request_threaded_irq(struct mbus_device *mbdev, - irq_handler_t handler, irq_handler_t thread_fn, - const char *name, void *data, int intr_src) -{ - int rc = 0; - unsigned int irq = intr_src; - unsigned long cookie = irq; - - rc = request_threaded_irq(irq, handler, thread_fn, 0, name, data); - if (rc) { - dev_err(mbdev_to_mdrv(mbdev)->dev, - "request_threaded_irq failed rc = %d\n", rc); - return ERR_PTR(rc); - } - return (struct mic_irq *)cookie; -} - -static void _mic_free_irq(struct mbus_device *mbdev, - struct mic_irq *cookie, void *data) -{ - unsigned long irq = (unsigned long)cookie; - free_irq(irq, data); -} - -static void _mic_ack_interrupt(struct mbus_device *mbdev, int num) -{ - mic_ack_interrupt(&mbdev_to_mdrv(mbdev)->mdev); -} - -static struct mbus_hw_ops mbus_hw_ops = { - .request_threaded_irq = _mic_request_threaded_irq, - .free_irq = _mic_free_irq, - .ack_interrupt = _mic_ack_interrupt, -}; - -static int __init mic_probe(struct platform_device *pdev) -{ - struct mic_driver *mdrv = &g_drv; - struct mic_device *mdev = &mdrv->mdev; - int rc = 0; - - mdrv->dev = &pdev->dev; - snprintf(mdrv->name, sizeof(mic_driver_name), mic_driver_name); - - /* FIXME: use dma_set_mask_and_coherent() and check result */ - dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); - - mdev->mmio.pa = MIC_X100_MMIO_BASE; - mdev->mmio.len = MIC_X100_MMIO_LEN; - mdev->mmio.va = devm_ioremap(&pdev->dev, MIC_X100_MMIO_BASE, - MIC_X100_MMIO_LEN); - if (!mdev->mmio.va) { - dev_err(&pdev->dev, "Cannot remap MMIO BAR\n"); - rc = -EIO; - goto done; - } - mic_hw_intr_init(mdrv); - platform_set_drvdata(pdev, mdrv); - mdrv->dma_mbdev = mbus_register_device(mdrv->dev, MBUS_DEV_DMA_MIC, - NULL, &mbus_hw_ops, 0, - mdrv->mdev.mmio.va); - if (IS_ERR(mdrv->dma_mbdev)) { - rc = PTR_ERR(mdrv->dma_mbdev); - dev_err(&pdev->dev, "mbus_add_device failed rc %d\n", rc); - goto done; - } - rc = mic_driver_init(mdrv); - if (rc) { - dev_err(&pdev->dev, "mic_driver_init failed rc %d\n", rc); - goto remove_dma; - } -done: - return rc; -remove_dma: - mbus_unregister_device(mdrv->dma_mbdev); - return rc; -} - -static int mic_remove(struct platform_device *pdev) -{ - struct mic_driver *mdrv = &g_drv; - - mic_driver_uninit(mdrv); - mbus_unregister_device(mdrv->dma_mbdev); - return 0; -} - -static void mic_platform_shutdown(struct platform_device *pdev) -{ - mic_remove(pdev); -} - -static struct platform_driver __refdata mic_platform_driver = { - .probe = mic_probe, - .remove = mic_remove, - .shutdown = mic_platform_shutdown, - .driver = { - .name = mic_driver_name, - }, -}; - -static struct platform_device *mic_platform_dev; - -static int __init mic_init(void) -{ - int ret; - struct cpuinfo_x86 *c = &cpu_data(0); - - if (!(c->x86 == 11 && c->x86_model == 1)) { - ret = -ENODEV; - pr_err("%s not running on X100 ret %d\n", __func__, ret); - goto done; - } - - request_module("mic_x100_dma"); - mic_init_card_debugfs(); - - mic_platform_dev = platform_device_register_simple(mic_driver_name, - 0, NULL, 0); - ret = PTR_ERR_OR_ZERO(mic_platform_dev); - if (ret) { - pr_err("platform_device_register_full ret %d\n", ret); - goto cleanup_debugfs; - } - ret = platform_driver_register(&mic_platform_driver); - if (ret) { - pr_err("platform_driver_register ret %d\n", ret); - goto device_unregister; - } - return ret; - -device_unregister: - platform_device_unregister(mic_platform_dev); -cleanup_debugfs: - mic_exit_card_debugfs(); -done: - return ret; -} - -static void __exit mic_exit(void) -{ - platform_driver_unregister(&mic_platform_driver); - platform_device_unregister(mic_platform_dev); - mic_exit_card_debugfs(); -} - -module_init(mic_init); -module_exit(mic_exit); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) MIC X100 Card driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/card/mic_x100.h b/drivers/misc/mic/card/mic_x100.h deleted file mode 100644 index 46644dde0c07..000000000000 --- a/drivers/misc/mic/card/mic_x100.h +++ /dev/null @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Disclaimer: The codes contained in these modules may be specific to - * the Intel Software Development Platform codenamed: Knights Ferry, and - * the Intel product codenamed: Knights Corner, and are not backward - * compatible with other Intel products. Additionally, Intel will NOT - * support the codes or instruction set in future products. - * - * Intel MIC Card driver. - */ -#ifndef _MIC_X100_CARD_H_ -#define _MIC_X100_CARD_H_ - -#define MIC_X100_MMIO_BASE 0x08007C0000ULL -#define MIC_X100_MMIO_LEN 0x00020000ULL -#define MIC_X100_SBOX_BASE_ADDRESS 0x00010000ULL - -#define MIC_X100_SBOX_SPAD0 0x0000AB20 -#define MIC_X100_SBOX_SDBIC0 0x0000CC90 -#define MIC_X100_SBOX_SDBIC0_DBREQ_BIT 0x80000000 -#define MIC_X100_SBOX_RDMASR0 0x0000B180 -#define MIC_X100_SBOX_APICICR0 0x0000A9D0 - -#define MIC_X100_MAX_DOORBELL_IDX 8 - -#define MIC_X100_NUM_SBOX_IRQ 8 -#define MIC_X100_NUM_RDMASR_IRQ 8 -#define MIC_X100_SBOX_IRQ_BASE 0 -#define MIC_X100_RDMASR_IRQ_BASE 17 - -#define MIC_X100_IRQ_BASE 26 - -#endif diff --git a/drivers/misc/mic/common/mic_dev.h b/drivers/misc/mic/common/mic_dev.h deleted file mode 100644 index f94f08df0260..000000000000 --- a/drivers/misc/mic/common/mic_dev.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC driver. - */ -#ifndef __MIC_DEV_H__ -#define __MIC_DEV_H__ - -/* The maximum number of MIC devices supported in a single host system. */ -#define MIC_MAX_NUM_DEVS 128 - -/** - * enum mic_hw_family - The hardware family to which a device belongs. - */ -enum mic_hw_family { - MIC_FAMILY_X100 = 0, - MIC_FAMILY_X200, - MIC_FAMILY_UNKNOWN, - MIC_FAMILY_LAST -}; - -/** - * struct mic_mw - MIC memory window - * - * @pa: Base physical address. - * @va: Base ioremap'd virtual address. - * @len: Size of the memory window. - */ -struct mic_mw { - phys_addr_t pa; - void __iomem *va; - resource_size_t len; -}; - -/* - * Scratch pad register offsets used by the host to communicate - * device page DMA address to the card. - */ -#define MIC_DPLO_SPAD 14 -#define MIC_DPHI_SPAD 15 - -/* - * These values are supposed to be in the config_change field of the - * device page when the host sends a config change interrupt to the card. - */ -#define MIC_VIRTIO_PARAM_DEV_REMOVE 0x1 -#define MIC_VIRTIO_PARAM_CONFIG_CHANGED 0x2 - -/* Maximum number of DMA channels */ -#define MIC_MAX_DMA_CHAN 4 - -#endif diff --git a/drivers/misc/mic/cosm/Makefile b/drivers/misc/mic/cosm/Makefile deleted file mode 100644 index 97d74cb12030..000000000000 --- a/drivers/misc/mic/cosm/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile - Intel MIC Coprocessor State Management (COSM) Driver -# Copyright(c) 2015, Intel Corporation. -# -obj-$(CONFIG_MIC_COSM) += mic_cosm.o - -mic_cosm-objs := cosm_main.o -mic_cosm-objs += cosm_debugfs.o -mic_cosm-objs += cosm_sysfs.o -mic_cosm-objs += cosm_scif_server.o diff --git a/drivers/misc/mic/cosm/cosm_debugfs.c b/drivers/misc/mic/cosm/cosm_debugfs.c deleted file mode 100644 index cb55653cf1f9..000000000000 --- a/drivers/misc/mic/cosm/cosm_debugfs.c +++ /dev/null @@ -1,116 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel MIC Coprocessor State Management (COSM) Driver - */ - -#include -#include -#include -#include "cosm_main.h" - -/* Debugfs parent dir */ -static struct dentry *cosm_dbg; - -/* - * log_buf_show - Display MIC kernel log buffer - * - * log_buf addr/len is read from System.map by user space - * and populated in sysfs entries. - */ -static int log_buf_show(struct seq_file *s, void *unused) -{ - void __iomem *log_buf_va; - int __iomem *log_buf_len_va; - struct cosm_device *cdev = s->private; - void *kva; - int size; - u64 aper_offset; - - if (!cdev || !cdev->log_buf_addr || !cdev->log_buf_len) - goto done; - - mutex_lock(&cdev->cosm_mutex); - switch (cdev->state) { - case MIC_BOOTING: - case MIC_ONLINE: - case MIC_SHUTTING_DOWN: - break; - default: - goto unlock; - } - - /* - * Card kernel will never be relocated and any kernel text/data mapping - * can be translated to phys address by subtracting __START_KERNEL_map. - */ - aper_offset = (u64)cdev->log_buf_len - __START_KERNEL_map; - log_buf_len_va = cdev->hw_ops->aper(cdev)->va + aper_offset; - aper_offset = (u64)cdev->log_buf_addr - __START_KERNEL_map; - log_buf_va = cdev->hw_ops->aper(cdev)->va + aper_offset; - - size = ioread32(log_buf_len_va); - kva = kmalloc(size, GFP_KERNEL); - if (!kva) - goto unlock; - - memcpy_fromio(kva, log_buf_va, size); - seq_write(s, kva, size); - kfree(kva); -unlock: - mutex_unlock(&cdev->cosm_mutex); -done: - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(log_buf); - -/* - * force_reset_show - Force MIC reset - * - * Invokes the force_reset COSM bus op instead of the standard reset - * op in case a force reset of the MIC device is required - */ -static int force_reset_show(struct seq_file *s, void *pos) -{ - struct cosm_device *cdev = s->private; - - cosm_stop(cdev, true); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(force_reset); - -void cosm_create_debug_dir(struct cosm_device *cdev) -{ - char name[16]; - - if (!cosm_dbg) - return; - - scnprintf(name, sizeof(name), "mic%d", cdev->index); - cdev->dbg_dir = debugfs_create_dir(name, cosm_dbg); - - debugfs_create_file("log_buf", 0444, cdev->dbg_dir, cdev, - &log_buf_fops); - debugfs_create_file("force_reset", 0444, cdev->dbg_dir, cdev, - &force_reset_fops); -} - -void cosm_delete_debug_dir(struct cosm_device *cdev) -{ - debugfs_remove_recursive(cdev->dbg_dir); -} - -void cosm_init_debugfs(void) -{ - cosm_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); -} - -void cosm_exit_debugfs(void) -{ - debugfs_remove(cosm_dbg); -} diff --git a/drivers/misc/mic/cosm/cosm_main.c b/drivers/misc/mic/cosm/cosm_main.c deleted file mode 100644 index ebb0eac43754..000000000000 --- a/drivers/misc/mic/cosm/cosm_main.c +++ /dev/null @@ -1,382 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel MIC Coprocessor State Management (COSM) Driver - */ - -#include -#include -#include -#include -#include -#include "cosm_main.h" - -static const char cosm_driver_name[] = "mic"; - -/* COSM ID allocator */ -static struct ida g_cosm_ida; -/* Class of MIC devices for sysfs accessibility. */ -static struct class *g_cosm_class; -/* Number of MIC devices */ -static atomic_t g_num_dev; - -/** - * cosm_hw_reset - Issue a HW reset for the MIC device - * @cdev: pointer to cosm_device instance - * @force: force a MIC to reset even if it is already reset and ready - */ -static void cosm_hw_reset(struct cosm_device *cdev, bool force) -{ - int i; - -#define MIC_RESET_TO (45) - if (force && cdev->hw_ops->force_reset) - cdev->hw_ops->force_reset(cdev); - else - cdev->hw_ops->reset(cdev); - - for (i = 0; i < MIC_RESET_TO; i++) { - if (cdev->hw_ops->ready(cdev)) { - cosm_set_state(cdev, MIC_READY); - return; - } - /* - * Resets typically take 10s of seconds to complete. - * Since an MMIO read is required to check if the - * firmware is ready or not, a 1 second delay works nicely. - */ - msleep(1000); - } - cosm_set_state(cdev, MIC_RESET_FAILED); -} - -/** - * cosm_start - Start the MIC - * @cdev: pointer to cosm_device instance - * - * This function prepares an MIC for boot and initiates boot. - * RETURNS: An appropriate -ERRNO error value on error, or 0 for success. - */ -int cosm_start(struct cosm_device *cdev) -{ - const struct cred *orig_cred; - struct cred *override_cred; - int rc; - - mutex_lock(&cdev->cosm_mutex); - if (!cdev->bootmode) { - dev_err(&cdev->dev, "%s %d bootmode not set\n", - __func__, __LINE__); - rc = -EINVAL; - goto unlock_ret; - } -retry: - if (cdev->state != MIC_READY) { - dev_err(&cdev->dev, "%s %d MIC state not READY\n", - __func__, __LINE__); - rc = -EINVAL; - goto unlock_ret; - } - if (!cdev->hw_ops->ready(cdev)) { - cosm_hw_reset(cdev, false); - /* - * The state will either be MIC_READY if the reset succeeded - * or MIC_RESET_FAILED if the firmware reset failed. - */ - goto retry; - } - - /* - * Set credentials to root to allow non-root user to download initramsfs - * with 600 permissions - */ - override_cred = prepare_creds(); - if (!override_cred) { - dev_err(&cdev->dev, "%s %d prepare_creds failed\n", - __func__, __LINE__); - rc = -ENOMEM; - goto unlock_ret; - } - override_cred->fsuid = GLOBAL_ROOT_UID; - orig_cred = override_creds(override_cred); - - rc = cdev->hw_ops->start(cdev, cdev->index); - - revert_creds(orig_cred); - put_cred(override_cred); - if (rc) - goto unlock_ret; - - /* - * If linux is being booted, card is treated 'online' only - * when the scif interface in the card is up. If anything else - * is booted, we set card to 'online' immediately. - */ - if (!strcmp(cdev->bootmode, "linux")) - cosm_set_state(cdev, MIC_BOOTING); - else - cosm_set_state(cdev, MIC_ONLINE); -unlock_ret: - mutex_unlock(&cdev->cosm_mutex); - if (rc) - dev_err(&cdev->dev, "cosm_start failed rc %d\n", rc); - return rc; -} - -/** - * cosm_stop - Prepare the MIC for reset and trigger reset - * @cdev: pointer to cosm_device instance - * @force: force a MIC to reset even if it is already reset and ready. - * - * RETURNS: None - */ -void cosm_stop(struct cosm_device *cdev, bool force) -{ - mutex_lock(&cdev->cosm_mutex); - if (cdev->state != MIC_READY || force) { - /* - * Don't call hw_ops if they have been called previously. - * stop(..) calls device_unregister and will crash the system if - * called multiple times. - */ - u8 state = cdev->state == MIC_RESETTING ? - cdev->prev_state : cdev->state; - bool call_hw_ops = state != MIC_RESET_FAILED && - state != MIC_READY; - - if (cdev->state != MIC_RESETTING) - cosm_set_state(cdev, MIC_RESETTING); - cdev->heartbeat_watchdog_enable = false; - if (call_hw_ops) - cdev->hw_ops->stop(cdev, force); - cosm_hw_reset(cdev, force); - cosm_set_shutdown_status(cdev, MIC_NOP); - if (call_hw_ops && cdev->hw_ops->post_reset) - cdev->hw_ops->post_reset(cdev, cdev->state); - } - mutex_unlock(&cdev->cosm_mutex); - flush_work(&cdev->scif_work); -} - -/** - * cosm_reset_trigger_work - Trigger MIC reset - * @work: The work structure - * - * This work is scheduled whenever the host wants to reset the MIC. - */ -static void cosm_reset_trigger_work(struct work_struct *work) -{ - struct cosm_device *cdev = container_of(work, struct cosm_device, - reset_trigger_work); - cosm_stop(cdev, false); -} - -/** - * cosm_reset - Schedule MIC reset - * @cdev: pointer to cosm_device instance - * - * RETURNS: An -EINVAL if the card is already READY or 0 for success. - */ -int cosm_reset(struct cosm_device *cdev) -{ - int rc = 0; - - mutex_lock(&cdev->cosm_mutex); - if (cdev->state != MIC_READY) { - if (cdev->state != MIC_RESETTING) { - cdev->prev_state = cdev->state; - cosm_set_state(cdev, MIC_RESETTING); - schedule_work(&cdev->reset_trigger_work); - } - } else { - dev_err(&cdev->dev, "%s %d MIC is READY\n", __func__, __LINE__); - rc = -EINVAL; - } - mutex_unlock(&cdev->cosm_mutex); - return rc; -} - -/** - * cosm_shutdown - Initiate MIC shutdown. - * @cdev: pointer to cosm_device instance - * - * RETURNS: None - */ -int cosm_shutdown(struct cosm_device *cdev) -{ - struct cosm_msg msg = { .id = COSM_MSG_SHUTDOWN }; - int rc = 0; - - mutex_lock(&cdev->cosm_mutex); - if (cdev->state != MIC_ONLINE) { - rc = -EINVAL; - dev_err(&cdev->dev, "%s %d skipping shutdown in state: %s\n", - __func__, __LINE__, cosm_state_string[cdev->state]); - goto err; - } - - if (!cdev->epd) { - rc = -ENOTCONN; - dev_err(&cdev->dev, "%s %d scif endpoint not connected rc %d\n", - __func__, __LINE__, rc); - goto err; - } - - rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); - if (rc < 0) { - dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n", - __func__, __LINE__, rc); - goto err; - } - cdev->heartbeat_watchdog_enable = false; - cosm_set_state(cdev, MIC_SHUTTING_DOWN); - rc = 0; -err: - mutex_unlock(&cdev->cosm_mutex); - return rc; -} - -static int cosm_driver_probe(struct cosm_device *cdev) -{ - int rc; - - /* Initialize SCIF server at first probe */ - if (atomic_add_return(1, &g_num_dev) == 1) { - rc = cosm_scif_init(); - if (rc) - goto scif_exit; - } - mutex_init(&cdev->cosm_mutex); - INIT_WORK(&cdev->reset_trigger_work, cosm_reset_trigger_work); - INIT_WORK(&cdev->scif_work, cosm_scif_work); - cdev->sysfs_heartbeat_enable = true; - cosm_sysfs_init(cdev); - cdev->sdev = device_create_with_groups(g_cosm_class, cdev->dev.parent, - MKDEV(0, cdev->index), cdev, cdev->attr_group, - "mic%d", cdev->index); - if (IS_ERR(cdev->sdev)) { - rc = PTR_ERR(cdev->sdev); - dev_err(&cdev->dev, "device_create_with_groups failed rc %d\n", - rc); - goto scif_exit; - } - - cdev->state_sysfs = sysfs_get_dirent(cdev->sdev->kobj.sd, - "state"); - if (!cdev->state_sysfs) { - rc = -ENODEV; - dev_err(&cdev->dev, "sysfs_get_dirent failed rc %d\n", rc); - goto destroy_device; - } - cosm_create_debug_dir(cdev); - return 0; -destroy_device: - device_destroy(g_cosm_class, MKDEV(0, cdev->index)); -scif_exit: - if (atomic_dec_and_test(&g_num_dev)) - cosm_scif_exit(); - return rc; -} - -static void cosm_driver_remove(struct cosm_device *cdev) -{ - cosm_delete_debug_dir(cdev); - sysfs_put(cdev->state_sysfs); - device_destroy(g_cosm_class, MKDEV(0, cdev->index)); - flush_work(&cdev->reset_trigger_work); - cosm_stop(cdev, false); - if (atomic_dec_and_test(&g_num_dev)) - cosm_scif_exit(); - - /* These sysfs entries might have allocated */ - kfree(cdev->cmdline); - kfree(cdev->firmware); - kfree(cdev->ramdisk); - kfree(cdev->bootmode); -} - -static int cosm_suspend(struct device *dev) -{ - struct cosm_device *cdev = dev_to_cosm(dev); - - mutex_lock(&cdev->cosm_mutex); - switch (cdev->state) { - /** - * Suspend/freeze hooks in userspace have already shutdown the card. - * Card should be 'ready' in most cases. It is however possible that - * some userspace application initiated a boot. In those cases, we - * simply reset the card. - */ - case MIC_ONLINE: - case MIC_BOOTING: - case MIC_SHUTTING_DOWN: - mutex_unlock(&cdev->cosm_mutex); - cosm_stop(cdev, false); - break; - default: - mutex_unlock(&cdev->cosm_mutex); - break; - } - return 0; -} - -static const struct dev_pm_ops cosm_pm_ops = { - .suspend = cosm_suspend, - .freeze = cosm_suspend -}; - -static struct cosm_driver cosm_driver = { - .driver = { - .name = KBUILD_MODNAME, - .owner = THIS_MODULE, - .pm = &cosm_pm_ops, - }, - .probe = cosm_driver_probe, - .remove = cosm_driver_remove -}; - -static int __init cosm_init(void) -{ - int ret; - - cosm_init_debugfs(); - - g_cosm_class = class_create(THIS_MODULE, cosm_driver_name); - if (IS_ERR(g_cosm_class)) { - ret = PTR_ERR(g_cosm_class); - pr_err("class_create failed ret %d\n", ret); - goto cleanup_debugfs; - } - - ida_init(&g_cosm_ida); - ret = cosm_register_driver(&cosm_driver); - if (ret) { - pr_err("cosm_register_driver failed ret %d\n", ret); - goto ida_destroy; - } - return 0; -ida_destroy: - ida_destroy(&g_cosm_ida); - class_destroy(g_cosm_class); -cleanup_debugfs: - cosm_exit_debugfs(); - return ret; -} - -static void __exit cosm_exit(void) -{ - cosm_unregister_driver(&cosm_driver); - ida_destroy(&g_cosm_ida); - class_destroy(g_cosm_class); - cosm_exit_debugfs(); -} - -module_init(cosm_init); -module_exit(cosm_exit); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) MIC Coprocessor State Management (COSM) Driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/cosm/cosm_main.h b/drivers/misc/mic/cosm/cosm_main.h deleted file mode 100644 index 5188ad245814..000000000000 --- a/drivers/misc/mic/cosm/cosm_main.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel MIC Coprocessor State Management (COSM) Driver - */ -#ifndef _COSM_COSM_H_ -#define _COSM_COSM_H_ - -#include -#include "../bus/cosm_bus.h" - -#define COSM_HEARTBEAT_SEND_SEC 30 -#define SCIF_COSM_LISTEN_PORT 201 - -/** - * enum COSM msg id's - * @COSM_MSG_SHUTDOWN: host->card trigger shutdown - * @COSM_MSG_SYNC_TIME: host->card send host time to card to sync time - * @COSM_MSG_HEARTBEAT: card->host heartbeat - * @COSM_MSG_SHUTDOWN_STATUS: card->host with shutdown status as payload - */ -enum cosm_msg_id { - COSM_MSG_SHUTDOWN, - COSM_MSG_SYNC_TIME, - COSM_MSG_HEARTBEAT, - COSM_MSG_SHUTDOWN_STATUS, -}; - -struct cosm_msg { - u64 id; - union { - u64 shutdown_status; - struct { - u64 tv_sec; - u64 tv_nsec; - } timespec; - }; -}; - -extern const char * const cosm_state_string[]; -extern const char * const cosm_shutdown_status_string[]; - -void cosm_sysfs_init(struct cosm_device *cdev); -int cosm_start(struct cosm_device *cdev); -void cosm_stop(struct cosm_device *cdev, bool force); -int cosm_reset(struct cosm_device *cdev); -int cosm_shutdown(struct cosm_device *cdev); -void cosm_set_state(struct cosm_device *cdev, u8 state); -void cosm_set_shutdown_status(struct cosm_device *cdev, u8 status); -void cosm_init_debugfs(void); -void cosm_exit_debugfs(void); -void cosm_create_debug_dir(struct cosm_device *cdev); -void cosm_delete_debug_dir(struct cosm_device *cdev); -int cosm_scif_init(void); -void cosm_scif_exit(void); -void cosm_scif_work(struct work_struct *work); - -#endif diff --git a/drivers/misc/mic/cosm/cosm_scif_server.c b/drivers/misc/mic/cosm/cosm_scif_server.c deleted file mode 100644 index 7baec9fd8756..000000000000 --- a/drivers/misc/mic/cosm/cosm_scif_server.c +++ /dev/null @@ -1,399 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel MIC Coprocessor State Management (COSM) Driver - */ -#include -#include - -#include "cosm_main.h" - -/* - * The COSM driver uses SCIF to communicate between the management node and the - * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b) - * receive a shutdown status back from the card upon completion of shutdown and - * (c) receive periodic heartbeat messages from the card used to deduce if the - * card has crashed. - * - * A COSM server consisting of a SCIF listening endpoint waits for incoming - * connections from the card. Upon acceptance of the connection, a separate - * work-item is scheduled to handle SCIF message processing for that card. The - * life-time of this work-item is therefore the time from which the connection - * from a card is accepted to the time at which the connection is closed. A new - * work-item starts each time the card boots and is alive till the card (a) - * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is - * unloaded. - * - * From the point of view of COSM interactions with SCIF during card - * shutdown, reset and crash are as follows: - * - * Card shutdown - * ------------- - * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN - * message from the host. - * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting - * in scif_remove(..) getting called on the card - * 3. scif_remove -> scif_stop -> scif_handle_remove_node -> - * scif_peer_unregister_device -> device_unregister for the host peer device - * 4. During device_unregister remove(..) method of cosm_client is invoked which - * closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT - * message being sent to host SCIF. SCIF_DISCNCT message processing on the - * host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes - * up the host COSM thread blocked in scif_poll(..) resulting in - * scif_poll(..) returning EPOLLHUP. - * 5. On the card, scif_peer_release_dev is next called which results in an - * SCIF_EXIT message being sent to the host and after receiving the - * SCIF_EXIT_ACK from the host the peer device teardown on the card is - * complete. - * 6. As part of the SCIF_EXIT message processing on the host, host sends a - * SCIF_REMOVE_NODE to itself corresponding to the card being removed. This - * starts a similar SCIF peer device teardown sequence on the host - * corresponding to the card being shut down. - * - * Card reset - * ---------- - * The case of interest here is when the card has not been previously shut down - * since most of the steps below are skipped in that case: - - * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver - * which unregisters the SCIF HW device resulting in scif_remove(..) being - * called on the host. - * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a - * SCIF_EXIT message being sent to the card. - * 3. The card executes scif_stop() as part of SCIF_EXIT message - * processing. This results in the COSM endpoint on the card being closed and - * the SCIF host peer device on the card getting unregistered similar to - * steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the - * host returns EPOLLHUP as a result. - * 4. On the host, card peer device unregister and SCIF HW remove(..) also - * subsequently complete. - * - * Card crash - * ---------- - * If a reset is issued after the card has crashed, there is no SCIF_DISCNT - * message from the card which would result in scif_poll(..) returning - * EPOLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE - * message to itself resulting in the card SCIF peer device being unregistered, - * this results in a scif_peer_release_dev -> scif_cleanup_scifdev-> - * scif_invalidate_ep call sequence which sets the endpoint state to - * DISCONNECTED and results in scif_poll(..) returning EPOLLHUP. - */ - -#define COSM_SCIF_BACKLOG 16 -#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10 -#define COSM_HEARTBEAT_TIMEOUT_SEC \ - (COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC) -#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC) - -static struct task_struct *server_thread; -static scif_epd_t listen_epd; - -/* Publish MIC card's shutdown status to user space MIC daemon */ -static void cosm_update_mic_status(struct cosm_device *cdev) -{ - if (cdev->shutdown_status_int != MIC_NOP) { - cosm_set_shutdown_status(cdev, cdev->shutdown_status_int); - cdev->shutdown_status_int = MIC_NOP; - } -} - -/* Store MIC card's shutdown status internally when it is received */ -static void cosm_shutdown_status_int(struct cosm_device *cdev, - enum mic_status shutdown_status) -{ - switch (shutdown_status) { - case MIC_HALTED: - case MIC_POWER_OFF: - case MIC_RESTART: - case MIC_CRASHED: - break; - default: - dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n", - __func__, __LINE__, shutdown_status); - return; - }; - cdev->shutdown_status_int = shutdown_status; - cdev->heartbeat_watchdog_enable = false; - - if (cdev->state != MIC_SHUTTING_DOWN) - cosm_set_state(cdev, MIC_SHUTTING_DOWN); -} - -/* Non-blocking recv. Read and process all available messages */ -static void cosm_scif_recv(struct cosm_device *cdev) -{ - struct cosm_msg msg; - int rc; - - while (1) { - rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0); - if (!rc) { - break; - } else if (rc < 0) { - dev_dbg(&cdev->dev, "%s: %d rc %d\n", - __func__, __LINE__, rc); - break; - } - dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n", - __func__, __LINE__, rc, msg.id); - - switch (msg.id) { - case COSM_MSG_SHUTDOWN_STATUS: - cosm_shutdown_status_int(cdev, msg.shutdown_status); - break; - case COSM_MSG_HEARTBEAT: - /* Nothing to do, heartbeat only unblocks scif_poll */ - break; - default: - dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n", - __func__, __LINE__, msg.id); - break; - } - } -} - -/* Publish crashed status for this MIC card */ -static void cosm_set_crashed(struct cosm_device *cdev) -{ - dev_err(&cdev->dev, "node alive timeout\n"); - cosm_shutdown_status_int(cdev, MIC_CRASHED); - cosm_update_mic_status(cdev); -} - -/* Send host time to the MIC card to sync system time between host and MIC */ -static void cosm_send_time(struct cosm_device *cdev) -{ - struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME }; - struct timespec64 ts; - int rc; - - ktime_get_real_ts64(&ts); - msg.timespec.tv_sec = ts.tv_sec; - msg.timespec.tv_nsec = ts.tv_nsec; - - rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); - if (rc < 0) - dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n", - __func__, __LINE__, rc); -} - -/* - * Close this cosm_device's endpoint after its peer endpoint on the card has - * been closed. In all cases except MIC card crash EPOLLHUP on the host is - * triggered by the client's endpoint being closed. - */ -static void cosm_scif_close(struct cosm_device *cdev) -{ - /* - * Because SHUTDOWN_STATUS message is sent by the MIC cards in the - * reboot notifier when shutdown is still not complete, we notify mpssd - * to reset the card when SCIF endpoint is closed. - */ - cosm_update_mic_status(cdev); - scif_close(cdev->epd); - cdev->epd = NULL; - dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__); -} - -/* - * Set card state to ONLINE when a new SCIF connection from a MIC card is - * received. Normally the state is BOOTING when the connection comes in, but can - * be ONLINE if cosm_client driver on the card was unloaded and then reloaded. - */ -static int cosm_set_online(struct cosm_device *cdev) -{ - int rc = 0; - - if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) { - cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable; - cdev->epd = cdev->newepd; - if (cdev->state == MIC_BOOTING) - cosm_set_state(cdev, MIC_ONLINE); - cosm_send_time(cdev); - dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__); - } else { - dev_warn(&cdev->dev, "%s %d not going online in state: %s\n", - __func__, __LINE__, cosm_state_string[cdev->state]); - rc = -EINVAL; - } - /* Drop reference acquired by bus_find_device in the server thread */ - put_device(&cdev->dev); - return rc; -} - -/* - * Work function for handling work for a SCIF connection from a particular MIC - * card. It first sets the card state to ONLINE and then calls scif_poll to - * block on activity such as incoming messages on the SCIF endpoint. When the - * endpoint is closed, the work function exits, completing its life cycle, from - * MIC card boot to card shutdown/reset/crash. - */ -void cosm_scif_work(struct work_struct *work) -{ - struct cosm_device *cdev = container_of(work, struct cosm_device, - scif_work); - struct scif_pollepd pollepd; - int rc; - - mutex_lock(&cdev->cosm_mutex); - if (cosm_set_online(cdev)) - goto exit; - - while (1) { - pollepd.epd = cdev->epd; - pollepd.events = EPOLLIN; - - /* Drop the mutex before blocking in scif_poll(..) */ - mutex_unlock(&cdev->cosm_mutex); - /* poll(..) with timeout on our endpoint */ - rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC); - mutex_lock(&cdev->cosm_mutex); - if (rc < 0) { - dev_err(&cdev->dev, "%s %d scif_poll rc %d\n", - __func__, __LINE__, rc); - continue; - } - - /* There is a message from the card */ - if (pollepd.revents & EPOLLIN) - cosm_scif_recv(cdev); - - /* The peer endpoint is closed or this endpoint disconnected */ - if (pollepd.revents & EPOLLHUP) { - cosm_scif_close(cdev); - break; - } - - /* Did we timeout from poll? */ - if (!rc && cdev->heartbeat_watchdog_enable) - cosm_set_crashed(cdev); - } -exit: - dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__); - mutex_unlock(&cdev->cosm_mutex); -} - -/* - * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC - * cards, finds the correct cosm_device to associate that connection with and - * schedules individual work items for each MIC card. - */ -static int cosm_scif_server(void *unused) -{ - struct cosm_device *cdev; - scif_epd_t newepd; - struct scif_port_id port_id; - int rc; - - allow_signal(SIGKILL); - - while (!kthread_should_stop()) { - rc = scif_accept(listen_epd, &port_id, &newepd, - SCIF_ACCEPT_SYNC); - if (rc < 0) { - if (-ERESTARTSYS != rc) - pr_err("%s %d rc %d\n", __func__, __LINE__, rc); - continue; - } - - /* - * Associate the incoming connection with a particular - * cosm_device, COSM device ID == SCIF node ID - 1 - */ - cdev = cosm_find_cdev_by_id(port_id.node - 1); - if (!cdev) - continue; - cdev->newepd = newepd; - schedule_work(&cdev->scif_work); - } - - pr_debug("%s %d Server thread stopped\n", __func__, __LINE__); - return 0; -} - -static int cosm_scif_listen(void) -{ - int rc; - - listen_epd = scif_open(); - if (!listen_epd) { - pr_err("%s %d scif_open failed\n", __func__, __LINE__); - return -ENOMEM; - } - - rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT); - if (rc < 0) { - pr_err("%s %d scif_bind failed rc %d\n", - __func__, __LINE__, rc); - goto err; - } - - rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG); - if (rc < 0) { - pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc); - goto err; - } - pr_debug("%s %d listen_epd set up\n", __func__, __LINE__); - return 0; -err: - scif_close(listen_epd); - listen_epd = NULL; - return rc; -} - -static void cosm_scif_listen_exit(void) -{ - pr_debug("%s %d closing listen_epd\n", __func__, __LINE__); - if (listen_epd) { - scif_close(listen_epd); - listen_epd = NULL; - } -} - -/* - * Create a listening SCIF endpoint and a server kthread which accepts incoming - * SCIF connections from MIC cards - */ -int cosm_scif_init(void) -{ - int rc = cosm_scif_listen(); - - if (rc) { - pr_err("%s %d cosm_scif_listen rc %d\n", - __func__, __LINE__, rc); - goto err; - } - - server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server"); - if (IS_ERR(server_thread)) { - rc = PTR_ERR(server_thread); - pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc); - goto listen_exit; - } - return 0; -listen_exit: - cosm_scif_listen_exit(); -err: - return rc; -} - -/* Stop the running server thread and close the listening SCIF endpoint */ -void cosm_scif_exit(void) -{ - int rc; - - if (!IS_ERR_OR_NULL(server_thread)) { - rc = send_sig(SIGKILL, server_thread, 0); - if (rc) { - pr_err("%s %d send_sig rc %d\n", - __func__, __LINE__, rc); - return; - } - kthread_stop(server_thread); - } - - cosm_scif_listen_exit(); -} diff --git a/drivers/misc/mic/cosm/cosm_sysfs.c b/drivers/misc/mic/cosm/cosm_sysfs.c deleted file mode 100644 index e6dac967c1af..000000000000 --- a/drivers/misc/mic/cosm/cosm_sysfs.c +++ /dev/null @@ -1,449 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel MIC Coprocessor State Management (COSM) Driver - */ -#include -#include "cosm_main.h" - -/* - * A state-to-string lookup table, for exposing a human readable state - * via sysfs. Always keep in sync with enum cosm_states - */ -const char * const cosm_state_string[] = { - [MIC_READY] = "ready", - [MIC_BOOTING] = "booting", - [MIC_ONLINE] = "online", - [MIC_SHUTTING_DOWN] = "shutting_down", - [MIC_RESETTING] = "resetting", - [MIC_RESET_FAILED] = "reset_failed", -}; - -/* - * A shutdown-status-to-string lookup table, for exposing a human - * readable state via sysfs. Always keep in sync with enum cosm_shutdown_status - */ -const char * const cosm_shutdown_status_string[] = { - [MIC_NOP] = "nop", - [MIC_CRASHED] = "crashed", - [MIC_HALTED] = "halted", - [MIC_POWER_OFF] = "poweroff", - [MIC_RESTART] = "restart", -}; - -void cosm_set_shutdown_status(struct cosm_device *cdev, u8 shutdown_status) -{ - dev_dbg(&cdev->dev, "Shutdown Status %s -> %s\n", - cosm_shutdown_status_string[cdev->shutdown_status], - cosm_shutdown_status_string[shutdown_status]); - cdev->shutdown_status = shutdown_status; -} - -void cosm_set_state(struct cosm_device *cdev, u8 state) -{ - dev_dbg(&cdev->dev, "State %s -> %s\n", - cosm_state_string[cdev->state], - cosm_state_string[state]); - cdev->state = state; - sysfs_notify_dirent(cdev->state_sysfs); -} - -static ssize_t -family_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - return cdev->hw_ops->family(cdev, buf); -} -static DEVICE_ATTR_RO(family); - -static ssize_t -stepping_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - return cdev->hw_ops->stepping(cdev, buf); -} -static DEVICE_ATTR_RO(stepping); - -static ssize_t -state_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev || cdev->state >= MIC_LAST) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%s\n", - cosm_state_string[cdev->state]); -} - -static ssize_t -state_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - int rc; - - if (!cdev) - return -EINVAL; - - if (sysfs_streq(buf, "boot")) { - rc = cosm_start(cdev); - goto done; - } - if (sysfs_streq(buf, "reset")) { - rc = cosm_reset(cdev); - goto done; - } - - if (sysfs_streq(buf, "shutdown")) { - rc = cosm_shutdown(cdev); - goto done; - } - rc = -EINVAL; -done: - if (rc) - count = rc; - return count; -} -static DEVICE_ATTR_RW(state); - -static ssize_t shutdown_status_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev || cdev->shutdown_status >= MIC_STATUS_LAST) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%s\n", - cosm_shutdown_status_string[cdev->shutdown_status]); -} -static DEVICE_ATTR_RO(shutdown_status); - -static ssize_t -heartbeat_enable_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%d\n", cdev->sysfs_heartbeat_enable); -} - -static ssize_t -heartbeat_enable_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - int enable; - int ret; - - if (!cdev) - return -EINVAL; - - mutex_lock(&cdev->cosm_mutex); - ret = kstrtoint(buf, 10, &enable); - if (ret) - goto unlock; - - cdev->sysfs_heartbeat_enable = enable; - /* if state is not online, cdev->heartbeat_watchdog_enable is 0 */ - if (cdev->state == MIC_ONLINE) - cdev->heartbeat_watchdog_enable = enable; - ret = count; -unlock: - mutex_unlock(&cdev->cosm_mutex); - return ret; -} -static DEVICE_ATTR_RW(heartbeat_enable); - -static ssize_t -cmdline_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - char *cmdline; - - if (!cdev) - return -EINVAL; - - cmdline = cdev->cmdline; - - if (cmdline) - return scnprintf(buf, PAGE_SIZE, "%s\n", cmdline); - return 0; -} - -static ssize_t -cmdline_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - mutex_lock(&cdev->cosm_mutex); - kfree(cdev->cmdline); - - cdev->cmdline = kmalloc(count + 1, GFP_KERNEL); - if (!cdev->cmdline) { - count = -ENOMEM; - goto unlock; - } - - strncpy(cdev->cmdline, buf, count); - - if (cdev->cmdline[count - 1] == '\n') - cdev->cmdline[count - 1] = '\0'; - else - cdev->cmdline[count] = '\0'; -unlock: - mutex_unlock(&cdev->cosm_mutex); - return count; -} -static DEVICE_ATTR_RW(cmdline); - -static ssize_t -firmware_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - char *firmware; - - if (!cdev) - return -EINVAL; - - firmware = cdev->firmware; - - if (firmware) - return scnprintf(buf, PAGE_SIZE, "%s\n", firmware); - return 0; -} - -static ssize_t -firmware_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - mutex_lock(&cdev->cosm_mutex); - kfree(cdev->firmware); - - cdev->firmware = kmalloc(count + 1, GFP_KERNEL); - if (!cdev->firmware) { - count = -ENOMEM; - goto unlock; - } - strncpy(cdev->firmware, buf, count); - - if (cdev->firmware[count - 1] == '\n') - cdev->firmware[count - 1] = '\0'; - else - cdev->firmware[count] = '\0'; -unlock: - mutex_unlock(&cdev->cosm_mutex); - return count; -} -static DEVICE_ATTR_RW(firmware); - -static ssize_t -ramdisk_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - char *ramdisk; - - if (!cdev) - return -EINVAL; - - ramdisk = cdev->ramdisk; - - if (ramdisk) - return scnprintf(buf, PAGE_SIZE, "%s\n", ramdisk); - return 0; -} - -static ssize_t -ramdisk_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - mutex_lock(&cdev->cosm_mutex); - kfree(cdev->ramdisk); - - cdev->ramdisk = kmalloc(count + 1, GFP_KERNEL); - if (!cdev->ramdisk) { - count = -ENOMEM; - goto unlock; - } - - strncpy(cdev->ramdisk, buf, count); - - if (cdev->ramdisk[count - 1] == '\n') - cdev->ramdisk[count - 1] = '\0'; - else - cdev->ramdisk[count] = '\0'; -unlock: - mutex_unlock(&cdev->cosm_mutex); - return count; -} -static DEVICE_ATTR_RW(ramdisk); - -static ssize_t -bootmode_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - char *bootmode; - - if (!cdev) - return -EINVAL; - - bootmode = cdev->bootmode; - - if (bootmode) - return scnprintf(buf, PAGE_SIZE, "%s\n", bootmode); - return 0; -} - -static ssize_t -bootmode_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - if (!sysfs_streq(buf, "linux") && !sysfs_streq(buf, "flash")) - return -EINVAL; - - mutex_lock(&cdev->cosm_mutex); - kfree(cdev->bootmode); - - cdev->bootmode = kmalloc(count + 1, GFP_KERNEL); - if (!cdev->bootmode) { - count = -ENOMEM; - goto unlock; - } - - strncpy(cdev->bootmode, buf, count); - - if (cdev->bootmode[count - 1] == '\n') - cdev->bootmode[count - 1] = '\0'; - else - cdev->bootmode[count] = '\0'; -unlock: - mutex_unlock(&cdev->cosm_mutex); - return count; -} -static DEVICE_ATTR_RW(bootmode); - -static ssize_t -log_buf_addr_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%p\n", cdev->log_buf_addr); -} - -static ssize_t -log_buf_addr_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - int ret; - unsigned long addr; - - if (!cdev) - return -EINVAL; - - ret = kstrtoul(buf, 16, &addr); - if (ret) - goto exit; - - cdev->log_buf_addr = (void *)addr; - ret = count; -exit: - return ret; -} -static DEVICE_ATTR_RW(log_buf_addr); - -static ssize_t -log_buf_len_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - - if (!cdev) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%p\n", cdev->log_buf_len); -} - -static ssize_t -log_buf_len_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cosm_device *cdev = dev_get_drvdata(dev); - int ret; - unsigned long addr; - - if (!cdev) - return -EINVAL; - - ret = kstrtoul(buf, 16, &addr); - if (ret) - goto exit; - - cdev->log_buf_len = (int *)addr; - ret = count; -exit: - return ret; -} -static DEVICE_ATTR_RW(log_buf_len); - -static struct attribute *cosm_default_attrs[] = { - &dev_attr_family.attr, - &dev_attr_stepping.attr, - &dev_attr_state.attr, - &dev_attr_shutdown_status.attr, - &dev_attr_heartbeat_enable.attr, - &dev_attr_cmdline.attr, - &dev_attr_firmware.attr, - &dev_attr_ramdisk.attr, - &dev_attr_bootmode.attr, - &dev_attr_log_buf_addr.attr, - &dev_attr_log_buf_len.attr, - - NULL -}; - -ATTRIBUTE_GROUPS(cosm_default); - -void cosm_sysfs_init(struct cosm_device *cdev) -{ - cdev->attr_group = cosm_default_groups; -} diff --git a/drivers/misc/mic/cosm_client/Makefile b/drivers/misc/mic/cosm_client/Makefile deleted file mode 100644 index 5b62270bc2ab..000000000000 --- a/drivers/misc/mic/cosm_client/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Makefile - Intel MIC COSM Client Driver -# Copyright(c) 2015, Intel Corporation. -# -obj-$(CONFIG_MIC_COSM) += cosm_client.o - -cosm_client-objs += cosm_scif_client.o diff --git a/drivers/misc/mic/cosm_client/cosm_scif_client.c b/drivers/misc/mic/cosm_client/cosm_scif_client.c deleted file mode 100644 index a03213dd9319..000000000000 --- a/drivers/misc/mic/cosm_client/cosm_scif_client.c +++ /dev/null @@ -1,269 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel MIC COSM Client Driver - */ -#include -#include -#include -#include -#include - -#include "../cosm/cosm_main.h" - -#define COSM_SCIF_MAX_RETRIES 10 -#define COSM_HEARTBEAT_SEND_MSEC (COSM_HEARTBEAT_SEND_SEC * MSEC_PER_SEC) - -static struct task_struct *client_thread; -static scif_epd_t client_epd; -static struct scif_peer_dev *client_spdev; - -/* - * Reboot notifier: receives shutdown status from the OS and communicates it - * back to the COSM process on the host - */ -static int cosm_reboot_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct cosm_msg msg = { .id = COSM_MSG_SHUTDOWN_STATUS }; - int rc; - - event = (event == SYS_RESTART) ? SYSTEM_RESTART : event; - dev_info(&client_spdev->dev, "%s %d received event %ld\n", - __func__, __LINE__, event); - - msg.shutdown_status = event; - rc = scif_send(client_epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); - if (rc < 0) - dev_err(&client_spdev->dev, "%s %d scif_send rc %d\n", - __func__, __LINE__, rc); - - return NOTIFY_DONE; -} - -static struct notifier_block cosm_reboot = { - .notifier_call = cosm_reboot_event, -}; - -/* Set system time from timespec value received from the host */ -static void cosm_set_time(struct cosm_msg *msg) -{ - struct timespec64 ts = { - .tv_sec = msg->timespec.tv_sec, - .tv_nsec = msg->timespec.tv_nsec, - }; - int rc = do_settimeofday64(&ts); - - if (rc) - dev_err(&client_spdev->dev, "%s: %d settimeofday rc %d\n", - __func__, __LINE__, rc); -} - -/* COSM client receive message processing */ -static void cosm_client_recv(void) -{ - struct cosm_msg msg; - int rc; - - while (1) { - rc = scif_recv(client_epd, &msg, sizeof(msg), 0); - if (!rc) { - return; - } else if (rc < 0) { - dev_err(&client_spdev->dev, "%s: %d rc %d\n", - __func__, __LINE__, rc); - return; - } - - dev_dbg(&client_spdev->dev, "%s: %d rc %d id 0x%llx\n", - __func__, __LINE__, rc, msg.id); - - switch (msg.id) { - case COSM_MSG_SYNC_TIME: - cosm_set_time(&msg); - break; - case COSM_MSG_SHUTDOWN: - orderly_poweroff(true); - break; - default: - dev_err(&client_spdev->dev, "%s: %d unknown id %lld\n", - __func__, __LINE__, msg.id); - break; - } - } -} - -/* Initiate connection to the COSM server on the host */ -static int cosm_scif_connect(void) -{ - struct scif_port_id port_id; - int i, rc; - - client_epd = scif_open(); - if (!client_epd) { - dev_err(&client_spdev->dev, "%s %d scif_open failed\n", - __func__, __LINE__); - return -ENOMEM; - } - - port_id.node = 0; - port_id.port = SCIF_COSM_LISTEN_PORT; - - for (i = 0; i < COSM_SCIF_MAX_RETRIES; i++) { - rc = scif_connect(client_epd, &port_id); - if (rc < 0) - msleep(1000); - else - break; - } - - if (rc < 0) { - dev_err(&client_spdev->dev, "%s %d scif_connect rc %d\n", - __func__, __LINE__, rc); - scif_close(client_epd); - client_epd = NULL; - } - return rc < 0 ? rc : 0; -} - -/* Close host SCIF connection */ -static void cosm_scif_connect_exit(void) -{ - if (client_epd) { - scif_close(client_epd); - client_epd = NULL; - } -} - -/* - * COSM SCIF client thread function: waits for messages from the host and sends - * a heartbeat to the host - */ -static int cosm_scif_client(void *unused) -{ - struct cosm_msg msg = { .id = COSM_MSG_HEARTBEAT }; - struct scif_pollepd pollepd; - int rc; - - allow_signal(SIGKILL); - - while (!kthread_should_stop()) { - pollepd.epd = client_epd; - pollepd.events = EPOLLIN; - - rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_SEND_MSEC); - if (rc < 0) { - if (-EINTR != rc) - dev_err(&client_spdev->dev, - "%s %d scif_poll rc %d\n", - __func__, __LINE__, rc); - continue; - } - - if (pollepd.revents & EPOLLIN) - cosm_client_recv(); - - msg.id = COSM_MSG_HEARTBEAT; - rc = scif_send(client_epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); - if (rc < 0) - dev_err(&client_spdev->dev, "%s %d scif_send rc %d\n", - __func__, __LINE__, rc); - } - - dev_dbg(&client_spdev->dev, "%s %d Client thread stopped\n", - __func__, __LINE__); - return 0; -} - -static void cosm_scif_probe(struct scif_peer_dev *spdev) -{ - int rc; - - dev_dbg(&spdev->dev, "%s %d: dnode %d\n", - __func__, __LINE__, spdev->dnode); - - /* We are only interested in the host with spdev->dnode == 0 */ - if (spdev->dnode) - return; - - client_spdev = spdev; - rc = cosm_scif_connect(); - if (rc) - goto exit; - - rc = register_reboot_notifier(&cosm_reboot); - if (rc) { - dev_err(&spdev->dev, - "reboot notifier registration failed rc %d\n", rc); - goto connect_exit; - } - - client_thread = kthread_run(cosm_scif_client, NULL, "cosm_client"); - if (IS_ERR(client_thread)) { - rc = PTR_ERR(client_thread); - dev_err(&spdev->dev, "%s %d kthread_run rc %d\n", - __func__, __LINE__, rc); - goto unreg_reboot; - } - return; -unreg_reboot: - unregister_reboot_notifier(&cosm_reboot); -connect_exit: - cosm_scif_connect_exit(); -exit: - client_spdev = NULL; -} - -static void cosm_scif_remove(struct scif_peer_dev *spdev) -{ - int rc; - - dev_dbg(&spdev->dev, "%s %d: dnode %d\n", - __func__, __LINE__, spdev->dnode); - - if (spdev->dnode) - return; - - if (!IS_ERR_OR_NULL(client_thread)) { - rc = send_sig(SIGKILL, client_thread, 0); - if (rc) { - pr_err("%s %d send_sig rc %d\n", - __func__, __LINE__, rc); - return; - } - kthread_stop(client_thread); - } - unregister_reboot_notifier(&cosm_reboot); - cosm_scif_connect_exit(); - client_spdev = NULL; -} - -static struct scif_client scif_client_cosm = { - .name = KBUILD_MODNAME, - .probe = cosm_scif_probe, - .remove = cosm_scif_remove, -}; - -static int __init cosm_client_init(void) -{ - int rc = scif_client_register(&scif_client_cosm); - - if (rc) - pr_err("scif_client_register failed rc %d\n", rc); - return rc; -} - -static void __exit cosm_client_exit(void) -{ - scif_client_unregister(&scif_client_cosm); -} - -module_init(cosm_client_init); -module_exit(cosm_client_exit); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) MIC card OS state management client driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/host/Makefile b/drivers/misc/mic/host/Makefile deleted file mode 100644 index 25f153367980..000000000000 --- a/drivers/misc/mic/host/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile - Intel MIC Linux driver. -# Copyright(c) 2013, Intel Corporation. -# -obj-$(CONFIG_INTEL_MIC_HOST) += mic_host.o -mic_host-objs := mic_main.o -mic_host-objs += mic_x100.o -mic_host-objs += mic_smpt.o -mic_host-objs += mic_intr.o -mic_host-objs += mic_boot.o -mic_host-objs += mic_debugfs.o diff --git a/drivers/misc/mic/host/mic_boot.c b/drivers/misc/mic/host/mic_boot.c deleted file mode 100644 index 8cb85b8b3e19..000000000000 --- a/drivers/misc/mic/host/mic_boot.c +++ /dev/null @@ -1,588 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#include -#include -#include -#include -#include -#include -#include -#include "../bus/scif_bus.h" -#include "../bus/vop_bus.h" -#include "../common/mic_dev.h" -#include "mic_device.h" -#include "mic_smpt.h" - -static inline struct mic_device *vpdev_to_mdev(struct device *dev) -{ - return dev_get_drvdata(dev->parent); -} - -static dma_addr_t -_mic_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - void *va = phys_to_virt(page_to_phys(page)) + offset; - struct mic_device *mdev = vpdev_to_mdev(dev); - - return mic_map_single(mdev, va, size); -} - -static void _mic_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - struct mic_device *mdev = vpdev_to_mdev(dev); - - mic_unmap_single(mdev, dma_addr, size); -} - -static const struct dma_map_ops _mic_dma_ops = { - .map_page = _mic_dma_map_page, - .unmap_page = _mic_dma_unmap_page, -}; - -static struct mic_irq * -__mic_request_irq(struct vop_device *vpdev, - irqreturn_t (*func)(int irq, void *data), - const char *name, void *data, int intr_src) -{ - struct mic_device *mdev = vpdev_to_mdev(&vpdev->dev); - - return mic_request_threaded_irq(mdev, func, NULL, name, data, - intr_src, MIC_INTR_DB); -} - -static void __mic_free_irq(struct vop_device *vpdev, - struct mic_irq *cookie, void *data) -{ - struct mic_device *mdev = vpdev_to_mdev(&vpdev->dev); - - mic_free_irq(mdev, cookie, data); -} - -static void __mic_ack_interrupt(struct vop_device *vpdev, int num) -{ - struct mic_device *mdev = vpdev_to_mdev(&vpdev->dev); - - mdev->ops->intr_workarounds(mdev); -} - -static int __mic_next_db(struct vop_device *vpdev) -{ - struct mic_device *mdev = vpdev_to_mdev(&vpdev->dev); - - return mic_next_db(mdev); -} - -static void *__mic_get_dp(struct vop_device *vpdev) -{ - struct mic_device *mdev = vpdev_to_mdev(&vpdev->dev); - - return mdev->dp; -} - -static void __iomem *__mic_get_remote_dp(struct vop_device *vpdev) -{ - return NULL; -} - -static void __mic_send_intr(struct vop_device *vpdev, int db) -{ - struct mic_device *mdev = vpdev_to_mdev(&vpdev->dev); - - mdev->ops->send_intr(mdev, db); -} - -static void __iomem *__mic_ioremap(struct vop_device *vpdev, - dma_addr_t pa, size_t len) -{ - struct mic_device *mdev = vpdev_to_mdev(&vpdev->dev); - - return mdev->aper.va + pa; -} - -static void __mic_iounmap(struct vop_device *vpdev, void __iomem *va) -{ - /* nothing to do */ -} - -static struct vop_hw_ops vop_hw_ops = { - .request_irq = __mic_request_irq, - .free_irq = __mic_free_irq, - .ack_interrupt = __mic_ack_interrupt, - .next_db = __mic_next_db, - .get_dp = __mic_get_dp, - .get_remote_dp = __mic_get_remote_dp, - .send_intr = __mic_send_intr, - .remap = __mic_ioremap, - .unmap = __mic_iounmap, -}; - -static inline struct mic_device *scdev_to_mdev(struct scif_hw_dev *scdev) -{ - return dev_get_drvdata(scdev->dev.parent); -} - -static void *__mic_dma_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, - unsigned long attrs) -{ - struct scif_hw_dev *scdev = dev_get_drvdata(dev); - struct mic_device *mdev = scdev_to_mdev(scdev); - dma_addr_t tmp; - void *va = kzalloc(size, gfp); - - if (va) { - tmp = mic_map_single(mdev, va, size); - if (dma_mapping_error(dev, tmp)) { - kfree(va); - va = NULL; - } else { - *dma_handle = tmp; - } - } - return va; -} - -static void __mic_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - struct scif_hw_dev *scdev = dev_get_drvdata(dev); - struct mic_device *mdev = scdev_to_mdev(scdev); - - mic_unmap_single(mdev, dma_handle, size); - kfree(vaddr); -} - -static dma_addr_t -__mic_dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - void *va = phys_to_virt(page_to_phys(page)) + offset; - struct scif_hw_dev *scdev = dev_get_drvdata(dev); - struct mic_device *mdev = scdev_to_mdev(scdev); - - return mic_map_single(mdev, va, size); -} - -static void -__mic_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scif_hw_dev *scdev = dev_get_drvdata(dev); - struct mic_device *mdev = scdev_to_mdev(scdev); - - mic_unmap_single(mdev, dma_addr, size); -} - -static int __mic_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scif_hw_dev *scdev = dev_get_drvdata(dev); - struct mic_device *mdev = scdev_to_mdev(scdev); - struct scatterlist *s; - int i, j, ret; - dma_addr_t da; - - ret = dma_map_sg(&mdev->pdev->dev, sg, nents, dir); - if (ret <= 0) - return 0; - - for_each_sg(sg, s, nents, i) { - da = mic_map(mdev, sg_dma_address(s) + s->offset, s->length); - if (!da) - goto err; - sg_dma_address(s) = da; - } - return nents; -err: - for_each_sg(sg, s, i, j) { - mic_unmap(mdev, sg_dma_address(s), s->length); - sg_dma_address(s) = mic_to_dma_addr(mdev, sg_dma_address(s)); - } - dma_unmap_sg(&mdev->pdev->dev, sg, nents, dir); - return 0; -} - -static void __mic_dma_unmap_sg(struct device *dev, - struct scatterlist *sg, int nents, - enum dma_data_direction dir, - unsigned long attrs) -{ - struct scif_hw_dev *scdev = dev_get_drvdata(dev); - struct mic_device *mdev = scdev_to_mdev(scdev); - struct scatterlist *s; - dma_addr_t da; - int i; - - for_each_sg(sg, s, nents, i) { - da = mic_to_dma_addr(mdev, sg_dma_address(s)); - mic_unmap(mdev, sg_dma_address(s), s->length); - sg_dma_address(s) = da; - } - dma_unmap_sg(&mdev->pdev->dev, sg, nents, dir); -} - -static const struct dma_map_ops __mic_dma_ops = { - .alloc = __mic_dma_alloc, - .free = __mic_dma_free, - .map_page = __mic_dma_map_page, - .unmap_page = __mic_dma_unmap_page, - .map_sg = __mic_dma_map_sg, - .unmap_sg = __mic_dma_unmap_sg, -}; - -static struct mic_irq * -___mic_request_irq(struct scif_hw_dev *scdev, - irqreturn_t (*func)(int irq, void *data), - const char *name, - void *data, int db) -{ - struct mic_device *mdev = scdev_to_mdev(scdev); - - return mic_request_threaded_irq(mdev, func, NULL, name, data, - db, MIC_INTR_DB); -} - -static void -___mic_free_irq(struct scif_hw_dev *scdev, - struct mic_irq *cookie, void *data) -{ - struct mic_device *mdev = scdev_to_mdev(scdev); - - mic_free_irq(mdev, cookie, data); -} - -static void ___mic_ack_interrupt(struct scif_hw_dev *scdev, int num) -{ - struct mic_device *mdev = scdev_to_mdev(scdev); - - mdev->ops->intr_workarounds(mdev); -} - -static int ___mic_next_db(struct scif_hw_dev *scdev) -{ - struct mic_device *mdev = scdev_to_mdev(scdev); - - return mic_next_db(mdev); -} - -static void ___mic_send_intr(struct scif_hw_dev *scdev, int db) -{ - struct mic_device *mdev = scdev_to_mdev(scdev); - - mdev->ops->send_intr(mdev, db); -} - -static void __iomem *___mic_ioremap(struct scif_hw_dev *scdev, - phys_addr_t pa, size_t len) -{ - struct mic_device *mdev = scdev_to_mdev(scdev); - - return mdev->aper.va + pa; -} - -static void ___mic_iounmap(struct scif_hw_dev *scdev, void __iomem *va) -{ - /* nothing to do */ -} - -static struct scif_hw_ops scif_hw_ops = { - .request_irq = ___mic_request_irq, - .free_irq = ___mic_free_irq, - .ack_interrupt = ___mic_ack_interrupt, - .next_db = ___mic_next_db, - .send_intr = ___mic_send_intr, - .remap = ___mic_ioremap, - .unmap = ___mic_iounmap, -}; - -static inline struct mic_device *mbdev_to_mdev(struct mbus_device *mbdev) -{ - return dev_get_drvdata(mbdev->dev.parent); -} - -static dma_addr_t -mic_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - void *va = phys_to_virt(page_to_phys(page)) + offset; - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - return mic_map_single(mdev, va, size); -} - -static void -mic_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - mic_unmap_single(mdev, dma_addr, size); -} - -static const struct dma_map_ops mic_dma_ops = { - .map_page = mic_dma_map_page, - .unmap_page = mic_dma_unmap_page, -}; - -static struct mic_irq * -_mic_request_threaded_irq(struct mbus_device *mbdev, - irq_handler_t handler, irq_handler_t thread_fn, - const char *name, void *data, int intr_src) -{ - return mic_request_threaded_irq(mbdev_to_mdev(mbdev), handler, - thread_fn, name, data, - intr_src, MIC_INTR_DMA); -} - -static void _mic_free_irq(struct mbus_device *mbdev, - struct mic_irq *cookie, void *data) -{ - mic_free_irq(mbdev_to_mdev(mbdev), cookie, data); -} - -static void _mic_ack_interrupt(struct mbus_device *mbdev, int num) -{ - struct mic_device *mdev = mbdev_to_mdev(mbdev); - mdev->ops->intr_workarounds(mdev); -} - -static struct mbus_hw_ops mbus_hw_ops = { - .request_threaded_irq = _mic_request_threaded_irq, - .free_irq = _mic_free_irq, - .ack_interrupt = _mic_ack_interrupt, -}; - -/* Initialize the MIC bootparams */ -void mic_bootparam_init(struct mic_device *mdev) -{ - struct mic_bootparam *bootparam = mdev->dp; - - bootparam->magic = cpu_to_le32(MIC_MAGIC); - bootparam->h2c_config_db = -1; - bootparam->node_id = mdev->id + 1; - bootparam->scif_host_dma_addr = 0x0; - bootparam->scif_card_dma_addr = 0x0; - bootparam->c2h_scif_db = -1; - bootparam->h2c_scif_db = -1; -} - -static inline struct mic_device *cosmdev_to_mdev(struct cosm_device *cdev) -{ - return dev_get_drvdata(cdev->dev.parent); -} - -static void _mic_reset(struct cosm_device *cdev) -{ - struct mic_device *mdev = cosmdev_to_mdev(cdev); - - mdev->ops->reset_fw_ready(mdev); - mdev->ops->reset(mdev); -} - -static bool _mic_ready(struct cosm_device *cdev) -{ - struct mic_device *mdev = cosmdev_to_mdev(cdev); - - return mdev->ops->is_fw_ready(mdev); -} - -/** - * mic_request_dma_chans - Request DMA channels - * @mdev: pointer to mic_device instance - * - * returns number of DMA channels acquired - */ -static int mic_request_dma_chans(struct mic_device *mdev) -{ - dma_cap_mask_t mask; - struct dma_chan *chan; - - dma_cap_zero(mask); - dma_cap_set(DMA_MEMCPY, mask); - - do { - chan = dma_request_channel(mask, mdev->ops->dma_filter, - &mdev->pdev->dev); - if (chan) { - mdev->dma_ch[mdev->num_dma_ch++] = chan; - if (mdev->num_dma_ch >= MIC_MAX_DMA_CHAN) - break; - } - } while (chan); - dev_info(&mdev->pdev->dev, "DMA channels # %d\n", mdev->num_dma_ch); - return mdev->num_dma_ch; -} - -/** - * mic_free_dma_chans - release DMA channels - * @mdev: pointer to mic_device instance - * - * returns none - */ -static void mic_free_dma_chans(struct mic_device *mdev) -{ - int i = 0; - - for (i = 0; i < mdev->num_dma_ch; i++) { - dma_release_channel(mdev->dma_ch[i]); - mdev->dma_ch[i] = NULL; - } - mdev->num_dma_ch = 0; -} - -/** - * _mic_start - Start the MIC. - * @cdev: pointer to cosm_device instance - * @id: MIC device id/index provided by COSM used in other drivers like SCIF - * - * This function prepares an MIC for boot and initiates boot. - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - * - * For all cosm_hw_ops the caller holds a mutex to ensure serialization. - */ -static int _mic_start(struct cosm_device *cdev, int id) -{ - struct mic_device *mdev = cosmdev_to_mdev(cdev); - int rc; - - mic_bootparam_init(mdev); - mdev->dma_mbdev = mbus_register_device(&mdev->pdev->dev, - MBUS_DEV_DMA_HOST, &mic_dma_ops, - &mbus_hw_ops, id, mdev->mmio.va); - if (IS_ERR(mdev->dma_mbdev)) { - rc = PTR_ERR(mdev->dma_mbdev); - goto unlock_ret; - } - if (!mic_request_dma_chans(mdev)) { - rc = -ENODEV; - goto dma_remove; - } - mdev->scdev = scif_register_device(&mdev->pdev->dev, MIC_SCIF_DEV, - &__mic_dma_ops, &scif_hw_ops, - id + 1, 0, &mdev->mmio, - &mdev->aper, mdev->dp, NULL, - mdev->dma_ch, mdev->num_dma_ch, - true); - if (IS_ERR(mdev->scdev)) { - rc = PTR_ERR(mdev->scdev); - goto dma_free; - } - - mdev->vpdev = vop_register_device(&mdev->pdev->dev, - VOP_DEV_TRNSP, &_mic_dma_ops, - &vop_hw_ops, id + 1, &mdev->aper, - mdev->dma_ch[0]); - if (IS_ERR(mdev->vpdev)) { - rc = PTR_ERR(mdev->vpdev); - goto scif_remove; - } - - rc = mdev->ops->load_mic_fw(mdev, NULL); - if (rc) - goto vop_remove; - mic_smpt_restore(mdev); - mic_intr_restore(mdev); - mdev->intr_ops->enable_interrupts(mdev); - mdev->ops->write_spad(mdev, MIC_DPLO_SPAD, mdev->dp_dma_addr); - mdev->ops->write_spad(mdev, MIC_DPHI_SPAD, mdev->dp_dma_addr >> 32); - mdev->ops->send_firmware_intr(mdev); - goto unlock_ret; -vop_remove: - vop_unregister_device(mdev->vpdev); -scif_remove: - scif_unregister_device(mdev->scdev); -dma_free: - mic_free_dma_chans(mdev); -dma_remove: - mbus_unregister_device(mdev->dma_mbdev); -unlock_ret: - return rc; -} - -/** - * _mic_stop - Prepare the MIC for reset and trigger reset. - * @cdev: pointer to cosm_device instance - * @force: force a MIC to reset even if it is already offline. - * - * RETURNS: None. - */ -static void _mic_stop(struct cosm_device *cdev, bool force) -{ - struct mic_device *mdev = cosmdev_to_mdev(cdev); - - /* - * Since SCIF handles card shutdown and reset (using COSM), it will - * will be the first to be registered and the last to be - * unregistered. - */ - vop_unregister_device(mdev->vpdev); - scif_unregister_device(mdev->scdev); - mic_free_dma_chans(mdev); - mbus_unregister_device(mdev->dma_mbdev); - mic_bootparam_init(mdev); -} - -static ssize_t _mic_family(struct cosm_device *cdev, char *buf) -{ - struct mic_device *mdev = cosmdev_to_mdev(cdev); - static const char *family[MIC_FAMILY_LAST] = { "x100", "Unknown" }; - - return scnprintf(buf, PAGE_SIZE, "%s\n", family[mdev->family]); -} - -static ssize_t _mic_stepping(struct cosm_device *cdev, char *buf) -{ - struct mic_device *mdev = cosmdev_to_mdev(cdev); - const char *string = "??"; - - switch (mdev->stepping) { - case MIC_A0_STEP: - string = "A0"; - break; - case MIC_B0_STEP: - string = "B0"; - break; - case MIC_B1_STEP: - string = "B1"; - break; - case MIC_C0_STEP: - string = "C0"; - break; - default: - break; - } - return scnprintf(buf, PAGE_SIZE, "%s\n", string); -} - -static struct mic_mw *_mic_aper(struct cosm_device *cdev) -{ - struct mic_device *mdev = cosmdev_to_mdev(cdev); - - return &mdev->aper; -} - -struct cosm_hw_ops cosm_hw_ops = { - .reset = _mic_reset, - .force_reset = _mic_reset, - .post_reset = NULL, - .ready = _mic_ready, - .start = _mic_start, - .stop = _mic_stop, - .family = _mic_family, - .stepping = _mic_stepping, - .aper = _mic_aper, -}; diff --git a/drivers/misc/mic/host/mic_debugfs.c b/drivers/misc/mic/host/mic_debugfs.c deleted file mode 100644 index ffda740e20d5..000000000000 --- a/drivers/misc/mic/host/mic_debugfs.c +++ /dev/null @@ -1,149 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#include -#include -#include - -#include -#include "../common/mic_dev.h" -#include "mic_device.h" -#include "mic_smpt.h" - -/* Debugfs parent dir */ -static struct dentry *mic_dbg; - -static int mic_smpt_show(struct seq_file *s, void *pos) -{ - int i; - struct mic_device *mdev = s->private; - unsigned long flags; - - seq_printf(s, "MIC %-2d |%-10s| %-14s %-10s\n", - mdev->id, "SMPT entry", "SW DMA addr", "RefCount"); - seq_puts(s, "====================================================\n"); - - if (mdev->smpt) { - struct mic_smpt_info *smpt_info = mdev->smpt; - spin_lock_irqsave(&smpt_info->smpt_lock, flags); - for (i = 0; i < smpt_info->info.num_reg; i++) { - seq_printf(s, "%9s|%-10d| %-#14llx %-10lld\n", - " ", i, smpt_info->entry[i].dma_addr, - smpt_info->entry[i].ref_count); - } - spin_unlock_irqrestore(&smpt_info->smpt_lock, flags); - } - seq_puts(s, "====================================================\n"); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(mic_smpt); - -static int mic_post_code_show(struct seq_file *s, void *pos) -{ - struct mic_device *mdev = s->private; - u32 reg = mdev->ops->get_postcode(mdev); - - seq_printf(s, "%c%c", reg & 0xff, (reg >> 8) & 0xff); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(mic_post_code); - -static int mic_msi_irq_info_show(struct seq_file *s, void *pos) -{ - struct mic_device *mdev = s->private; - int reg; - int i, j; - u16 entry; - u16 vector; - struct pci_dev *pdev = mdev->pdev; - - if (pci_dev_msi_enabled(pdev)) { - for (i = 0; i < mdev->irq_info.num_vectors; i++) { - if (pdev->msix_enabled) { - entry = mdev->irq_info.msix_entries[i].entry; - vector = mdev->irq_info.msix_entries[i].vector; - } else { - entry = 0; - vector = pdev->irq; - } - - reg = mdev->intr_ops->read_msi_to_src_map(mdev, entry); - - seq_printf(s, "%s %-10d %s %-10d MXAR[%d]: %08X\n", - "IRQ:", vector, "Entry:", entry, i, reg); - - seq_printf(s, "%-10s", "offset:"); - for (j = (MIC_NUM_OFFSETS - 1); j >= 0; j--) - seq_printf(s, "%4d ", j); - seq_puts(s, "\n"); - - - seq_printf(s, "%-10s", "count:"); - for (j = (MIC_NUM_OFFSETS - 1); j >= 0; j--) - seq_printf(s, "%4d ", - (mdev->irq_info.mic_msi_map[i] & - BIT(j)) ? 1 : 0); - seq_puts(s, "\n\n"); - } - } else { - seq_puts(s, "MSI/MSIx interrupts not enabled\n"); - } - - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(mic_msi_irq_info); - -/* - * mic_create_debug_dir - Initialize MIC debugfs entries. - */ -void mic_create_debug_dir(struct mic_device *mdev) -{ - char name[16]; - - if (!mic_dbg) - return; - - scnprintf(name, sizeof(name), "mic%d", mdev->id); - mdev->dbg_dir = debugfs_create_dir(name, mic_dbg); - - debugfs_create_file("smpt", 0444, mdev->dbg_dir, mdev, - &mic_smpt_fops); - - debugfs_create_file("post_code", 0444, mdev->dbg_dir, mdev, - &mic_post_code_fops); - - debugfs_create_file("msi_irq_info", 0444, mdev->dbg_dir, mdev, - &mic_msi_irq_info_fops); -} - -/* - * mic_delete_debug_dir - Uninitialize MIC debugfs entries. - */ -void mic_delete_debug_dir(struct mic_device *mdev) -{ - debugfs_remove_recursive(mdev->dbg_dir); -} - -/* - * mic_init_debugfs - Initialize global debugfs entry. - */ -void __init mic_init_debugfs(void) -{ - mic_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); -} - -/* - * mic_exit_debugfs - Uninitialize global debugfs entry - */ -void mic_exit_debugfs(void) -{ - debugfs_remove(mic_dbg); -} diff --git a/drivers/misc/mic/host/mic_device.h b/drivers/misc/mic/host/mic_device.h deleted file mode 100644 index 41bcd308ae59..000000000000 --- a/drivers/misc/mic/host/mic_device.h +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#ifndef _MIC_DEVICE_H_ -#define _MIC_DEVICE_H_ - -#include -#include -#include -#include -#include -#include -#include -#include "../bus/scif_bus.h" -#include "../bus/vop_bus.h" -#include "../bus/cosm_bus.h" -#include "mic_intr.h" - -/** - * enum mic_stepping - MIC stepping ids. - */ -enum mic_stepping { - MIC_A0_STEP = 0x0, - MIC_B0_STEP = 0x10, - MIC_B1_STEP = 0x11, - MIC_C0_STEP = 0x20, -}; - -extern struct cosm_hw_ops cosm_hw_ops; - -/** - * struct mic_device - MIC device information for each card. - * - * @mmio: MMIO bar information. - * @aper: Aperture bar information. - * @family: The MIC family to which this device belongs. - * @ops: MIC HW specific operations. - * @id: The unique device id for this MIC device. - * @stepping: Stepping ID. - * @pdev: Underlying PCI device. - * @mic_mutex: Mutex for synchronizing access to mic_device. - * @intr_ops: HW specific interrupt operations. - * @smpt_ops: Hardware specific SMPT operations. - * @smpt: MIC SMPT information. - * @intr_info: H/W specific interrupt information. - * @irq_info: The OS specific irq information - * @dbg_dir: debugfs directory of this MIC device. - * @bootaddr: MIC boot address. - * @dp: virtio device page - * @dp_dma_addr: virtio device page DMA address. - * @dma_mbdev: MIC BUS DMA device. - * @dma_ch - Array of DMA channels - * @num_dma_ch - Number of DMA channels available - * @scdev: SCIF device on the SCIF virtual bus. - * @vpdev: Virtio over PCIe device on the VOP virtual bus. - * @cosm_dev: COSM device - */ -struct mic_device { - struct mic_mw mmio; - struct mic_mw aper; - enum mic_hw_family family; - struct mic_hw_ops *ops; - int id; - enum mic_stepping stepping; - struct pci_dev *pdev; - struct mutex mic_mutex; - struct mic_hw_intr_ops *intr_ops; - struct mic_smpt_ops *smpt_ops; - struct mic_smpt_info *smpt; - struct mic_intr_info *intr_info; - struct mic_irq_info irq_info; - struct dentry *dbg_dir; - u32 bootaddr; - void *dp; - dma_addr_t dp_dma_addr; - struct mbus_device *dma_mbdev; - struct dma_chan *dma_ch[MIC_MAX_DMA_CHAN]; - int num_dma_ch; - struct scif_hw_dev *scdev; - struct vop_device *vpdev; - struct cosm_device *cosm_dev; -}; - -/** - * struct mic_hw_ops - MIC HW specific operations. - * @aper_bar: Aperture bar resource number. - * @mmio_bar: MMIO bar resource number. - * @read_spad: Read from scratch pad register. - * @write_spad: Write to scratch pad register. - * @send_intr: Send an interrupt for a particular doorbell on the card. - * @ack_interrupt: Hardware specific operations to ack the h/w on - * receipt of an interrupt. - * @intr_workarounds: Hardware specific workarounds needed after - * handling an interrupt. - * @reset: Reset the remote processor. - * @reset_fw_ready: Reset firmware ready field. - * @is_fw_ready: Check if firmware is ready for OS download. - * @send_firmware_intr: Send an interrupt to the card firmware. - * @load_mic_fw: Load firmware segments required to boot the card - * into card memory. This includes the kernel, command line, ramdisk etc. - * @get_postcode: Get post code status from firmware. - * @dma_filter: DMA filter function to be used. - */ -struct mic_hw_ops { - u8 aper_bar; - u8 mmio_bar; - u32 (*read_spad)(struct mic_device *mdev, unsigned int idx); - void (*write_spad)(struct mic_device *mdev, unsigned int idx, u32 val); - void (*send_intr)(struct mic_device *mdev, int doorbell); - u32 (*ack_interrupt)(struct mic_device *mdev); - void (*intr_workarounds)(struct mic_device *mdev); - void (*reset)(struct mic_device *mdev); - void (*reset_fw_ready)(struct mic_device *mdev); - bool (*is_fw_ready)(struct mic_device *mdev); - void (*send_firmware_intr)(struct mic_device *mdev); - int (*load_mic_fw)(struct mic_device *mdev, const char *buf); - u32 (*get_postcode)(struct mic_device *mdev); - bool (*dma_filter)(struct dma_chan *chan, void *param); -}; - -/** - * mic_mmio_read - read from an MMIO register. - * @mw: MMIO register base virtual address. - * @offset: register offset. - * - * RETURNS: register value. - */ -static inline u32 mic_mmio_read(struct mic_mw *mw, u32 offset) -{ - return ioread32(mw->va + offset); -} - -/** - * mic_mmio_write - write to an MMIO register. - * @mw: MMIO register base virtual address. - * @val: the data value to put into the register - * @offset: register offset. - * - * RETURNS: none. - */ -static inline void -mic_mmio_write(struct mic_mw *mw, u32 val, u32 offset) -{ - iowrite32(val, mw->va + offset); -} - -void mic_bootparam_init(struct mic_device *mdev); -void mic_create_debug_dir(struct mic_device *dev); -void mic_delete_debug_dir(struct mic_device *dev); -void __init mic_init_debugfs(void); -void mic_exit_debugfs(void); -#endif diff --git a/drivers/misc/mic/host/mic_intr.c b/drivers/misc/mic/host/mic_intr.c deleted file mode 100644 index 85b3221b5d40..000000000000 --- a/drivers/misc/mic/host/mic_intr.c +++ /dev/null @@ -1,635 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#include -#include - -#include "../common/mic_dev.h" -#include "mic_device.h" - -static irqreturn_t mic_thread_fn(int irq, void *dev) -{ - struct mic_device *mdev = dev; - struct mic_intr_info *intr_info = mdev->intr_info; - struct mic_irq_info *irq_info = &mdev->irq_info; - struct mic_intr_cb *intr_cb; - struct pci_dev *pdev = mdev->pdev; - int i; - - spin_lock(&irq_info->mic_thread_lock); - for (i = intr_info->intr_start_idx[MIC_INTR_DB]; - i < intr_info->intr_len[MIC_INTR_DB]; i++) - if (test_and_clear_bit(i, &irq_info->mask)) { - list_for_each_entry(intr_cb, &irq_info->cb_list[i], - list) - if (intr_cb->thread_fn) - intr_cb->thread_fn(pdev->irq, - intr_cb->data); - } - spin_unlock(&irq_info->mic_thread_lock); - return IRQ_HANDLED; -} -/** - * mic_interrupt - Generic interrupt handler for - * MSI and INTx based interrupts. - * @irq: interrupt to handle (unused) - * @dev: pointer to the mic_device instance - */ -static irqreturn_t mic_interrupt(int irq, void *dev) -{ - struct mic_device *mdev = dev; - struct mic_intr_info *intr_info = mdev->intr_info; - struct mic_irq_info *irq_info = &mdev->irq_info; - struct mic_intr_cb *intr_cb; - struct pci_dev *pdev = mdev->pdev; - u32 mask; - int i; - - mask = mdev->ops->ack_interrupt(mdev); - if (!mask) - return IRQ_NONE; - - spin_lock(&irq_info->mic_intr_lock); - for (i = intr_info->intr_start_idx[MIC_INTR_DB]; - i < intr_info->intr_len[MIC_INTR_DB]; i++) - if (mask & BIT(i)) { - list_for_each_entry(intr_cb, &irq_info->cb_list[i], - list) - if (intr_cb->handler) - intr_cb->handler(pdev->irq, - intr_cb->data); - set_bit(i, &irq_info->mask); - } - spin_unlock(&irq_info->mic_intr_lock); - return IRQ_WAKE_THREAD; -} - -/* Return the interrupt offset from the index. Index is 0 based. */ -static u16 mic_map_src_to_offset(struct mic_device *mdev, - int intr_src, enum mic_intr_type type) -{ - if (type >= MIC_NUM_INTR_TYPES) - return MIC_NUM_OFFSETS; - if (intr_src >= mdev->intr_info->intr_len[type]) - return MIC_NUM_OFFSETS; - - return mdev->intr_info->intr_start_idx[type] + intr_src; -} - -/* Return next available msix_entry. */ -static struct msix_entry *mic_get_available_vector(struct mic_device *mdev) -{ - int i; - struct mic_irq_info *info = &mdev->irq_info; - - for (i = 0; i < info->num_vectors; i++) - if (!info->mic_msi_map[i]) - return &info->msix_entries[i]; - return NULL; -} - -/** - * mic_register_intr_callback - Register a callback handler for the - * given source id. - * - * @mdev: pointer to the mic_device instance - * @idx: The source id to be registered. - * @handler: The function to be called when the source id receives - * the interrupt. - * @thread_fn: thread fn. corresponding to the handler - * @data: Private data of the requester. - * Return the callback structure that was registered or an - * appropriate error on failure. - */ -static struct mic_intr_cb *mic_register_intr_callback(struct mic_device *mdev, - u8 idx, irq_handler_t handler, irq_handler_t thread_fn, - void *data) -{ - struct mic_intr_cb *intr_cb; - unsigned long flags; - int rc; - intr_cb = kmalloc(sizeof(*intr_cb), GFP_KERNEL); - - if (!intr_cb) - return ERR_PTR(-ENOMEM); - - intr_cb->handler = handler; - intr_cb->thread_fn = thread_fn; - intr_cb->data = data; - intr_cb->cb_id = ida_simple_get(&mdev->irq_info.cb_ida, - 0, 0, GFP_KERNEL); - if (intr_cb->cb_id < 0) { - rc = intr_cb->cb_id; - goto ida_fail; - } - - spin_lock(&mdev->irq_info.mic_thread_lock); - spin_lock_irqsave(&mdev->irq_info.mic_intr_lock, flags); - list_add_tail(&intr_cb->list, &mdev->irq_info.cb_list[idx]); - spin_unlock_irqrestore(&mdev->irq_info.mic_intr_lock, flags); - spin_unlock(&mdev->irq_info.mic_thread_lock); - - return intr_cb; -ida_fail: - kfree(intr_cb); - return ERR_PTR(rc); -} - -/** - * mic_unregister_intr_callback - Unregister the callback handler - * identified by its callback id. - * - * @mdev: pointer to the mic_device instance - * @idx: The callback structure id to be unregistered. - * Return the source id that was unregistered or MIC_NUM_OFFSETS if no - * such callback handler was found. - */ -static u8 mic_unregister_intr_callback(struct mic_device *mdev, u32 idx) -{ - struct list_head *pos, *tmp; - struct mic_intr_cb *intr_cb; - unsigned long flags; - int i; - - spin_lock(&mdev->irq_info.mic_thread_lock); - spin_lock_irqsave(&mdev->irq_info.mic_intr_lock, flags); - for (i = 0; i < MIC_NUM_OFFSETS; i++) { - list_for_each_safe(pos, tmp, &mdev->irq_info.cb_list[i]) { - intr_cb = list_entry(pos, struct mic_intr_cb, list); - if (intr_cb->cb_id == idx) { - list_del(pos); - ida_simple_remove(&mdev->irq_info.cb_ida, - intr_cb->cb_id); - kfree(intr_cb); - spin_unlock_irqrestore( - &mdev->irq_info.mic_intr_lock, flags); - spin_unlock(&mdev->irq_info.mic_thread_lock); - return i; - } - } - } - spin_unlock_irqrestore(&mdev->irq_info.mic_intr_lock, flags); - spin_unlock(&mdev->irq_info.mic_thread_lock); - return MIC_NUM_OFFSETS; -} - -/** - * mic_setup_msix - Initializes MSIx interrupts. - * - * @mdev: pointer to mic_device instance - * @pdev: PCI device structure - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static int mic_setup_msix(struct mic_device *mdev, struct pci_dev *pdev) -{ - int rc, i; - int entry_size = sizeof(*mdev->irq_info.msix_entries); - - mdev->irq_info.msix_entries = kmalloc_array(MIC_MIN_MSIX, - entry_size, GFP_KERNEL); - if (!mdev->irq_info.msix_entries) { - rc = -ENOMEM; - goto err_nomem1; - } - - for (i = 0; i < MIC_MIN_MSIX; i++) - mdev->irq_info.msix_entries[i].entry = i; - - rc = pci_enable_msix_exact(pdev, mdev->irq_info.msix_entries, - MIC_MIN_MSIX); - if (rc) { - dev_dbg(&pdev->dev, "Error enabling MSIx. rc = %d\n", rc); - goto err_enable_msix; - } - - mdev->irq_info.num_vectors = MIC_MIN_MSIX; - mdev->irq_info.mic_msi_map = kzalloc((sizeof(u32) * - mdev->irq_info.num_vectors), GFP_KERNEL); - - if (!mdev->irq_info.mic_msi_map) { - rc = -ENOMEM; - goto err_nomem2; - } - - dev_dbg(&mdev->pdev->dev, - "%d MSIx irqs setup\n", mdev->irq_info.num_vectors); - return 0; -err_nomem2: - pci_disable_msix(pdev); -err_enable_msix: - kfree(mdev->irq_info.msix_entries); -err_nomem1: - mdev->irq_info.num_vectors = 0; - return rc; -} - -/** - * mic_setup_callbacks - Initialize data structures needed - * to handle callbacks. - * - * @mdev: pointer to mic_device instance - */ -static int mic_setup_callbacks(struct mic_device *mdev) -{ - int i; - - mdev->irq_info.cb_list = kmalloc_array(MIC_NUM_OFFSETS, - sizeof(*mdev->irq_info.cb_list), - GFP_KERNEL); - if (!mdev->irq_info.cb_list) - return -ENOMEM; - - for (i = 0; i < MIC_NUM_OFFSETS; i++) - INIT_LIST_HEAD(&mdev->irq_info.cb_list[i]); - ida_init(&mdev->irq_info.cb_ida); - spin_lock_init(&mdev->irq_info.mic_intr_lock); - spin_lock_init(&mdev->irq_info.mic_thread_lock); - return 0; -} - -/** - * mic_release_callbacks - Uninitialize data structures needed - * to handle callbacks. - * - * @mdev: pointer to mic_device instance - */ -static void mic_release_callbacks(struct mic_device *mdev) -{ - unsigned long flags; - struct list_head *pos, *tmp; - struct mic_intr_cb *intr_cb; - int i; - - spin_lock(&mdev->irq_info.mic_thread_lock); - spin_lock_irqsave(&mdev->irq_info.mic_intr_lock, flags); - for (i = 0; i < MIC_NUM_OFFSETS; i++) { - if (list_empty(&mdev->irq_info.cb_list[i])) - break; - - list_for_each_safe(pos, tmp, &mdev->irq_info.cb_list[i]) { - intr_cb = list_entry(pos, struct mic_intr_cb, list); - list_del(pos); - ida_simple_remove(&mdev->irq_info.cb_ida, - intr_cb->cb_id); - kfree(intr_cb); - } - } - spin_unlock_irqrestore(&mdev->irq_info.mic_intr_lock, flags); - spin_unlock(&mdev->irq_info.mic_thread_lock); - ida_destroy(&mdev->irq_info.cb_ida); - kfree(mdev->irq_info.cb_list); -} - -/** - * mic_setup_msi - Initializes MSI interrupts. - * - * @mdev: pointer to mic_device instance - * @pdev: PCI device structure - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static int mic_setup_msi(struct mic_device *mdev, struct pci_dev *pdev) -{ - int rc; - - rc = pci_enable_msi(pdev); - if (rc) { - dev_dbg(&pdev->dev, "Error enabling MSI. rc = %d\n", rc); - return rc; - } - - mdev->irq_info.num_vectors = 1; - mdev->irq_info.mic_msi_map = kzalloc((sizeof(u32) * - mdev->irq_info.num_vectors), GFP_KERNEL); - - if (!mdev->irq_info.mic_msi_map) { - rc = -ENOMEM; - goto err_nomem1; - } - - rc = mic_setup_callbacks(mdev); - if (rc) { - dev_err(&pdev->dev, "Error setting up callbacks\n"); - goto err_nomem2; - } - - rc = request_threaded_irq(pdev->irq, mic_interrupt, mic_thread_fn, - 0, "mic-msi", mdev); - if (rc) { - dev_err(&pdev->dev, "Error allocating MSI interrupt\n"); - goto err_irq_req_fail; - } - - dev_dbg(&pdev->dev, "%d MSI irqs setup\n", mdev->irq_info.num_vectors); - return 0; -err_irq_req_fail: - mic_release_callbacks(mdev); -err_nomem2: - kfree(mdev->irq_info.mic_msi_map); -err_nomem1: - pci_disable_msi(pdev); - mdev->irq_info.num_vectors = 0; - return rc; -} - -/** - * mic_setup_intx - Initializes legacy interrupts. - * - * @mdev: pointer to mic_device instance - * @pdev: PCI device structure - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static int mic_setup_intx(struct mic_device *mdev, struct pci_dev *pdev) -{ - int rc; - - /* Enable intx */ - pci_intx(pdev, 1); - rc = mic_setup_callbacks(mdev); - if (rc) { - dev_err(&pdev->dev, "Error setting up callbacks\n"); - goto err_nomem; - } - - rc = request_threaded_irq(pdev->irq, mic_interrupt, mic_thread_fn, - IRQF_SHARED, "mic-intx", mdev); - if (rc) - goto err; - - dev_dbg(&pdev->dev, "intx irq setup\n"); - return 0; -err: - mic_release_callbacks(mdev); -err_nomem: - return rc; -} - -/** - * mic_next_db - Retrieve the next doorbell interrupt source id. - * The id is picked sequentially from the available pool of - * doorlbell ids. - * - * @mdev: pointer to the mic_device instance. - * - * Returns the next doorbell interrupt source. - */ -int mic_next_db(struct mic_device *mdev) -{ - int next_db; - - next_db = mdev->irq_info.next_avail_src % - mdev->intr_info->intr_len[MIC_INTR_DB]; - mdev->irq_info.next_avail_src++; - return next_db; -} - -#define COOKIE_ID_SHIFT 16 -#define GET_ENTRY(cookie) ((cookie) & 0xFFFF) -#define GET_OFFSET(cookie) ((cookie) >> COOKIE_ID_SHIFT) -#define MK_COOKIE(x, y) ((x) | (y) << COOKIE_ID_SHIFT) - -/** - * mic_request_threaded_irq - request an irq. mic_mutex needs - * to be held before calling this function. - * - * @mdev: pointer to mic_device instance - * @handler: The callback function that handles the interrupt. - * The function needs to call ack_interrupts - * (mdev->ops->ack_interrupt(mdev)) when handling the interrupts. - * @thread_fn: thread fn required by request_threaded_irq. - * @name: The ASCII name of the callee requesting the irq. - * @data: private data that is returned back when calling the - * function handler. - * @intr_src: The source id of the requester. Its the doorbell id - * for Doorbell interrupts and DMA channel id for DMA interrupts. - * @type: The type of interrupt. Values defined in mic_intr_type - * - * returns: The cookie that is transparent to the caller. Passed - * back when calling mic_free_irq. An appropriate error code - * is returned on failure. Caller needs to use IS_ERR(return_val) - * to check for failure and PTR_ERR(return_val) to obtained the - * error code. - * - */ -struct mic_irq * -mic_request_threaded_irq(struct mic_device *mdev, - irq_handler_t handler, irq_handler_t thread_fn, - const char *name, void *data, int intr_src, - enum mic_intr_type type) -{ - u16 offset; - int rc = 0; - struct msix_entry *msix = NULL; - unsigned long cookie = 0; - u16 entry; - struct mic_intr_cb *intr_cb; - struct pci_dev *pdev = mdev->pdev; - - offset = mic_map_src_to_offset(mdev, intr_src, type); - if (offset >= MIC_NUM_OFFSETS) { - dev_err(&mdev->pdev->dev, - "Error mapping index %d to a valid source id.\n", - intr_src); - rc = -EINVAL; - goto err; - } - - if (mdev->irq_info.num_vectors > 1) { - msix = mic_get_available_vector(mdev); - if (!msix) { - dev_err(&mdev->pdev->dev, - "No MSIx vectors available for use.\n"); - rc = -ENOSPC; - goto err; - } - - rc = request_threaded_irq(msix->vector, handler, thread_fn, - 0, name, data); - if (rc) { - dev_dbg(&mdev->pdev->dev, - "request irq failed rc = %d\n", rc); - goto err; - } - entry = msix->entry; - mdev->irq_info.mic_msi_map[entry] |= BIT(offset); - mdev->intr_ops->program_msi_to_src_map(mdev, - entry, offset, true); - cookie = MK_COOKIE(entry, offset); - dev_dbg(&mdev->pdev->dev, "irq: %d assigned for src: %d\n", - msix->vector, intr_src); - } else { - intr_cb = mic_register_intr_callback(mdev, offset, handler, - thread_fn, data); - if (IS_ERR(intr_cb)) { - dev_err(&mdev->pdev->dev, - "No available callback entries for use\n"); - rc = PTR_ERR(intr_cb); - goto err; - } - - entry = 0; - if (pci_dev_msi_enabled(pdev)) { - mdev->irq_info.mic_msi_map[entry] |= (1 << offset); - mdev->intr_ops->program_msi_to_src_map(mdev, - entry, offset, true); - } - cookie = MK_COOKIE(entry, intr_cb->cb_id); - dev_dbg(&mdev->pdev->dev, "callback %d registered for src: %d\n", - intr_cb->cb_id, intr_src); - } - return (struct mic_irq *)cookie; -err: - return ERR_PTR(rc); -} - -/** - * mic_free_irq - free irq. mic_mutex - * needs to be held before calling this function. - * - * @mdev: pointer to mic_device instance - * @cookie: cookie obtained during a successful call to mic_request_threaded_irq - * @data: private data specified by the calling function during the - * mic_request_threaded_irq - * - * returns: none. - */ -void mic_free_irq(struct mic_device *mdev, - struct mic_irq *cookie, void *data) -{ - u32 offset; - u32 entry; - u8 src_id; - unsigned int irq; - struct pci_dev *pdev = mdev->pdev; - - entry = GET_ENTRY((unsigned long)cookie); - offset = GET_OFFSET((unsigned long)cookie); - if (mdev->irq_info.num_vectors > 1) { - if (entry >= mdev->irq_info.num_vectors) { - dev_warn(&mdev->pdev->dev, - "entry %d should be < num_irq %d\n", - entry, mdev->irq_info.num_vectors); - return; - } - irq = mdev->irq_info.msix_entries[entry].vector; - free_irq(irq, data); - mdev->irq_info.mic_msi_map[entry] &= ~(BIT(offset)); - mdev->intr_ops->program_msi_to_src_map(mdev, - entry, offset, false); - - dev_dbg(&mdev->pdev->dev, "irq: %d freed\n", irq); - } else { - irq = pdev->irq; - src_id = mic_unregister_intr_callback(mdev, offset); - if (src_id >= MIC_NUM_OFFSETS) { - dev_warn(&mdev->pdev->dev, "Error unregistering callback\n"); - return; - } - if (pci_dev_msi_enabled(pdev)) { - mdev->irq_info.mic_msi_map[entry] &= ~(BIT(src_id)); - mdev->intr_ops->program_msi_to_src_map(mdev, - entry, src_id, false); - } - dev_dbg(&mdev->pdev->dev, "callback %d unregistered for src: %d\n", - offset, src_id); - } -} - -/** - * mic_setup_interrupts - Initializes interrupts. - * - * @mdev: pointer to mic_device instance - * @pdev: PCI device structure - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -int mic_setup_interrupts(struct mic_device *mdev, struct pci_dev *pdev) -{ - int rc; - - rc = mic_setup_msix(mdev, pdev); - if (!rc) - goto done; - - rc = mic_setup_msi(mdev, pdev); - if (!rc) - goto done; - - rc = mic_setup_intx(mdev, pdev); - if (rc) { - dev_err(&mdev->pdev->dev, "no usable interrupts\n"); - return rc; - } -done: - mdev->intr_ops->enable_interrupts(mdev); - return 0; -} - -/** - * mic_free_interrupts - Frees interrupts setup by mic_setup_interrupts - * - * @mdev: pointer to mic_device instance - * @pdev: PCI device structure - * - * returns none. - */ -void mic_free_interrupts(struct mic_device *mdev, struct pci_dev *pdev) -{ - int i; - - mdev->intr_ops->disable_interrupts(mdev); - if (mdev->irq_info.num_vectors > 1) { - for (i = 0; i < mdev->irq_info.num_vectors; i++) { - if (mdev->irq_info.mic_msi_map[i]) - dev_warn(&pdev->dev, "irq %d may still be in use.\n", - mdev->irq_info.msix_entries[i].vector); - } - kfree(mdev->irq_info.mic_msi_map); - kfree(mdev->irq_info.msix_entries); - pci_disable_msix(pdev); - } else { - if (pci_dev_msi_enabled(pdev)) { - free_irq(pdev->irq, mdev); - kfree(mdev->irq_info.mic_msi_map); - pci_disable_msi(pdev); - } else { - free_irq(pdev->irq, mdev); - } - mic_release_callbacks(mdev); - } -} - -/** - * mic_intr_restore - Restore MIC interrupt registers. - * - * @mdev: pointer to mic_device instance. - * - * Restore the interrupt registers to values previously - * stored in the SW data structures. mic_mutex needs to - * be held before calling this function. - * - * returns None. - */ -void mic_intr_restore(struct mic_device *mdev) -{ - int entry, offset; - struct pci_dev *pdev = mdev->pdev; - - if (!pci_dev_msi_enabled(pdev)) - return; - - for (entry = 0; entry < mdev->irq_info.num_vectors; entry++) { - for (offset = 0; offset < MIC_NUM_OFFSETS; offset++) { - if (mdev->irq_info.mic_msi_map[entry] & BIT(offset)) - mdev->intr_ops->program_msi_to_src_map(mdev, - entry, offset, true); - } - } -} diff --git a/drivers/misc/mic/host/mic_intr.h b/drivers/misc/mic/host/mic_intr.h deleted file mode 100644 index b14ba818006f..000000000000 --- a/drivers/misc/mic/host/mic_intr.h +++ /dev/null @@ -1,137 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#ifndef _MIC_INTR_H_ -#define _MIC_INTR_H_ - -#include -#include -/* - * The minimum number of msix vectors required for normal operation. - * 3 for virtio network, console and block devices. - * 1 for card shutdown notifications. - * 4 for host owned DMA channels. - * 1 for SCIF - */ -#define MIC_MIN_MSIX 9 -#define MIC_NUM_OFFSETS 32 - -/** - * mic_intr_source - The type of source that will generate - * the interrupt.The number of types needs to be in sync with - * MIC_NUM_INTR_TYPES - * - * MIC_INTR_DB: The source is a doorbell - * MIC_INTR_DMA: The source is a DMA channel - * MIC_INTR_ERR: The source is an error interrupt e.g. SBOX ERR - * MIC_NUM_INTR_TYPES: Total number of interrupt sources. - */ -enum mic_intr_type { - MIC_INTR_DB = 0, - MIC_INTR_DMA, - MIC_INTR_ERR, - MIC_NUM_INTR_TYPES -}; - -/** - * struct mic_intr_info - Contains h/w specific interrupt sources - * information. - * - * @intr_start_idx: Contains the starting indexes of the - * interrupt types. - * @intr_len: Contains the length of the interrupt types. - */ -struct mic_intr_info { - u16 intr_start_idx[MIC_NUM_INTR_TYPES]; - u16 intr_len[MIC_NUM_INTR_TYPES]; -}; - -/** - * struct mic_irq_info - OS specific irq information - * - * @next_avail_src: next available doorbell that can be assigned. - * @msix_entries: msix entries allocated while setting up MSI-x - * @mic_msi_map: The MSI/MSI-x mapping information. - * @num_vectors: The number of MSI/MSI-x vectors that have been allocated. - * @cb_ida: callback ID allocator to track the callbacks registered. - * @mic_intr_lock: spinlock to protect the interrupt callback list. - * @mic_thread_lock: spinlock to protect the thread callback list. - * This lock is used to protect against thread_fn while - * mic_intr_lock is used to protect against interrupt handler. - * @cb_list: Array of callback lists one for each source. - * @mask: Mask used by the main thread fn to call the underlying thread fns. - */ -struct mic_irq_info { - int next_avail_src; - struct msix_entry *msix_entries; - u32 *mic_msi_map; - u16 num_vectors; - struct ida cb_ida; - spinlock_t mic_intr_lock; - spinlock_t mic_thread_lock; - struct list_head *cb_list; - unsigned long mask; -}; - -/** - * struct mic_intr_cb - Interrupt callback structure. - * - * @handler: The callback function - * @thread_fn: The thread_fn. - * @data: Private data of the requester. - * @cb_id: The callback id. Identifies this callback. - * @list: list head pointing to the next callback structure. - */ -struct mic_intr_cb { - irq_handler_t handler; - irq_handler_t thread_fn; - void *data; - int cb_id; - struct list_head list; -}; - -/** - * struct mic_irq - opaque pointer used as cookie - */ -struct mic_irq; - -/* Forward declaration */ -struct mic_device; - -/** - * struct mic_hw_intr_ops: MIC HW specific interrupt operations - * @intr_init: Initialize H/W specific interrupt information. - * @enable_interrupts: Enable interrupts from the hardware. - * @disable_interrupts: Disable interrupts from the hardware. - * @program_msi_to_src_map: Update MSI mapping registers with - * irq information. - * @read_msi_to_src_map: Read MSI mapping registers containing - * irq information. - */ -struct mic_hw_intr_ops { - void (*intr_init)(struct mic_device *mdev); - void (*enable_interrupts)(struct mic_device *mdev); - void (*disable_interrupts)(struct mic_device *mdev); - void (*program_msi_to_src_map) (struct mic_device *mdev, - int idx, int intr_src, bool set); - u32 (*read_msi_to_src_map) (struct mic_device *mdev, - int idx); -}; - -int mic_next_db(struct mic_device *mdev); -struct mic_irq * -mic_request_threaded_irq(struct mic_device *mdev, - irq_handler_t handler, irq_handler_t thread_fn, - const char *name, void *data, int intr_src, - enum mic_intr_type type); -void mic_free_irq(struct mic_device *mdev, - struct mic_irq *cookie, void *data); -int mic_setup_interrupts(struct mic_device *mdev, struct pci_dev *pdev); -void mic_free_interrupts(struct mic_device *mdev, struct pci_dev *pdev); -void mic_intr_restore(struct mic_device *mdev); -#endif diff --git a/drivers/misc/mic/host/mic_main.c b/drivers/misc/mic/host/mic_main.c deleted file mode 100644 index ea4608527ea0..000000000000 --- a/drivers/misc/mic/host/mic_main.c +++ /dev/null @@ -1,335 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#include -#include -#include -#include - -#include -#include "../common/mic_dev.h" -#include "mic_device.h" -#include "mic_x100.h" -#include "mic_smpt.h" - -static const char mic_driver_name[] = "mic"; - -static const struct pci_device_id mic_pci_tbl[] = { - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2250)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2251)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2252)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2253)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2254)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2255)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2256)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2257)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2258)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_2259)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_225a)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_225b)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_225c)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_225d)}, - {PCI_DEVICE(PCI_VENDOR_ID_INTEL, MIC_X100_PCI_DEVICE_225e)}, - - /* required last entry */ - { 0, } -}; - -MODULE_DEVICE_TABLE(pci, mic_pci_tbl); - -/* ID allocator for MIC devices */ -static struct ida g_mic_ida; - -/* Initialize the device page */ -static int mic_dp_init(struct mic_device *mdev) -{ - mdev->dp = kzalloc(MIC_DP_SIZE, GFP_KERNEL); - if (!mdev->dp) - return -ENOMEM; - - mdev->dp_dma_addr = mic_map_single(mdev, - mdev->dp, MIC_DP_SIZE); - if (mic_map_error(mdev->dp_dma_addr)) { - kfree(mdev->dp); - dev_err(&mdev->pdev->dev, "%s %d err %d\n", - __func__, __LINE__, -ENOMEM); - return -ENOMEM; - } - mdev->ops->write_spad(mdev, MIC_DPLO_SPAD, mdev->dp_dma_addr); - mdev->ops->write_spad(mdev, MIC_DPHI_SPAD, mdev->dp_dma_addr >> 32); - return 0; -} - -/* Uninitialize the device page */ -static void mic_dp_uninit(struct mic_device *mdev) -{ - mic_unmap_single(mdev, mdev->dp_dma_addr, MIC_DP_SIZE); - kfree(mdev->dp); -} - -/** - * mic_ops_init: Initialize HW specific operation tables. - * - * @mdev: pointer to mic_device instance - * - * returns none. - */ -static void mic_ops_init(struct mic_device *mdev) -{ - switch (mdev->family) { - case MIC_FAMILY_X100: - mdev->ops = &mic_x100_ops; - mdev->intr_ops = &mic_x100_intr_ops; - mdev->smpt_ops = &mic_x100_smpt_ops; - break; - default: - break; - } -} - -/** - * mic_get_family - Determine hardware family to which this MIC belongs. - * - * @pdev: The pci device structure - * - * returns family. - */ -static enum mic_hw_family mic_get_family(struct pci_dev *pdev) -{ - enum mic_hw_family family; - - switch (pdev->device) { - case MIC_X100_PCI_DEVICE_2250: - case MIC_X100_PCI_DEVICE_2251: - case MIC_X100_PCI_DEVICE_2252: - case MIC_X100_PCI_DEVICE_2253: - case MIC_X100_PCI_DEVICE_2254: - case MIC_X100_PCI_DEVICE_2255: - case MIC_X100_PCI_DEVICE_2256: - case MIC_X100_PCI_DEVICE_2257: - case MIC_X100_PCI_DEVICE_2258: - case MIC_X100_PCI_DEVICE_2259: - case MIC_X100_PCI_DEVICE_225a: - case MIC_X100_PCI_DEVICE_225b: - case MIC_X100_PCI_DEVICE_225c: - case MIC_X100_PCI_DEVICE_225d: - case MIC_X100_PCI_DEVICE_225e: - family = MIC_FAMILY_X100; - break; - default: - family = MIC_FAMILY_UNKNOWN; - break; - } - return family; -} - -/** - * mic_device_init - Allocates and initializes the MIC device structure - * - * @mdev: pointer to mic_device instance - * @pdev: The pci device structure - * - * returns none. - */ -static void -mic_device_init(struct mic_device *mdev, struct pci_dev *pdev) -{ - mdev->pdev = pdev; - mdev->family = mic_get_family(pdev); - mdev->stepping = pdev->revision; - mic_ops_init(mdev); - mutex_init(&mdev->mic_mutex); - mdev->irq_info.next_avail_src = 0; -} - -/** - * mic_probe - Device Initialization Routine - * - * @pdev: PCI device structure - * @ent: entry in mic_pci_tbl - * - * returns 0 on success, < 0 on failure. - */ -static int mic_probe(struct pci_dev *pdev, - const struct pci_device_id *ent) -{ - int rc; - struct mic_device *mdev; - - mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); - if (!mdev) { - rc = -ENOMEM; - goto mdev_alloc_fail; - } - mdev->id = ida_simple_get(&g_mic_ida, 0, MIC_MAX_NUM_DEVS, GFP_KERNEL); - if (mdev->id < 0) { - rc = mdev->id; - dev_err(&pdev->dev, "ida_simple_get failed rc %d\n", rc); - goto ida_fail; - } - - mic_device_init(mdev, pdev); - - rc = pci_enable_device(pdev); - if (rc) { - dev_err(&pdev->dev, "failed to enable pci device.\n"); - goto ida_remove; - } - - pci_set_master(pdev); - - rc = pci_request_regions(pdev, mic_driver_name); - if (rc) { - dev_err(&pdev->dev, "failed to get pci regions.\n"); - goto disable_device; - } - - rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); - if (rc) { - dev_err(&pdev->dev, "Cannot set DMA mask\n"); - goto release_regions; - } - - mdev->mmio.pa = pci_resource_start(pdev, mdev->ops->mmio_bar); - mdev->mmio.len = pci_resource_len(pdev, mdev->ops->mmio_bar); - mdev->mmio.va = pci_ioremap_bar(pdev, mdev->ops->mmio_bar); - if (!mdev->mmio.va) { - dev_err(&pdev->dev, "Cannot remap MMIO BAR\n"); - rc = -EIO; - goto release_regions; - } - - mdev->aper.pa = pci_resource_start(pdev, mdev->ops->aper_bar); - mdev->aper.len = pci_resource_len(pdev, mdev->ops->aper_bar); - mdev->aper.va = ioremap_wc(mdev->aper.pa, mdev->aper.len); - if (!mdev->aper.va) { - dev_err(&pdev->dev, "Cannot remap Aperture BAR\n"); - rc = -EIO; - goto unmap_mmio; - } - - mdev->intr_ops->intr_init(mdev); - rc = mic_setup_interrupts(mdev, pdev); - if (rc) { - dev_err(&pdev->dev, "mic_setup_interrupts failed %d\n", rc); - goto unmap_aper; - } - rc = mic_smpt_init(mdev); - if (rc) { - dev_err(&pdev->dev, "smpt_init failed %d\n", rc); - goto free_interrupts; - } - - pci_set_drvdata(pdev, mdev); - - rc = mic_dp_init(mdev); - if (rc) { - dev_err(&pdev->dev, "mic_dp_init failed rc %d\n", rc); - goto smpt_uninit; - } - mic_bootparam_init(mdev); - mic_create_debug_dir(mdev); - - mdev->cosm_dev = cosm_register_device(&mdev->pdev->dev, &cosm_hw_ops); - if (IS_ERR(mdev->cosm_dev)) { - rc = PTR_ERR(mdev->cosm_dev); - dev_err(&pdev->dev, "cosm_add_device failed rc %d\n", rc); - goto cleanup_debug_dir; - } - return 0; -cleanup_debug_dir: - mic_delete_debug_dir(mdev); - mic_dp_uninit(mdev); -smpt_uninit: - mic_smpt_uninit(mdev); -free_interrupts: - mic_free_interrupts(mdev, pdev); -unmap_aper: - iounmap(mdev->aper.va); -unmap_mmio: - iounmap(mdev->mmio.va); -release_regions: - pci_release_regions(pdev); -disable_device: - pci_disable_device(pdev); -ida_remove: - ida_simple_remove(&g_mic_ida, mdev->id); -ida_fail: - kfree(mdev); -mdev_alloc_fail: - dev_err(&pdev->dev, "Probe failed rc %d\n", rc); - return rc; -} - -/** - * mic_remove - Device Removal Routine - * mic_remove is called by the PCI subsystem to alert the driver - * that it should release a PCI device. - * - * @pdev: PCI device structure - */ -static void mic_remove(struct pci_dev *pdev) -{ - struct mic_device *mdev; - - mdev = pci_get_drvdata(pdev); - if (!mdev) - return; - - cosm_unregister_device(mdev->cosm_dev); - mic_delete_debug_dir(mdev); - mic_dp_uninit(mdev); - mic_smpt_uninit(mdev); - mic_free_interrupts(mdev, pdev); - iounmap(mdev->aper.va); - iounmap(mdev->mmio.va); - pci_release_regions(pdev); - pci_disable_device(pdev); - ida_simple_remove(&g_mic_ida, mdev->id); - kfree(mdev); -} - -static struct pci_driver mic_driver = { - .name = mic_driver_name, - .id_table = mic_pci_tbl, - .probe = mic_probe, - .remove = mic_remove -}; - -static int __init mic_init(void) -{ - int ret; - - request_module("mic_x100_dma"); - mic_init_debugfs(); - ida_init(&g_mic_ida); - ret = pci_register_driver(&mic_driver); - if (ret) { - pr_err("pci_register_driver failed ret %d\n", ret); - goto cleanup_debugfs; - } - return 0; -cleanup_debugfs: - ida_destroy(&g_mic_ida); - mic_exit_debugfs(); - return ret; -} - -static void __exit mic_exit(void) -{ - pci_unregister_driver(&mic_driver); - ida_destroy(&g_mic_ida); - mic_exit_debugfs(); -} - -module_init(mic_init); -module_exit(mic_exit); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) MIC X100 Host driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/host/mic_smpt.c b/drivers/misc/mic/host/mic_smpt.c deleted file mode 100644 index 50d1bebecd54..000000000000 --- a/drivers/misc/mic/host/mic_smpt.c +++ /dev/null @@ -1,427 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#include - -#include "../common/mic_dev.h" -#include "mic_device.h" -#include "mic_smpt.h" - -static inline u64 mic_system_page_mask(struct mic_device *mdev) -{ - return (1ULL << mdev->smpt->info.page_shift) - 1ULL; -} - -static inline u8 mic_sys_addr_to_smpt(struct mic_device *mdev, dma_addr_t pa) -{ - return (pa - mdev->smpt->info.base) >> mdev->smpt->info.page_shift; -} - -static inline u64 mic_smpt_to_pa(struct mic_device *mdev, u8 index) -{ - return mdev->smpt->info.base + (index * mdev->smpt->info.page_size); -} - -static inline u64 mic_smpt_offset(struct mic_device *mdev, dma_addr_t pa) -{ - return pa & mic_system_page_mask(mdev); -} - -static inline u64 mic_smpt_align_low(struct mic_device *mdev, dma_addr_t pa) -{ - return ALIGN(pa - mic_system_page_mask(mdev), - mdev->smpt->info.page_size); -} - -static inline u64 mic_smpt_align_high(struct mic_device *mdev, dma_addr_t pa) -{ - return ALIGN(pa, mdev->smpt->info.page_size); -} - -/* Total Cumulative system memory accessible by MIC across all SMPT entries */ -static inline u64 mic_max_system_memory(struct mic_device *mdev) -{ - return mdev->smpt->info.num_reg * mdev->smpt->info.page_size; -} - -/* Maximum system memory address accessible by MIC */ -static inline u64 mic_max_system_addr(struct mic_device *mdev) -{ - return mdev->smpt->info.base + mic_max_system_memory(mdev) - 1ULL; -} - -/* Check if the DMA address is a MIC system memory address */ -static inline bool -mic_is_system_addr(struct mic_device *mdev, dma_addr_t pa) -{ - return pa >= mdev->smpt->info.base && pa <= mic_max_system_addr(mdev); -} - -/* Populate an SMPT entry and update the reference counts. */ -static void mic_add_smpt_entry(int spt, s64 *ref, u64 addr, - int entries, struct mic_device *mdev) -{ - struct mic_smpt_info *smpt_info = mdev->smpt; - int i; - - for (i = spt; i < spt + entries; i++, - addr += smpt_info->info.page_size) { - if (!smpt_info->entry[i].ref_count && - (smpt_info->entry[i].dma_addr != addr)) { - mdev->smpt_ops->set(mdev, addr, i); - smpt_info->entry[i].dma_addr = addr; - } - smpt_info->entry[i].ref_count += ref[i - spt]; - } -} - -/* - * Find an available MIC address in MIC SMPT address space - * for a given DMA address and size. - */ -static dma_addr_t mic_smpt_op(struct mic_device *mdev, u64 dma_addr, - int entries, s64 *ref, size_t size) -{ - int spt; - int ae = 0; - int i; - unsigned long flags; - dma_addr_t mic_addr = 0; - dma_addr_t addr = dma_addr; - struct mic_smpt_info *smpt_info = mdev->smpt; - - spin_lock_irqsave(&smpt_info->smpt_lock, flags); - - /* find existing entries */ - for (i = 0; i < smpt_info->info.num_reg; i++) { - if (smpt_info->entry[i].dma_addr == addr) { - ae++; - addr += smpt_info->info.page_size; - } else if (ae) /* cannot find contiguous entries */ - goto not_found; - - if (ae == entries) - goto found; - } - - /* find free entry */ - for (ae = 0, i = 0; i < smpt_info->info.num_reg; i++) { - ae = (smpt_info->entry[i].ref_count == 0) ? ae + 1 : 0; - if (ae == entries) - goto found; - } - -not_found: - spin_unlock_irqrestore(&smpt_info->smpt_lock, flags); - return mic_addr; - -found: - spt = i - entries + 1; - mic_addr = mic_smpt_to_pa(mdev, spt); - mic_add_smpt_entry(spt, ref, dma_addr, entries, mdev); - smpt_info->map_count++; - smpt_info->ref_count += (s64)size; - spin_unlock_irqrestore(&smpt_info->smpt_lock, flags); - return mic_addr; -} - -/* - * Returns number of smpt entries needed for dma_addr to dma_addr + size - * also returns the reference count array for each of those entries - * and the starting smpt address - */ -static int mic_get_smpt_ref_count(struct mic_device *mdev, dma_addr_t dma_addr, - size_t size, s64 *ref, u64 *smpt_start) -{ - u64 start = dma_addr; - u64 end = dma_addr + size; - int i = 0; - - while (start < end) { - ref[i++] = min(mic_smpt_align_high(mdev, start + 1), - end) - start; - start = mic_smpt_align_high(mdev, start + 1); - } - - if (smpt_start) - *smpt_start = mic_smpt_align_low(mdev, dma_addr); - - return i; -} - -/* - * mic_to_dma_addr - Converts a MIC address to a DMA address. - * - * @mdev: pointer to mic_device instance. - * @mic_addr: MIC address. - * - * returns a DMA address. - */ -dma_addr_t mic_to_dma_addr(struct mic_device *mdev, dma_addr_t mic_addr) -{ - struct mic_smpt_info *smpt_info = mdev->smpt; - int spt; - dma_addr_t dma_addr; - - if (!mic_is_system_addr(mdev, mic_addr)) { - dev_err(&mdev->pdev->dev, - "mic_addr is invalid. mic_addr = 0x%llx\n", mic_addr); - return -EINVAL; - } - spt = mic_sys_addr_to_smpt(mdev, mic_addr); - dma_addr = smpt_info->entry[spt].dma_addr + - mic_smpt_offset(mdev, mic_addr); - return dma_addr; -} - -/** - * mic_map - Maps a DMA address to a MIC physical address. - * - * @mdev: pointer to mic_device instance. - * @dma_addr: DMA address. - * @size: Size of the region to be mapped. - * - * This API converts the DMA address provided to a DMA address understood - * by MIC. Caller should check for errors by calling mic_map_error(..). - * - * returns DMA address as required by MIC. - */ -dma_addr_t mic_map(struct mic_device *mdev, dma_addr_t dma_addr, size_t size) -{ - dma_addr_t mic_addr = 0; - int num_entries; - s64 *ref; - u64 smpt_start; - - if (!size || size > mic_max_system_memory(mdev)) - return mic_addr; - - ref = kmalloc_array(mdev->smpt->info.num_reg, sizeof(s64), GFP_ATOMIC); - if (!ref) - return mic_addr; - - num_entries = mic_get_smpt_ref_count(mdev, dma_addr, size, - ref, &smpt_start); - - /* Set the smpt table appropriately and get 16G aligned mic address */ - mic_addr = mic_smpt_op(mdev, smpt_start, num_entries, ref, size); - - kfree(ref); - - /* - * If mic_addr is zero then its an error case - * since mic_addr can never be zero. - * else generate mic_addr by adding the 16G offset in dma_addr - */ - if (!mic_addr && MIC_FAMILY_X100 == mdev->family) { - dev_err(&mdev->pdev->dev, - "mic_map failed dma_addr 0x%llx size 0x%lx\n", - dma_addr, size); - return mic_addr; - } else { - return mic_addr + mic_smpt_offset(mdev, dma_addr); - } -} - -/** - * mic_unmap - Unmaps a MIC physical address. - * - * @mdev: pointer to mic_device instance. - * @mic_addr: MIC physical address. - * @size: Size of the region to be unmapped. - * - * This API unmaps the mappings created by mic_map(..). - * - * returns None. - */ -void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size) -{ - struct mic_smpt_info *smpt_info = mdev->smpt; - s64 *ref; - int num_smpt; - int spt; - int i; - unsigned long flags; - - if (!size) - return; - - if (!mic_is_system_addr(mdev, mic_addr)) { - dev_err(&mdev->pdev->dev, - "invalid address: 0x%llx\n", mic_addr); - return; - } - - spt = mic_sys_addr_to_smpt(mdev, mic_addr); - ref = kmalloc_array(mdev->smpt->info.num_reg, sizeof(s64), GFP_ATOMIC); - if (!ref) - return; - - /* Get number of smpt entries to be mapped, ref count array */ - num_smpt = mic_get_smpt_ref_count(mdev, mic_addr, size, ref, NULL); - - spin_lock_irqsave(&smpt_info->smpt_lock, flags); - smpt_info->unmap_count++; - smpt_info->ref_count -= (s64)size; - - for (i = spt; i < spt + num_smpt; i++) { - smpt_info->entry[i].ref_count -= ref[i - spt]; - if (smpt_info->entry[i].ref_count < 0) - dev_warn(&mdev->pdev->dev, - "ref count for entry %d is negative\n", i); - } - spin_unlock_irqrestore(&smpt_info->smpt_lock, flags); - kfree(ref); -} - -/** - * mic_map_single - Maps a virtual address to a MIC physical address. - * - * @mdev: pointer to mic_device instance. - * @va: Kernel direct mapped virtual address. - * @size: Size of the region to be mapped. - * - * This API calls pci_map_single(..) for the direct mapped virtual address - * and then converts the DMA address provided to a DMA address understood - * by MIC. Caller should check for errors by calling mic_map_error(..). - * - * returns DMA address as required by MIC. - */ -dma_addr_t mic_map_single(struct mic_device *mdev, void *va, size_t size) -{ - dma_addr_t mic_addr = 0; - struct pci_dev *pdev = mdev->pdev; - dma_addr_t dma_addr = - pci_map_single(pdev, va, size, PCI_DMA_BIDIRECTIONAL); - - if (!pci_dma_mapping_error(pdev, dma_addr)) { - mic_addr = mic_map(mdev, dma_addr, size); - if (!mic_addr) { - dev_err(&mdev->pdev->dev, - "mic_map failed dma_addr 0x%llx size 0x%lx\n", - dma_addr, size); - pci_unmap_single(pdev, dma_addr, - size, PCI_DMA_BIDIRECTIONAL); - } - } - return mic_addr; -} - -/** - * mic_unmap_single - Unmaps a MIC physical address. - * - * @mdev: pointer to mic_device instance. - * @mic_addr: MIC physical address. - * @size: Size of the region to be unmapped. - * - * This API unmaps the mappings created by mic_map_single(..). - * - * returns None. - */ -void -mic_unmap_single(struct mic_device *mdev, dma_addr_t mic_addr, size_t size) -{ - struct pci_dev *pdev = mdev->pdev; - dma_addr_t dma_addr = mic_to_dma_addr(mdev, mic_addr); - mic_unmap(mdev, mic_addr, size); - pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL); -} - -/** - * mic_smpt_init - Initialize MIC System Memory Page Tables. - * - * @mdev: pointer to mic_device instance. - * - * returns 0 for success and -errno for error. - */ -int mic_smpt_init(struct mic_device *mdev) -{ - int i, err = 0; - dma_addr_t dma_addr; - struct mic_smpt_info *smpt_info; - - mdev->smpt = kmalloc(sizeof(*mdev->smpt), GFP_KERNEL); - if (!mdev->smpt) - return -ENOMEM; - - smpt_info = mdev->smpt; - mdev->smpt_ops->init(mdev); - smpt_info->entry = kmalloc_array(smpt_info->info.num_reg, - sizeof(*smpt_info->entry), GFP_KERNEL); - if (!smpt_info->entry) { - err = -ENOMEM; - goto free_smpt; - } - spin_lock_init(&smpt_info->smpt_lock); - for (i = 0; i < smpt_info->info.num_reg; i++) { - dma_addr = i * smpt_info->info.page_size; - smpt_info->entry[i].dma_addr = dma_addr; - smpt_info->entry[i].ref_count = 0; - mdev->smpt_ops->set(mdev, dma_addr, i); - } - smpt_info->ref_count = 0; - smpt_info->map_count = 0; - smpt_info->unmap_count = 0; - return 0; -free_smpt: - kfree(smpt_info); - return err; -} - -/** - * mic_smpt_uninit - UnInitialize MIC System Memory Page Tables. - * - * @mdev: pointer to mic_device instance. - * - * returns None. - */ -void mic_smpt_uninit(struct mic_device *mdev) -{ - struct mic_smpt_info *smpt_info = mdev->smpt; - int i; - - dev_dbg(&mdev->pdev->dev, - "nodeid %d SMPT ref count %lld map %lld unmap %lld\n", - mdev->id, smpt_info->ref_count, - smpt_info->map_count, smpt_info->unmap_count); - - for (i = 0; i < smpt_info->info.num_reg; i++) { - dev_dbg(&mdev->pdev->dev, - "SMPT entry[%d] dma_addr = 0x%llx ref_count = %lld\n", - i, smpt_info->entry[i].dma_addr, - smpt_info->entry[i].ref_count); - if (smpt_info->entry[i].ref_count) - dev_warn(&mdev->pdev->dev, - "ref count for entry %d is not zero\n", i); - } - kfree(smpt_info->entry); - kfree(smpt_info); -} - -/** - * mic_smpt_restore - Restore MIC System Memory Page Tables. - * - * @mdev: pointer to mic_device instance. - * - * Restore the SMPT registers to values previously stored in the - * SW data structures. Some MIC steppings lose register state - * across resets and this API should be called for performing - * a restore operation if required. - * - * returns None. - */ -void mic_smpt_restore(struct mic_device *mdev) -{ - int i; - dma_addr_t dma_addr; - - for (i = 0; i < mdev->smpt->info.num_reg; i++) { - dma_addr = mdev->smpt->entry[i].dma_addr; - mdev->smpt_ops->set(mdev, dma_addr, i); - } -} diff --git a/drivers/misc/mic/host/mic_smpt.h b/drivers/misc/mic/host/mic_smpt.h deleted file mode 100644 index 3b1ec14a9d81..000000000000 --- a/drivers/misc/mic/host/mic_smpt.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#ifndef MIC_SMPT_H -#define MIC_SMPT_H -/** - * struct mic_smpt_ops - MIC HW specific SMPT operations. - * @init: Initialize hardware specific SMPT information in mic_smpt_hw_info. - * @set: Set the value for a particular SMPT entry. - */ -struct mic_smpt_ops { - void (*init)(struct mic_device *mdev); - void (*set)(struct mic_device *mdev, dma_addr_t dma_addr, u8 index); -}; - -/** - * struct mic_smpt - MIC SMPT entry information. - * @dma_addr: Base DMA address for this SMPT entry. - * @ref_count: Number of active mappings for this SMPT entry in bytes. - */ -struct mic_smpt { - dma_addr_t dma_addr; - s64 ref_count; -}; - -/** - * struct mic_smpt_hw_info - MIC SMPT hardware specific information. - * @num_reg: Number of SMPT registers. - * @page_shift: System memory page shift. - * @page_size: System memory page size. - * @base: System address base. - */ -struct mic_smpt_hw_info { - u8 num_reg; - u8 page_shift; - u64 page_size; - u64 base; -}; - -/** - * struct mic_smpt_info - MIC SMPT information. - * @entry: Array of SMPT entries. - * @smpt_lock: Spin lock protecting access to SMPT data structures. - * @info: Hardware specific SMPT information. - * @ref_count: Number of active SMPT mappings (for debug). - * @map_count: Number of SMPT mappings created (for debug). - * @unmap_count: Number of SMPT mappings destroyed (for debug). - */ -struct mic_smpt_info { - struct mic_smpt *entry; - spinlock_t smpt_lock; - struct mic_smpt_hw_info info; - s64 ref_count; - s64 map_count; - s64 unmap_count; -}; - -dma_addr_t mic_map_single(struct mic_device *mdev, void *va, size_t size); -void mic_unmap_single(struct mic_device *mdev, - dma_addr_t mic_addr, size_t size); -dma_addr_t mic_map(struct mic_device *mdev, - dma_addr_t dma_addr, size_t size); -void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size); -dma_addr_t mic_to_dma_addr(struct mic_device *mdev, dma_addr_t mic_addr); - -/** - * mic_map_error - Check a MIC address for errors. - * - * @mdev: pointer to mic_device instance. - * - * returns Whether there was an error during mic_map..(..) APIs. - */ -static inline bool mic_map_error(dma_addr_t mic_addr) -{ - return !mic_addr; -} - -int mic_smpt_init(struct mic_device *mdev); -void mic_smpt_uninit(struct mic_device *mdev); -void mic_smpt_restore(struct mic_device *mdev); - -#endif diff --git a/drivers/misc/mic/host/mic_x100.c b/drivers/misc/mic/host/mic_x100.c deleted file mode 100644 index f5536c1ad607..000000000000 --- a/drivers/misc/mic/host/mic_x100.c +++ /dev/null @@ -1,585 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#include -#include -#include -#include -#include - -#include "../common/mic_dev.h" -#include "mic_device.h" -#include "mic_x100.h" -#include "mic_smpt.h" - -static const u16 mic_x100_intr_init[] = { - MIC_X100_DOORBELL_IDX_START, - MIC_X100_DMA_IDX_START, - MIC_X100_ERR_IDX_START, - MIC_X100_NUM_DOORBELL, - MIC_X100_NUM_DMA, - MIC_X100_NUM_ERR, -}; - -/** - * mic_x100_write_spad - write to the scratchpad register - * @mdev: pointer to mic_device instance - * @idx: index to the scratchpad register, 0 based - * @val: the data value to put into the register - * - * This function allows writing of a 32bit value to the indexed scratchpad - * register. - * - * RETURNS: none. - */ -static void -mic_x100_write_spad(struct mic_device *mdev, unsigned int idx, u32 val) -{ - dev_dbg(&mdev->pdev->dev, "Writing 0x%x to scratch pad index %d\n", - val, idx); - mic_mmio_write(&mdev->mmio, val, - MIC_X100_SBOX_BASE_ADDRESS + - MIC_X100_SBOX_SPAD0 + idx * 4); -} - -/** - * mic_x100_read_spad - read from the scratchpad register - * @mdev: pointer to mic_device instance - * @idx: index to scratchpad register, 0 based - * - * This function allows reading of the 32bit scratchpad register. - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static u32 -mic_x100_read_spad(struct mic_device *mdev, unsigned int idx) -{ - u32 val = mic_mmio_read(&mdev->mmio, - MIC_X100_SBOX_BASE_ADDRESS + - MIC_X100_SBOX_SPAD0 + idx * 4); - - dev_dbg(&mdev->pdev->dev, - "Reading 0x%x from scratch pad index %d\n", val, idx); - return val; -} - -/** - * mic_x100_enable_interrupts - Enable interrupts. - * @mdev: pointer to mic_device instance - */ -static void mic_x100_enable_interrupts(struct mic_device *mdev) -{ - u32 reg; - struct mic_mw *mw = &mdev->mmio; - u32 sice0 = MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_SICE0; - u32 siac0 = MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_SIAC0; - - reg = mic_mmio_read(mw, sice0); - reg |= MIC_X100_SBOX_DBR_BITS(0xf) | MIC_X100_SBOX_DMA_BITS(0xff); - mic_mmio_write(mw, reg, sice0); - - /* - * Enable auto-clear when enabling interrupts. Applicable only for - * MSI-x. Legacy and MSI mode cannot have auto-clear enabled. - */ - if (mdev->irq_info.num_vectors > 1) { - reg = mic_mmio_read(mw, siac0); - reg |= MIC_X100_SBOX_DBR_BITS(0xf) | - MIC_X100_SBOX_DMA_BITS(0xff); - mic_mmio_write(mw, reg, siac0); - } -} - -/** - * mic_x100_disable_interrupts - Disable interrupts. - * @mdev: pointer to mic_device instance - */ -static void mic_x100_disable_interrupts(struct mic_device *mdev) -{ - u32 reg; - struct mic_mw *mw = &mdev->mmio; - u32 sice0 = MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_SICE0; - u32 siac0 = MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_SIAC0; - u32 sicc0 = MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_SICC0; - - reg = mic_mmio_read(mw, sice0); - mic_mmio_write(mw, reg, sicc0); - - if (mdev->irq_info.num_vectors > 1) { - reg = mic_mmio_read(mw, siac0); - reg &= ~(MIC_X100_SBOX_DBR_BITS(0xf) | - MIC_X100_SBOX_DMA_BITS(0xff)); - mic_mmio_write(mw, reg, siac0); - } -} - -/** - * mic_x100_send_sbox_intr - Send an MIC_X100_SBOX interrupt to MIC. - * @mdev: pointer to mic_device instance - * @doorbell: doorbell number - */ -static void mic_x100_send_sbox_intr(struct mic_device *mdev, - int doorbell) -{ - struct mic_mw *mw = &mdev->mmio; - u64 apic_icr_offset = MIC_X100_SBOX_APICICR0 + doorbell * 8; - u32 apicicr_low = mic_mmio_read(mw, MIC_X100_SBOX_BASE_ADDRESS + - apic_icr_offset); - - /* for MIC we need to make sure we "hit" the send_icr bit (13) */ - apicicr_low = (apicicr_low | (1 << 13)); - - /* Ensure that the interrupt is ordered w.r.t. previous stores. */ - wmb(); - mic_mmio_write(mw, apicicr_low, - MIC_X100_SBOX_BASE_ADDRESS + apic_icr_offset); -} - -/** - * mic_x100_send_rdmasr_intr - Send an RDMASR interrupt to MIC. - * @mdev: pointer to mic_device instance - * @doorbell: doorbell number - */ -static void mic_x100_send_rdmasr_intr(struct mic_device *mdev, - int doorbell) -{ - int rdmasr_offset = MIC_X100_SBOX_RDMASR0 + (doorbell << 2); - /* Ensure that the interrupt is ordered w.r.t. previous stores. */ - wmb(); - mic_mmio_write(&mdev->mmio, 0, - MIC_X100_SBOX_BASE_ADDRESS + rdmasr_offset); -} - -/** - * __mic_x100_send_intr - Send interrupt to MIC. - * @mdev: pointer to mic_device instance - * @doorbell: doorbell number. - */ -static void mic_x100_send_intr(struct mic_device *mdev, int doorbell) -{ - int rdmasr_db; - if (doorbell < MIC_X100_NUM_SBOX_IRQ) { - mic_x100_send_sbox_intr(mdev, doorbell); - } else { - rdmasr_db = doorbell - MIC_X100_NUM_SBOX_IRQ; - mic_x100_send_rdmasr_intr(mdev, rdmasr_db); - } -} - -/** - * mic_x100_ack_interrupt - Read the interrupt sources register and - * clear it. This function will be called in the MSI/INTx case. - * @mdev: Pointer to mic_device instance. - * - * Returns: bitmask of interrupt sources triggered. - */ -static u32 mic_x100_ack_interrupt(struct mic_device *mdev) -{ - u32 sicr0 = MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_SICR0; - u32 reg = mic_mmio_read(&mdev->mmio, sicr0); - mic_mmio_write(&mdev->mmio, reg, sicr0); - return reg; -} - -/** - * mic_x100_intr_workarounds - These hardware specific workarounds are - * to be invoked everytime an interrupt is handled. - * @mdev: Pointer to mic_device instance. - * - * Returns: none - */ -static void mic_x100_intr_workarounds(struct mic_device *mdev) -{ - struct mic_mw *mw = &mdev->mmio; - - /* Clear pending bit array. */ - if (MIC_A0_STEP == mdev->stepping) - mic_mmio_write(mw, 1, MIC_X100_SBOX_BASE_ADDRESS + - MIC_X100_SBOX_MSIXPBACR); - - if (mdev->stepping >= MIC_B0_STEP) - mdev->intr_ops->enable_interrupts(mdev); -} - -/** - * mic_x100_hw_intr_init - Initialize h/w specific interrupt - * information. - * @mdev: pointer to mic_device instance - */ -static void mic_x100_hw_intr_init(struct mic_device *mdev) -{ - mdev->intr_info = (struct mic_intr_info *)mic_x100_intr_init; -} - -/** - * mic_x100_read_msi_to_src_map - read from the MSI mapping registers - * @mdev: pointer to mic_device instance - * @idx: index to the mapping register, 0 based - * - * This function allows reading of the 32bit MSI mapping register. - * - * RETURNS: The value in the register. - */ -static u32 -mic_x100_read_msi_to_src_map(struct mic_device *mdev, int idx) -{ - return mic_mmio_read(&mdev->mmio, - MIC_X100_SBOX_BASE_ADDRESS + - MIC_X100_SBOX_MXAR0 + idx * 4); -} - -/** - * mic_x100_program_msi_to_src_map - program the MSI mapping registers - * @mdev: pointer to mic_device instance - * @idx: index to the mapping register, 0 based - * @offset: The bit offset in the register that needs to be updated. - * @set: boolean specifying if the bit in the specified offset needs - * to be set or cleared. - * - * RETURNS: None. - */ -static void -mic_x100_program_msi_to_src_map(struct mic_device *mdev, - int idx, int offset, bool set) -{ - unsigned long reg; - struct mic_mw *mw = &mdev->mmio; - u32 mxar = MIC_X100_SBOX_BASE_ADDRESS + - MIC_X100_SBOX_MXAR0 + idx * 4; - - reg = mic_mmio_read(mw, mxar); - if (set) - __set_bit(offset, ®); - else - __clear_bit(offset, ®); - mic_mmio_write(mw, reg, mxar); -} - -/* - * mic_x100_reset_fw_ready - Reset Firmware ready status field. - * @mdev: pointer to mic_device instance - */ -static void mic_x100_reset_fw_ready(struct mic_device *mdev) -{ - mdev->ops->write_spad(mdev, MIC_X100_DOWNLOAD_INFO, 0); -} - -/* - * mic_x100_is_fw_ready - Check if firmware is ready. - * @mdev: pointer to mic_device instance - */ -static bool mic_x100_is_fw_ready(struct mic_device *mdev) -{ - u32 scratch2 = mdev->ops->read_spad(mdev, MIC_X100_DOWNLOAD_INFO); - return MIC_X100_SPAD2_DOWNLOAD_STATUS(scratch2) ? true : false; -} - -/** - * mic_x100_get_apic_id - Get bootstrap APIC ID. - * @mdev: pointer to mic_device instance - */ -static u32 mic_x100_get_apic_id(struct mic_device *mdev) -{ - u32 scratch2 = 0; - - scratch2 = mdev->ops->read_spad(mdev, MIC_X100_DOWNLOAD_INFO); - return MIC_X100_SPAD2_APIC_ID(scratch2); -} - -/** - * mic_x100_send_firmware_intr - Send an interrupt to the firmware on MIC. - * @mdev: pointer to mic_device instance - */ -static void mic_x100_send_firmware_intr(struct mic_device *mdev) -{ - u32 apicicr_low; - u64 apic_icr_offset = MIC_X100_SBOX_APICICR7; - int vector = MIC_X100_BSP_INTERRUPT_VECTOR; - struct mic_mw *mw = &mdev->mmio; - - /* - * For MIC we need to make sure we "hit" - * the send_icr bit (13). - */ - apicicr_low = (vector | (1 << 13)); - - mic_mmio_write(mw, mic_x100_get_apic_id(mdev), - MIC_X100_SBOX_BASE_ADDRESS + apic_icr_offset + 4); - - /* Ensure that the interrupt is ordered w.r.t. previous stores. */ - wmb(); - mic_mmio_write(mw, apicicr_low, - MIC_X100_SBOX_BASE_ADDRESS + apic_icr_offset); -} - -/** - * mic_x100_hw_reset - Reset the MIC device. - * @mdev: pointer to mic_device instance - */ -static void mic_x100_hw_reset(struct mic_device *mdev) -{ - u32 reset_reg; - u32 rgcr = MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_RGCR; - struct mic_mw *mw = &mdev->mmio; - - /* Ensure that the reset is ordered w.r.t. previous loads and stores */ - mb(); - /* Trigger reset */ - reset_reg = mic_mmio_read(mw, rgcr); - reset_reg |= 0x1; - mic_mmio_write(mw, reset_reg, rgcr); - /* - * It seems we really want to delay at least 1 second - * after touching reset to prevent a lot of problems. - */ - msleep(1000); -} - -/** - * mic_x100_load_command_line - Load command line to MIC. - * @mdev: pointer to mic_device instance - * @fw: the firmware image - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static int -mic_x100_load_command_line(struct mic_device *mdev, const struct firmware *fw) -{ - u32 len = 0; - u32 boot_mem; - char *buf; - void __iomem *cmd_line_va = mdev->aper.va + mdev->bootaddr + fw->size; -#define CMDLINE_SIZE 2048 - - boot_mem = mdev->aper.len >> 20; - buf = kzalloc(CMDLINE_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - len += scnprintf(buf, CMDLINE_SIZE - len, - " mem=%dM", boot_mem); - if (mdev->cosm_dev->cmdline) - scnprintf(buf + len, CMDLINE_SIZE - len, " %s", - mdev->cosm_dev->cmdline); - memcpy_toio(cmd_line_va, buf, strlen(buf) + 1); - kfree(buf); - return 0; -} - -/** - * mic_x100_load_ramdisk - Load ramdisk to MIC. - * @mdev: pointer to mic_device instance - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static int -mic_x100_load_ramdisk(struct mic_device *mdev) -{ - const struct firmware *fw; - int rc; - struct boot_params __iomem *bp = mdev->aper.va + mdev->bootaddr; - - rc = request_firmware(&fw, mdev->cosm_dev->ramdisk, &mdev->pdev->dev); - if (rc < 0) { - dev_err(&mdev->pdev->dev, - "ramdisk request_firmware failed: %d %s\n", - rc, mdev->cosm_dev->ramdisk); - goto error; - } - /* - * Typically the bootaddr for card OS is 64M - * so copy over the ramdisk @ 128M. - */ - memcpy_toio(mdev->aper.va + (mdev->bootaddr << 1), fw->data, fw->size); - iowrite32(mdev->bootaddr << 1, &bp->hdr.ramdisk_image); - iowrite32(fw->size, &bp->hdr.ramdisk_size); - release_firmware(fw); -error: - return rc; -} - -/** - * mic_x100_get_boot_addr - Get MIC boot address. - * @mdev: pointer to mic_device instance - * - * This function is called during firmware load to determine - * the address at which the OS should be downloaded in card - * memory i.e. GDDR. - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static int -mic_x100_get_boot_addr(struct mic_device *mdev) -{ - u32 scratch2, boot_addr; - int rc = 0; - - scratch2 = mdev->ops->read_spad(mdev, MIC_X100_DOWNLOAD_INFO); - boot_addr = MIC_X100_SPAD2_DOWNLOAD_ADDR(scratch2); - dev_dbg(&mdev->pdev->dev, "%s %d boot_addr 0x%x\n", - __func__, __LINE__, boot_addr); - if (boot_addr > (1 << 31)) { - dev_err(&mdev->pdev->dev, - "incorrect bootaddr 0x%x\n", - boot_addr); - rc = -EINVAL; - goto error; - } - mdev->bootaddr = boot_addr; -error: - return rc; -} - -/** - * mic_x100_load_firmware - Load firmware to MIC. - * @mdev: pointer to mic_device instance - * @buf: buffer containing boot string including firmware/ramdisk path. - * - * RETURNS: An appropriate -ERRNO error value on error, or zero for success. - */ -static int -mic_x100_load_firmware(struct mic_device *mdev, const char *buf) -{ - int rc; - const struct firmware *fw; - - rc = mic_x100_get_boot_addr(mdev); - if (rc) - return rc; - /* load OS */ - rc = request_firmware(&fw, mdev->cosm_dev->firmware, &mdev->pdev->dev); - if (rc < 0) { - dev_err(&mdev->pdev->dev, - "ramdisk request_firmware failed: %d %s\n", - rc, mdev->cosm_dev->firmware); - return rc; - } - if (mdev->bootaddr > mdev->aper.len - fw->size) { - rc = -EINVAL; - dev_err(&mdev->pdev->dev, "%s %d rc %d bootaddr 0x%x\n", - __func__, __LINE__, rc, mdev->bootaddr); - goto error; - } - memcpy_toio(mdev->aper.va + mdev->bootaddr, fw->data, fw->size); - mdev->ops->write_spad(mdev, MIC_X100_FW_SIZE, fw->size); - if (!strcmp(mdev->cosm_dev->bootmode, "flash")) { - rc = -EINVAL; - dev_err(&mdev->pdev->dev, "%s %d rc %d\n", - __func__, __LINE__, rc); - goto error; - } - /* load command line */ - rc = mic_x100_load_command_line(mdev, fw); - if (rc) { - dev_err(&mdev->pdev->dev, "%s %d rc %d\n", - __func__, __LINE__, rc); - goto error; - } - release_firmware(fw); - /* load ramdisk */ - if (mdev->cosm_dev->ramdisk) - rc = mic_x100_load_ramdisk(mdev); - - return rc; - -error: - release_firmware(fw); - return rc; -} - -/** - * mic_x100_get_postcode - Get postcode status from firmware. - * @mdev: pointer to mic_device instance - * - * RETURNS: postcode. - */ -static u32 mic_x100_get_postcode(struct mic_device *mdev) -{ - return mic_mmio_read(&mdev->mmio, MIC_X100_POSTCODE); -} - -/** - * mic_x100_smpt_set - Update an SMPT entry with a DMA address. - * @mdev: pointer to mic_device instance - * @dma_addr: DMA address to use - * @index: entry to write to - * - * RETURNS: none. - */ -static void -mic_x100_smpt_set(struct mic_device *mdev, dma_addr_t dma_addr, u8 index) -{ -#define SNOOP_ON (0 << 0) -#define SNOOP_OFF (1 << 0) -/* - * Sbox Smpt Reg Bits: - * Bits 31:2 Host address - * Bits 1 RSVD - * Bits 0 No snoop - */ -#define BUILD_SMPT(NO_SNOOP, HOST_ADDR) \ - (u32)(((HOST_ADDR) << 2) | ((NO_SNOOP) & 0x01)) - - uint32_t smpt_reg_val = BUILD_SMPT(SNOOP_ON, - dma_addr >> mdev->smpt->info.page_shift); - mic_mmio_write(&mdev->mmio, smpt_reg_val, - MIC_X100_SBOX_BASE_ADDRESS + - MIC_X100_SBOX_SMPT00 + (4 * index)); -} - -/** - * mic_x100_smpt_hw_init - Initialize SMPT X100 specific fields. - * @mdev: pointer to mic_device instance - * - * RETURNS: none. - */ -static void mic_x100_smpt_hw_init(struct mic_device *mdev) -{ - struct mic_smpt_hw_info *info = &mdev->smpt->info; - - info->num_reg = 32; - info->page_shift = 34; - info->page_size = (1ULL << info->page_shift); - info->base = 0x8000000000ULL; -} - -struct mic_smpt_ops mic_x100_smpt_ops = { - .init = mic_x100_smpt_hw_init, - .set = mic_x100_smpt_set, -}; - -static bool mic_x100_dma_filter(struct dma_chan *chan, void *param) -{ - if (chan->device->dev->parent == (struct device *)param) - return true; - return false; -} - -struct mic_hw_ops mic_x100_ops = { - .aper_bar = MIC_X100_APER_BAR, - .mmio_bar = MIC_X100_MMIO_BAR, - .read_spad = mic_x100_read_spad, - .write_spad = mic_x100_write_spad, - .send_intr = mic_x100_send_intr, - .ack_interrupt = mic_x100_ack_interrupt, - .intr_workarounds = mic_x100_intr_workarounds, - .reset = mic_x100_hw_reset, - .reset_fw_ready = mic_x100_reset_fw_ready, - .is_fw_ready = mic_x100_is_fw_ready, - .send_firmware_intr = mic_x100_send_firmware_intr, - .load_mic_fw = mic_x100_load_firmware, - .get_postcode = mic_x100_get_postcode, - .dma_filter = mic_x100_dma_filter, -}; - -struct mic_hw_intr_ops mic_x100_intr_ops = { - .intr_init = mic_x100_hw_intr_init, - .enable_interrupts = mic_x100_enable_interrupts, - .disable_interrupts = mic_x100_disable_interrupts, - .program_msi_to_src_map = mic_x100_program_msi_to_src_map, - .read_msi_to_src_map = mic_x100_read_msi_to_src_map, -}; diff --git a/drivers/misc/mic/host/mic_x100.h b/drivers/misc/mic/host/mic_x100.h deleted file mode 100644 index aebcaed6fa72..000000000000 --- a/drivers/misc/mic/host/mic_x100.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC Host driver. - */ -#ifndef _MIC_X100_HW_H_ -#define _MIC_X100_HW_H_ - -#define MIC_X100_PCI_DEVICE_2250 0x2250 -#define MIC_X100_PCI_DEVICE_2251 0x2251 -#define MIC_X100_PCI_DEVICE_2252 0x2252 -#define MIC_X100_PCI_DEVICE_2253 0x2253 -#define MIC_X100_PCI_DEVICE_2254 0x2254 -#define MIC_X100_PCI_DEVICE_2255 0x2255 -#define MIC_X100_PCI_DEVICE_2256 0x2256 -#define MIC_X100_PCI_DEVICE_2257 0x2257 -#define MIC_X100_PCI_DEVICE_2258 0x2258 -#define MIC_X100_PCI_DEVICE_2259 0x2259 -#define MIC_X100_PCI_DEVICE_225a 0x225a -#define MIC_X100_PCI_DEVICE_225b 0x225b -#define MIC_X100_PCI_DEVICE_225c 0x225c -#define MIC_X100_PCI_DEVICE_225d 0x225d -#define MIC_X100_PCI_DEVICE_225e 0x225e - -#define MIC_X100_APER_BAR 0 -#define MIC_X100_MMIO_BAR 4 - -#define MIC_X100_SBOX_BASE_ADDRESS 0x00010000 -#define MIC_X100_SBOX_SPAD0 0x0000AB20 -#define MIC_X100_SBOX_SICR0_DBR(x) ((x) & 0xf) -#define MIC_X100_SBOX_SICR0_DMA(x) (((x) >> 8) & 0xff) -#define MIC_X100_SBOX_SICE0_DBR(x) ((x) & 0xf) -#define MIC_X100_SBOX_DBR_BITS(x) ((x) & 0xf) -#define MIC_X100_SBOX_SICE0_DMA(x) (((x) >> 8) & 0xff) -#define MIC_X100_SBOX_DMA_BITS(x) (((x) & 0xff) << 8) - -#define MIC_X100_SBOX_APICICR0 0x0000A9D0 -#define MIC_X100_SBOX_SICR0 0x00009004 -#define MIC_X100_SBOX_SICE0 0x0000900C -#define MIC_X100_SBOX_SICC0 0x00009010 -#define MIC_X100_SBOX_SIAC0 0x00009014 -#define MIC_X100_SBOX_MSIXPBACR 0x00009084 -#define MIC_X100_SBOX_MXAR0 0x00009044 -#define MIC_X100_SBOX_SMPT00 0x00003100 -#define MIC_X100_SBOX_RDMASR0 0x0000B180 - -#define MIC_X100_DOORBELL_IDX_START 0 -#define MIC_X100_NUM_DOORBELL 4 -#define MIC_X100_DMA_IDX_START 8 -#define MIC_X100_NUM_DMA 8 -#define MIC_X100_ERR_IDX_START 30 -#define MIC_X100_NUM_ERR 1 - -#define MIC_X100_NUM_SBOX_IRQ 8 -#define MIC_X100_NUM_RDMASR_IRQ 8 -#define MIC_X100_RDMASR_IRQ_BASE 17 -#define MIC_X100_SPAD2_DOWNLOAD_STATUS(x) ((x) & 0x1) -#define MIC_X100_SPAD2_APIC_ID(x) (((x) >> 1) & 0x1ff) -#define MIC_X100_SPAD2_DOWNLOAD_ADDR(x) ((x) & 0xfffff000) -#define MIC_X100_SBOX_APICICR7 0x0000AA08 -#define MIC_X100_SBOX_RGCR 0x00004010 -#define MIC_X100_SBOX_SDBIC0 0x0000CC90 -#define MIC_X100_DOWNLOAD_INFO 2 -#define MIC_X100_FW_SIZE 5 -#define MIC_X100_POSTCODE 0x242c - -/* Host->Card(bootstrap) Interrupt Vector */ -#define MIC_X100_BSP_INTERRUPT_VECTOR 229 - -extern struct mic_hw_ops mic_x100_ops; -extern struct mic_smpt_ops mic_x100_smpt_ops; -extern struct mic_hw_intr_ops mic_x100_intr_ops; - -#endif diff --git a/drivers/misc/mic/scif/Makefile b/drivers/misc/mic/scif/Makefile deleted file mode 100644 index ff372555d118..000000000000 --- a/drivers/misc/mic/scif/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile - SCIF driver. -# Copyright(c) 2014, Intel Corporation. -# -obj-$(CONFIG_SCIF) += scif.o -scif-objs := scif_main.o -scif-objs += scif_peer_bus.o -scif-objs += scif_ports.o -scif-objs += scif_debugfs.o -scif-objs += scif_fd.o -scif-objs += scif_api.o -scif-objs += scif_epd.o -scif-objs += scif_rb.o -scif-objs += scif_nodeqp.o -scif-objs += scif_nm.o -scif-objs += scif_dma.o -scif-objs += scif_fence.o -scif-objs += scif_mmap.o -scif-objs += scif_rma.o -scif-objs += scif_rma_list.o diff --git a/drivers/misc/mic/scif/scif_api.c b/drivers/misc/mic/scif/scif_api.c deleted file mode 100644 index 304d6c833712..000000000000 --- a/drivers/misc/mic/scif/scif_api.c +++ /dev/null @@ -1,1485 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include -#include "scif_main.h" -#include "scif_map.h" - -static const char * const scif_ep_states[] = { - "Unbound", - "Bound", - "Listening", - "Connected", - "Connecting", - "Mapping", - "Closing", - "Close Listening", - "Disconnected", - "Zombie"}; - -enum conn_async_state { - ASYNC_CONN_IDLE = 1, /* ep setup for async connect */ - ASYNC_CONN_INPROGRESS, /* async connect in progress */ - ASYNC_CONN_FLUSH_WORK /* async work flush in progress */ -}; - -/* - * File operations for anonymous inode file associated with a SCIF endpoint, - * used in kernel mode SCIF poll. Kernel mode SCIF poll calls portions of the - * poll API in the kernel and these take in a struct file *. Since a struct - * file is not available to kernel mode SCIF, it uses an anonymous file for - * this purpose. - */ -const struct file_operations scif_anon_fops = { - .owner = THIS_MODULE, -}; - -scif_epd_t scif_open(void) -{ - struct scif_endpt *ep; - int err; - - might_sleep(); - ep = kzalloc(sizeof(*ep), GFP_KERNEL); - if (!ep) - goto err_ep_alloc; - - ep->qp_info.qp = kzalloc(sizeof(*ep->qp_info.qp), GFP_KERNEL); - if (!ep->qp_info.qp) - goto err_qp_alloc; - - err = scif_anon_inode_getfile(ep); - if (err) - goto err_anon_inode; - - spin_lock_init(&ep->lock); - mutex_init(&ep->sendlock); - mutex_init(&ep->recvlock); - - scif_rma_ep_init(ep); - ep->state = SCIFEP_UNBOUND; - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI open: ep %p success\n", ep); - return ep; - -err_anon_inode: - kfree(ep->qp_info.qp); -err_qp_alloc: - kfree(ep); -err_ep_alloc: - return NULL; -} -EXPORT_SYMBOL_GPL(scif_open); - -/* - * scif_disconnect_ep - Disconnects the endpoint if found - * @epd: The end point returned from scif_open() - */ -static struct scif_endpt *scif_disconnect_ep(struct scif_endpt *ep) -{ - struct scifmsg msg; - struct scif_endpt *fep = NULL; - struct scif_endpt *tmpep; - struct list_head *pos, *tmpq; - int err; - - /* - * Wake up any threads blocked in send()/recv() before closing - * out the connection. Grabbing and releasing the send/recv lock - * will ensure that any blocked senders/receivers have exited for - * Ring 0 endpoints. It is a Ring 0 bug to call send/recv after - * close. Ring 3 endpoints are not affected since close will not - * be called while there are IOCTLs executing. - */ - wake_up_interruptible(&ep->sendwq); - wake_up_interruptible(&ep->recvwq); - mutex_lock(&ep->sendlock); - mutex_unlock(&ep->sendlock); - mutex_lock(&ep->recvlock); - mutex_unlock(&ep->recvlock); - - /* Remove from the connected list */ - mutex_lock(&scif_info.connlock); - list_for_each_safe(pos, tmpq, &scif_info.connected) { - tmpep = list_entry(pos, struct scif_endpt, list); - if (tmpep == ep) { - list_del(pos); - fep = tmpep; - spin_lock(&ep->lock); - break; - } - } - - if (!fep) { - /* - * The other side has completed the disconnect before - * the end point can be removed from the list. Therefore - * the ep lock is not locked, traverse the disconnected - * list to find the endpoint and release the conn lock. - */ - list_for_each_safe(pos, tmpq, &scif_info.disconnected) { - tmpep = list_entry(pos, struct scif_endpt, list); - if (tmpep == ep) { - list_del(pos); - break; - } - } - mutex_unlock(&scif_info.connlock); - return NULL; - } - - init_completion(&ep->discon); - msg.uop = SCIF_DISCNCT; - msg.src = ep->port; - msg.dst = ep->peer; - msg.payload[0] = (u64)ep; - msg.payload[1] = ep->remote_ep; - - err = scif_nodeqp_send(ep->remote_dev, &msg); - spin_unlock(&ep->lock); - mutex_unlock(&scif_info.connlock); - - if (!err) - /* Wait for the remote node to respond with SCIF_DISCNT_ACK */ - wait_for_completion_timeout(&ep->discon, - SCIF_NODE_ALIVE_TIMEOUT); - return ep; -} - -int scif_close(scif_epd_t epd) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct scif_endpt *tmpep; - struct list_head *pos, *tmpq; - enum scif_epd_state oldstate; - bool flush_conn; - - dev_dbg(scif_info.mdev.this_device, "SCIFAPI close: ep %p %s\n", - ep, scif_ep_states[ep->state]); - might_sleep(); - spin_lock(&ep->lock); - flush_conn = (ep->conn_async_state == ASYNC_CONN_INPROGRESS); - spin_unlock(&ep->lock); - - if (flush_conn) - flush_work(&scif_info.conn_work); - - spin_lock(&ep->lock); - oldstate = ep->state; - - ep->state = SCIFEP_CLOSING; - - switch (oldstate) { - case SCIFEP_ZOMBIE: - dev_err(scif_info.mdev.this_device, - "SCIFAPI close: zombie state unexpected\n"); - fallthrough; - case SCIFEP_DISCONNECTED: - spin_unlock(&ep->lock); - scif_unregister_all_windows(epd); - /* Remove from the disconnected list */ - mutex_lock(&scif_info.connlock); - list_for_each_safe(pos, tmpq, &scif_info.disconnected) { - tmpep = list_entry(pos, struct scif_endpt, list); - if (tmpep == ep) { - list_del(pos); - break; - } - } - mutex_unlock(&scif_info.connlock); - break; - case SCIFEP_UNBOUND: - case SCIFEP_BOUND: - case SCIFEP_CONNECTING: - spin_unlock(&ep->lock); - break; - case SCIFEP_MAPPING: - case SCIFEP_CONNECTED: - case SCIFEP_CLOSING: - { - spin_unlock(&ep->lock); - scif_unregister_all_windows(epd); - scif_disconnect_ep(ep); - break; - } - case SCIFEP_LISTENING: - case SCIFEP_CLLISTEN: - { - struct scif_conreq *conreq; - struct scifmsg msg; - struct scif_endpt *aep; - - spin_unlock(&ep->lock); - mutex_lock(&scif_info.eplock); - - /* remove from listen list */ - list_for_each_safe(pos, tmpq, &scif_info.listen) { - tmpep = list_entry(pos, struct scif_endpt, list); - if (tmpep == ep) - list_del(pos); - } - /* Remove any dangling accepts */ - while (ep->acceptcnt) { - aep = list_first_entry(&ep->li_accept, - struct scif_endpt, liacceptlist); - list_del(&aep->liacceptlist); - scif_put_port(aep->port.port); - list_for_each_safe(pos, tmpq, &scif_info.uaccept) { - tmpep = list_entry(pos, struct scif_endpt, - miacceptlist); - if (tmpep == aep) { - list_del(pos); - break; - } - } - mutex_unlock(&scif_info.eplock); - mutex_lock(&scif_info.connlock); - list_for_each_safe(pos, tmpq, &scif_info.connected) { - tmpep = list_entry(pos, - struct scif_endpt, list); - if (tmpep == aep) { - list_del(pos); - break; - } - } - list_for_each_safe(pos, tmpq, &scif_info.disconnected) { - tmpep = list_entry(pos, - struct scif_endpt, list); - if (tmpep == aep) { - list_del(pos); - break; - } - } - mutex_unlock(&scif_info.connlock); - scif_teardown_ep(aep); - mutex_lock(&scif_info.eplock); - scif_add_epd_to_zombie_list(aep, SCIF_EPLOCK_HELD); - ep->acceptcnt--; - } - - spin_lock(&ep->lock); - mutex_unlock(&scif_info.eplock); - - /* Remove and reject any pending connection requests. */ - while (ep->conreqcnt) { - conreq = list_first_entry(&ep->conlist, - struct scif_conreq, list); - list_del(&conreq->list); - - msg.uop = SCIF_CNCT_REJ; - msg.dst.node = conreq->msg.src.node; - msg.dst.port = conreq->msg.src.port; - msg.payload[0] = conreq->msg.payload[0]; - msg.payload[1] = conreq->msg.payload[1]; - /* - * No Error Handling on purpose for scif_nodeqp_send(). - * If the remote node is lost we still want free the - * connection requests on the self node. - */ - scif_nodeqp_send(&scif_dev[conreq->msg.src.node], - &msg); - ep->conreqcnt--; - kfree(conreq); - } - - spin_unlock(&ep->lock); - /* If a kSCIF accept is waiting wake it up */ - wake_up_interruptible(&ep->conwq); - break; - } - } - scif_put_port(ep->port.port); - scif_anon_inode_fput(ep); - scif_teardown_ep(ep); - scif_add_epd_to_zombie_list(ep, !SCIF_EPLOCK_HELD); - return 0; -} -EXPORT_SYMBOL_GPL(scif_close); - -/** - * scif_flush() - Wakes up any blocking accepts. The endpoint will no longer - * accept new connections. - * @epd: The end point returned from scif_open() - */ -int __scif_flush(scif_epd_t epd) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - - switch (ep->state) { - case SCIFEP_LISTENING: - { - ep->state = SCIFEP_CLLISTEN; - - /* If an accept is waiting wake it up */ - wake_up_interruptible(&ep->conwq); - break; - } - default: - break; - } - return 0; -} - -int scif_bind(scif_epd_t epd, u16 pn) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int ret = 0; - int tmp; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI bind: ep %p %s requested port number %d\n", - ep, scif_ep_states[ep->state], pn); - if (pn) { - /* - * Similar to IETF RFC 1700, SCIF ports below - * SCIF_ADMIN_PORT_END can only be bound by system (or root) - * processes or by processes executed by privileged users. - */ - if (pn < SCIF_ADMIN_PORT_END && !capable(CAP_SYS_ADMIN)) { - ret = -EACCES; - goto scif_bind_admin_exit; - } - } - - spin_lock(&ep->lock); - if (ep->state == SCIFEP_BOUND) { - ret = -EINVAL; - goto scif_bind_exit; - } else if (ep->state != SCIFEP_UNBOUND) { - ret = -EISCONN; - goto scif_bind_exit; - } - - if (pn) { - tmp = scif_rsrv_port(pn); - if (tmp != pn) { - ret = -EINVAL; - goto scif_bind_exit; - } - } else { - ret = scif_get_new_port(); - if (ret < 0) - goto scif_bind_exit; - pn = ret; - } - - ep->state = SCIFEP_BOUND; - ep->port.node = scif_info.nodeid; - ep->port.port = pn; - ep->conn_async_state = ASYNC_CONN_IDLE; - ret = pn; - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI bind: bound to port number %d\n", pn); -scif_bind_exit: - spin_unlock(&ep->lock); -scif_bind_admin_exit: - return ret; -} -EXPORT_SYMBOL_GPL(scif_bind); - -int scif_listen(scif_epd_t epd, int backlog) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI listen: ep %p %s\n", ep, scif_ep_states[ep->state]); - spin_lock(&ep->lock); - switch (ep->state) { - case SCIFEP_ZOMBIE: - case SCIFEP_CLOSING: - case SCIFEP_CLLISTEN: - case SCIFEP_UNBOUND: - case SCIFEP_DISCONNECTED: - spin_unlock(&ep->lock); - return -EINVAL; - case SCIFEP_LISTENING: - case SCIFEP_CONNECTED: - case SCIFEP_CONNECTING: - case SCIFEP_MAPPING: - spin_unlock(&ep->lock); - return -EISCONN; - case SCIFEP_BOUND: - break; - } - - ep->state = SCIFEP_LISTENING; - ep->backlog = backlog; - - ep->conreqcnt = 0; - ep->acceptcnt = 0; - INIT_LIST_HEAD(&ep->conlist); - init_waitqueue_head(&ep->conwq); - INIT_LIST_HEAD(&ep->li_accept); - spin_unlock(&ep->lock); - - /* - * Listen status is complete so delete the qp information not needed - * on a listen before placing on the list of listening ep's - */ - scif_teardown_ep(ep); - ep->qp_info.qp = NULL; - - mutex_lock(&scif_info.eplock); - list_add_tail(&ep->list, &scif_info.listen); - mutex_unlock(&scif_info.eplock); - return 0; -} -EXPORT_SYMBOL_GPL(scif_listen); - -/* - ************************************************************************ - * SCIF connection flow: - * - * 1) A SCIF listening endpoint can call scif_accept(..) to wait for SCIF - * connections via a SCIF_CNCT_REQ message - * 2) A SCIF endpoint can initiate a SCIF connection by calling - * scif_connect(..) which calls scif_setup_qp_connect(..) which - * allocates the local qp for the endpoint ring buffer and then sends - * a SCIF_CNCT_REQ to the remote node and waits for a SCIF_CNCT_GNT or - * a SCIF_CNCT_REJ message - * 3) The peer node handles a SCIF_CNCT_REQ via scif_cnctreq_resp(..) which - * wakes up any threads blocked in step 1 or sends a SCIF_CNCT_REJ - * message otherwise - * 4) A thread blocked waiting for incoming connections allocates its local - * endpoint QP and ring buffer following which it sends a SCIF_CNCT_GNT - * and waits for a SCIF_CNCT_GNT(N)ACK. If the allocation fails then - * the node sends a SCIF_CNCT_REJ message - * 5) Upon receipt of a SCIF_CNCT_GNT or a SCIF_CNCT_REJ message the - * connecting endpoint is woken up as part of handling - * scif_cnctgnt_resp(..) following which it maps the remote endpoints' - * QP, updates its outbound QP and sends a SCIF_CNCT_GNTACK message on - * success or a SCIF_CNCT_GNTNACK message on failure and completes - * the scif_connect(..) API - * 6) Upon receipt of a SCIF_CNCT_GNT(N)ACK the accepting endpoint blocked - * in step 4 is woken up and completes the scif_accept(..) API - * 7) The SCIF connection is now established between the two SCIF endpoints. - */ -static int scif_conn_func(struct scif_endpt *ep) -{ - int err = 0; - struct scifmsg msg; - struct device *spdev; - - err = scif_reserve_dma_chan(ep); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - ep->state = SCIFEP_BOUND; - goto connect_error_simple; - } - /* Initiate the first part of the endpoint QP setup */ - err = scif_setup_qp_connect(ep->qp_info.qp, &ep->qp_info.qp_offset, - SCIF_ENDPT_QP_SIZE, ep->remote_dev); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s err %d qp_offset 0x%llx\n", - __func__, err, ep->qp_info.qp_offset); - ep->state = SCIFEP_BOUND; - goto connect_error_simple; - } - - spdev = scif_get_peer_dev(ep->remote_dev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - goto cleanup_qp; - } - /* Format connect message and send it */ - msg.src = ep->port; - msg.dst = ep->conn_port; - msg.uop = SCIF_CNCT_REQ; - msg.payload[0] = (u64)ep; - msg.payload[1] = ep->qp_info.qp_offset; - err = _scif_nodeqp_send(ep->remote_dev, &msg); - if (err) - goto connect_error_dec; - scif_put_peer_dev(spdev); - /* - * Wait for the remote node to respond with SCIF_CNCT_GNT or - * SCIF_CNCT_REJ message. - */ - err = wait_event_timeout(ep->conwq, ep->state != SCIFEP_CONNECTING, - SCIF_NODE_ALIVE_TIMEOUT); - if (!err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d timeout\n", __func__, __LINE__); - ep->state = SCIFEP_BOUND; - } - spdev = scif_get_peer_dev(ep->remote_dev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - goto cleanup_qp; - } - if (ep->state == SCIFEP_MAPPING) { - err = scif_setup_qp_connect_response(ep->remote_dev, - ep->qp_info.qp, - ep->qp_info.gnt_pld); - /* - * If the resource to map the queue are not available then - * we need to tell the other side to terminate the accept - */ - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - msg.uop = SCIF_CNCT_GNTNACK; - msg.payload[0] = ep->remote_ep; - _scif_nodeqp_send(ep->remote_dev, &msg); - ep->state = SCIFEP_BOUND; - goto connect_error_dec; - } - - msg.uop = SCIF_CNCT_GNTACK; - msg.payload[0] = ep->remote_ep; - err = _scif_nodeqp_send(ep->remote_dev, &msg); - if (err) { - ep->state = SCIFEP_BOUND; - goto connect_error_dec; - } - ep->state = SCIFEP_CONNECTED; - mutex_lock(&scif_info.connlock); - list_add_tail(&ep->list, &scif_info.connected); - mutex_unlock(&scif_info.connlock); - dev_dbg(&ep->remote_dev->sdev->dev, - "SCIFAPI connect: ep %p connected\n", ep); - } else if (ep->state == SCIFEP_BOUND) { - dev_dbg(&ep->remote_dev->sdev->dev, - "SCIFAPI connect: ep %p connection refused\n", ep); - err = -ECONNREFUSED; - goto connect_error_dec; - } - scif_put_peer_dev(spdev); - return err; -connect_error_dec: - scif_put_peer_dev(spdev); -cleanup_qp: - scif_cleanup_ep_qp(ep); -connect_error_simple: - return err; -} - -/* - * scif_conn_handler: - * - * Workqueue handler for servicing non-blocking SCIF connect - * - */ -void scif_conn_handler(struct work_struct *work) -{ - struct scif_endpt *ep; - - do { - ep = NULL; - spin_lock(&scif_info.nb_connect_lock); - if (!list_empty(&scif_info.nb_connect_list)) { - ep = list_first_entry(&scif_info.nb_connect_list, - struct scif_endpt, conn_list); - list_del(&ep->conn_list); - } - spin_unlock(&scif_info.nb_connect_lock); - if (ep) { - ep->conn_err = scif_conn_func(ep); - wake_up_interruptible(&ep->conn_pend_wq); - } - } while (ep); -} - -int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int err = 0; - struct scif_dev *remote_dev; - struct device *spdev; - - dev_dbg(scif_info.mdev.this_device, "SCIFAPI connect: ep %p %s\n", ep, - scif_ep_states[ep->state]); - - if (!scif_dev || dst->node > scif_info.maxid) - return -ENODEV; - - might_sleep(); - - remote_dev = &scif_dev[dst->node]; - spdev = scif_get_peer_dev(remote_dev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - return err; - } - - spin_lock(&ep->lock); - switch (ep->state) { - case SCIFEP_ZOMBIE: - case SCIFEP_CLOSING: - err = -EINVAL; - break; - case SCIFEP_DISCONNECTED: - if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) - ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; - else - err = -EINVAL; - break; - case SCIFEP_LISTENING: - case SCIFEP_CLLISTEN: - err = -EOPNOTSUPP; - break; - case SCIFEP_CONNECTING: - case SCIFEP_MAPPING: - if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) - err = -EINPROGRESS; - else - err = -EISCONN; - break; - case SCIFEP_CONNECTED: - if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) - ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; - else - err = -EISCONN; - break; - case SCIFEP_UNBOUND: - err = scif_get_new_port(); - if (err < 0) - break; - ep->port.port = err; - ep->port.node = scif_info.nodeid; - ep->conn_async_state = ASYNC_CONN_IDLE; - fallthrough; - case SCIFEP_BOUND: - /* - * If a non-blocking connect has been already initiated - * (conn_async_state is either ASYNC_CONN_INPROGRESS or - * ASYNC_CONN_FLUSH_WORK), the end point could end up in - * SCIF_BOUND due an error in the connection process - * (e.g., connection refused) If conn_async_state is - * ASYNC_CONN_INPROGRESS - transition to ASYNC_CONN_FLUSH_WORK - * so that the error status can be collected. If the state is - * already ASYNC_CONN_FLUSH_WORK - then set the error to - * EINPROGRESS since some other thread is waiting to collect - * error status. - */ - if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { - ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; - } else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) { - err = -EINPROGRESS; - } else { - ep->conn_port = *dst; - init_waitqueue_head(&ep->sendwq); - init_waitqueue_head(&ep->recvwq); - init_waitqueue_head(&ep->conwq); - ep->conn_async_state = 0; - - if (unlikely(non_block)) - ep->conn_async_state = ASYNC_CONN_INPROGRESS; - } - break; - } - - if (err || ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) - goto connect_simple_unlock1; - - ep->state = SCIFEP_CONNECTING; - ep->remote_dev = &scif_dev[dst->node]; - ep->qp_info.qp->magic = SCIFEP_MAGIC; - if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { - init_waitqueue_head(&ep->conn_pend_wq); - spin_lock(&scif_info.nb_connect_lock); - list_add_tail(&ep->conn_list, &scif_info.nb_connect_list); - spin_unlock(&scif_info.nb_connect_lock); - err = -EINPROGRESS; - schedule_work(&scif_info.conn_work); - } -connect_simple_unlock1: - spin_unlock(&ep->lock); - scif_put_peer_dev(spdev); - if (err) { - return err; - } else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) { - flush_work(&scif_info.conn_work); - err = ep->conn_err; - spin_lock(&ep->lock); - ep->conn_async_state = ASYNC_CONN_IDLE; - spin_unlock(&ep->lock); - } else { - err = scif_conn_func(ep); - } - return err; -} - -int scif_connect(scif_epd_t epd, struct scif_port_id *dst) -{ - return __scif_connect(epd, dst, false); -} -EXPORT_SYMBOL_GPL(scif_connect); - -/* - * scif_accept() - Accept a connection request from the remote node - * - * The function accepts a connection request from the remote node. Successful - * complete is indicate by a new end point being created and passed back - * to the caller for future reference. - * - * Upon successful complete a zero will be returned and the peer information - * will be filled in. - * - * If the end point is not in the listening state -EINVAL will be returned. - * - * If during the connection sequence resource allocation fails the -ENOMEM - * will be returned. - * - * If the function is called with the ASYNC flag set and no connection requests - * are pending it will return -EAGAIN. - * - * If the remote side is not sending any connection requests the caller may - * terminate this function with a signal. If so a -EINTR will be returned. - */ -int scif_accept(scif_epd_t epd, struct scif_port_id *peer, - scif_epd_t *newepd, int flags) -{ - struct scif_endpt *lep = (struct scif_endpt *)epd; - struct scif_endpt *cep; - struct scif_conreq *conreq; - struct scifmsg msg; - int err; - struct device *spdev; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI accept: ep %p %s\n", lep, scif_ep_states[lep->state]); - - if (flags & ~SCIF_ACCEPT_SYNC) - return -EINVAL; - - if (!peer || !newepd) - return -EINVAL; - - might_sleep(); - spin_lock(&lep->lock); - if (lep->state != SCIFEP_LISTENING) { - spin_unlock(&lep->lock); - return -EINVAL; - } - - if (!lep->conreqcnt && !(flags & SCIF_ACCEPT_SYNC)) { - /* No connection request present and we do not want to wait */ - spin_unlock(&lep->lock); - return -EAGAIN; - } - - lep->files = current->files; -retry_connection: - spin_unlock(&lep->lock); - /* Wait for the remote node to send us a SCIF_CNCT_REQ */ - err = wait_event_interruptible(lep->conwq, - (lep->conreqcnt || - (lep->state != SCIFEP_LISTENING))); - if (err) - return err; - - if (lep->state != SCIFEP_LISTENING) - return -EINTR; - - spin_lock(&lep->lock); - - if (!lep->conreqcnt) - goto retry_connection; - - /* Get the first connect request off the list */ - conreq = list_first_entry(&lep->conlist, struct scif_conreq, list); - list_del(&conreq->list); - lep->conreqcnt--; - spin_unlock(&lep->lock); - - /* Fill in the peer information */ - peer->node = conreq->msg.src.node; - peer->port = conreq->msg.src.port; - - cep = kzalloc(sizeof(*cep), GFP_KERNEL); - if (!cep) { - err = -ENOMEM; - goto scif_accept_error_epalloc; - } - spin_lock_init(&cep->lock); - mutex_init(&cep->sendlock); - mutex_init(&cep->recvlock); - cep->state = SCIFEP_CONNECTING; - cep->remote_dev = &scif_dev[peer->node]; - cep->remote_ep = conreq->msg.payload[0]; - - scif_rma_ep_init(cep); - - err = scif_reserve_dma_chan(cep); - if (err) { - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", __func__, __LINE__, err); - goto scif_accept_error_qpalloc; - } - - cep->qp_info.qp = kzalloc(sizeof(*cep->qp_info.qp), GFP_KERNEL); - if (!cep->qp_info.qp) { - err = -ENOMEM; - goto scif_accept_error_qpalloc; - } - - err = scif_anon_inode_getfile(cep); - if (err) - goto scif_accept_error_anon_inode; - - cep->qp_info.qp->magic = SCIFEP_MAGIC; - spdev = scif_get_peer_dev(cep->remote_dev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - goto scif_accept_error_map; - } - err = scif_setup_qp_accept(cep->qp_info.qp, &cep->qp_info.qp_offset, - conreq->msg.payload[1], SCIF_ENDPT_QP_SIZE, - cep->remote_dev); - if (err) { - dev_dbg(&cep->remote_dev->sdev->dev, - "SCIFAPI accept: ep %p new %p scif_setup_qp_accept %d qp_offset 0x%llx\n", - lep, cep, err, cep->qp_info.qp_offset); - scif_put_peer_dev(spdev); - goto scif_accept_error_map; - } - - cep->port.node = lep->port.node; - cep->port.port = lep->port.port; - cep->peer.node = peer->node; - cep->peer.port = peer->port; - init_waitqueue_head(&cep->sendwq); - init_waitqueue_head(&cep->recvwq); - init_waitqueue_head(&cep->conwq); - - msg.uop = SCIF_CNCT_GNT; - msg.src = cep->port; - msg.payload[0] = cep->remote_ep; - msg.payload[1] = cep->qp_info.qp_offset; - msg.payload[2] = (u64)cep; - - err = _scif_nodeqp_send(cep->remote_dev, &msg); - scif_put_peer_dev(spdev); - if (err) - goto scif_accept_error_map; -retry: - /* Wait for the remote node to respond with SCIF_CNCT_GNT(N)ACK */ - err = wait_event_timeout(cep->conwq, cep->state != SCIFEP_CONNECTING, - SCIF_NODE_ACCEPT_TIMEOUT); - if (!err && scifdev_alive(cep)) - goto retry; - err = !err ? -ENODEV : 0; - if (err) - goto scif_accept_error_map; - kfree(conreq); - - spin_lock(&cep->lock); - - if (cep->state == SCIFEP_CLOSING) { - /* - * Remote failed to allocate resources and NAKed the grant. - * There is at this point nothing referencing the new end point. - */ - spin_unlock(&cep->lock); - scif_teardown_ep(cep); - kfree(cep); - - /* If call with sync flag then go back and wait. */ - if (flags & SCIF_ACCEPT_SYNC) { - spin_lock(&lep->lock); - goto retry_connection; - } - return -EAGAIN; - } - - scif_get_port(cep->port.port); - *newepd = (scif_epd_t)cep; - spin_unlock(&cep->lock); - return 0; -scif_accept_error_map: - scif_anon_inode_fput(cep); -scif_accept_error_anon_inode: - scif_teardown_ep(cep); -scif_accept_error_qpalloc: - kfree(cep); -scif_accept_error_epalloc: - msg.uop = SCIF_CNCT_REJ; - msg.dst.node = conreq->msg.src.node; - msg.dst.port = conreq->msg.src.port; - msg.payload[0] = conreq->msg.payload[0]; - msg.payload[1] = conreq->msg.payload[1]; - scif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg); - kfree(conreq); - return err; -} -EXPORT_SYMBOL_GPL(scif_accept); - -/* - * scif_msg_param_check: - * @epd: The end point returned from scif_open() - * @len: Length to receive - * @flags: blocking or non blocking - * - * Validate parameters for messaging APIs scif_send(..)/scif_recv(..). - */ -static inline int scif_msg_param_check(scif_epd_t epd, int len, int flags) -{ - int ret = -EINVAL; - - if (len < 0) - goto err_ret; - if (flags && (!(flags & SCIF_RECV_BLOCK))) - goto err_ret; - ret = 0; -err_ret: - return ret; -} - -static int _scif_send(scif_epd_t epd, void *msg, int len, int flags) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct scifmsg notif_msg; - int curr_xfer_len = 0, sent_len = 0, write_count; - int ret = 0; - struct scif_qp *qp = ep->qp_info.qp; - - if (flags & SCIF_SEND_BLOCK) - might_sleep(); - - spin_lock(&ep->lock); - while (sent_len != len && SCIFEP_CONNECTED == ep->state) { - write_count = scif_rb_space(&qp->outbound_q); - if (write_count) { - /* Best effort to send as much data as possible */ - curr_xfer_len = min(len - sent_len, write_count); - ret = scif_rb_write(&qp->outbound_q, msg, - curr_xfer_len); - if (ret < 0) - break; - /* Success. Update write pointer */ - scif_rb_commit(&qp->outbound_q); - /* - * Send a notification to the peer about the - * produced data message. - */ - notif_msg.src = ep->port; - notif_msg.uop = SCIF_CLIENT_SENT; - notif_msg.payload[0] = ep->remote_ep; - ret = _scif_nodeqp_send(ep->remote_dev, ¬if_msg); - if (ret) - break; - sent_len += curr_xfer_len; - msg = msg + curr_xfer_len; - continue; - } - curr_xfer_len = min(len - sent_len, SCIF_ENDPT_QP_SIZE - 1); - /* Not enough RB space. return for the Non Blocking case */ - if (!(flags & SCIF_SEND_BLOCK)) - break; - - spin_unlock(&ep->lock); - /* Wait for a SCIF_CLIENT_RCVD message in the Blocking case */ - ret = - wait_event_interruptible(ep->sendwq, - (SCIFEP_CONNECTED != ep->state) || - (scif_rb_space(&qp->outbound_q) >= - curr_xfer_len)); - spin_lock(&ep->lock); - if (ret) - break; - } - if (sent_len) - ret = sent_len; - else if (!ret && SCIFEP_CONNECTED != ep->state) - ret = SCIFEP_DISCONNECTED == ep->state ? - -ECONNRESET : -ENOTCONN; - spin_unlock(&ep->lock); - return ret; -} - -static int _scif_recv(scif_epd_t epd, void *msg, int len, int flags) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct scifmsg notif_msg; - int curr_recv_len = 0, remaining_len = len, read_count; - int ret = 0; - struct scif_qp *qp = ep->qp_info.qp; - - if (flags & SCIF_RECV_BLOCK) - might_sleep(); - spin_lock(&ep->lock); - while (remaining_len && (SCIFEP_CONNECTED == ep->state || - SCIFEP_DISCONNECTED == ep->state)) { - read_count = scif_rb_count(&qp->inbound_q, remaining_len); - if (read_count) { - /* - * Best effort to recv as much data as there - * are bytes to read in the RB particularly - * important for the Non Blocking case. - */ - curr_recv_len = min(remaining_len, read_count); - scif_rb_get_next(&qp->inbound_q, msg, curr_recv_len); - if (ep->state == SCIFEP_CONNECTED) { - /* - * Update the read pointer only if the endpoint - * is still connected else the read pointer - * might no longer exist since the peer has - * freed resources! - */ - scif_rb_update_read_ptr(&qp->inbound_q); - /* - * Send a notification to the peer about the - * consumed data message only if the EP is in - * SCIFEP_CONNECTED state. - */ - notif_msg.src = ep->port; - notif_msg.uop = SCIF_CLIENT_RCVD; - notif_msg.payload[0] = ep->remote_ep; - ret = _scif_nodeqp_send(ep->remote_dev, - ¬if_msg); - if (ret) - break; - } - remaining_len -= curr_recv_len; - msg = msg + curr_recv_len; - continue; - } - /* - * Bail out now if the EP is in SCIFEP_DISCONNECTED state else - * we will keep looping forever. - */ - if (ep->state == SCIFEP_DISCONNECTED) - break; - /* - * Return in the Non Blocking case if there is no data - * to read in this iteration. - */ - if (!(flags & SCIF_RECV_BLOCK)) - break; - curr_recv_len = min(remaining_len, SCIF_ENDPT_QP_SIZE - 1); - spin_unlock(&ep->lock); - /* - * Wait for a SCIF_CLIENT_SEND message in the blocking case - * or until other side disconnects. - */ - ret = - wait_event_interruptible(ep->recvwq, - SCIFEP_CONNECTED != ep->state || - scif_rb_count(&qp->inbound_q, - curr_recv_len) - >= curr_recv_len); - spin_lock(&ep->lock); - if (ret) - break; - } - if (len - remaining_len) - ret = len - remaining_len; - else if (!ret && ep->state != SCIFEP_CONNECTED) - ret = ep->state == SCIFEP_DISCONNECTED ? - -ECONNRESET : -ENOTCONN; - spin_unlock(&ep->lock); - return ret; -} - -/** - * scif_user_send() - Send data to connection queue - * @epd: The end point returned from scif_open() - * @msg: Address to place data - * @len: Length to receive - * @flags: blocking or non blocking - * - * This function is called from the driver IOCTL entry point - * only and is a wrapper for _scif_send(). - */ -int scif_user_send(scif_epd_t epd, void __user *msg, int len, int flags) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int err = 0; - int sent_len = 0; - char *tmp; - int loop_len; - int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1))); - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI send (U): ep %p %s\n", ep, scif_ep_states[ep->state]); - if (!len) - return 0; - - err = scif_msg_param_check(epd, len, flags); - if (err) - goto send_err; - - tmp = kmalloc(chunk_len, GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto send_err; - } - /* - * Grabbing the lock before breaking up the transfer in - * multiple chunks is required to ensure that messages do - * not get fragmented and reordered. - */ - mutex_lock(&ep->sendlock); - while (sent_len != len) { - loop_len = len - sent_len; - loop_len = min(chunk_len, loop_len); - if (copy_from_user(tmp, msg, loop_len)) { - err = -EFAULT; - goto send_free_err; - } - err = _scif_send(epd, tmp, loop_len, flags); - if (err < 0) - goto send_free_err; - sent_len += err; - msg += err; - if (err != loop_len) - goto send_free_err; - } -send_free_err: - mutex_unlock(&ep->sendlock); - kfree(tmp); -send_err: - return err < 0 ? err : sent_len; -} - -/** - * scif_user_recv() - Receive data from connection queue - * @epd: The end point returned from scif_open() - * @msg: Address to place data - * @len: Length to receive - * @flags: blocking or non blocking - * - * This function is called from the driver IOCTL entry point - * only and is a wrapper for _scif_recv(). - */ -int scif_user_recv(scif_epd_t epd, void __user *msg, int len, int flags) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int err = 0; - int recv_len = 0; - char *tmp; - int loop_len; - int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1))); - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI recv (U): ep %p %s\n", ep, scif_ep_states[ep->state]); - if (!len) - return 0; - - err = scif_msg_param_check(epd, len, flags); - if (err) - goto recv_err; - - tmp = kmalloc(chunk_len, GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto recv_err; - } - /* - * Grabbing the lock before breaking up the transfer in - * multiple chunks is required to ensure that messages do - * not get fragmented and reordered. - */ - mutex_lock(&ep->recvlock); - while (recv_len != len) { - loop_len = len - recv_len; - loop_len = min(chunk_len, loop_len); - err = _scif_recv(epd, tmp, loop_len, flags); - if (err < 0) - goto recv_free_err; - if (copy_to_user(msg, tmp, err)) { - err = -EFAULT; - goto recv_free_err; - } - recv_len += err; - msg += err; - if (err != loop_len) - goto recv_free_err; - } -recv_free_err: - mutex_unlock(&ep->recvlock); - kfree(tmp); -recv_err: - return err < 0 ? err : recv_len; -} - -/** - * scif_send() - Send data to connection queue - * @epd: The end point returned from scif_open() - * @msg: Address to place data - * @len: Length to receive - * @flags: blocking or non blocking - * - * This function is called from the kernel mode only and is - * a wrapper for _scif_send(). - */ -int scif_send(scif_epd_t epd, void *msg, int len, int flags) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int ret; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI send (K): ep %p %s\n", ep, scif_ep_states[ep->state]); - if (!len) - return 0; - - ret = scif_msg_param_check(epd, len, flags); - if (ret) - return ret; - if (!ep->remote_dev) - return -ENOTCONN; - /* - * Grab the mutex lock in the blocking case only - * to ensure messages do not get fragmented/reordered. - * The non blocking mode is protected using spin locks - * in _scif_send(). - */ - if (flags & SCIF_SEND_BLOCK) - mutex_lock(&ep->sendlock); - - ret = _scif_send(epd, msg, len, flags); - - if (flags & SCIF_SEND_BLOCK) - mutex_unlock(&ep->sendlock); - return ret; -} -EXPORT_SYMBOL_GPL(scif_send); - -/** - * scif_recv() - Receive data from connection queue - * @epd: The end point returned from scif_open() - * @msg: Address to place data - * @len: Length to receive - * @flags: blocking or non blocking - * - * This function is called from the kernel mode only and is - * a wrapper for _scif_recv(). - */ -int scif_recv(scif_epd_t epd, void *msg, int len, int flags) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int ret; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI recv (K): ep %p %s\n", ep, scif_ep_states[ep->state]); - if (!len) - return 0; - - ret = scif_msg_param_check(epd, len, flags); - if (ret) - return ret; - /* - * Grab the mutex lock in the blocking case only - * to ensure messages do not get fragmented/reordered. - * The non blocking mode is protected using spin locks - * in _scif_send(). - */ - if (flags & SCIF_RECV_BLOCK) - mutex_lock(&ep->recvlock); - - ret = _scif_recv(epd, msg, len, flags); - - if (flags & SCIF_RECV_BLOCK) - mutex_unlock(&ep->recvlock); - - return ret; -} -EXPORT_SYMBOL_GPL(scif_recv); - -static inline void _scif_poll_wait(struct file *f, wait_queue_head_t *wq, - poll_table *p, struct scif_endpt *ep) -{ - /* - * Because poll_wait makes a GFP_KERNEL allocation, give up the lock - * and regrab it afterwards. Because the endpoint state might have - * changed while the lock was given up, the state must be checked - * again after re-acquiring the lock. The code in __scif_pollfd(..) - * does this. - */ - spin_unlock(&ep->lock); - poll_wait(f, wq, p); - spin_lock(&ep->lock); -} - -__poll_t -__scif_pollfd(struct file *f, poll_table *wait, struct scif_endpt *ep) -{ - __poll_t mask = 0; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI pollfd: ep %p %s\n", ep, scif_ep_states[ep->state]); - - spin_lock(&ep->lock); - - /* Endpoint is waiting for a non-blocking connect to complete */ - if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { - _scif_poll_wait(f, &ep->conn_pend_wq, wait, ep); - if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { - if (ep->state == SCIFEP_CONNECTED || - ep->state == SCIFEP_DISCONNECTED || - ep->conn_err) - mask |= EPOLLOUT; - goto exit; - } - } - - /* Endpoint is listening for incoming connection requests */ - if (ep->state == SCIFEP_LISTENING) { - _scif_poll_wait(f, &ep->conwq, wait, ep); - if (ep->state == SCIFEP_LISTENING) { - if (ep->conreqcnt) - mask |= EPOLLIN; - goto exit; - } - } - - /* Endpoint is connected or disconnected */ - if (ep->state == SCIFEP_CONNECTED || ep->state == SCIFEP_DISCONNECTED) { - if (poll_requested_events(wait) & EPOLLIN) - _scif_poll_wait(f, &ep->recvwq, wait, ep); - if (poll_requested_events(wait) & EPOLLOUT) - _scif_poll_wait(f, &ep->sendwq, wait, ep); - if (ep->state == SCIFEP_CONNECTED || - ep->state == SCIFEP_DISCONNECTED) { - /* Data can be read without blocking */ - if (scif_rb_count(&ep->qp_info.qp->inbound_q, 1)) - mask |= EPOLLIN; - /* Data can be written without blocking */ - if (scif_rb_space(&ep->qp_info.qp->outbound_q)) - mask |= EPOLLOUT; - /* Return EPOLLHUP if endpoint is disconnected */ - if (ep->state == SCIFEP_DISCONNECTED) - mask |= EPOLLHUP; - goto exit; - } - } - - /* Return EPOLLERR if the endpoint is in none of the above states */ - mask |= EPOLLERR; -exit: - spin_unlock(&ep->lock); - return mask; -} - -/** - * scif_poll() - Kernel mode SCIF poll - * @ufds: Array of scif_pollepd structures containing the end points - * and events to poll on - * @nfds: Size of the ufds array - * @timeout_msecs: Timeout in msecs, -ve implies infinite timeout - * - * The code flow in this function is based on do_poll(..) in select.c - * - * Returns the number of endpoints which have pending events or 0 in - * the event of a timeout. If a signal is used for wake up, -EINTR is - * returned. - */ -int -scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs) -{ - struct poll_wqueues table; - poll_table *pt; - int i, count = 0, timed_out = timeout_msecs == 0; - __poll_t mask; - u64 timeout = timeout_msecs < 0 ? MAX_SCHEDULE_TIMEOUT - : msecs_to_jiffies(timeout_msecs); - - poll_initwait(&table); - pt = &table.pt; - while (1) { - for (i = 0; i < nfds; i++) { - pt->_key = ufds[i].events | EPOLLERR | EPOLLHUP; - mask = __scif_pollfd(ufds[i].epd->anon, - pt, ufds[i].epd); - mask &= ufds[i].events | EPOLLERR | EPOLLHUP; - if (mask) { - count++; - pt->_qproc = NULL; - } - ufds[i].revents = mask; - } - pt->_qproc = NULL; - if (!count) { - count = table.error; - if (signal_pending(current)) - count = -EINTR; - } - if (count || timed_out) - break; - - if (!schedule_timeout_interruptible(timeout)) - timed_out = 1; - } - poll_freewait(&table); - return count; -} -EXPORT_SYMBOL_GPL(scif_poll); - -int scif_get_node_ids(u16 *nodes, int len, u16 *self) -{ - int online = 0; - int offset = 0; - int node; - - if (!scif_is_mgmt_node()) - scif_get_node_info(); - - *self = scif_info.nodeid; - mutex_lock(&scif_info.conflock); - len = min_t(int, len, scif_info.total); - for (node = 0; node <= scif_info.maxid; node++) { - if (_scifdev_alive(&scif_dev[node])) { - online++; - if (offset < len) - nodes[offset++] = node; - } - } - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI get_node_ids total %d online %d filled in %d nodes\n", - scif_info.total, online, offset); - mutex_unlock(&scif_info.conflock); - - return online; -} -EXPORT_SYMBOL_GPL(scif_get_node_ids); - -static int scif_add_client_dev(struct device *dev, struct subsys_interface *si) -{ - struct scif_client *client = - container_of(si, struct scif_client, si); - struct scif_peer_dev *spdev = - container_of(dev, struct scif_peer_dev, dev); - - if (client->probe) - client->probe(spdev); - return 0; -} - -static void scif_remove_client_dev(struct device *dev, - struct subsys_interface *si) -{ - struct scif_client *client = - container_of(si, struct scif_client, si); - struct scif_peer_dev *spdev = - container_of(dev, struct scif_peer_dev, dev); - - if (client->remove) - client->remove(spdev); -} - -void scif_client_unregister(struct scif_client *client) -{ - subsys_interface_unregister(&client->si); -} -EXPORT_SYMBOL_GPL(scif_client_unregister); - -int scif_client_register(struct scif_client *client) -{ - struct subsys_interface *si = &client->si; - - si->name = client->name; - si->subsys = &scif_peer_bus; - si->add_dev = scif_add_client_dev; - si->remove_dev = scif_remove_client_dev; - - return subsys_interface_register(&client->si); -} -EXPORT_SYMBOL_GPL(scif_client_register); diff --git a/drivers/misc/mic/scif/scif_debugfs.c b/drivers/misc/mic/scif/scif_debugfs.c deleted file mode 100644 index 8fe38e7ca6e6..000000000000 --- a/drivers/misc/mic/scif/scif_debugfs.c +++ /dev/null @@ -1,116 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include -#include - -#include "../common/mic_dev.h" -#include "scif_main.h" - -/* Debugfs parent dir */ -static struct dentry *scif_dbg; - -static int scif_dev_show(struct seq_file *s, void *unused) -{ - int node; - - seq_printf(s, "Total Nodes %d Self Node Id %d Maxid %d\n", - scif_info.total, scif_info.nodeid, - scif_info.maxid); - - if (!scif_dev) - return 0; - - seq_printf(s, "%-16s\t%-16s\n", "node_id", "state"); - - for (node = 0; node <= scif_info.maxid; node++) - seq_printf(s, "%-16d\t%-16s\n", scif_dev[node].node, - _scifdev_alive(&scif_dev[node]) ? - "Running" : "Offline"); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(scif_dev); - -static void scif_display_window(struct scif_window *window, struct seq_file *s) -{ - int j; - struct scatterlist *sg; - scif_pinned_pages_t pin = window->pinned_pages; - - seq_printf(s, "window %p type %d temp %d offset 0x%llx ", - window, window->type, window->temp, window->offset); - seq_printf(s, "nr_pages 0x%llx nr_contig_chunks 0x%x prot %d ", - window->nr_pages, window->nr_contig_chunks, window->prot); - seq_printf(s, "ref_count %d magic 0x%llx peer_window 0x%llx ", - window->ref_count, window->magic, window->peer_window); - seq_printf(s, "unreg_state 0x%x va_for_temp 0x%lx\n", - window->unreg_state, window->va_for_temp); - - for (j = 0; j < window->nr_contig_chunks; j++) - seq_printf(s, "page[%d] dma_addr 0x%llx num_pages 0x%llx\n", j, - window->dma_addr[j], window->num_pages[j]); - - if (window->type == SCIF_WINDOW_SELF && pin) - for (j = 0; j < window->nr_pages; j++) - seq_printf(s, "page[%d] = pinned_pages %p address %p\n", - j, pin->pages[j], - page_address(pin->pages[j])); - - if (window->st) - for_each_sg(window->st->sgl, sg, window->st->nents, j) - seq_printf(s, "sg[%d] dma addr 0x%llx length 0x%x\n", - j, sg_dma_address(sg), sg_dma_len(sg)); -} - -static void scif_display_all_windows(struct list_head *head, struct seq_file *s) -{ - struct list_head *item; - struct scif_window *window; - - list_for_each(item, head) { - window = list_entry(item, struct scif_window, list); - scif_display_window(window, s); - } -} - -static int scif_rma_show(struct seq_file *s, void *unused) -{ - struct scif_endpt *ep; - struct list_head *pos; - - mutex_lock(&scif_info.connlock); - list_for_each(pos, &scif_info.connected) { - ep = list_entry(pos, struct scif_endpt, list); - seq_printf(s, "ep %p self windows\n", ep); - mutex_lock(&ep->rma_info.rma_lock); - scif_display_all_windows(&ep->rma_info.reg_list, s); - seq_printf(s, "ep %p remote windows\n", ep); - scif_display_all_windows(&ep->rma_info.remote_reg_list, s); - mutex_unlock(&ep->rma_info.rma_lock); - } - mutex_unlock(&scif_info.connlock); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(scif_rma); - -void __init scif_init_debugfs(void) -{ - scif_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); - - debugfs_create_file("scif_dev", 0444, scif_dbg, NULL, &scif_dev_fops); - debugfs_create_file("scif_rma", 0444, scif_dbg, NULL, &scif_rma_fops); - debugfs_create_u8("en_msg_log", 0666, scif_dbg, &scif_info.en_msg_log); - debugfs_create_u8("p2p_enable", 0666, scif_dbg, &scif_info.p2p_enable); -} - -void scif_exit_debugfs(void) -{ - debugfs_remove_recursive(scif_dbg); -} diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c deleted file mode 100644 index 401b98e5ad79..000000000000 --- a/drivers/misc/mic/scif/scif_dma.c +++ /dev/null @@ -1,1940 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel SCIF driver. - */ -#include "scif_main.h" -#include "scif_map.h" - -/* - * struct scif_dma_comp_cb - SCIF DMA completion callback - * - * @dma_completion_func: DMA completion callback - * @cb_cookie: DMA completion callback cookie - * @temp_buf: Temporary buffer - * @temp_buf_to_free: Temporary buffer to be freed - * @is_cache: Is a kmem_cache allocated buffer - * @dst_offset: Destination registration offset - * @dst_window: Destination registration window - * @len: Length of the temp buffer - * @temp_phys: DMA address of the temp buffer - * @sdev: The SCIF device - * @header_padding: padding for cache line alignment - */ -struct scif_dma_comp_cb { - void (*dma_completion_func)(void *cookie); - void *cb_cookie; - u8 *temp_buf; - u8 *temp_buf_to_free; - bool is_cache; - s64 dst_offset; - struct scif_window *dst_window; - size_t len; - dma_addr_t temp_phys; - struct scif_dev *sdev; - int header_padding; -}; - -/** - * struct scif_copy_work - Work for DMA copy - * - * @src_offset: Starting source offset - * @dst_offset: Starting destination offset - * @src_window: Starting src registered window - * @dst_window: Starting dst registered window - * @loopback: true if this is a loopback DMA transfer - * @len: Length of the transfer - * @comp_cb: DMA copy completion callback - * @remote_dev: The remote SCIF peer device - * @fence_type: polling or interrupt based - * @ordered: is this a tail byte ordered DMA transfer - */ -struct scif_copy_work { - s64 src_offset; - s64 dst_offset; - struct scif_window *src_window; - struct scif_window *dst_window; - int loopback; - size_t len; - struct scif_dma_comp_cb *comp_cb; - struct scif_dev *remote_dev; - int fence_type; - bool ordered; -}; - -/** - * scif_reserve_dma_chan: - * @ep: Endpoint Descriptor. - * - * This routine reserves a DMA channel for a particular - * endpoint. All DMA transfers for an endpoint are always - * programmed on the same DMA channel. - */ -int scif_reserve_dma_chan(struct scif_endpt *ep) -{ - int err = 0; - struct scif_dev *scifdev; - struct scif_hw_dev *sdev; - struct dma_chan *chan; - - /* Loopback DMAs are not supported on the management node */ - if (!scif_info.nodeid && scifdev_self(ep->remote_dev)) - return 0; - if (scif_info.nodeid) - scifdev = &scif_dev[0]; - else - scifdev = ep->remote_dev; - sdev = scifdev->sdev; - if (!sdev->num_dma_ch) - return -ENODEV; - chan = sdev->dma_ch[scifdev->dma_ch_idx]; - scifdev->dma_ch_idx = (scifdev->dma_ch_idx + 1) % sdev->num_dma_ch; - mutex_lock(&ep->rma_info.rma_lock); - ep->rma_info.dma_chan = chan; - mutex_unlock(&ep->rma_info.rma_lock); - return err; -} - -#ifdef CONFIG_MMU_NOTIFIER -/* - * scif_rma_destroy_tcw: - * - * This routine destroys temporary cached windows - */ -static -void __scif_rma_destroy_tcw(struct scif_mmu_notif *mmn, - u64 start, u64 len) -{ - struct list_head *item, *tmp; - struct scif_window *window; - u64 start_va, end_va; - u64 end = start + len; - - if (end <= start) - return; - - list_for_each_safe(item, tmp, &mmn->tc_reg_list) { - window = list_entry(item, struct scif_window, list); - if (!len) - break; - start_va = window->va_for_temp; - end_va = start_va + (window->nr_pages << PAGE_SHIFT); - if (start < start_va && end <= start_va) - break; - if (start >= end_va) - continue; - __scif_rma_destroy_tcw_helper(window); - } -} - -static void scif_rma_destroy_tcw(struct scif_mmu_notif *mmn, u64 start, u64 len) -{ - struct scif_endpt *ep = mmn->ep; - - spin_lock(&ep->rma_info.tc_lock); - __scif_rma_destroy_tcw(mmn, start, len); - spin_unlock(&ep->rma_info.tc_lock); -} - -static void scif_rma_destroy_tcw_ep(struct scif_endpt *ep) -{ - struct list_head *item, *tmp; - struct scif_mmu_notif *mmn; - - list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) { - mmn = list_entry(item, struct scif_mmu_notif, list); - scif_rma_destroy_tcw(mmn, 0, ULONG_MAX); - } -} - -static void __scif_rma_destroy_tcw_ep(struct scif_endpt *ep) -{ - struct list_head *item, *tmp; - struct scif_mmu_notif *mmn; - - spin_lock(&ep->rma_info.tc_lock); - list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) { - mmn = list_entry(item, struct scif_mmu_notif, list); - __scif_rma_destroy_tcw(mmn, 0, ULONG_MAX); - } - spin_unlock(&ep->rma_info.tc_lock); -} - -static bool scif_rma_tc_can_cache(struct scif_endpt *ep, size_t cur_bytes) -{ - if ((cur_bytes >> PAGE_SHIFT) > scif_info.rma_tc_limit) - return false; - if ((atomic_read(&ep->rma_info.tcw_total_pages) - + (cur_bytes >> PAGE_SHIFT)) > - scif_info.rma_tc_limit) { - dev_info(scif_info.mdev.this_device, - "%s %d total=%d, current=%zu reached max\n", - __func__, __LINE__, - atomic_read(&ep->rma_info.tcw_total_pages), - (1 + (cur_bytes >> PAGE_SHIFT))); - scif_rma_destroy_tcw_invalid(); - __scif_rma_destroy_tcw_ep(ep); - } - return true; -} - -static void scif_mmu_notifier_release(struct mmu_notifier *mn, - struct mm_struct *mm) -{ - struct scif_mmu_notif *mmn; - - mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); - scif_rma_destroy_tcw(mmn, 0, ULONG_MAX); - schedule_work(&scif_info.misc_work); -} - -static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) -{ - struct scif_mmu_notif *mmn; - - mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); - scif_rma_destroy_tcw(mmn, range->start, range->end - range->start); - - return 0; -} - -static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) -{ - /* - * Nothing to do here, everything needed was done in - * invalidate_range_start. - */ -} - -static const struct mmu_notifier_ops scif_mmu_notifier_ops = { - .release = scif_mmu_notifier_release, - .clear_flush_young = NULL, - .invalidate_range_start = scif_mmu_notifier_invalidate_range_start, - .invalidate_range_end = scif_mmu_notifier_invalidate_range_end}; - -static void scif_ep_unregister_mmu_notifier(struct scif_endpt *ep) -{ - struct scif_endpt_rma_info *rma = &ep->rma_info; - struct scif_mmu_notif *mmn = NULL; - struct list_head *item, *tmp; - - mutex_lock(&ep->rma_info.mmn_lock); - list_for_each_safe(item, tmp, &rma->mmn_list) { - mmn = list_entry(item, struct scif_mmu_notif, list); - mmu_notifier_unregister(&mmn->ep_mmu_notifier, mmn->mm); - list_del(item); - kfree(mmn); - } - mutex_unlock(&ep->rma_info.mmn_lock); -} - -static void scif_init_mmu_notifier(struct scif_mmu_notif *mmn, - struct mm_struct *mm, struct scif_endpt *ep) -{ - mmn->ep = ep; - mmn->mm = mm; - mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops; - INIT_LIST_HEAD(&mmn->list); - INIT_LIST_HEAD(&mmn->tc_reg_list); -} - -static struct scif_mmu_notif * -scif_find_mmu_notifier(struct mm_struct *mm, struct scif_endpt_rma_info *rma) -{ - struct scif_mmu_notif *mmn; - - list_for_each_entry(mmn, &rma->mmn_list, list) - if (mmn->mm == mm) - return mmn; - return NULL; -} - -static struct scif_mmu_notif * -scif_add_mmu_notifier(struct mm_struct *mm, struct scif_endpt *ep) -{ - struct scif_mmu_notif *mmn - = kzalloc(sizeof(*mmn), GFP_KERNEL); - - if (!mmn) - return ERR_PTR(-ENOMEM); - - scif_init_mmu_notifier(mmn, current->mm, ep); - if (mmu_notifier_register(&mmn->ep_mmu_notifier, current->mm)) { - kfree(mmn); - return ERR_PTR(-EBUSY); - } - list_add(&mmn->list, &ep->rma_info.mmn_list); - return mmn; -} - -/* - * Called from the misc thread to destroy temporary cached windows and - * unregister the MMU notifier for the SCIF endpoint. - */ -void scif_mmu_notif_handler(struct work_struct *work) -{ - struct list_head *pos, *tmpq; - struct scif_endpt *ep; -restart: - scif_rma_destroy_tcw_invalid(); - spin_lock(&scif_info.rmalock); - list_for_each_safe(pos, tmpq, &scif_info.mmu_notif_cleanup) { - ep = list_entry(pos, struct scif_endpt, mmu_list); - list_del(&ep->mmu_list); - spin_unlock(&scif_info.rmalock); - scif_rma_destroy_tcw_ep(ep); - scif_ep_unregister_mmu_notifier(ep); - goto restart; - } - spin_unlock(&scif_info.rmalock); -} - -static bool scif_is_set_reg_cache(int flags) -{ - return !!(flags & SCIF_RMA_USECACHE); -} -#else -static struct scif_mmu_notif * -scif_find_mmu_notifier(struct mm_struct *mm, - struct scif_endpt_rma_info *rma) -{ - return NULL; -} - -static struct scif_mmu_notif * -scif_add_mmu_notifier(struct mm_struct *mm, struct scif_endpt *ep) -{ - return NULL; -} - -void scif_mmu_notif_handler(struct work_struct *work) -{ -} - -static bool scif_is_set_reg_cache(int flags) -{ - return false; -} - -static bool scif_rma_tc_can_cache(struct scif_endpt *ep, size_t cur_bytes) -{ - return false; -} -#endif - -/** - * scif_register_temp: - * @epd: End Point Descriptor. - * @addr: virtual address to/from which to copy - * @len: length of range to copy - * @prot: read/write protection - * @out_offset: computed offset returned by reference. - * @out_window: allocated registered window returned by reference. - * - * Create a temporary registered window. The peer will not know about this - * window. This API is used for scif_vreadfrom()/scif_vwriteto() API's. - */ -static int -scif_register_temp(scif_epd_t epd, unsigned long addr, size_t len, int prot, - off_t *out_offset, struct scif_window **out_window) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int err; - scif_pinned_pages_t pinned_pages; - size_t aligned_len; - - aligned_len = ALIGN(len, PAGE_SIZE); - - err = __scif_pin_pages((void *)(addr & PAGE_MASK), - aligned_len, &prot, 0, &pinned_pages); - if (err) - return err; - - pinned_pages->prot = prot; - - /* Compute the offset for this registration */ - err = scif_get_window_offset(ep, 0, 0, - aligned_len >> PAGE_SHIFT, - (s64 *)out_offset); - if (err) - goto error_unpin; - - /* Allocate and prepare self registration window */ - *out_window = scif_create_window(ep, aligned_len >> PAGE_SHIFT, - *out_offset, true); - if (!*out_window) { - scif_free_window_offset(ep, NULL, *out_offset); - err = -ENOMEM; - goto error_unpin; - } - - (*out_window)->pinned_pages = pinned_pages; - (*out_window)->nr_pages = pinned_pages->nr_pages; - (*out_window)->prot = pinned_pages->prot; - - (*out_window)->va_for_temp = addr & PAGE_MASK; - err = scif_map_window(ep->remote_dev, *out_window); - if (err) { - /* Something went wrong! Rollback */ - scif_destroy_window(ep, *out_window); - *out_window = NULL; - } else { - *out_offset |= (addr - (*out_window)->va_for_temp); - } - return err; -error_unpin: - if (err) - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - scif_unpin_pages(pinned_pages); - return err; -} - -#define SCIF_DMA_TO (3 * HZ) - -/* - * scif_sync_dma - Program a DMA without an interrupt descriptor - * - * @dev - The address of the pointer to the device instance used - * for DMA registration. - * @chan - DMA channel to be used. - * @sync_wait: Wait for DMA to complete? - * - * Return 0 on success and -errno on error. - */ -static int scif_sync_dma(struct scif_hw_dev *sdev, struct dma_chan *chan, - bool sync_wait) -{ - int err = 0; - struct dma_async_tx_descriptor *tx = NULL; - enum dma_ctrl_flags flags = DMA_PREP_FENCE; - dma_cookie_t cookie; - struct dma_device *ddev; - - if (!chan) { - err = -EIO; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - return err; - } - ddev = chan->device; - - tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, flags); - if (!tx) { - err = -ENOMEM; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto release; - } - cookie = tx->tx_submit(tx); - - if (dma_submit_error(cookie)) { - err = -ENOMEM; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto release; - } - if (!sync_wait) { - dma_async_issue_pending(chan); - } else { - if (dma_sync_wait(chan, cookie) == DMA_COMPLETE) { - err = 0; - } else { - err = -EIO; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - } - } -release: - return err; -} - -static void scif_dma_callback(void *arg) -{ - struct completion *done = (struct completion *)arg; - - complete(done); -} - -#define SCIF_DMA_SYNC_WAIT true -#define SCIF_DMA_POLL BIT(0) -#define SCIF_DMA_INTR BIT(1) - -/* - * scif_async_dma - Program a DMA with an interrupt descriptor - * - * @dev - The address of the pointer to the device instance used - * for DMA registration. - * @chan - DMA channel to be used. - * Return 0 on success and -errno on error. - */ -static int scif_async_dma(struct scif_hw_dev *sdev, struct dma_chan *chan) -{ - int err = 0; - struct dma_device *ddev; - struct dma_async_tx_descriptor *tx = NULL; - enum dma_ctrl_flags flags = DMA_PREP_INTERRUPT | DMA_PREP_FENCE; - DECLARE_COMPLETION_ONSTACK(done_wait); - dma_cookie_t cookie; - enum dma_status status; - - if (!chan) { - err = -EIO; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - return err; - } - ddev = chan->device; - - tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, flags); - if (!tx) { - err = -ENOMEM; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto release; - } - reinit_completion(&done_wait); - tx->callback = scif_dma_callback; - tx->callback_param = &done_wait; - cookie = tx->tx_submit(tx); - - if (dma_submit_error(cookie)) { - err = -ENOMEM; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto release; - } - dma_async_issue_pending(chan); - - err = wait_for_completion_timeout(&done_wait, SCIF_DMA_TO); - if (!err) { - err = -EIO; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto release; - } - err = 0; - status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); - if (status != DMA_COMPLETE) { - err = -EIO; - dev_err(&sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto release; - } -release: - return err; -} - -/* - * scif_drain_dma_poll - Drain all outstanding DMA operations for a particular - * DMA channel via polling. - * - * @sdev - The SCIF device - * @chan - DMA channel - * Return 0 on success and -errno on error. - */ -static int scif_drain_dma_poll(struct scif_hw_dev *sdev, struct dma_chan *chan) -{ - if (!chan) - return -EINVAL; - return scif_sync_dma(sdev, chan, SCIF_DMA_SYNC_WAIT); -} - -/* - * scif_drain_dma_intr - Drain all outstanding DMA operations for a particular - * DMA channel via interrupt based blocking wait. - * - * @sdev - The SCIF device - * @chan - DMA channel - * Return 0 on success and -errno on error. - */ -int scif_drain_dma_intr(struct scif_hw_dev *sdev, struct dma_chan *chan) -{ - if (!chan) - return -EINVAL; - return scif_async_dma(sdev, chan); -} - -/** - * scif_rma_destroy_windows: - * - * This routine destroys all windows queued for cleanup - */ -void scif_rma_destroy_windows(void) -{ - struct list_head *item, *tmp; - struct scif_window *window; - struct scif_endpt *ep; - struct dma_chan *chan; - - might_sleep(); -restart: - spin_lock(&scif_info.rmalock); - list_for_each_safe(item, tmp, &scif_info.rma) { - window = list_entry(item, struct scif_window, - list); - ep = (struct scif_endpt *)window->ep; - chan = ep->rma_info.dma_chan; - - list_del_init(&window->list); - spin_unlock(&scif_info.rmalock); - if (!chan || !scifdev_alive(ep) || - !scif_drain_dma_intr(ep->remote_dev->sdev, - ep->rma_info.dma_chan)) - /* Remove window from global list */ - window->unreg_state = OP_COMPLETED; - else - dev_warn(&ep->remote_dev->sdev->dev, - "DMA engine hung?\n"); - if (window->unreg_state == OP_COMPLETED) { - if (window->type == SCIF_WINDOW_SELF) - scif_destroy_window(ep, window); - else - scif_destroy_remote_window(window); - atomic_dec(&ep->rma_info.tw_refcount); - } - goto restart; - } - spin_unlock(&scif_info.rmalock); -} - -/** - * scif_rma_destroy_tcw: - * - * This routine destroys temporary cached registered windows - * which have been queued for cleanup. - */ -void scif_rma_destroy_tcw_invalid(void) -{ - struct list_head *item, *tmp; - struct scif_window *window; - struct scif_endpt *ep; - struct dma_chan *chan; - - might_sleep(); -restart: - spin_lock(&scif_info.rmalock); - list_for_each_safe(item, tmp, &scif_info.rma_tc) { - window = list_entry(item, struct scif_window, list); - ep = (struct scif_endpt *)window->ep; - chan = ep->rma_info.dma_chan; - list_del_init(&window->list); - spin_unlock(&scif_info.rmalock); - mutex_lock(&ep->rma_info.rma_lock); - if (!chan || !scifdev_alive(ep) || - !scif_drain_dma_intr(ep->remote_dev->sdev, - ep->rma_info.dma_chan)) { - atomic_sub(window->nr_pages, - &ep->rma_info.tcw_total_pages); - scif_destroy_window(ep, window); - atomic_dec(&ep->rma_info.tcw_refcount); - } else { - dev_warn(&ep->remote_dev->sdev->dev, - "DMA engine hung?\n"); - } - mutex_unlock(&ep->rma_info.rma_lock); - goto restart; - } - spin_unlock(&scif_info.rmalock); -} - -static inline -void *_get_local_va(off_t off, struct scif_window *window, size_t len) -{ - int page_nr = (off - window->offset) >> PAGE_SHIFT; - off_t page_off = off & ~PAGE_MASK; - void *va = NULL; - - if (window->type == SCIF_WINDOW_SELF) { - struct page **pages = window->pinned_pages->pages; - - va = page_address(pages[page_nr]) + page_off; - } - return va; -} - -static inline -void *ioremap_remote(off_t off, struct scif_window *window, - size_t len, struct scif_dev *dev, - struct scif_window_iter *iter) -{ - dma_addr_t phys = scif_off_to_dma_addr(window, off, NULL, iter); - - /* - * If the DMA address is not card relative then we need the DMA - * addresses to be an offset into the bar. The aperture base was already - * added so subtract it here since scif_ioremap is going to add it again - */ - if (!scifdev_self(dev) && window->type == SCIF_WINDOW_PEER && - dev->sdev->aper && !dev->sdev->card_rel_da) - phys = phys - dev->sdev->aper->pa; - return scif_ioremap(phys, len, dev); -} - -static inline void -iounmap_remote(void *virt, size_t size, struct scif_copy_work *work) -{ - scif_iounmap(virt, size, work->remote_dev); -} - -/* - * Takes care of ordering issue caused by - * 1. Hardware: Only in the case of cpu copy from mgmt node to card - * because of WC memory. - * 2. Software: If memcpy reorders copy instructions for optimization. - * This could happen at both mgmt node and card. - */ -static inline void -scif_ordered_memcpy_toio(char *dst, const char *src, size_t count) -{ - if (!count) - return; - - memcpy_toio((void __iomem __force *)dst, src, --count); - /* Order the last byte with the previous stores */ - wmb(); - *(dst + count) = *(src + count); -} - -static inline void scif_unaligned_cpy_toio(char *dst, const char *src, - size_t count, bool ordered) -{ - if (ordered) - scif_ordered_memcpy_toio(dst, src, count); - else - memcpy_toio((void __iomem __force *)dst, src, count); -} - -static inline -void scif_ordered_memcpy_fromio(char *dst, const char *src, size_t count) -{ - if (!count) - return; - - memcpy_fromio(dst, (void __iomem __force *)src, --count); - /* Order the last byte with the previous loads */ - rmb(); - *(dst + count) = *(src + count); -} - -static inline void scif_unaligned_cpy_fromio(char *dst, const char *src, - size_t count, bool ordered) -{ - if (ordered) - scif_ordered_memcpy_fromio(dst, src, count); - else - memcpy_fromio(dst, (void __iomem __force *)src, count); -} - -#define SCIF_RMA_ERROR_CODE (~(dma_addr_t)0x0) - -/* - * scif_off_to_dma_addr: - * Obtain the dma_addr given the window and the offset. - * @window: Registered window. - * @off: Window offset. - * @nr_bytes: Return the number of contiguous bytes till next DMA addr index. - * @index: Return the index of the dma_addr array found. - * @start_off: start offset of index of the dma addr array found. - * The nr_bytes provides the callee an estimate of the maximum possible - * DMA xfer possible while the index/start_off provide faster lookups - * for the next iteration. - */ -dma_addr_t scif_off_to_dma_addr(struct scif_window *window, s64 off, - size_t *nr_bytes, struct scif_window_iter *iter) -{ - int i, page_nr; - s64 start, end; - off_t page_off; - - if (window->nr_pages == window->nr_contig_chunks) { - page_nr = (off - window->offset) >> PAGE_SHIFT; - page_off = off & ~PAGE_MASK; - - if (nr_bytes) - *nr_bytes = PAGE_SIZE - page_off; - return window->dma_addr[page_nr] | page_off; - } - if (iter) { - i = iter->index; - start = iter->offset; - } else { - i = 0; - start = window->offset; - } - for (; i < window->nr_contig_chunks; i++) { - end = start + (window->num_pages[i] << PAGE_SHIFT); - if (off >= start && off < end) { - if (iter) { - iter->index = i; - iter->offset = start; - } - if (nr_bytes) - *nr_bytes = end - off; - return (window->dma_addr[i] + (off - start)); - } - start += (window->num_pages[i] << PAGE_SHIFT); - } - dev_err(scif_info.mdev.this_device, - "%s %d BUG. Addr not found? window %p off 0x%llx\n", - __func__, __LINE__, window, off); - return SCIF_RMA_ERROR_CODE; -} - -/* - * Copy between rma window and temporary buffer - */ -static void scif_rma_local_cpu_copy(s64 offset, struct scif_window *window, - u8 *temp, size_t rem_len, bool to_temp) -{ - void *window_virt; - size_t loop_len; - int offset_in_page; - s64 end_offset; - - offset_in_page = offset & ~PAGE_MASK; - loop_len = PAGE_SIZE - offset_in_page; - - if (rem_len < loop_len) - loop_len = rem_len; - - window_virt = _get_local_va(offset, window, loop_len); - if (!window_virt) - return; - if (to_temp) - memcpy(temp, window_virt, loop_len); - else - memcpy(window_virt, temp, loop_len); - - offset += loop_len; - temp += loop_len; - rem_len -= loop_len; - - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - while (rem_len) { - if (offset == end_offset) { - window = list_next_entry(window, list); - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - } - loop_len = min(PAGE_SIZE, rem_len); - window_virt = _get_local_va(offset, window, loop_len); - if (!window_virt) - return; - if (to_temp) - memcpy(temp, window_virt, loop_len); - else - memcpy(window_virt, temp, loop_len); - offset += loop_len; - temp += loop_len; - rem_len -= loop_len; - } -} - -/** - * scif_rma_completion_cb: - * @data: RMA cookie - * - * RMA interrupt completion callback. - */ -static void scif_rma_completion_cb(void *data) -{ - struct scif_dma_comp_cb *comp_cb = data; - - /* Free DMA Completion CB. */ - if (comp_cb->dst_window) - scif_rma_local_cpu_copy(comp_cb->dst_offset, - comp_cb->dst_window, - comp_cb->temp_buf + - comp_cb->header_padding, - comp_cb->len, false); - scif_unmap_single(comp_cb->temp_phys, comp_cb->sdev, - SCIF_KMEM_UNALIGNED_BUF_SIZE); - if (comp_cb->is_cache) - kmem_cache_free(unaligned_cache, - comp_cb->temp_buf_to_free); - else - kfree(comp_cb->temp_buf_to_free); -} - -/* Copies between temporary buffer and offsets provided in work */ -static int -scif_rma_list_dma_copy_unaligned(struct scif_copy_work *work, - u8 *temp, struct dma_chan *chan, - bool src_local) -{ - struct scif_dma_comp_cb *comp_cb = work->comp_cb; - dma_addr_t window_dma_addr, temp_dma_addr; - dma_addr_t temp_phys = comp_cb->temp_phys; - size_t loop_len, nr_contig_bytes = 0, remaining_len = work->len; - int offset_in_ca, ret = 0; - s64 end_offset, offset; - struct scif_window *window; - void *window_virt_addr; - size_t tail_len; - struct dma_async_tx_descriptor *tx; - struct dma_device *dev = chan->device; - dma_cookie_t cookie; - - if (src_local) { - offset = work->dst_offset; - window = work->dst_window; - } else { - offset = work->src_offset; - window = work->src_window; - } - - offset_in_ca = offset & (L1_CACHE_BYTES - 1); - if (offset_in_ca) { - loop_len = L1_CACHE_BYTES - offset_in_ca; - loop_len = min(loop_len, remaining_len); - window_virt_addr = ioremap_remote(offset, window, - loop_len, - work->remote_dev, - NULL); - if (!window_virt_addr) - return -ENOMEM; - if (src_local) - scif_unaligned_cpy_toio(window_virt_addr, temp, - loop_len, - work->ordered && - !(remaining_len - loop_len)); - else - scif_unaligned_cpy_fromio(temp, window_virt_addr, - loop_len, work->ordered && - !(remaining_len - loop_len)); - iounmap_remote(window_virt_addr, loop_len, work); - - offset += loop_len; - temp += loop_len; - temp_phys += loop_len; - remaining_len -= loop_len; - } - - offset_in_ca = offset & ~PAGE_MASK; - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - - tail_len = remaining_len & (L1_CACHE_BYTES - 1); - remaining_len -= tail_len; - while (remaining_len) { - if (offset == end_offset) { - window = list_next_entry(window, list); - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - } - if (scif_is_mgmt_node()) - temp_dma_addr = temp_phys; - else - /* Fix if we ever enable IOMMU on the card */ - temp_dma_addr = (dma_addr_t)virt_to_phys(temp); - window_dma_addr = scif_off_to_dma_addr(window, offset, - &nr_contig_bytes, - NULL); - loop_len = min(nr_contig_bytes, remaining_len); - if (src_local) { - if (work->ordered && !tail_len && - !(remaining_len - loop_len) && - loop_len != L1_CACHE_BYTES) { - /* - * Break up the last chunk of the transfer into - * two steps. if there is no tail to guarantee - * DMA ordering. SCIF_DMA_POLLING inserts - * a status update descriptor in step 1 which - * acts as a double sided synchronization fence - * for the DMA engine to ensure that the last - * cache line in step 2 is updated last. - */ - /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */ - tx = - dev->device_prep_dma_memcpy(chan, - window_dma_addr, - temp_dma_addr, - loop_len - - L1_CACHE_BYTES, - DMA_PREP_FENCE); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - dma_async_issue_pending(chan); - offset += (loop_len - L1_CACHE_BYTES); - temp_dma_addr += (loop_len - L1_CACHE_BYTES); - window_dma_addr += (loop_len - L1_CACHE_BYTES); - remaining_len -= (loop_len - L1_CACHE_BYTES); - loop_len = remaining_len; - - /* Step 2) DMA: L1_CACHE_BYTES */ - tx = - dev->device_prep_dma_memcpy(chan, - window_dma_addr, - temp_dma_addr, - loop_len, 0); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - dma_async_issue_pending(chan); - } else { - tx = - dev->device_prep_dma_memcpy(chan, - window_dma_addr, - temp_dma_addr, - loop_len, 0); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - dma_async_issue_pending(chan); - } - } else { - tx = dev->device_prep_dma_memcpy(chan, temp_dma_addr, - window_dma_addr, loop_len, 0); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - dma_async_issue_pending(chan); - } - offset += loop_len; - temp += loop_len; - temp_phys += loop_len; - remaining_len -= loop_len; - offset_in_ca = 0; - } - if (tail_len) { - if (offset == end_offset) { - window = list_next_entry(window, list); - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - } - window_virt_addr = ioremap_remote(offset, window, tail_len, - work->remote_dev, - NULL); - if (!window_virt_addr) - return -ENOMEM; - /* - * The CPU copy for the tail bytes must be initiated only once - * previous DMA transfers for this endpoint have completed - * to guarantee ordering. - */ - if (work->ordered) { - struct scif_dev *rdev = work->remote_dev; - - ret = scif_drain_dma_intr(rdev->sdev, chan); - if (ret) - return ret; - } - if (src_local) - scif_unaligned_cpy_toio(window_virt_addr, temp, - tail_len, work->ordered); - else - scif_unaligned_cpy_fromio(temp, window_virt_addr, - tail_len, work->ordered); - iounmap_remote(window_virt_addr, tail_len, work); - } - tx = dev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_INTERRUPT); - if (!tx) { - ret = -ENOMEM; - return ret; - } - tx->callback = &scif_rma_completion_cb; - tx->callback_param = comp_cb; - cookie = tx->tx_submit(tx); - - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - return ret; - } - dma_async_issue_pending(chan); - return 0; -err: - dev_err(scif_info.mdev.this_device, - "%s %d Desc Prog Failed ret %d\n", - __func__, __LINE__, ret); - return ret; -} - -/* - * _scif_rma_list_dma_copy_aligned: - * - * Traverse all the windows and perform DMA copy. - */ -static int _scif_rma_list_dma_copy_aligned(struct scif_copy_work *work, - struct dma_chan *chan) -{ - dma_addr_t src_dma_addr, dst_dma_addr; - size_t loop_len, remaining_len, src_contig_bytes = 0; - size_t dst_contig_bytes = 0; - struct scif_window_iter src_win_iter; - struct scif_window_iter dst_win_iter; - s64 end_src_offset, end_dst_offset; - struct scif_window *src_window = work->src_window; - struct scif_window *dst_window = work->dst_window; - s64 src_offset = work->src_offset, dst_offset = work->dst_offset; - int ret = 0; - struct dma_async_tx_descriptor *tx; - struct dma_device *dev = chan->device; - dma_cookie_t cookie; - - remaining_len = work->len; - - scif_init_window_iter(src_window, &src_win_iter); - scif_init_window_iter(dst_window, &dst_win_iter); - end_src_offset = src_window->offset + - (src_window->nr_pages << PAGE_SHIFT); - end_dst_offset = dst_window->offset + - (dst_window->nr_pages << PAGE_SHIFT); - while (remaining_len) { - if (src_offset == end_src_offset) { - src_window = list_next_entry(src_window, list); - end_src_offset = src_window->offset + - (src_window->nr_pages << PAGE_SHIFT); - scif_init_window_iter(src_window, &src_win_iter); - } - if (dst_offset == end_dst_offset) { - dst_window = list_next_entry(dst_window, list); - end_dst_offset = dst_window->offset + - (dst_window->nr_pages << PAGE_SHIFT); - scif_init_window_iter(dst_window, &dst_win_iter); - } - - /* compute dma addresses for transfer */ - src_dma_addr = scif_off_to_dma_addr(src_window, src_offset, - &src_contig_bytes, - &src_win_iter); - dst_dma_addr = scif_off_to_dma_addr(dst_window, dst_offset, - &dst_contig_bytes, - &dst_win_iter); - loop_len = min(src_contig_bytes, dst_contig_bytes); - loop_len = min(loop_len, remaining_len); - if (work->ordered && !(remaining_len - loop_len)) { - /* - * Break up the last chunk of the transfer into two - * steps to ensure that the last byte in step 2 is - * updated last. - */ - /* Step 1) DMA: Body Length - 1 */ - tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, - src_dma_addr, - loop_len - 1, - DMA_PREP_FENCE); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - src_offset += (loop_len - 1); - dst_offset += (loop_len - 1); - src_dma_addr += (loop_len - 1); - dst_dma_addr += (loop_len - 1); - remaining_len -= (loop_len - 1); - loop_len = remaining_len; - - /* Step 2) DMA: 1 BYTES */ - tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, - src_dma_addr, loop_len, 0); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - dma_async_issue_pending(chan); - } else { - tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, - src_dma_addr, loop_len, 0); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - } - src_offset += loop_len; - dst_offset += loop_len; - remaining_len -= loop_len; - } - return ret; -err: - dev_err(scif_info.mdev.this_device, - "%s %d Desc Prog Failed ret %d\n", - __func__, __LINE__, ret); - return ret; -} - -/* - * scif_rma_list_dma_copy_aligned: - * - * Traverse all the windows and perform DMA copy. - */ -static int scif_rma_list_dma_copy_aligned(struct scif_copy_work *work, - struct dma_chan *chan) -{ - dma_addr_t src_dma_addr, dst_dma_addr; - size_t loop_len, remaining_len, tail_len, src_contig_bytes = 0; - size_t dst_contig_bytes = 0; - int src_cache_off; - s64 end_src_offset, end_dst_offset; - struct scif_window_iter src_win_iter; - struct scif_window_iter dst_win_iter; - void *src_virt, *dst_virt; - struct scif_window *src_window = work->src_window; - struct scif_window *dst_window = work->dst_window; - s64 src_offset = work->src_offset, dst_offset = work->dst_offset; - int ret = 0; - struct dma_async_tx_descriptor *tx; - struct dma_device *dev = chan->device; - dma_cookie_t cookie; - - remaining_len = work->len; - scif_init_window_iter(src_window, &src_win_iter); - scif_init_window_iter(dst_window, &dst_win_iter); - - src_cache_off = src_offset & (L1_CACHE_BYTES - 1); - if (src_cache_off != 0) { - /* Head */ - loop_len = L1_CACHE_BYTES - src_cache_off; - loop_len = min(loop_len, remaining_len); - src_dma_addr = __scif_off_to_dma_addr(src_window, src_offset); - dst_dma_addr = __scif_off_to_dma_addr(dst_window, dst_offset); - if (src_window->type == SCIF_WINDOW_SELF) - src_virt = _get_local_va(src_offset, src_window, - loop_len); - else - src_virt = ioremap_remote(src_offset, src_window, - loop_len, - work->remote_dev, NULL); - if (!src_virt) - return -ENOMEM; - if (dst_window->type == SCIF_WINDOW_SELF) - dst_virt = _get_local_va(dst_offset, dst_window, - loop_len); - else - dst_virt = ioremap_remote(dst_offset, dst_window, - loop_len, - work->remote_dev, NULL); - if (!dst_virt) { - if (src_window->type != SCIF_WINDOW_SELF) - iounmap_remote(src_virt, loop_len, work); - return -ENOMEM; - } - if (src_window->type == SCIF_WINDOW_SELF) - scif_unaligned_cpy_toio(dst_virt, src_virt, loop_len, - remaining_len == loop_len ? - work->ordered : false); - else - scif_unaligned_cpy_fromio(dst_virt, src_virt, loop_len, - remaining_len == loop_len ? - work->ordered : false); - if (src_window->type != SCIF_WINDOW_SELF) - iounmap_remote(src_virt, loop_len, work); - if (dst_window->type != SCIF_WINDOW_SELF) - iounmap_remote(dst_virt, loop_len, work); - src_offset += loop_len; - dst_offset += loop_len; - remaining_len -= loop_len; - } - - end_src_offset = src_window->offset + - (src_window->nr_pages << PAGE_SHIFT); - end_dst_offset = dst_window->offset + - (dst_window->nr_pages << PAGE_SHIFT); - tail_len = remaining_len & (L1_CACHE_BYTES - 1); - remaining_len -= tail_len; - while (remaining_len) { - if (src_offset == end_src_offset) { - src_window = list_next_entry(src_window, list); - end_src_offset = src_window->offset + - (src_window->nr_pages << PAGE_SHIFT); - scif_init_window_iter(src_window, &src_win_iter); - } - if (dst_offset == end_dst_offset) { - dst_window = list_next_entry(dst_window, list); - end_dst_offset = dst_window->offset + - (dst_window->nr_pages << PAGE_SHIFT); - scif_init_window_iter(dst_window, &dst_win_iter); - } - - /* compute dma addresses for transfer */ - src_dma_addr = scif_off_to_dma_addr(src_window, src_offset, - &src_contig_bytes, - &src_win_iter); - dst_dma_addr = scif_off_to_dma_addr(dst_window, dst_offset, - &dst_contig_bytes, - &dst_win_iter); - loop_len = min(src_contig_bytes, dst_contig_bytes); - loop_len = min(loop_len, remaining_len); - if (work->ordered && !tail_len && - !(remaining_len - loop_len)) { - /* - * Break up the last chunk of the transfer into two - * steps. if there is no tail to gurantee DMA ordering. - * Passing SCIF_DMA_POLLING inserts a status update - * descriptor in step 1 which acts as a double sided - * synchronization fence for the DMA engine to ensure - * that the last cache line in step 2 is updated last. - */ - /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */ - tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, - src_dma_addr, - loop_len - - L1_CACHE_BYTES, - DMA_PREP_FENCE); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - dma_async_issue_pending(chan); - src_offset += (loop_len - L1_CACHE_BYTES); - dst_offset += (loop_len - L1_CACHE_BYTES); - src_dma_addr += (loop_len - L1_CACHE_BYTES); - dst_dma_addr += (loop_len - L1_CACHE_BYTES); - remaining_len -= (loop_len - L1_CACHE_BYTES); - loop_len = remaining_len; - - /* Step 2) DMA: L1_CACHE_BYTES */ - tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, - src_dma_addr, - loop_len, 0); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - dma_async_issue_pending(chan); - } else { - tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, - src_dma_addr, - loop_len, 0); - if (!tx) { - ret = -ENOMEM; - goto err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - ret = -ENOMEM; - goto err; - } - dma_async_issue_pending(chan); - } - src_offset += loop_len; - dst_offset += loop_len; - remaining_len -= loop_len; - } - remaining_len = tail_len; - if (remaining_len) { - loop_len = remaining_len; - if (src_offset == end_src_offset) - src_window = list_next_entry(src_window, list); - if (dst_offset == end_dst_offset) - dst_window = list_next_entry(dst_window, list); - - src_dma_addr = __scif_off_to_dma_addr(src_window, src_offset); - dst_dma_addr = __scif_off_to_dma_addr(dst_window, dst_offset); - /* - * The CPU copy for the tail bytes must be initiated only once - * previous DMA transfers for this endpoint have completed to - * guarantee ordering. - */ - if (work->ordered) { - struct scif_dev *rdev = work->remote_dev; - - ret = scif_drain_dma_poll(rdev->sdev, chan); - if (ret) - return ret; - } - if (src_window->type == SCIF_WINDOW_SELF) - src_virt = _get_local_va(src_offset, src_window, - loop_len); - else - src_virt = ioremap_remote(src_offset, src_window, - loop_len, - work->remote_dev, NULL); - if (!src_virt) - return -ENOMEM; - - if (dst_window->type == SCIF_WINDOW_SELF) - dst_virt = _get_local_va(dst_offset, dst_window, - loop_len); - else - dst_virt = ioremap_remote(dst_offset, dst_window, - loop_len, - work->remote_dev, NULL); - if (!dst_virt) { - if (src_window->type != SCIF_WINDOW_SELF) - iounmap_remote(src_virt, loop_len, work); - return -ENOMEM; - } - - if (src_window->type == SCIF_WINDOW_SELF) - scif_unaligned_cpy_toio(dst_virt, src_virt, loop_len, - work->ordered); - else - scif_unaligned_cpy_fromio(dst_virt, src_virt, - loop_len, work->ordered); - if (src_window->type != SCIF_WINDOW_SELF) - iounmap_remote(src_virt, loop_len, work); - - if (dst_window->type != SCIF_WINDOW_SELF) - iounmap_remote(dst_virt, loop_len, work); - remaining_len -= loop_len; - } - return ret; -err: - dev_err(scif_info.mdev.this_device, - "%s %d Desc Prog Failed ret %d\n", - __func__, __LINE__, ret); - return ret; -} - -/* - * scif_rma_list_cpu_copy: - * - * Traverse all the windows and perform CPU copy. - */ -static int scif_rma_list_cpu_copy(struct scif_copy_work *work) -{ - void *src_virt, *dst_virt; - size_t loop_len, remaining_len; - int src_page_off, dst_page_off; - s64 src_offset = work->src_offset, dst_offset = work->dst_offset; - struct scif_window *src_window = work->src_window; - struct scif_window *dst_window = work->dst_window; - s64 end_src_offset, end_dst_offset; - int ret = 0; - struct scif_window_iter src_win_iter; - struct scif_window_iter dst_win_iter; - - remaining_len = work->len; - - scif_init_window_iter(src_window, &src_win_iter); - scif_init_window_iter(dst_window, &dst_win_iter); - while (remaining_len) { - src_page_off = src_offset & ~PAGE_MASK; - dst_page_off = dst_offset & ~PAGE_MASK; - loop_len = min(PAGE_SIZE - - max(src_page_off, dst_page_off), - remaining_len); - - if (src_window->type == SCIF_WINDOW_SELF) - src_virt = _get_local_va(src_offset, src_window, - loop_len); - else - src_virt = ioremap_remote(src_offset, src_window, - loop_len, - work->remote_dev, - &src_win_iter); - if (!src_virt) { - ret = -ENOMEM; - goto error; - } - - if (dst_window->type == SCIF_WINDOW_SELF) - dst_virt = _get_local_va(dst_offset, dst_window, - loop_len); - else - dst_virt = ioremap_remote(dst_offset, dst_window, - loop_len, - work->remote_dev, - &dst_win_iter); - if (!dst_virt) { - if (src_window->type == SCIF_WINDOW_PEER) - iounmap_remote(src_virt, loop_len, work); - ret = -ENOMEM; - goto error; - } - - if (work->loopback) { - memcpy(dst_virt, src_virt, loop_len); - } else { - if (src_window->type == SCIF_WINDOW_SELF) - memcpy_toio((void __iomem __force *)dst_virt, - src_virt, loop_len); - else - memcpy_fromio(dst_virt, - (void __iomem __force *)src_virt, - loop_len); - } - if (src_window->type == SCIF_WINDOW_PEER) - iounmap_remote(src_virt, loop_len, work); - - if (dst_window->type == SCIF_WINDOW_PEER) - iounmap_remote(dst_virt, loop_len, work); - - src_offset += loop_len; - dst_offset += loop_len; - remaining_len -= loop_len; - if (remaining_len) { - end_src_offset = src_window->offset + - (src_window->nr_pages << PAGE_SHIFT); - end_dst_offset = dst_window->offset + - (dst_window->nr_pages << PAGE_SHIFT); - if (src_offset == end_src_offset) { - src_window = list_next_entry(src_window, list); - scif_init_window_iter(src_window, - &src_win_iter); - } - if (dst_offset == end_dst_offset) { - dst_window = list_next_entry(dst_window, list); - scif_init_window_iter(dst_window, - &dst_win_iter); - } - } - } -error: - return ret; -} - -static int scif_rma_list_dma_copy_wrapper(struct scif_endpt *epd, - struct scif_copy_work *work, - struct dma_chan *chan, off_t loffset) -{ - int src_cache_off, dst_cache_off; - s64 src_offset = work->src_offset, dst_offset = work->dst_offset; - u8 *temp = NULL; - bool src_local = true; - struct scif_dma_comp_cb *comp_cb; - int err; - - if (is_dma_copy_aligned(chan->device, 1, 1, 1)) - return _scif_rma_list_dma_copy_aligned(work, chan); - - src_cache_off = src_offset & (L1_CACHE_BYTES - 1); - dst_cache_off = dst_offset & (L1_CACHE_BYTES - 1); - - if (dst_cache_off == src_cache_off) - return scif_rma_list_dma_copy_aligned(work, chan); - - if (work->loopback) - return scif_rma_list_cpu_copy(work); - src_local = work->src_window->type == SCIF_WINDOW_SELF; - - /* Allocate dma_completion cb */ - comp_cb = kzalloc(sizeof(*comp_cb), GFP_KERNEL); - if (!comp_cb) - goto error; - - work->comp_cb = comp_cb; - comp_cb->cb_cookie = comp_cb; - comp_cb->dma_completion_func = &scif_rma_completion_cb; - - if (work->len + (L1_CACHE_BYTES << 1) < SCIF_KMEM_UNALIGNED_BUF_SIZE) { - comp_cb->is_cache = false; - /* Allocate padding bytes to align to a cache line */ - temp = kmalloc(work->len + (L1_CACHE_BYTES << 1), - GFP_KERNEL); - if (!temp) - goto free_comp_cb; - comp_cb->temp_buf_to_free = temp; - /* kmalloc(..) does not guarantee cache line alignment */ - if (!IS_ALIGNED((u64)temp, L1_CACHE_BYTES)) - temp = PTR_ALIGN(temp, L1_CACHE_BYTES); - } else { - comp_cb->is_cache = true; - temp = kmem_cache_alloc(unaligned_cache, GFP_KERNEL); - if (!temp) - goto free_comp_cb; - comp_cb->temp_buf_to_free = temp; - } - - if (src_local) { - temp += dst_cache_off; - scif_rma_local_cpu_copy(work->src_offset, work->src_window, - temp, work->len, true); - } else { - comp_cb->dst_window = work->dst_window; - comp_cb->dst_offset = work->dst_offset; - work->src_offset = work->src_offset - src_cache_off; - comp_cb->len = work->len; - work->len = ALIGN(work->len + src_cache_off, L1_CACHE_BYTES); - comp_cb->header_padding = src_cache_off; - } - comp_cb->temp_buf = temp; - - err = scif_map_single(&comp_cb->temp_phys, temp, - work->remote_dev, SCIF_KMEM_UNALIGNED_BUF_SIZE); - if (err) - goto free_temp_buf; - comp_cb->sdev = work->remote_dev; - if (scif_rma_list_dma_copy_unaligned(work, temp, chan, src_local) < 0) - goto free_temp_buf; - if (!src_local) - work->fence_type = SCIF_DMA_INTR; - return 0; -free_temp_buf: - if (comp_cb->is_cache) - kmem_cache_free(unaligned_cache, comp_cb->temp_buf_to_free); - else - kfree(comp_cb->temp_buf_to_free); -free_comp_cb: - kfree(comp_cb); -error: - return -ENOMEM; -} - -/** - * scif_rma_copy: - * @epd: end point descriptor. - * @loffset: offset in local registered address space to/from which to copy - * @addr: user virtual address to/from which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space to/from which to copy - * @flags: flags - * @dir: LOCAL->REMOTE or vice versa. - * @last_chunk: true if this is the last chunk of a larger transfer - * - * Validate parameters, check if src/dst registered ranges requested for copy - * are valid and initiate either CPU or DMA copy. - */ -static int scif_rma_copy(scif_epd_t epd, off_t loffset, unsigned long addr, - size_t len, off_t roffset, int flags, - enum scif_rma_dir dir, bool last_chunk) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct scif_rma_req remote_req; - struct scif_rma_req req; - struct scif_window *local_window = NULL; - struct scif_window *remote_window = NULL; - struct scif_copy_work copy_work; - bool loopback; - int err = 0; - struct dma_chan *chan; - struct scif_mmu_notif *mmn = NULL; - bool cache = false; - struct device *spdev; - - err = scif_verify_epd(ep); - if (err) - return err; - - if (flags && !(flags & (SCIF_RMA_USECPU | SCIF_RMA_USECACHE | - SCIF_RMA_SYNC | SCIF_RMA_ORDERED))) - return -EINVAL; - - loopback = scifdev_self(ep->remote_dev) ? true : false; - copy_work.fence_type = ((flags & SCIF_RMA_SYNC) && last_chunk) ? - SCIF_DMA_POLL : 0; - copy_work.ordered = !!((flags & SCIF_RMA_ORDERED) && last_chunk); - - /* Use CPU for Mgmt node <-> Mgmt node copies */ - if (loopback && scif_is_mgmt_node()) { - flags |= SCIF_RMA_USECPU; - copy_work.fence_type = 0x0; - } - - cache = scif_is_set_reg_cache(flags); - - remote_req.out_window = &remote_window; - remote_req.offset = roffset; - remote_req.nr_bytes = len; - /* - * If transfer is from local to remote then the remote window - * must be writeable and vice versa. - */ - remote_req.prot = dir == SCIF_LOCAL_TO_REMOTE ? VM_WRITE : VM_READ; - remote_req.type = SCIF_WINDOW_PARTIAL; - remote_req.head = &ep->rma_info.remote_reg_list; - - spdev = scif_get_peer_dev(ep->remote_dev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - return err; - } - - if (addr && cache) { - mutex_lock(&ep->rma_info.mmn_lock); - mmn = scif_find_mmu_notifier(current->mm, &ep->rma_info); - if (!mmn) - mmn = scif_add_mmu_notifier(current->mm, ep); - mutex_unlock(&ep->rma_info.mmn_lock); - if (IS_ERR(mmn)) { - scif_put_peer_dev(spdev); - return PTR_ERR(mmn); - } - cache = cache && !scif_rma_tc_can_cache(ep, len); - } - mutex_lock(&ep->rma_info.rma_lock); - if (addr) { - req.out_window = &local_window; - req.nr_bytes = ALIGN(len + (addr & ~PAGE_MASK), - PAGE_SIZE); - req.va_for_temp = addr & PAGE_MASK; - req.prot = (dir == SCIF_LOCAL_TO_REMOTE ? - VM_READ : VM_WRITE | VM_READ); - /* Does a valid local window exist? */ - if (mmn) { - spin_lock(&ep->rma_info.tc_lock); - req.head = &mmn->tc_reg_list; - err = scif_query_tcw(ep, &req); - spin_unlock(&ep->rma_info.tc_lock); - } - if (!mmn || err) { - err = scif_register_temp(epd, req.va_for_temp, - req.nr_bytes, req.prot, - &loffset, &local_window); - if (err) { - mutex_unlock(&ep->rma_info.rma_lock); - goto error; - } - if (!cache) - goto skip_cache; - atomic_inc(&ep->rma_info.tcw_refcount); - atomic_add_return(local_window->nr_pages, - &ep->rma_info.tcw_total_pages); - if (mmn) { - spin_lock(&ep->rma_info.tc_lock); - scif_insert_tcw(local_window, - &mmn->tc_reg_list); - spin_unlock(&ep->rma_info.tc_lock); - } - } -skip_cache: - loffset = local_window->offset + - (addr - local_window->va_for_temp); - } else { - req.out_window = &local_window; - req.offset = loffset; - /* - * If transfer is from local to remote then the self window - * must be readable and vice versa. - */ - req.prot = dir == SCIF_LOCAL_TO_REMOTE ? VM_READ : VM_WRITE; - req.nr_bytes = len; - req.type = SCIF_WINDOW_PARTIAL; - req.head = &ep->rma_info.reg_list; - /* Does a valid local window exist? */ - err = scif_query_window(&req); - if (err) { - mutex_unlock(&ep->rma_info.rma_lock); - goto error; - } - } - - /* Does a valid remote window exist? */ - err = scif_query_window(&remote_req); - if (err) { - mutex_unlock(&ep->rma_info.rma_lock); - goto error; - } - - /* - * Prepare copy_work for submitting work to the DMA kernel thread - * or CPU copy routine. - */ - copy_work.len = len; - copy_work.loopback = loopback; - copy_work.remote_dev = ep->remote_dev; - if (dir == SCIF_LOCAL_TO_REMOTE) { - copy_work.src_offset = loffset; - copy_work.src_window = local_window; - copy_work.dst_offset = roffset; - copy_work.dst_window = remote_window; - } else { - copy_work.src_offset = roffset; - copy_work.src_window = remote_window; - copy_work.dst_offset = loffset; - copy_work.dst_window = local_window; - } - - if (flags & SCIF_RMA_USECPU) { - scif_rma_list_cpu_copy(©_work); - } else { - chan = ep->rma_info.dma_chan; - err = scif_rma_list_dma_copy_wrapper(epd, ©_work, - chan, loffset); - } - if (addr && !cache) - atomic_inc(&ep->rma_info.tw_refcount); - - mutex_unlock(&ep->rma_info.rma_lock); - - if (last_chunk) { - struct scif_dev *rdev = ep->remote_dev; - - if (copy_work.fence_type == SCIF_DMA_POLL) - err = scif_drain_dma_poll(rdev->sdev, - ep->rma_info.dma_chan); - else if (copy_work.fence_type == SCIF_DMA_INTR) - err = scif_drain_dma_intr(rdev->sdev, - ep->rma_info.dma_chan); - } - - if (addr && !cache) - scif_queue_for_cleanup(local_window, &scif_info.rma); - scif_put_peer_dev(spdev); - return err; -error: - if (err) { - if (addr && local_window && !cache) - scif_destroy_window(ep, local_window); - dev_err(scif_info.mdev.this_device, - "%s %d err %d len 0x%lx\n", - __func__, __LINE__, err, len); - } - scif_put_peer_dev(spdev); - return err; -} - -int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, - off_t roffset, int flags) -{ - int err; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI readfrom: ep %p loffset 0x%lx len 0x%lx offset 0x%lx flags 0x%x\n", - epd, loffset, len, roffset, flags); - if (scif_unaligned(loffset, roffset)) { - while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { - err = scif_rma_copy(epd, loffset, 0x0, - SCIF_MAX_UNALIGNED_BUF_SIZE, - roffset, flags, - SCIF_REMOTE_TO_LOCAL, false); - if (err) - goto readfrom_err; - loffset += SCIF_MAX_UNALIGNED_BUF_SIZE; - roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; - len -= SCIF_MAX_UNALIGNED_BUF_SIZE; - } - } - err = scif_rma_copy(epd, loffset, 0x0, len, - roffset, flags, SCIF_REMOTE_TO_LOCAL, true); -readfrom_err: - return err; -} -EXPORT_SYMBOL_GPL(scif_readfrom); - -int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, - off_t roffset, int flags) -{ - int err; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI writeto: ep %p loffset 0x%lx len 0x%lx roffset 0x%lx flags 0x%x\n", - epd, loffset, len, roffset, flags); - if (scif_unaligned(loffset, roffset)) { - while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { - err = scif_rma_copy(epd, loffset, 0x0, - SCIF_MAX_UNALIGNED_BUF_SIZE, - roffset, flags, - SCIF_LOCAL_TO_REMOTE, false); - if (err) - goto writeto_err; - loffset += SCIF_MAX_UNALIGNED_BUF_SIZE; - roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; - len -= SCIF_MAX_UNALIGNED_BUF_SIZE; - } - } - err = scif_rma_copy(epd, loffset, 0x0, len, - roffset, flags, SCIF_LOCAL_TO_REMOTE, true); -writeto_err: - return err; -} -EXPORT_SYMBOL_GPL(scif_writeto); - -int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, - off_t roffset, int flags) -{ - int err; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI vreadfrom: ep %p addr %p len 0x%lx roffset 0x%lx flags 0x%x\n", - epd, addr, len, roffset, flags); - if (scif_unaligned((off_t __force)addr, roffset)) { - if (len > SCIF_MAX_UNALIGNED_BUF_SIZE) - flags &= ~SCIF_RMA_USECACHE; - - while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { - err = scif_rma_copy(epd, 0, (u64)addr, - SCIF_MAX_UNALIGNED_BUF_SIZE, - roffset, flags, - SCIF_REMOTE_TO_LOCAL, false); - if (err) - goto vreadfrom_err; - addr += SCIF_MAX_UNALIGNED_BUF_SIZE; - roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; - len -= SCIF_MAX_UNALIGNED_BUF_SIZE; - } - } - err = scif_rma_copy(epd, 0, (u64)addr, len, - roffset, flags, SCIF_REMOTE_TO_LOCAL, true); -vreadfrom_err: - return err; -} -EXPORT_SYMBOL_GPL(scif_vreadfrom); - -int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, - off_t roffset, int flags) -{ - int err; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI vwriteto: ep %p addr %p len 0x%lx roffset 0x%lx flags 0x%x\n", - epd, addr, len, roffset, flags); - if (scif_unaligned((off_t __force)addr, roffset)) { - if (len > SCIF_MAX_UNALIGNED_BUF_SIZE) - flags &= ~SCIF_RMA_USECACHE; - - while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { - err = scif_rma_copy(epd, 0, (u64)addr, - SCIF_MAX_UNALIGNED_BUF_SIZE, - roffset, flags, - SCIF_LOCAL_TO_REMOTE, false); - if (err) - goto vwriteto_err; - addr += SCIF_MAX_UNALIGNED_BUF_SIZE; - roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; - len -= SCIF_MAX_UNALIGNED_BUF_SIZE; - } - } - err = scif_rma_copy(epd, 0, (u64)addr, len, - roffset, flags, SCIF_LOCAL_TO_REMOTE, true); -vwriteto_err: - return err; -} -EXPORT_SYMBOL_GPL(scif_vwriteto); diff --git a/drivers/misc/mic/scif/scif_epd.c b/drivers/misc/mic/scif/scif_epd.c deleted file mode 100644 index 426687f6696b..000000000000 --- a/drivers/misc/mic/scif/scif_epd.c +++ /dev/null @@ -1,357 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include "scif_main.h" -#include "scif_map.h" - -void scif_cleanup_ep_qp(struct scif_endpt *ep) -{ - struct scif_qp *qp = ep->qp_info.qp; - - if (qp->outbound_q.rb_base) { - scif_iounmap((void *)qp->outbound_q.rb_base, - qp->outbound_q.size, ep->remote_dev); - qp->outbound_q.rb_base = NULL; - } - if (qp->remote_qp) { - scif_iounmap((void *)qp->remote_qp, - sizeof(struct scif_qp), ep->remote_dev); - qp->remote_qp = NULL; - } - if (qp->local_qp) { - scif_unmap_single(qp->local_qp, ep->remote_dev, - sizeof(struct scif_qp)); - qp->local_qp = 0x0; - } - if (qp->local_buf) { - scif_unmap_single(qp->local_buf, ep->remote_dev, - SCIF_ENDPT_QP_SIZE); - qp->local_buf = 0; - } -} - -void scif_teardown_ep(void *endpt) -{ - struct scif_endpt *ep = endpt; - struct scif_qp *qp = ep->qp_info.qp; - - if (qp) { - spin_lock(&ep->lock); - scif_cleanup_ep_qp(ep); - spin_unlock(&ep->lock); - kfree(qp->inbound_q.rb_base); - kfree(qp); - } -} - -/* - * Enqueue the endpoint to the zombie list for cleanup. - * The endpoint should not be accessed once this API returns. - */ -void scif_add_epd_to_zombie_list(struct scif_endpt *ep, bool eplock_held) -{ - if (!eplock_held) - mutex_lock(&scif_info.eplock); - spin_lock(&ep->lock); - ep->state = SCIFEP_ZOMBIE; - spin_unlock(&ep->lock); - list_add_tail(&ep->list, &scif_info.zombie); - scif_info.nr_zombies++; - if (!eplock_held) - mutex_unlock(&scif_info.eplock); - schedule_work(&scif_info.misc_work); -} - -static struct scif_endpt *scif_find_listen_ep(u16 port) -{ - struct scif_endpt *ep = NULL; - struct list_head *pos, *tmpq; - - mutex_lock(&scif_info.eplock); - list_for_each_safe(pos, tmpq, &scif_info.listen) { - ep = list_entry(pos, struct scif_endpt, list); - if (ep->port.port == port) { - mutex_unlock(&scif_info.eplock); - return ep; - } - } - mutex_unlock(&scif_info.eplock); - return NULL; -} - -void scif_cleanup_zombie_epd(void) -{ - struct list_head *pos, *tmpq; - struct scif_endpt *ep; - - mutex_lock(&scif_info.eplock); - list_for_each_safe(pos, tmpq, &scif_info.zombie) { - ep = list_entry(pos, struct scif_endpt, list); - if (scif_rma_ep_can_uninit(ep)) { - list_del(pos); - scif_info.nr_zombies--; - put_iova_domain(&ep->rma_info.iovad); - kfree(ep); - } - } - mutex_unlock(&scif_info.eplock); -} - -/** - * scif_cnctreq() - Respond to SCIF_CNCT_REQ interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * This message is initiated by the remote node to request a connection - * to the local node. This function looks for an end point in the - * listen state on the requested port id. - * - * If it finds a listening port it places the connect request on the - * listening end points queue and wakes up any pending accept calls. - * - * If it does not find a listening end point it sends a connection - * reject message to the remote node. - */ -void scif_cnctreq(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = NULL; - struct scif_conreq *conreq; - - conreq = kmalloc(sizeof(*conreq), GFP_KERNEL); - if (!conreq) - /* Lack of resources so reject the request. */ - goto conreq_sendrej; - - ep = scif_find_listen_ep(msg->dst.port); - if (!ep) - /* Send reject due to no listening ports */ - goto conreq_sendrej_free; - else - spin_lock(&ep->lock); - - if (ep->backlog <= ep->conreqcnt) { - /* Send reject due to too many pending requests */ - spin_unlock(&ep->lock); - goto conreq_sendrej_free; - } - - conreq->msg = *msg; - list_add_tail(&conreq->list, &ep->conlist); - ep->conreqcnt++; - wake_up_interruptible(&ep->conwq); - spin_unlock(&ep->lock); - return; - -conreq_sendrej_free: - kfree(conreq); -conreq_sendrej: - msg->uop = SCIF_CNCT_REJ; - scif_nodeqp_send(&scif_dev[msg->src.node], msg); -} - -/** - * scif_cnctgnt() - Respond to SCIF_CNCT_GNT interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * An accept() on the remote node has occurred and sent this message - * to indicate success. Place the end point in the MAPPING state and - * save the remote nodes memory information. Then wake up the connect - * request so it can finish. - */ -void scif_cnctgnt(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - - spin_lock(&ep->lock); - if (SCIFEP_CONNECTING == ep->state) { - ep->peer.node = msg->src.node; - ep->peer.port = msg->src.port; - ep->qp_info.gnt_pld = msg->payload[1]; - ep->remote_ep = msg->payload[2]; - ep->state = SCIFEP_MAPPING; - - wake_up(&ep->conwq); - } - spin_unlock(&ep->lock); -} - -/** - * scif_cnctgnt_ack() - Respond to SCIF_CNCT_GNTACK interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The remote connection request has finished mapping the local memory. - * Place the connection in the connected state and wake up the pending - * accept() call. - */ -void scif_cnctgnt_ack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - - mutex_lock(&scif_info.connlock); - spin_lock(&ep->lock); - /* New ep is now connected with all resources set. */ - ep->state = SCIFEP_CONNECTED; - list_add_tail(&ep->list, &scif_info.connected); - wake_up(&ep->conwq); - spin_unlock(&ep->lock); - mutex_unlock(&scif_info.connlock); -} - -/** - * scif_cnctgnt_nack() - Respond to SCIF_CNCT_GNTNACK interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The remote connection request failed to map the local memory it was sent. - * Place the end point in the CLOSING state to indicate it and wake up - * the pending accept(); - */ -void scif_cnctgnt_nack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - - spin_lock(&ep->lock); - ep->state = SCIFEP_CLOSING; - wake_up(&ep->conwq); - spin_unlock(&ep->lock); -} - -/** - * scif_cnctrej() - Respond to SCIF_CNCT_REJ interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The remote end has rejected the connection request. Set the end - * point back to the bound state and wake up the pending connect(). - */ -void scif_cnctrej(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - - spin_lock(&ep->lock); - if (SCIFEP_CONNECTING == ep->state) { - ep->state = SCIFEP_BOUND; - wake_up(&ep->conwq); - } - spin_unlock(&ep->lock); -} - -/** - * scif_discnct() - Respond to SCIF_DISCNCT interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The remote node has indicated close() has been called on its end - * point. Remove the local end point from the connected list, set its - * state to disconnected and ensure accesses to the remote node are - * shutdown. - * - * When all accesses to the remote end have completed then send a - * DISCNT_ACK to indicate it can remove its resources and complete - * the close routine. - */ -void scif_discnct(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = NULL; - struct scif_endpt *tmpep; - struct list_head *pos, *tmpq; - - mutex_lock(&scif_info.connlock); - list_for_each_safe(pos, tmpq, &scif_info.connected) { - tmpep = list_entry(pos, struct scif_endpt, list); - /* - * The local ep may have sent a disconnect and and been closed - * due to a message response time out. It may have been - * allocated again and formed a new connection so we want to - * check if the remote ep matches - */ - if (((u64)tmpep == msg->payload[1]) && - ((u64)tmpep->remote_ep == msg->payload[0])) { - list_del(pos); - ep = tmpep; - spin_lock(&ep->lock); - break; - } - } - - /* - * If the terminated end is not found then this side started closing - * before the other side sent the disconnect. If so the ep will no - * longer be on the connected list. Regardless the other side - * needs to be acked to let it know close is complete. - */ - if (!ep) { - mutex_unlock(&scif_info.connlock); - goto discnct_ack; - } - - ep->state = SCIFEP_DISCONNECTED; - list_add_tail(&ep->list, &scif_info.disconnected); - - wake_up_interruptible(&ep->sendwq); - wake_up_interruptible(&ep->recvwq); - spin_unlock(&ep->lock); - mutex_unlock(&scif_info.connlock); - -discnct_ack: - msg->uop = SCIF_DISCNT_ACK; - scif_nodeqp_send(&scif_dev[msg->src.node], msg); -} - -/** - * scif_discnct_ack() - Respond to SCIF_DISCNT_ACK interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Remote side has indicated it has not more references to local resources - */ -void scif_discnt_ack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - - spin_lock(&ep->lock); - ep->state = SCIFEP_DISCONNECTED; - spin_unlock(&ep->lock); - complete(&ep->discon); -} - -/** - * scif_clientsend() - Respond to SCIF_CLIENT_SEND interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Remote side is confirming send or receive interrupt handling is complete. - */ -void scif_clientsend(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - - spin_lock(&ep->lock); - if (SCIFEP_CONNECTED == ep->state) - wake_up_interruptible(&ep->recvwq); - spin_unlock(&ep->lock); -} - -/** - * scif_clientrcvd() - Respond to SCIF_CLIENT_RCVD interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Remote side is confirming send or receive interrupt handling is complete. - */ -void scif_clientrcvd(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - - spin_lock(&ep->lock); - if (SCIFEP_CONNECTED == ep->state) - wake_up_interruptible(&ep->sendwq); - spin_unlock(&ep->lock); -} diff --git a/drivers/misc/mic/scif/scif_epd.h b/drivers/misc/mic/scif/scif_epd.h deleted file mode 100644 index 0b9dfe1cc06c..000000000000 --- a/drivers/misc/mic/scif/scif_epd.h +++ /dev/null @@ -1,200 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#ifndef SCIF_EPD_H -#define SCIF_EPD_H - -#include -#include -#include - -#define SCIF_EPLOCK_HELD true - -enum scif_epd_state { - SCIFEP_UNBOUND, - SCIFEP_BOUND, - SCIFEP_LISTENING, - SCIFEP_CONNECTED, - SCIFEP_CONNECTING, - SCIFEP_MAPPING, - SCIFEP_CLOSING, - SCIFEP_CLLISTEN, - SCIFEP_DISCONNECTED, - SCIFEP_ZOMBIE -}; - -/* - * struct scif_conreq - Data structure added to the connection list. - * - * @msg: connection request message received - * @list: link to list of connection requests - */ -struct scif_conreq { - struct scifmsg msg; - struct list_head list; -}; - -/* Size of the RB for the Endpoint QP */ -#define SCIF_ENDPT_QP_SIZE 0x1000 - -/* - * scif_endpt_qp_info - SCIF endpoint queue pair - * - * @qp - Qpair for this endpoint - * @qp_offset - DMA address of the QP - * @gnt_pld - Payload in a SCIF_CNCT_GNT message containing the - * physical address of the remote_qp. - */ -struct scif_endpt_qp_info { - struct scif_qp *qp; - dma_addr_t qp_offset; - dma_addr_t gnt_pld; -}; - -/* - * struct scif_endpt - The SCIF endpoint data structure - * - * @state: end point state - * @lock: lock synchronizing access to endpoint fields like state etc - * @port: self port information - * @peer: peer port information - * @backlog: maximum pending connection requests - * @qp_info: Endpoint QP information for SCIF messaging - * @remote_dev: scifdev used by this endpt to communicate with remote node. - * @remote_ep: remote endpoint - * @conreqcnt: Keep track of number of connection requests. - * @files: Open file information used to match the id passed in with - * the flush routine. - * @conlist: list of connection requests - * @conwq: waitqueue for connection processing - * @discon: completion used during disconnection - * @sendwq: waitqueue used during sending messages - * @recvwq: waitqueue used during message receipt - * @sendlock: Synchronize ordering of messages sent - * @recvlock: Synchronize ordering of messages received - * @list: link to list of various endpoints like connected, listening etc - * @li_accept: pending ACCEPTREG - * @acceptcnt: pending ACCEPTREG cnt - * @liacceptlist: link to listen accept - * @miacceptlist: link to uaccept - * @listenep: associated listen ep - * @conn_work: Non blocking connect work - * @conn_port: Connection port - * @conn_err: Errors during connection - * @conn_async_state: Async connection - * @conn_pend_wq: Used by poll while waiting for incoming connections - * @conn_list: List of async connection requests - * @rma_info: Information for triggering SCIF RMA and DMA operations - * @mmu_list: link to list of MMU notifier cleanup work - * @anon: anonymous file for use in kernel mode scif poll - */ -struct scif_endpt { - enum scif_epd_state state; - spinlock_t lock; - struct scif_port_id port; - struct scif_port_id peer; - int backlog; - struct scif_endpt_qp_info qp_info; - struct scif_dev *remote_dev; - u64 remote_ep; - int conreqcnt; - struct files_struct *files; - struct list_head conlist; - wait_queue_head_t conwq; - struct completion discon; - wait_queue_head_t sendwq; - wait_queue_head_t recvwq; - struct mutex sendlock; - struct mutex recvlock; - struct list_head list; - struct list_head li_accept; - int acceptcnt; - struct list_head liacceptlist; - struct list_head miacceptlist; - struct scif_endpt *listenep; - struct scif_port_id conn_port; - int conn_err; - int conn_async_state; - wait_queue_head_t conn_pend_wq; - struct list_head conn_list; - struct scif_endpt_rma_info rma_info; - struct list_head mmu_list; - struct file *anon; -}; - -static inline int scifdev_alive(struct scif_endpt *ep) -{ - return _scifdev_alive(ep->remote_dev); -} - -/* - * scif_verify_epd: - * ep: SCIF endpoint - * - * Checks several generic error conditions and returns the - * appropriate error. - */ -static inline int scif_verify_epd(struct scif_endpt *ep) -{ - if (ep->state == SCIFEP_DISCONNECTED) - return -ECONNRESET; - - if (ep->state != SCIFEP_CONNECTED) - return -ENOTCONN; - - if (!scifdev_alive(ep)) - return -ENODEV; - - return 0; -} - -static inline int scif_anon_inode_getfile(scif_epd_t epd) -{ - epd->anon = anon_inode_getfile("scif", &scif_anon_fops, NULL, 0); - - return PTR_ERR_OR_ZERO(epd->anon); -} - -static inline void scif_anon_inode_fput(scif_epd_t epd) -{ - if (epd->anon) { - fput(epd->anon); - epd->anon = NULL; - } -} - -void scif_cleanup_zombie_epd(void); -void scif_teardown_ep(void *endpt); -void scif_cleanup_ep_qp(struct scif_endpt *ep); -void scif_add_epd_to_zombie_list(struct scif_endpt *ep, bool eplock_held); -void scif_get_node_info(void); -void scif_send_acks(struct scif_dev *dev); -void scif_conn_handler(struct work_struct *work); -int scif_rsrv_port(u16 port); -void scif_get_port(u16 port); -int scif_get_new_port(void); -void scif_put_port(u16 port); -int scif_user_send(scif_epd_t epd, void __user *msg, int len, int flags); -int scif_user_recv(scif_epd_t epd, void __user *msg, int len, int flags); -void scif_cnctreq(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_cnctgnt(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_cnctgnt_ack(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_cnctgnt_nack(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_cnctrej(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_discnct(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_discnt_ack(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_clientsend(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_clientrcvd(struct scif_dev *scifdev, struct scifmsg *msg); -int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block); -int __scif_flush(scif_epd_t epd); -int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd); -__poll_t __scif_pollfd(struct file *f, poll_table *wait, - struct scif_endpt *ep); -int __scif_pin_pages(void *addr, size_t len, int *out_prot, - int map_flags, scif_pinned_pages_t *pages); -#endif /* SCIF_EPD_H */ diff --git a/drivers/misc/mic/scif/scif_fd.c b/drivers/misc/mic/scif/scif_fd.c deleted file mode 100644 index 3f08646cd78a..000000000000 --- a/drivers/misc/mic/scif/scif_fd.c +++ /dev/null @@ -1,462 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include "scif_main.h" - -static int scif_fdopen(struct inode *inode, struct file *f) -{ - struct scif_endpt *priv = scif_open(); - - if (!priv) - return -ENOMEM; - f->private_data = priv; - return 0; -} - -static int scif_fdclose(struct inode *inode, struct file *f) -{ - struct scif_endpt *priv = f->private_data; - - return scif_close(priv); -} - -static int scif_fdmmap(struct file *f, struct vm_area_struct *vma) -{ - struct scif_endpt *priv = f->private_data; - - return scif_mmap(vma, priv); -} - -static __poll_t scif_fdpoll(struct file *f, poll_table *wait) -{ - struct scif_endpt *priv = f->private_data; - - return __scif_pollfd(f, wait, priv); -} - -static int scif_fdflush(struct file *f, fl_owner_t id) -{ - struct scif_endpt *ep = f->private_data; - - spin_lock(&ep->lock); - /* - * The listening endpoint stashes the open file information before - * waiting for incoming connections. The release callback would never be - * called if the application closed the endpoint, while waiting for - * incoming connections from a separate thread since the file descriptor - * reference count is bumped up in the accept IOCTL. Call the flush - * routine if the id matches the endpoint open file information so that - * the listening endpoint can be woken up and the fd released. - */ - if (ep->files == id) - __scif_flush(ep); - spin_unlock(&ep->lock); - return 0; -} - -static __always_inline void scif_err_debug(int err, const char *str) -{ - /* - * ENOTCONN is a common uninteresting error which is - * flooding debug messages to the console unnecessarily. - */ - if (err < 0 && err != -ENOTCONN) - dev_dbg(scif_info.mdev.this_device, "%s err %d\n", str, err); -} - -static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - struct scif_endpt *priv = f->private_data; - void __user *argp = (void __user *)arg; - int err = 0; - struct scifioctl_msg request; - bool non_block = false; - - non_block = !!(f->f_flags & O_NONBLOCK); - - switch (cmd) { - case SCIF_BIND: - { - int pn; - - if (copy_from_user(&pn, argp, sizeof(pn))) - return -EFAULT; - - pn = scif_bind(priv, pn); - if (pn < 0) - return pn; - - if (copy_to_user(argp, &pn, sizeof(pn))) - return -EFAULT; - - return 0; - } - case SCIF_LISTEN: - return scif_listen(priv, arg); - case SCIF_CONNECT: - { - struct scifioctl_connect req; - struct scif_endpt *ep = (struct scif_endpt *)priv; - - if (copy_from_user(&req, argp, sizeof(req))) - return -EFAULT; - - err = __scif_connect(priv, &req.peer, non_block); - if (err < 0) - return err; - - req.self.node = ep->port.node; - req.self.port = ep->port.port; - - if (copy_to_user(argp, &req, sizeof(req))) - return -EFAULT; - - return 0; - } - /* - * Accept is done in two halves. The request ioctl does the basic - * functionality of accepting the request and returning the information - * about it including the internal ID of the end point. The register - * is done with the internal ID on a new file descriptor opened by the - * requesting process. - */ - case SCIF_ACCEPTREQ: - { - struct scifioctl_accept request; - scif_epd_t *ep = (scif_epd_t *)&request.endpt; - - if (copy_from_user(&request, argp, sizeof(request))) - return -EFAULT; - - err = scif_accept(priv, &request.peer, ep, request.flags); - if (err < 0) - return err; - - if (copy_to_user(argp, &request, sizeof(request))) { - scif_close(*ep); - return -EFAULT; - } - /* - * Add to the list of user mode eps where the second half - * of the accept is not yet completed. - */ - mutex_lock(&scif_info.eplock); - list_add_tail(&((*ep)->miacceptlist), &scif_info.uaccept); - list_add_tail(&((*ep)->liacceptlist), &priv->li_accept); - (*ep)->listenep = priv; - priv->acceptcnt++; - mutex_unlock(&scif_info.eplock); - - return 0; - } - case SCIF_ACCEPTREG: - { - struct scif_endpt *priv = f->private_data; - struct scif_endpt *newep; - struct scif_endpt *lisep; - struct scif_endpt *fep = NULL; - struct scif_endpt *tmpep; - struct list_head *pos, *tmpq; - - /* Finally replace the pointer to the accepted endpoint */ - if (copy_from_user(&newep, argp, sizeof(void *))) - return -EFAULT; - - /* Remove form the user accept queue */ - mutex_lock(&scif_info.eplock); - list_for_each_safe(pos, tmpq, &scif_info.uaccept) { - tmpep = list_entry(pos, - struct scif_endpt, miacceptlist); - if (tmpep == newep) { - list_del(pos); - fep = tmpep; - break; - } - } - - if (!fep) { - mutex_unlock(&scif_info.eplock); - return -ENOENT; - } - - lisep = newep->listenep; - list_for_each_safe(pos, tmpq, &lisep->li_accept) { - tmpep = list_entry(pos, - struct scif_endpt, liacceptlist); - if (tmpep == newep) { - list_del(pos); - lisep->acceptcnt--; - break; - } - } - - mutex_unlock(&scif_info.eplock); - - /* Free the resources automatically created from the open. */ - scif_anon_inode_fput(priv); - scif_teardown_ep(priv); - scif_add_epd_to_zombie_list(priv, !SCIF_EPLOCK_HELD); - f->private_data = newep; - return 0; - } - case SCIF_SEND: - { - struct scif_endpt *priv = f->private_data; - - if (copy_from_user(&request, argp, - sizeof(struct scifioctl_msg))) { - err = -EFAULT; - goto send_err; - } - err = scif_user_send(priv, (void __user *)request.msg, - request.len, request.flags); - if (err < 0) - goto send_err; - if (copy_to_user(& - ((struct scifioctl_msg __user *)argp)->out_len, - &err, sizeof(err))) { - err = -EFAULT; - goto send_err; - } - err = 0; -send_err: - scif_err_debug(err, "scif_send"); - return err; - } - case SCIF_RECV: - { - struct scif_endpt *priv = f->private_data; - - if (copy_from_user(&request, argp, - sizeof(struct scifioctl_msg))) { - err = -EFAULT; - goto recv_err; - } - - err = scif_user_recv(priv, (void __user *)request.msg, - request.len, request.flags); - if (err < 0) - goto recv_err; - - if (copy_to_user(& - ((struct scifioctl_msg __user *)argp)->out_len, - &err, sizeof(err))) { - err = -EFAULT; - goto recv_err; - } - err = 0; -recv_err: - scif_err_debug(err, "scif_recv"); - return err; - } - case SCIF_GET_NODEIDS: - { - struct scifioctl_node_ids node_ids; - int entries; - u16 *nodes; - void __user *unodes, *uself; - u16 self; - - if (copy_from_user(&node_ids, argp, sizeof(node_ids))) { - err = -EFAULT; - goto getnodes_err2; - } - - entries = min_t(int, scif_info.maxid, node_ids.len); - nodes = kmalloc_array(entries, sizeof(u16), GFP_KERNEL); - if (entries && !nodes) { - err = -ENOMEM; - goto getnodes_err2; - } - node_ids.len = scif_get_node_ids(nodes, entries, &self); - - unodes = (void __user *)node_ids.nodes; - if (copy_to_user(unodes, nodes, sizeof(u16) * entries)) { - err = -EFAULT; - goto getnodes_err1; - } - - uself = (void __user *)node_ids.self; - if (copy_to_user(uself, &self, sizeof(u16))) { - err = -EFAULT; - goto getnodes_err1; - } - - if (copy_to_user(argp, &node_ids, sizeof(node_ids))) { - err = -EFAULT; - goto getnodes_err1; - } -getnodes_err1: - kfree(nodes); -getnodes_err2: - return err; - } - case SCIF_REG: - { - struct scif_endpt *priv = f->private_data; - struct scifioctl_reg reg; - off_t ret; - - if (copy_from_user(®, argp, sizeof(reg))) { - err = -EFAULT; - goto reg_err; - } - if (reg.flags & SCIF_MAP_KERNEL) { - err = -EINVAL; - goto reg_err; - } - ret = scif_register(priv, (void *)reg.addr, reg.len, - reg.offset, reg.prot, reg.flags); - if (ret < 0) { - err = (int)ret; - goto reg_err; - } - - if (copy_to_user(&((struct scifioctl_reg __user *)argp) - ->out_offset, &ret, sizeof(reg.out_offset))) { - err = -EFAULT; - goto reg_err; - } - err = 0; -reg_err: - scif_err_debug(err, "scif_register"); - return err; - } - case SCIF_UNREG: - { - struct scif_endpt *priv = f->private_data; - struct scifioctl_unreg unreg; - - if (copy_from_user(&unreg, argp, sizeof(unreg))) { - err = -EFAULT; - goto unreg_err; - } - err = scif_unregister(priv, unreg.offset, unreg.len); -unreg_err: - scif_err_debug(err, "scif_unregister"); - return err; - } - case SCIF_READFROM: - { - struct scif_endpt *priv = f->private_data; - struct scifioctl_copy copy; - - if (copy_from_user(©, argp, sizeof(copy))) { - err = -EFAULT; - goto readfrom_err; - } - err = scif_readfrom(priv, copy.loffset, copy.len, copy.roffset, - copy.flags); -readfrom_err: - scif_err_debug(err, "scif_readfrom"); - return err; - } - case SCIF_WRITETO: - { - struct scif_endpt *priv = f->private_data; - struct scifioctl_copy copy; - - if (copy_from_user(©, argp, sizeof(copy))) { - err = -EFAULT; - goto writeto_err; - } - err = scif_writeto(priv, copy.loffset, copy.len, copy.roffset, - copy.flags); -writeto_err: - scif_err_debug(err, "scif_writeto"); - return err; - } - case SCIF_VREADFROM: - { - struct scif_endpt *priv = f->private_data; - struct scifioctl_copy copy; - - if (copy_from_user(©, argp, sizeof(copy))) { - err = -EFAULT; - goto vreadfrom_err; - } - err = scif_vreadfrom(priv, (void __force *)copy.addr, copy.len, - copy.roffset, copy.flags); -vreadfrom_err: - scif_err_debug(err, "scif_vreadfrom"); - return err; - } - case SCIF_VWRITETO: - { - struct scif_endpt *priv = f->private_data; - struct scifioctl_copy copy; - - if (copy_from_user(©, argp, sizeof(copy))) { - err = -EFAULT; - goto vwriteto_err; - } - err = scif_vwriteto(priv, (void __force *)copy.addr, copy.len, - copy.roffset, copy.flags); -vwriteto_err: - scif_err_debug(err, "scif_vwriteto"); - return err; - } - case SCIF_FENCE_MARK: - { - struct scif_endpt *priv = f->private_data; - struct scifioctl_fence_mark mark; - int tmp_mark = 0; - - if (copy_from_user(&mark, argp, sizeof(mark))) { - err = -EFAULT; - goto fence_mark_err; - } - err = scif_fence_mark(priv, mark.flags, &tmp_mark); - if (err) - goto fence_mark_err; - if (copy_to_user((void __user *)mark.mark, &tmp_mark, - sizeof(tmp_mark))) { - err = -EFAULT; - goto fence_mark_err; - } -fence_mark_err: - scif_err_debug(err, "scif_fence_mark"); - return err; - } - case SCIF_FENCE_WAIT: - { - struct scif_endpt *priv = f->private_data; - - err = scif_fence_wait(priv, arg); - scif_err_debug(err, "scif_fence_wait"); - return err; - } - case SCIF_FENCE_SIGNAL: - { - struct scif_endpt *priv = f->private_data; - struct scifioctl_fence_signal signal; - - if (copy_from_user(&signal, argp, sizeof(signal))) { - err = -EFAULT; - goto fence_signal_err; - } - - err = scif_fence_signal(priv, signal.loff, signal.lval, - signal.roff, signal.rval, signal.flags); -fence_signal_err: - scif_err_debug(err, "scif_fence_signal"); - return err; - } - } - return -EINVAL; -} - -const struct file_operations scif_fops = { - .open = scif_fdopen, - .release = scif_fdclose, - .unlocked_ioctl = scif_fdioctl, - .mmap = scif_fdmmap, - .poll = scif_fdpoll, - .flush = scif_fdflush, - .owner = THIS_MODULE, -}; diff --git a/drivers/misc/mic/scif/scif_fence.c b/drivers/misc/mic/scif/scif_fence.c deleted file mode 100644 index 4fedf6183951..000000000000 --- a/drivers/misc/mic/scif/scif_fence.c +++ /dev/null @@ -1,783 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel SCIF driver. - */ - -#include "scif_main.h" - -/** - * scif_recv_mark: Handle SCIF_MARK request - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The peer has requested a mark. - */ -void scif_recv_mark(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - int mark = 0; - int err; - - err = _scif_fence_mark(ep, &mark); - if (err) - msg->uop = SCIF_MARK_NACK; - else - msg->uop = SCIF_MARK_ACK; - msg->payload[0] = ep->remote_ep; - msg->payload[2] = mark; - scif_nodeqp_send(ep->remote_dev, msg); -} - -/** - * scif_recv_mark_resp: Handle SCIF_MARK_(N)ACK messages. - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The peer has responded to a SCIF_MARK message. - */ -void scif_recv_mark_resp(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - struct scif_fence_info *fence_req = - (struct scif_fence_info *)msg->payload[1]; - - mutex_lock(&ep->rma_info.rma_lock); - if (msg->uop == SCIF_MARK_ACK) { - fence_req->state = OP_COMPLETED; - fence_req->dma_mark = (int)msg->payload[2]; - } else { - fence_req->state = OP_FAILED; - } - mutex_unlock(&ep->rma_info.rma_lock); - complete(&fence_req->comp); -} - -/** - * scif_recv_wait: Handle SCIF_WAIT request - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The peer has requested waiting on a fence. - */ -void scif_recv_wait(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - struct scif_remote_fence_info *fence; - - /* - * Allocate structure for remote fence information and - * send a NACK if the allocation failed. The peer will - * return ENOMEM upon receiving a NACK. - */ - fence = kmalloc(sizeof(*fence), GFP_KERNEL); - if (!fence) { - msg->payload[0] = ep->remote_ep; - msg->uop = SCIF_WAIT_NACK; - scif_nodeqp_send(ep->remote_dev, msg); - return; - } - - /* Prepare the fence request */ - memcpy(&fence->msg, msg, sizeof(struct scifmsg)); - INIT_LIST_HEAD(&fence->list); - - /* Insert to the global remote fence request list */ - mutex_lock(&scif_info.fencelock); - atomic_inc(&ep->rma_info.fence_refcount); - list_add_tail(&fence->list, &scif_info.fence); - mutex_unlock(&scif_info.fencelock); - - schedule_work(&scif_info.misc_work); -} - -/** - * scif_recv_wait_resp: Handle SCIF_WAIT_(N)ACK messages. - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The peer has responded to a SCIF_WAIT message. - */ -void scif_recv_wait_resp(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - struct scif_fence_info *fence_req = - (struct scif_fence_info *)msg->payload[1]; - - mutex_lock(&ep->rma_info.rma_lock); - if (msg->uop == SCIF_WAIT_ACK) - fence_req->state = OP_COMPLETED; - else - fence_req->state = OP_FAILED; - mutex_unlock(&ep->rma_info.rma_lock); - complete(&fence_req->comp); -} - -/** - * scif_recv_sig_local: Handle SCIF_SIG_LOCAL request - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The peer has requested a signal on a local offset. - */ -void scif_recv_sig_local(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - int err; - - err = scif_prog_signal(ep, msg->payload[1], msg->payload[2], - SCIF_WINDOW_SELF); - if (err) - msg->uop = SCIF_SIG_NACK; - else - msg->uop = SCIF_SIG_ACK; - msg->payload[0] = ep->remote_ep; - scif_nodeqp_send(ep->remote_dev, msg); -} - -/** - * scif_recv_sig_remote: Handle SCIF_SIGNAL_REMOTE request - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The peer has requested a signal on a remote offset. - */ -void scif_recv_sig_remote(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - int err; - - err = scif_prog_signal(ep, msg->payload[1], msg->payload[2], - SCIF_WINDOW_PEER); - if (err) - msg->uop = SCIF_SIG_NACK; - else - msg->uop = SCIF_SIG_ACK; - msg->payload[0] = ep->remote_ep; - scif_nodeqp_send(ep->remote_dev, msg); -} - -/** - * scif_recv_sig_resp: Handle SCIF_SIG_(N)ACK messages. - * @scifdev: SCIF device - * @msg: Interrupt message - * - * The peer has responded to a signal request. - */ -void scif_recv_sig_resp(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - struct scif_fence_info *fence_req = - (struct scif_fence_info *)msg->payload[3]; - - mutex_lock(&ep->rma_info.rma_lock); - if (msg->uop == SCIF_SIG_ACK) - fence_req->state = OP_COMPLETED; - else - fence_req->state = OP_FAILED; - mutex_unlock(&ep->rma_info.rma_lock); - complete(&fence_req->comp); -} - -static inline void *scif_get_local_va(off_t off, struct scif_window *window) -{ - struct page **pages = window->pinned_pages->pages; - int page_nr = (off - window->offset) >> PAGE_SHIFT; - off_t page_off = off & ~PAGE_MASK; - - return page_address(pages[page_nr]) + page_off; -} - -static void scif_prog_signal_cb(void *arg) -{ - struct scif_cb_arg *cb_arg = arg; - - dma_pool_free(cb_arg->ep->remote_dev->signal_pool, cb_arg->status, - cb_arg->src_dma_addr); - kfree(cb_arg); -} - -static int _scif_prog_signal(scif_epd_t epd, dma_addr_t dst, u64 val) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct dma_chan *chan = ep->rma_info.dma_chan; - struct dma_device *ddev = chan->device; - bool x100 = !is_dma_copy_aligned(chan->device, 1, 1, 1); - struct dma_async_tx_descriptor *tx; - struct scif_status *status = NULL; - struct scif_cb_arg *cb_arg = NULL; - dma_addr_t src; - dma_cookie_t cookie; - int err; - - tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_FENCE); - if (!tx) { - err = -ENOMEM; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto alloc_fail; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - err = (int)cookie; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto alloc_fail; - } - dma_async_issue_pending(chan); - if (x100) { - /* - * For X100 use the status descriptor to write the value to - * the destination. - */ - tx = ddev->device_prep_dma_imm_data(chan, dst, val, 0); - } else { - status = dma_pool_alloc(ep->remote_dev->signal_pool, GFP_KERNEL, - &src); - if (!status) { - err = -ENOMEM; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto alloc_fail; - } - status->val = val; - status->src_dma_addr = src; - status->ep = ep; - src += offsetof(struct scif_status, val); - tx = ddev->device_prep_dma_memcpy(chan, dst, src, sizeof(val), - DMA_PREP_INTERRUPT); - } - if (!tx) { - err = -ENOMEM; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto dma_fail; - } - if (!x100) { - cb_arg = kmalloc(sizeof(*cb_arg), GFP_KERNEL); - if (!cb_arg) { - err = -ENOMEM; - goto dma_fail; - } - cb_arg->src_dma_addr = src; - cb_arg->status = status; - cb_arg->ep = ep; - tx->callback = scif_prog_signal_cb; - tx->callback_param = cb_arg; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - err = -EIO; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - goto dma_fail; - } - dma_async_issue_pending(chan); - return 0; -dma_fail: - if (!x100) { - dma_pool_free(ep->remote_dev->signal_pool, status, - src - offsetof(struct scif_status, val)); - kfree(cb_arg); - } -alloc_fail: - return err; -} - -/** - * scif_prog_signal: - * @epd: Endpoint Descriptor - * @offset: registered address to write @val to - * @val: Value to be written at @offset - * @type: Type of the window. - * - * Arrange to write a value to the registered offset after ensuring that the - * offset provided is indeed valid. - */ -int scif_prog_signal(scif_epd_t epd, off_t offset, u64 val, - enum scif_window_type type) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct scif_window *window = NULL; - struct scif_rma_req req; - dma_addr_t dst_dma_addr; - int err; - - mutex_lock(&ep->rma_info.rma_lock); - req.out_window = &window; - req.offset = offset; - req.nr_bytes = sizeof(u64); - req.prot = SCIF_PROT_WRITE; - req.type = SCIF_WINDOW_SINGLE; - if (type == SCIF_WINDOW_SELF) - req.head = &ep->rma_info.reg_list; - else - req.head = &ep->rma_info.remote_reg_list; - /* Does a valid window exist? */ - err = scif_query_window(&req); - if (err) { - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", __func__, __LINE__, err); - goto unlock_ret; - } - - if (scif_is_mgmt_node() && scifdev_self(ep->remote_dev)) { - u64 *dst_virt; - - if (type == SCIF_WINDOW_SELF) - dst_virt = scif_get_local_va(offset, window); - else - dst_virt = - scif_get_local_va(offset, (struct scif_window *) - window->peer_window); - *dst_virt = val; - } else { - dst_dma_addr = __scif_off_to_dma_addr(window, offset); - err = _scif_prog_signal(epd, dst_dma_addr, val); - } -unlock_ret: - mutex_unlock(&ep->rma_info.rma_lock); - return err; -} - -static int _scif_fence_wait(scif_epd_t epd, int mark) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - dma_cookie_t cookie = mark & ~SCIF_REMOTE_FENCE; - int err; - - /* Wait for DMA callback in scif_fence_mark_cb(..) */ - err = wait_event_interruptible_timeout(ep->rma_info.markwq, - dma_async_is_tx_complete( - ep->rma_info.dma_chan, - cookie, NULL, NULL) == - DMA_COMPLETE, - SCIF_NODE_ALIVE_TIMEOUT); - if (!err) - err = -ETIMEDOUT; - else if (err > 0) - err = 0; - return err; -} - -/** - * scif_rma_handle_remote_fences: - * - * This routine services remote fence requests. - */ -void scif_rma_handle_remote_fences(void) -{ - struct list_head *item, *tmp; - struct scif_remote_fence_info *fence; - struct scif_endpt *ep; - int mark, err; - - might_sleep(); - mutex_lock(&scif_info.fencelock); - list_for_each_safe(item, tmp, &scif_info.fence) { - fence = list_entry(item, struct scif_remote_fence_info, - list); - /* Remove fence from global list */ - list_del(&fence->list); - - /* Initiate the fence operation */ - ep = (struct scif_endpt *)fence->msg.payload[0]; - mark = fence->msg.payload[2]; - err = _scif_fence_wait(ep, mark); - if (err) - fence->msg.uop = SCIF_WAIT_NACK; - else - fence->msg.uop = SCIF_WAIT_ACK; - fence->msg.payload[0] = ep->remote_ep; - scif_nodeqp_send(ep->remote_dev, &fence->msg); - kfree(fence); - if (!atomic_sub_return(1, &ep->rma_info.fence_refcount)) - schedule_work(&scif_info.misc_work); - } - mutex_unlock(&scif_info.fencelock); -} - -static int _scif_send_fence(scif_epd_t epd, int uop, int mark, int *out_mark) -{ - int err; - struct scifmsg msg; - struct scif_fence_info *fence_req; - struct scif_endpt *ep = (struct scif_endpt *)epd; - - fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL); - if (!fence_req) { - err = -ENOMEM; - goto error; - } - - fence_req->state = OP_IN_PROGRESS; - init_completion(&fence_req->comp); - - msg.src = ep->port; - msg.uop = uop; - msg.payload[0] = ep->remote_ep; - msg.payload[1] = (u64)fence_req; - if (uop == SCIF_WAIT) - msg.payload[2] = mark; - spin_lock(&ep->lock); - if (ep->state == SCIFEP_CONNECTED) - err = scif_nodeqp_send(ep->remote_dev, &msg); - else - err = -ENOTCONN; - spin_unlock(&ep->lock); - if (err) - goto error_free; -retry: - /* Wait for a SCIF_WAIT_(N)ACK message */ - err = wait_for_completion_timeout(&fence_req->comp, - SCIF_NODE_ALIVE_TIMEOUT); - if (!err && scifdev_alive(ep)) - goto retry; - if (!err) - err = -ENODEV; - if (err > 0) - err = 0; - mutex_lock(&ep->rma_info.rma_lock); - if (err < 0) { - if (fence_req->state == OP_IN_PROGRESS) - fence_req->state = OP_FAILED; - } - if (fence_req->state == OP_FAILED && !err) - err = -ENOMEM; - if (uop == SCIF_MARK && fence_req->state == OP_COMPLETED) - *out_mark = SCIF_REMOTE_FENCE | fence_req->dma_mark; - mutex_unlock(&ep->rma_info.rma_lock); -error_free: - kfree(fence_req); -error: - return err; -} - -/** - * scif_send_fence_mark: - * @epd: end point descriptor. - * @out_mark: Output DMA mark reported by peer. - * - * Send a remote fence mark request. - */ -static int scif_send_fence_mark(scif_epd_t epd, int *out_mark) -{ - return _scif_send_fence(epd, SCIF_MARK, 0, out_mark); -} - -/** - * scif_send_fence_wait: - * @epd: end point descriptor. - * @mark: DMA mark to wait for. - * - * Send a remote fence wait request. - */ -static int scif_send_fence_wait(scif_epd_t epd, int mark) -{ - return _scif_send_fence(epd, SCIF_WAIT, mark, NULL); -} - -static int _scif_send_fence_signal_wait(struct scif_endpt *ep, - struct scif_fence_info *fence_req) -{ - int err; - -retry: - /* Wait for a SCIF_SIG_(N)ACK message */ - err = wait_for_completion_timeout(&fence_req->comp, - SCIF_NODE_ALIVE_TIMEOUT); - if (!err && scifdev_alive(ep)) - goto retry; - if (!err) - err = -ENODEV; - if (err > 0) - err = 0; - if (err < 0) { - mutex_lock(&ep->rma_info.rma_lock); - if (fence_req->state == OP_IN_PROGRESS) - fence_req->state = OP_FAILED; - mutex_unlock(&ep->rma_info.rma_lock); - } - if (fence_req->state == OP_FAILED && !err) - err = -ENXIO; - return err; -} - -/** - * scif_send_fence_signal: - * @epd: endpoint descriptor - * @loff: local offset - * @lval: local value to write to loffset - * @roff: remote offset - * @rval: remote value to write to roffset - * @flags: flags - * - * Sends a remote fence signal request - */ -static int scif_send_fence_signal(scif_epd_t epd, off_t roff, u64 rval, - off_t loff, u64 lval, int flags) -{ - int err = 0; - struct scifmsg msg; - struct scif_fence_info *fence_req; - struct scif_endpt *ep = (struct scif_endpt *)epd; - - fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL); - if (!fence_req) { - err = -ENOMEM; - goto error; - } - - fence_req->state = OP_IN_PROGRESS; - init_completion(&fence_req->comp); - msg.src = ep->port; - if (flags & SCIF_SIGNAL_LOCAL) { - msg.uop = SCIF_SIG_LOCAL; - msg.payload[0] = ep->remote_ep; - msg.payload[1] = roff; - msg.payload[2] = rval; - msg.payload[3] = (u64)fence_req; - spin_lock(&ep->lock); - if (ep->state == SCIFEP_CONNECTED) - err = scif_nodeqp_send(ep->remote_dev, &msg); - else - err = -ENOTCONN; - spin_unlock(&ep->lock); - if (err) - goto error_free; - err = _scif_send_fence_signal_wait(ep, fence_req); - if (err) - goto error_free; - } - fence_req->state = OP_IN_PROGRESS; - - if (flags & SCIF_SIGNAL_REMOTE) { - msg.uop = SCIF_SIG_REMOTE; - msg.payload[0] = ep->remote_ep; - msg.payload[1] = loff; - msg.payload[2] = lval; - msg.payload[3] = (u64)fence_req; - spin_lock(&ep->lock); - if (ep->state == SCIFEP_CONNECTED) - err = scif_nodeqp_send(ep->remote_dev, &msg); - else - err = -ENOTCONN; - spin_unlock(&ep->lock); - if (err) - goto error_free; - err = _scif_send_fence_signal_wait(ep, fence_req); - } -error_free: - kfree(fence_req); -error: - return err; -} - -static void scif_fence_mark_cb(void *arg) -{ - struct scif_endpt *ep = (struct scif_endpt *)arg; - - wake_up_interruptible(&ep->rma_info.markwq); - atomic_dec(&ep->rma_info.fence_refcount); -} - -/** - * _scif_fence_mark: - * @epd: endpoint descriptor - * @mark: DMA mark to set-up - * - * Set up a mark for this endpoint and return the value of the mark. - */ -int _scif_fence_mark(scif_epd_t epd, int *mark) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct dma_chan *chan = ep->rma_info.dma_chan; - struct dma_device *ddev = chan->device; - struct dma_async_tx_descriptor *tx; - dma_cookie_t cookie; - int err; - - tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_FENCE); - if (!tx) { - err = -ENOMEM; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - return err; - } - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - err = (int)cookie; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - return err; - } - dma_async_issue_pending(chan); - tx = ddev->device_prep_dma_interrupt(chan, DMA_PREP_INTERRUPT); - if (!tx) { - err = -ENOMEM; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - return err; - } - tx->callback = scif_fence_mark_cb; - tx->callback_param = ep; - *mark = cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - err = (int)cookie; - dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - return err; - } - atomic_inc(&ep->rma_info.fence_refcount); - dma_async_issue_pending(chan); - return 0; -} - -#define SCIF_LOOPB_MAGIC_MARK 0xdead - -int scif_fence_mark(scif_epd_t epd, int flags, int *mark) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int err = 0; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI fence_mark: ep %p flags 0x%x mark 0x%x\n", - ep, flags, *mark); - err = scif_verify_epd(ep); - if (err) - return err; - - /* Invalid flags? */ - if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)) - return -EINVAL; - - /* At least one of init self or peer RMA should be set */ - if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))) - return -EINVAL; - - /* Exactly one of init self or peer RMA should be set but not both */ - if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER)) - return -EINVAL; - - /* - * Management node loopback does not need to use DMA. - * Return a valid mark to be symmetric. - */ - if (scifdev_self(ep->remote_dev) && scif_is_mgmt_node()) { - *mark = SCIF_LOOPB_MAGIC_MARK; - return 0; - } - - if (flags & SCIF_FENCE_INIT_SELF) - err = _scif_fence_mark(epd, mark); - else - err = scif_send_fence_mark(ep, mark); - - if (err) - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", __func__, __LINE__, err); - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI fence_mark: ep %p flags 0x%x mark 0x%x err %d\n", - ep, flags, *mark, err); - return err; -} -EXPORT_SYMBOL_GPL(scif_fence_mark); - -int scif_fence_wait(scif_epd_t epd, int mark) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int err = 0; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI fence_wait: ep %p mark 0x%x\n", - ep, mark); - err = scif_verify_epd(ep); - if (err) - return err; - /* - * Management node loopback does not need to use DMA. - * The only valid mark provided is 0 so simply - * return success if the mark is valid. - */ - if (scifdev_self(ep->remote_dev) && scif_is_mgmt_node()) { - if (mark == SCIF_LOOPB_MAGIC_MARK) - return 0; - else - return -EINVAL; - } - if (mark & SCIF_REMOTE_FENCE) - err = scif_send_fence_wait(epd, mark); - else - err = _scif_fence_wait(epd, mark); - if (err < 0) - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", __func__, __LINE__, err); - return err; -} -EXPORT_SYMBOL_GPL(scif_fence_wait); - -int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, - off_t roff, u64 rval, int flags) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - int err = 0; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI fence_signal: ep %p loff 0x%lx lval 0x%llx roff 0x%lx rval 0x%llx flags 0x%x\n", - ep, loff, lval, roff, rval, flags); - err = scif_verify_epd(ep); - if (err) - return err; - - /* Invalid flags? */ - if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER | - SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE)) - return -EINVAL; - - /* At least one of init self or peer RMA should be set */ - if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))) - return -EINVAL; - - /* Exactly one of init self or peer RMA should be set but not both */ - if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER)) - return -EINVAL; - - /* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */ - if (!(flags & (SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE))) - return -EINVAL; - - /* Only Dword offsets allowed */ - if ((flags & SCIF_SIGNAL_LOCAL) && (loff & (sizeof(u32) - 1))) - return -EINVAL; - - /* Only Dword aligned offsets allowed */ - if ((flags & SCIF_SIGNAL_REMOTE) && (roff & (sizeof(u32) - 1))) - return -EINVAL; - - if (flags & SCIF_FENCE_INIT_PEER) { - err = scif_send_fence_signal(epd, roff, rval, loff, - lval, flags); - } else { - /* Local Signal in Local RAS */ - if (flags & SCIF_SIGNAL_LOCAL) { - err = scif_prog_signal(epd, loff, lval, - SCIF_WINDOW_SELF); - if (err) - goto error_ret; - } - - /* Signal in Remote RAS */ - if (flags & SCIF_SIGNAL_REMOTE) - err = scif_prog_signal(epd, roff, - rval, SCIF_WINDOW_PEER); - } -error_ret: - if (err) - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", __func__, __LINE__, err); - return err; -} -EXPORT_SYMBOL_GPL(scif_fence_signal); diff --git a/drivers/misc/mic/scif/scif_main.c b/drivers/misc/mic/scif/scif_main.c deleted file mode 100644 index e2278bf9f11d..000000000000 --- a/drivers/misc/mic/scif/scif_main.c +++ /dev/null @@ -1,351 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include -#include - -#include -#include "../common/mic_dev.h" -#include "../bus/scif_bus.h" -#include "scif_peer_bus.h" -#include "scif_main.h" -#include "scif_map.h" - -struct scif_info scif_info = { - .mdev = { - .minor = MISC_DYNAMIC_MINOR, - .name = "scif", - .fops = &scif_fops, - } -}; - -struct scif_dev *scif_dev; -struct kmem_cache *unaligned_cache; -static atomic_t g_loopb_cnt; - -/* Runs in the context of intr_wq */ -static void scif_intr_bh_handler(struct work_struct *work) -{ - struct scif_dev *scifdev = - container_of(work, struct scif_dev, intr_bh); - - if (scifdev_self(scifdev)) - scif_loopb_msg_handler(scifdev, scifdev->qpairs); - else - scif_nodeqp_intrhandler(scifdev, scifdev->qpairs); -} - -int scif_setup_intr_wq(struct scif_dev *scifdev) -{ - if (!scifdev->intr_wq) { - snprintf(scifdev->intr_wqname, sizeof(scifdev->intr_wqname), - "SCIF INTR %d", scifdev->node); - scifdev->intr_wq = - alloc_ordered_workqueue(scifdev->intr_wqname, 0); - if (!scifdev->intr_wq) - return -ENOMEM; - INIT_WORK(&scifdev->intr_bh, scif_intr_bh_handler); - } - return 0; -} - -void scif_destroy_intr_wq(struct scif_dev *scifdev) -{ - if (scifdev->intr_wq) { - destroy_workqueue(scifdev->intr_wq); - scifdev->intr_wq = NULL; - } -} - -irqreturn_t scif_intr_handler(int irq, void *data) -{ - struct scif_dev *scifdev = data; - struct scif_hw_dev *sdev = scifdev->sdev; - - sdev->hw_ops->ack_interrupt(sdev, scifdev->db); - queue_work(scifdev->intr_wq, &scifdev->intr_bh); - return IRQ_HANDLED; -} - -static void scif_qp_setup_handler(struct work_struct *work) -{ - struct scif_dev *scifdev = container_of(work, struct scif_dev, - qp_dwork.work); - struct scif_hw_dev *sdev = scifdev->sdev; - dma_addr_t da = 0; - int err; - - if (scif_is_mgmt_node()) { - struct mic_bootparam *bp = sdev->dp; - - da = bp->scif_card_dma_addr; - scifdev->rdb = bp->h2c_scif_db; - } else { - struct mic_bootparam __iomem *bp = sdev->rdp; - - da = readq(&bp->scif_host_dma_addr); - scifdev->rdb = ioread8(&bp->c2h_scif_db); - } - if (da) { - err = scif_qp_response(da, scifdev); - if (err) - dev_err(&scifdev->sdev->dev, - "scif_qp_response err %d\n", err); - } else { - schedule_delayed_work(&scifdev->qp_dwork, - msecs_to_jiffies(1000)); - } -} - -static int scif_setup_scifdev(void) -{ - /* We support a maximum of 129 SCIF nodes including the mgmt node */ -#define MAX_SCIF_NODES 129 - int i; - u8 num_nodes = MAX_SCIF_NODES; - - scif_dev = kcalloc(num_nodes, sizeof(*scif_dev), GFP_KERNEL); - if (!scif_dev) - return -ENOMEM; - for (i = 0; i < num_nodes; i++) { - struct scif_dev *scifdev = &scif_dev[i]; - - scifdev->node = i; - scifdev->exit = OP_IDLE; - init_waitqueue_head(&scifdev->disconn_wq); - mutex_init(&scifdev->lock); - INIT_WORK(&scifdev->peer_add_work, scif_add_peer_device); - INIT_DELAYED_WORK(&scifdev->p2p_dwork, - scif_poll_qp_state); - INIT_DELAYED_WORK(&scifdev->qp_dwork, - scif_qp_setup_handler); - INIT_LIST_HEAD(&scifdev->p2p); - RCU_INIT_POINTER(scifdev->spdev, NULL); - } - return 0; -} - -static void scif_destroy_scifdev(void) -{ - kfree(scif_dev); - scif_dev = NULL; -} - -static int scif_probe(struct scif_hw_dev *sdev) -{ - struct scif_dev *scifdev = &scif_dev[sdev->dnode]; - int rc; - - dev_set_drvdata(&sdev->dev, sdev); - scifdev->sdev = sdev; - - if (1 == atomic_add_return(1, &g_loopb_cnt)) { - struct scif_dev *loopb_dev = &scif_dev[sdev->snode]; - - loopb_dev->sdev = sdev; - rc = scif_setup_loopback_qp(loopb_dev); - if (rc) - goto exit; - } - - rc = scif_setup_intr_wq(scifdev); - if (rc) - goto destroy_loopb; - rc = scif_setup_qp(scifdev); - if (rc) - goto destroy_intr; - scifdev->db = sdev->hw_ops->next_db(sdev); - scifdev->cookie = sdev->hw_ops->request_irq(sdev, scif_intr_handler, - "SCIF_INTR", scifdev, - scifdev->db); - if (IS_ERR(scifdev->cookie)) { - rc = PTR_ERR(scifdev->cookie); - goto free_qp; - } - if (scif_is_mgmt_node()) { - struct mic_bootparam *bp = sdev->dp; - - bp->c2h_scif_db = scifdev->db; - bp->scif_host_dma_addr = scifdev->qp_dma_addr; - } else { - struct mic_bootparam __iomem *bp = sdev->rdp; - - iowrite8(scifdev->db, &bp->h2c_scif_db); - writeq(scifdev->qp_dma_addr, &bp->scif_card_dma_addr); - } - schedule_delayed_work(&scifdev->qp_dwork, - msecs_to_jiffies(1000)); - return rc; -free_qp: - scif_free_qp(scifdev); -destroy_intr: - scif_destroy_intr_wq(scifdev); -destroy_loopb: - if (atomic_dec_and_test(&g_loopb_cnt)) - scif_destroy_loopback_qp(&scif_dev[sdev->snode]); -exit: - return rc; -} - -void scif_stop(struct scif_dev *scifdev) -{ - struct scif_dev *dev; - int i; - - for (i = scif_info.maxid; i >= 0; i--) { - dev = &scif_dev[i]; - if (scifdev_self(dev)) - continue; - scif_handle_remove_node(i); - } -} - -static void scif_remove(struct scif_hw_dev *sdev) -{ - struct scif_dev *scifdev = &scif_dev[sdev->dnode]; - - if (scif_is_mgmt_node()) { - struct mic_bootparam *bp = sdev->dp; - - bp->c2h_scif_db = -1; - bp->scif_host_dma_addr = 0x0; - } else { - struct mic_bootparam __iomem *bp = sdev->rdp; - - iowrite8(-1, &bp->h2c_scif_db); - writeq(0x0, &bp->scif_card_dma_addr); - } - if (scif_is_mgmt_node()) { - scif_disconnect_node(scifdev->node, true); - } else { - scif_info.card_initiated_exit = true; - scif_stop(scifdev); - } - if (atomic_dec_and_test(&g_loopb_cnt)) - scif_destroy_loopback_qp(&scif_dev[sdev->snode]); - if (scifdev->cookie) { - sdev->hw_ops->free_irq(sdev, scifdev->cookie, scifdev); - scifdev->cookie = NULL; - } - scif_destroy_intr_wq(scifdev); - cancel_delayed_work(&scifdev->qp_dwork); - scif_free_qp(scifdev); - scifdev->rdb = -1; - scifdev->sdev = NULL; -} - -static struct scif_hw_dev_id id_table[] = { - { MIC_SCIF_DEV, SCIF_DEV_ANY_ID }, - { 0 }, -}; - -static struct scif_driver scif_driver = { - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = scif_probe, - .remove = scif_remove, -}; - -static int _scif_init(void) -{ - int rc; - - mutex_init(&scif_info.eplock); - spin_lock_init(&scif_info.rmalock); - spin_lock_init(&scif_info.nb_connect_lock); - spin_lock_init(&scif_info.port_lock); - mutex_init(&scif_info.conflock); - mutex_init(&scif_info.connlock); - mutex_init(&scif_info.fencelock); - INIT_LIST_HEAD(&scif_info.uaccept); - INIT_LIST_HEAD(&scif_info.listen); - INIT_LIST_HEAD(&scif_info.zombie); - INIT_LIST_HEAD(&scif_info.connected); - INIT_LIST_HEAD(&scif_info.disconnected); - INIT_LIST_HEAD(&scif_info.rma); - INIT_LIST_HEAD(&scif_info.rma_tc); - INIT_LIST_HEAD(&scif_info.mmu_notif_cleanup); - INIT_LIST_HEAD(&scif_info.fence); - INIT_LIST_HEAD(&scif_info.nb_connect_list); - init_waitqueue_head(&scif_info.exitwq); - scif_info.rma_tc_limit = SCIF_RMA_TEMP_CACHE_LIMIT; - scif_info.en_msg_log = 0; - scif_info.p2p_enable = 1; - rc = scif_setup_scifdev(); - if (rc) - goto error; - unaligned_cache = kmem_cache_create("Unaligned_DMA", - SCIF_KMEM_UNALIGNED_BUF_SIZE, - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!unaligned_cache) { - rc = -ENOMEM; - goto free_sdev; - } - INIT_WORK(&scif_info.misc_work, scif_misc_handler); - INIT_WORK(&scif_info.mmu_notif_work, scif_mmu_notif_handler); - INIT_WORK(&scif_info.conn_work, scif_conn_handler); - idr_init(&scif_ports); - return 0; -free_sdev: - scif_destroy_scifdev(); -error: - return rc; -} - -static void _scif_exit(void) -{ - idr_destroy(&scif_ports); - kmem_cache_destroy(unaligned_cache); - scif_destroy_scifdev(); -} - -static int __init scif_init(void) -{ - struct miscdevice *mdev = &scif_info.mdev; - int rc; - - _scif_init(); - iova_cache_get(); - rc = scif_peer_bus_init(); - if (rc) - goto exit; - rc = scif_register_driver(&scif_driver); - if (rc) - goto peer_bus_exit; - rc = misc_register(mdev); - if (rc) - goto unreg_scif; - scif_init_debugfs(); - return 0; -unreg_scif: - scif_unregister_driver(&scif_driver); -peer_bus_exit: - scif_peer_bus_exit(); -exit: - _scif_exit(); - return rc; -} - -static void __exit scif_exit(void) -{ - scif_exit_debugfs(); - misc_deregister(&scif_info.mdev); - scif_unregister_driver(&scif_driver); - scif_peer_bus_exit(); - iova_cache_put(); - _scif_exit(); -} - -module_init(scif_init); -module_exit(scif_exit); - -MODULE_DEVICE_TABLE(scif, id_table); -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) SCIF driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/scif/scif_main.h b/drivers/misc/mic/scif/scif_main.h deleted file mode 100644 index bb3ab97d5b35..000000000000 --- a/drivers/misc/mic/scif/scif_main.h +++ /dev/null @@ -1,274 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#ifndef SCIF_MAIN_H -#define SCIF_MAIN_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../common/mic_dev.h" - -#define SCIF_MGMT_NODE 0 -#define SCIF_DEFAULT_WATCHDOG_TO 30 -#define SCIF_NODE_ACCEPT_TIMEOUT (3 * HZ) -#define SCIF_NODE_ALIVE_TIMEOUT (SCIF_DEFAULT_WATCHDOG_TO * HZ) -#define SCIF_RMA_TEMP_CACHE_LIMIT 0x20000 - -/* - * Generic state used for certain node QP message exchanges - * like Unregister, Alloc etc. - */ -enum scif_msg_state { - OP_IDLE = 1, - OP_IN_PROGRESS, - OP_COMPLETED, - OP_FAILED -}; - -/* - * struct scif_info - Global SCIF information - * - * @nodeid: Node ID this node is to others - * @maxid: Max known node ID - * @total: Total number of SCIF nodes - * @nr_zombies: number of zombie endpoints - * @eplock: Lock to synchronize listening, zombie endpoint lists - * @connlock: Lock to synchronize connected and disconnected lists - * @nb_connect_lock: Synchronize non blocking connect operations - * @port_lock: Synchronize access to SCIF ports - * @uaccept: List of user acceptreq waiting for acceptreg - * @listen: List of listening end points - * @zombie: List of zombie end points with pending RMA's - * @connected: List of end points in connected state - * @disconnected: List of end points in disconnected state - * @nb_connect_list: List for non blocking connections - * @misc_work: miscellaneous SCIF tasks - * @conflock: Lock to synchronize SCIF node configuration changes - * @en_msg_log: Enable debug message logging - * @p2p_enable: Enable P2P SCIF network - * @mdev: The MISC device - * @conn_work: Work for workqueue handling all connections - * @exitwq: Wait queue for waiting for an EXIT node QP message response - * @loopb_dev: Dummy SCIF device used for loopback - * @loopb_wq: Workqueue used for handling loopback messages - * @loopb_wqname[16]: Name of loopback workqueue - * @loopb_work: Used for submitting work to loopb_wq - * @loopb_recv_q: List of messages received on the loopb_wq - * @card_initiated_exit: set when the card has initiated the exit - * @rmalock: Synchronize access to RMA operations - * @fencelock: Synchronize access to list of remote fences requested. - * @rma: List of temporary registered windows to be destroyed. - * @rma_tc: List of temporary registered & cached Windows to be destroyed - * @fence: List of remote fence requests - * @mmu_notif_work: Work for registration caching MMU notifier workqueue - * @mmu_notif_cleanup: List of temporary cached windows for reg cache - * @rma_tc_limit: RMA temporary cache limit - */ -struct scif_info { - u8 nodeid; - u8 maxid; - u8 total; - u32 nr_zombies; - struct mutex eplock; - struct mutex connlock; - spinlock_t nb_connect_lock; - spinlock_t port_lock; - struct list_head uaccept; - struct list_head listen; - struct list_head zombie; - struct list_head connected; - struct list_head disconnected; - struct list_head nb_connect_list; - struct work_struct misc_work; - struct mutex conflock; - u8 en_msg_log; - u8 p2p_enable; - struct miscdevice mdev; - struct work_struct conn_work; - wait_queue_head_t exitwq; - struct scif_dev *loopb_dev; - struct workqueue_struct *loopb_wq; - char loopb_wqname[16]; - struct work_struct loopb_work; - struct list_head loopb_recv_q; - bool card_initiated_exit; - spinlock_t rmalock; - struct mutex fencelock; - struct list_head rma; - struct list_head rma_tc; - struct list_head fence; - struct work_struct mmu_notif_work; - struct list_head mmu_notif_cleanup; - unsigned long rma_tc_limit; -}; - -/* - * struct scif_p2p_info - SCIF mapping information used for P2P - * - * @ppi_peer_id - SCIF peer node id - * @ppi_sg - Scatter list for bar information (One for mmio and one for aper) - * @sg_nentries - Number of entries in the scatterlist - * @ppi_da: DMA address for MMIO and APER bars - * @ppi_len: Length of MMIO and APER bars - * @ppi_list: Link in list of mapping information - */ -struct scif_p2p_info { - u8 ppi_peer_id; - struct scatterlist *ppi_sg[2]; - u64 sg_nentries[2]; - dma_addr_t ppi_da[2]; - u64 ppi_len[2]; -#define SCIF_PPI_MMIO 0 -#define SCIF_PPI_APER 1 - struct list_head ppi_list; -}; - -/* - * struct scif_dev - SCIF remote device specific fields - * - * @node: Node id - * @p2p: List of P2P mapping information - * @qpairs: The node queue pair for exchanging control messages - * @intr_wq: Workqueue for handling Node QP messages - * @intr_wqname: Name of node QP workqueue for handling interrupts - * @intr_bh: Used for submitting work to intr_wq - * @lock: Lock used for synchronizing access to the scif device - * @sdev: SCIF hardware device on the SCIF hardware bus - * @db: doorbell the peer will trigger to generate an interrupt on self - * @rdb: Doorbell to trigger on the peer to generate an interrupt on the peer - * @cookie: Cookie received while registering the interrupt handler - * @peer_add_work: Work for handling device_add for peer devices - * @p2p_dwork: Delayed work to enable polling for P2P state - * @qp_dwork: Delayed work for enabling polling for remote QP information - * @p2p_retry: Number of times to retry polling of P2P state - * @base_addr: P2P aperture bar base address - * @mic_mw mmio: The peer MMIO information used for P2P - * @spdev: SCIF peer device on the SCIF peer bus - * @node_remove_ack_pending: True if a node_remove_ack is pending - * @exit_ack_pending: true if an exit_ack is pending - * @disconn_wq: Used while waiting for a node remove response - * @disconn_rescnt: Keeps track of number of node remove requests sent - * @exit: Status of exit message - * @qp_dma_addr: Queue pair DMA address passed to the peer - * @dma_ch_idx: Round robin index for DMA channels - * @signal_pool: DMA pool used for scheduling scif_fence_signal DMA's -*/ -struct scif_dev { - u8 node; - struct list_head p2p; - struct scif_qp *qpairs; - struct workqueue_struct *intr_wq; - char intr_wqname[16]; - struct work_struct intr_bh; - struct mutex lock; - struct scif_hw_dev *sdev; - int db; - int rdb; - struct mic_irq *cookie; - struct work_struct peer_add_work; - struct delayed_work p2p_dwork; - struct delayed_work qp_dwork; - int p2p_retry; - dma_addr_t base_addr; - struct mic_mw mmio; - struct scif_peer_dev __rcu *spdev; - bool node_remove_ack_pending; - bool exit_ack_pending; - wait_queue_head_t disconn_wq; - atomic_t disconn_rescnt; - enum scif_msg_state exit; - dma_addr_t qp_dma_addr; - int dma_ch_idx; - struct dma_pool *signal_pool; -}; - -extern bool scif_reg_cache_enable; -extern bool scif_ulimit_check; -extern struct scif_info scif_info; -extern struct idr scif_ports; -extern struct bus_type scif_peer_bus; -extern struct scif_dev *scif_dev; -extern const struct file_operations scif_fops; -extern const struct file_operations scif_anon_fops; - -/* Size of the RB for the Node QP */ -#define SCIF_NODE_QP_SIZE 0x10000 - -#include "scif_nodeqp.h" -#include "scif_rma.h" -#include "scif_rma_list.h" - -/* - * scifdev_self: - * @dev: The remote SCIF Device - * - * Returns true if the SCIF Device passed is the self aka Loopback SCIF device. - */ -static inline int scifdev_self(struct scif_dev *dev) -{ - return dev->node == scif_info.nodeid; -} - -static inline bool scif_is_mgmt_node(void) -{ - return !scif_info.nodeid; -} - -/* - * scifdev_is_p2p: - * @dev: The remote SCIF Device - * - * Returns true if the SCIF Device is a MIC Peer to Peer SCIF device. - */ -static inline bool scifdev_is_p2p(struct scif_dev *dev) -{ - if (scif_is_mgmt_node()) - return false; - else - return dev != &scif_dev[SCIF_MGMT_NODE] && - !scifdev_self(dev); -} - -/* - * scifdev_alive: - * @scifdev: The remote SCIF Device - * - * Returns true if the remote SCIF Device is running or sleeping for - * this endpoint. - */ -static inline int _scifdev_alive(struct scif_dev *scifdev) -{ - struct scif_peer_dev *spdev; - - rcu_read_lock(); - spdev = rcu_dereference(scifdev->spdev); - rcu_read_unlock(); - return !!spdev; -} - -#include "scif_epd.h" - -void __init scif_init_debugfs(void); -void scif_exit_debugfs(void); -int scif_setup_intr_wq(struct scif_dev *scifdev); -void scif_destroy_intr_wq(struct scif_dev *scifdev); -void scif_cleanup_scifdev(struct scif_dev *dev); -void scif_handle_remove_node(int node); -void scif_disconnect_node(u32 node_id, bool mgmt_initiated); -void scif_free_qp(struct scif_dev *dev); -void scif_misc_handler(struct work_struct *work); -void scif_stop(struct scif_dev *scifdev); -irqreturn_t scif_intr_handler(int irq, void *data); -#endif /* SCIF_MAIN_H */ diff --git a/drivers/misc/mic/scif/scif_map.h b/drivers/misc/mic/scif/scif_map.h deleted file mode 100644 index 96b760819bfc..000000000000 --- a/drivers/misc/mic/scif/scif_map.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#ifndef SCIF_MAP_H -#define SCIF_MAP_H - -#include "../bus/scif_bus.h" - -static __always_inline void * -scif_alloc_coherent(dma_addr_t *dma_handle, - struct scif_dev *scifdev, size_t size, - gfp_t gfp) -{ - void *va; - - if (scifdev_self(scifdev)) { - va = kmalloc(size, gfp); - if (va) - *dma_handle = virt_to_phys(va); - } else { - va = dma_alloc_coherent(&scifdev->sdev->dev, - size, dma_handle, gfp); - if (va && scifdev_is_p2p(scifdev)) - *dma_handle = *dma_handle + scifdev->base_addr; - } - return va; -} - -static __always_inline void -scif_free_coherent(void *va, dma_addr_t local, - struct scif_dev *scifdev, size_t size) -{ - if (scifdev_self(scifdev)) { - kfree(va); - } else { - if (scifdev_is_p2p(scifdev) && local > scifdev->base_addr) - local = local - scifdev->base_addr; - dma_free_coherent(&scifdev->sdev->dev, - size, va, local); - } -} - -static __always_inline int -scif_map_single(dma_addr_t *dma_handle, - void *local, struct scif_dev *scifdev, size_t size) -{ - int err = 0; - - if (scifdev_self(scifdev)) { - *dma_handle = virt_to_phys((local)); - } else { - *dma_handle = dma_map_single(&scifdev->sdev->dev, - local, size, DMA_BIDIRECTIONAL); - if (dma_mapping_error(&scifdev->sdev->dev, *dma_handle)) - err = -ENOMEM; - else if (scifdev_is_p2p(scifdev)) - *dma_handle = *dma_handle + scifdev->base_addr; - } - if (err) - *dma_handle = 0; - return err; -} - -static __always_inline void -scif_unmap_single(dma_addr_t local, struct scif_dev *scifdev, - size_t size) -{ - if (!scifdev_self(scifdev)) { - if (scifdev_is_p2p(scifdev)) - local = local - scifdev->base_addr; - dma_unmap_single(&scifdev->sdev->dev, local, - size, DMA_BIDIRECTIONAL); - } -} - -static __always_inline void * -scif_ioremap(dma_addr_t phys, size_t size, struct scif_dev *scifdev) -{ - void *out_virt; - struct scif_hw_dev *sdev = scifdev->sdev; - - if (scifdev_self(scifdev)) - out_virt = phys_to_virt(phys); - else - out_virt = (void __force *) - sdev->hw_ops->remap(sdev, phys, size); - return out_virt; -} - -static __always_inline void -scif_iounmap(void *virt, size_t len, struct scif_dev *scifdev) -{ - if (!scifdev_self(scifdev)) { - struct scif_hw_dev *sdev = scifdev->sdev; - - sdev->hw_ops->unmap(sdev, (void __force __iomem *)virt); - } -} - -static __always_inline int -scif_map_page(dma_addr_t *dma_handle, struct page *page, - struct scif_dev *scifdev) -{ - int err = 0; - - if (scifdev_self(scifdev)) { - *dma_handle = page_to_phys(page); - } else { - struct scif_hw_dev *sdev = scifdev->sdev; - *dma_handle = dma_map_page(&sdev->dev, - page, 0x0, PAGE_SIZE, - DMA_BIDIRECTIONAL); - if (dma_mapping_error(&sdev->dev, *dma_handle)) - err = -ENOMEM; - else if (scifdev_is_p2p(scifdev)) - *dma_handle = *dma_handle + scifdev->base_addr; - } - if (err) - *dma_handle = 0; - return err; -} -#endif /* SCIF_MAP_H */ diff --git a/drivers/misc/mic/scif/scif_mmap.c b/drivers/misc/mic/scif/scif_mmap.c deleted file mode 100644 index a151d416f39c..000000000000 --- a/drivers/misc/mic/scif/scif_mmap.c +++ /dev/null @@ -1,690 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel SCIF driver. - */ -#include "scif_main.h" - -/* - * struct scif_vma_info - Information about a remote memory mapping - * created via scif_mmap(..) - * @vma: VM area struct - * @list: link to list of active vmas - */ -struct scif_vma_info { - struct vm_area_struct *vma; - struct list_head list; -}; - -void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_rma_req req; - struct scif_window *window = NULL; - struct scif_window *recv_window = - (struct scif_window *)msg->payload[0]; - struct scif_endpt *ep; - - ep = (struct scif_endpt *)recv_window->ep; - req.out_window = &window; - req.offset = recv_window->offset; - req.prot = recv_window->prot; - req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT; - req.type = SCIF_WINDOW_FULL; - req.head = &ep->rma_info.reg_list; - msg->payload[0] = ep->remote_ep; - - mutex_lock(&ep->rma_info.rma_lock); - /* Does a valid window exist? */ - if (scif_query_window(&req)) { - dev_err(&scifdev->sdev->dev, - "%s %d -ENXIO\n", __func__, __LINE__); - msg->uop = SCIF_UNREGISTER_ACK; - goto error; - } - - scif_put_window(window, window->nr_pages); - - if (!window->ref_count) { - atomic_inc(&ep->rma_info.tw_refcount); - ep->rma_info.async_list_del = 1; - list_del_init(&window->list); - scif_free_window_offset(ep, window, window->offset); - } -error: - mutex_unlock(&ep->rma_info.rma_lock); - if (window && !window->ref_count) - scif_queue_for_cleanup(window, &scif_info.rma); -} - -/* - * Remove valid remote memory mappings created via scif_mmap(..) from the - * process address space since the remote node is lost - */ -static void __scif_zap_mmaps(struct scif_endpt *ep) -{ - struct list_head *item; - struct scif_vma_info *info; - struct vm_area_struct *vma; - unsigned long size; - - spin_lock(&ep->lock); - list_for_each(item, &ep->rma_info.vma_list) { - info = list_entry(item, struct scif_vma_info, list); - vma = info->vma; - size = vma->vm_end - vma->vm_start; - zap_vma_ptes(vma, vma->vm_start, size); - dev_dbg(scif_info.mdev.this_device, - "%s ep %p zap vma %p size 0x%lx\n", - __func__, ep, info->vma, size); - } - spin_unlock(&ep->lock); -} - -/* - * Traverse the list of endpoints for a particular remote node and - * zap valid remote memory mappings since the remote node is lost - */ -static void _scif_zap_mmaps(int node, struct list_head *head) -{ - struct scif_endpt *ep; - struct list_head *item; - - mutex_lock(&scif_info.connlock); - list_for_each(item, head) { - ep = list_entry(item, struct scif_endpt, list); - if (ep->remote_dev->node == node) - __scif_zap_mmaps(ep); - } - mutex_unlock(&scif_info.connlock); -} - -/* - * Wrapper for removing remote memory mappings for a particular node. This API - * is called by peer nodes as part of handling a lost node. - */ -void scif_zap_mmaps(int node) -{ - _scif_zap_mmaps(node, &scif_info.connected); - _scif_zap_mmaps(node, &scif_info.disconnected); -} - -/* - * This API is only called while handling a lost node: - * a) Remote node is dead. - * b) Remote memory mappings have been zapped - * So we can traverse the remote_reg_list without any locks. Since - * the window has not yet been unregistered we can drop the ref count - * and queue it to the cleanup thread. - */ -static void __scif_cleanup_rma_for_zombies(struct scif_endpt *ep) -{ - struct list_head *pos, *tmp; - struct scif_window *window; - - list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) { - window = list_entry(pos, struct scif_window, list); - if (window->ref_count) - scif_put_window(window, window->nr_pages); - else - dev_err(scif_info.mdev.this_device, - "%s %d unexpected\n", - __func__, __LINE__); - if (!window->ref_count) { - atomic_inc(&ep->rma_info.tw_refcount); - list_del_init(&window->list); - scif_queue_for_cleanup(window, &scif_info.rma); - } - } -} - -/* Cleanup remote registration lists for zombie endpoints */ -void scif_cleanup_rma_for_zombies(int node) -{ - struct scif_endpt *ep; - struct list_head *item; - - mutex_lock(&scif_info.eplock); - list_for_each(item, &scif_info.zombie) { - ep = list_entry(item, struct scif_endpt, list); - if (ep->remote_dev && ep->remote_dev->node == node) - __scif_cleanup_rma_for_zombies(ep); - } - mutex_unlock(&scif_info.eplock); - flush_work(&scif_info.misc_work); -} - -/* Insert the VMA into the per endpoint VMA list */ -static int scif_insert_vma(struct scif_endpt *ep, struct vm_area_struct *vma) -{ - struct scif_vma_info *info; - int err = 0; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) { - err = -ENOMEM; - goto done; - } - info->vma = vma; - spin_lock(&ep->lock); - list_add_tail(&info->list, &ep->rma_info.vma_list); - spin_unlock(&ep->lock); -done: - return err; -} - -/* Delete the VMA from the per endpoint VMA list */ -static void scif_delete_vma(struct scif_endpt *ep, struct vm_area_struct *vma) -{ - struct list_head *item; - struct scif_vma_info *info; - - spin_lock(&ep->lock); - list_for_each(item, &ep->rma_info.vma_list) { - info = list_entry(item, struct scif_vma_info, list); - if (info->vma == vma) { - list_del(&info->list); - kfree(info); - break; - } - } - spin_unlock(&ep->lock); -} - -static phys_addr_t scif_get_phys(phys_addr_t phys, struct scif_endpt *ep) -{ - struct scif_dev *scifdev = (struct scif_dev *)ep->remote_dev; - struct scif_hw_dev *sdev = scifdev->sdev; - phys_addr_t out_phys, apt_base = 0; - - /* - * If the DMA address is card relative then we need to add the - * aperture base for mmap to work correctly - */ - if (!scifdev_self(scifdev) && sdev->aper && sdev->card_rel_da) - apt_base = sdev->aper->pa; - out_phys = apt_base + phys; - return out_phys; -} - -int scif_get_pages(scif_epd_t epd, off_t offset, size_t len, - struct scif_range **pages) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct scif_rma_req req; - struct scif_window *window = NULL; - int nr_pages, err, i; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI get_pinned_pages: ep %p offset 0x%lx len 0x%lx\n", - ep, offset, len); - err = scif_verify_epd(ep); - if (err) - return err; - - if (!len || (offset < 0) || - (offset + len < offset) || - (ALIGN(offset, PAGE_SIZE) != offset) || - (ALIGN(len, PAGE_SIZE) != len)) - return -EINVAL; - - nr_pages = len >> PAGE_SHIFT; - - req.out_window = &window; - req.offset = offset; - req.prot = 0; - req.nr_bytes = len; - req.type = SCIF_WINDOW_SINGLE; - req.head = &ep->rma_info.remote_reg_list; - - mutex_lock(&ep->rma_info.rma_lock); - /* Does a valid window exist? */ - err = scif_query_window(&req); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - goto error; - } - - /* Allocate scif_range */ - *pages = kzalloc(sizeof(**pages), GFP_KERNEL); - if (!*pages) { - err = -ENOMEM; - goto error; - } - - /* Allocate phys addr array */ - (*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t)); - if (!((*pages)->phys_addr)) { - err = -ENOMEM; - goto error; - } - - if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) { - /* Allocate virtual address array */ - ((*pages)->va = scif_zalloc(nr_pages * sizeof(void *))); - if (!(*pages)->va) { - err = -ENOMEM; - goto error; - } - } - /* Populate the values */ - (*pages)->cookie = window; - (*pages)->nr_pages = nr_pages; - (*pages)->prot_flags = window->prot; - - for (i = 0; i < nr_pages; i++) { - (*pages)->phys_addr[i] = - __scif_off_to_dma_addr(window, offset + - (i * PAGE_SIZE)); - (*pages)->phys_addr[i] = scif_get_phys((*pages)->phys_addr[i], - ep); - if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) - (*pages)->va[i] = - ep->remote_dev->sdev->aper->va + - (*pages)->phys_addr[i] - - ep->remote_dev->sdev->aper->pa; - } - - scif_get_window(window, nr_pages); -error: - mutex_unlock(&ep->rma_info.rma_lock); - if (err) { - if (*pages) { - scif_free((*pages)->phys_addr, - nr_pages * sizeof(dma_addr_t)); - scif_free((*pages)->va, - nr_pages * sizeof(void *)); - kfree(*pages); - *pages = NULL; - } - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - } - return err; -} -EXPORT_SYMBOL_GPL(scif_get_pages); - -int scif_put_pages(struct scif_range *pages) -{ - struct scif_endpt *ep; - struct scif_window *window; - struct scifmsg msg; - - if (!pages || !pages->cookie) - return -EINVAL; - - window = pages->cookie; - - if (!window || window->magic != SCIFEP_MAGIC) - return -EINVAL; - - ep = (struct scif_endpt *)window->ep; - /* - * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the - * callee should be allowed to release references to the pages, - * else the endpoint was not connected in the first place, - * hence the ENOTCONN. - */ - if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED) - return -ENOTCONN; - - mutex_lock(&ep->rma_info.rma_lock); - - scif_put_window(window, pages->nr_pages); - - /* Initiate window destruction if ref count is zero */ - if (!window->ref_count) { - list_del(&window->list); - mutex_unlock(&ep->rma_info.rma_lock); - scif_drain_dma_intr(ep->remote_dev->sdev, - ep->rma_info.dma_chan); - /* Inform the peer about this window being destroyed. */ - msg.uop = SCIF_MUNMAP; - msg.src = ep->port; - msg.payload[0] = window->peer_window; - /* No error handling for notification messages */ - scif_nodeqp_send(ep->remote_dev, &msg); - /* Destroy this window from the peer's registered AS */ - scif_destroy_remote_window(window); - } else { - mutex_unlock(&ep->rma_info.rma_lock); - } - - scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t)); - scif_free(pages->va, pages->nr_pages * sizeof(void *)); - kfree(pages); - return 0; -} -EXPORT_SYMBOL_GPL(scif_put_pages); - -/* - * scif_rma_list_mmap: - * - * Traverse the remote registration list starting from start_window: - * 1) Create VtoP mappings via remap_pfn_range(..) - * 2) Once step 1) and 2) complete successfully then traverse the range of - * windows again and bump the reference count. - * RMA lock must be held. - */ -static int scif_rma_list_mmap(struct scif_window *start_window, s64 offset, - int nr_pages, struct vm_area_struct *vma) -{ - s64 end_offset, loop_offset = offset; - struct scif_window *window = start_window; - int loop_nr_pages, nr_pages_left = nr_pages; - struct scif_endpt *ep = (struct scif_endpt *)start_window->ep; - struct list_head *head = &ep->rma_info.remote_reg_list; - int i, err = 0; - dma_addr_t phys_addr; - struct scif_window_iter src_win_iter; - size_t contig_bytes = 0; - - might_sleep(); - list_for_each_entry_from(window, head, list) { - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - loop_nr_pages = min_t(int, - (end_offset - loop_offset) >> PAGE_SHIFT, - nr_pages_left); - scif_init_window_iter(window, &src_win_iter); - for (i = 0; i < loop_nr_pages; i++) { - phys_addr = scif_off_to_dma_addr(window, loop_offset, - &contig_bytes, - &src_win_iter); - phys_addr = scif_get_phys(phys_addr, ep); - err = remap_pfn_range(vma, - vma->vm_start + - loop_offset - offset, - phys_addr >> PAGE_SHIFT, - PAGE_SIZE, - vma->vm_page_prot); - if (err) - goto error; - loop_offset += PAGE_SIZE; - } - nr_pages_left -= loop_nr_pages; - if (!nr_pages_left) - break; - } - /* - * No more failures expected. Bump up the ref count for all - * the windows. Another traversal from start_window required - * for handling errors encountered across windows during - * remap_pfn_range(..). - */ - loop_offset = offset; - nr_pages_left = nr_pages; - window = start_window; - head = &ep->rma_info.remote_reg_list; - list_for_each_entry_from(window, head, list) { - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - loop_nr_pages = min_t(int, - (end_offset - loop_offset) >> PAGE_SHIFT, - nr_pages_left); - scif_get_window(window, loop_nr_pages); - nr_pages_left -= loop_nr_pages; - loop_offset += (loop_nr_pages << PAGE_SHIFT); - if (!nr_pages_left) - break; - } -error: - if (err) - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", __func__, __LINE__, err); - return err; -} - -/* - * scif_rma_list_munmap: - * - * Traverse the remote registration list starting from window: - * 1) Decrement ref count. - * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer. - * RMA lock must be held. - */ -static void scif_rma_list_munmap(struct scif_window *start_window, - s64 offset, int nr_pages) -{ - struct scifmsg msg; - s64 loop_offset = offset, end_offset; - int loop_nr_pages, nr_pages_left = nr_pages; - struct scif_endpt *ep = (struct scif_endpt *)start_window->ep; - struct list_head *head = &ep->rma_info.remote_reg_list; - struct scif_window *window = start_window, *_window; - - msg.uop = SCIF_MUNMAP; - msg.src = ep->port; - loop_offset = offset; - nr_pages_left = nr_pages; - list_for_each_entry_safe_from(window, _window, head, list) { - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - loop_nr_pages = min_t(int, - (end_offset - loop_offset) >> PAGE_SHIFT, - nr_pages_left); - scif_put_window(window, loop_nr_pages); - if (!window->ref_count) { - struct scif_dev *rdev = ep->remote_dev; - - scif_drain_dma_intr(rdev->sdev, - ep->rma_info.dma_chan); - /* Inform the peer about this munmap */ - msg.payload[0] = window->peer_window; - /* No error handling for Notification messages. */ - scif_nodeqp_send(ep->remote_dev, &msg); - list_del(&window->list); - /* Destroy this window from the peer's registered AS */ - scif_destroy_remote_window(window); - } - nr_pages_left -= loop_nr_pages; - loop_offset += (loop_nr_pages << PAGE_SHIFT); - if (!nr_pages_left) - break; - } -} - -/* - * The private data field of each VMA used to mmap a remote window - * points to an instance of struct vma_pvt - */ -struct vma_pvt { - struct scif_endpt *ep; /* End point for remote window */ - s64 offset; /* offset within remote window */ - bool valid_offset; /* offset is valid only if the original - * mmap request was for a single page - * else the offset within the vma is - * the correct offset - */ - struct kref ref; -}; - -static void vma_pvt_release(struct kref *ref) -{ - struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref); - - kfree(vmapvt); -} - -/** - * scif_vma_open - VMA open driver callback - * @vma: VMM memory area. - * The open method is called by the kernel to allow the subsystem implementing - * the VMA to initialize the area. This method is invoked any time a new - * reference to the VMA is made (when a process forks, for example). - * The one exception happens when the VMA is first created by mmap; - * in this case, the driver's mmap method is called instead. - * This function is also invoked when an existing VMA is split by the kernel - * due to a call to munmap on a subset of the VMA resulting in two VMAs. - * The kernel invokes this function only on one of the two VMAs. - */ -static void scif_vma_open(struct vm_area_struct *vma) -{ - struct vma_pvt *vmapvt = vma->vm_private_data; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n", - vma->vm_start, vma->vm_end); - scif_insert_vma(vmapvt->ep, vma); - kref_get(&vmapvt->ref); -} - -/** - * scif_munmap - VMA close driver callback. - * @vma: VMM memory area. - * When an area is destroyed, the kernel calls its close operation. - * Note that there's no usage count associated with VMA's; the area - * is opened and closed exactly once by each process that uses it. - */ -static void scif_munmap(struct vm_area_struct *vma) -{ - struct scif_endpt *ep; - struct vma_pvt *vmapvt = vma->vm_private_data; - int nr_pages = vma_pages(vma); - s64 offset; - struct scif_rma_req req; - struct scif_window *window = NULL; - int err; - - might_sleep(); - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n", - vma->vm_start, vma->vm_end); - ep = vmapvt->ep; - offset = vmapvt->valid_offset ? vmapvt->offset : - (vma->vm_pgoff) << PAGE_SHIFT; - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI munmap: ep %p nr_pages 0x%x offset 0x%llx\n", - ep, nr_pages, offset); - req.out_window = &window; - req.offset = offset; - req.nr_bytes = vma->vm_end - vma->vm_start; - req.prot = vma->vm_flags & (VM_READ | VM_WRITE); - req.type = SCIF_WINDOW_PARTIAL; - req.head = &ep->rma_info.remote_reg_list; - - mutex_lock(&ep->rma_info.rma_lock); - - err = scif_query_window(&req); - if (err) - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", __func__, __LINE__, err); - else - scif_rma_list_munmap(window, offset, nr_pages); - - mutex_unlock(&ep->rma_info.rma_lock); - /* - * The kernel probably zeroes these out but we still want - * to clean up our own mess just in case. - */ - vma->vm_ops = NULL; - vma->vm_private_data = NULL; - kref_put(&vmapvt->ref, vma_pvt_release); - scif_delete_vma(ep, vma); -} - -static const struct vm_operations_struct scif_vm_ops = { - .open = scif_vma_open, - .close = scif_munmap, -}; - -/** - * scif_mmap - Map pages in virtual address space to a remote window. - * @vma: VMM memory area. - * @epd: endpoint descriptor - * - * Return: Upon successful completion, scif_mmap() returns zero - * else an apt error is returned as documented in scif.h - */ -int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd) -{ - struct scif_rma_req req; - struct scif_window *window = NULL; - struct scif_endpt *ep = (struct scif_endpt *)epd; - s64 start_offset = vma->vm_pgoff << PAGE_SHIFT; - int nr_pages = vma_pages(vma); - int err; - struct vma_pvt *vmapvt; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI mmap: ep %p start_offset 0x%llx nr_pages 0x%x\n", - ep, start_offset, nr_pages); - err = scif_verify_epd(ep); - if (err) - return err; - - might_sleep(); - - err = scif_insert_vma(ep, vma); - if (err) - return err; - - vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL); - if (!vmapvt) { - scif_delete_vma(ep, vma); - return -ENOMEM; - } - - vmapvt->ep = ep; - kref_init(&vmapvt->ref); - - req.out_window = &window; - req.offset = start_offset; - req.nr_bytes = vma->vm_end - vma->vm_start; - req.prot = vma->vm_flags & (VM_READ | VM_WRITE); - req.type = SCIF_WINDOW_PARTIAL; - req.head = &ep->rma_info.remote_reg_list; - - mutex_lock(&ep->rma_info.rma_lock); - /* Does a valid window exist? */ - err = scif_query_window(&req); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - goto error_unlock; - } - - /* Default prot for loopback */ - if (!scifdev_self(ep->remote_dev)) - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - - /* - * VM_DONTCOPY - Do not copy this vma on fork - * VM_DONTEXPAND - Cannot expand with mremap() - * VM_RESERVED - Count as reserved_vm like IO - * VM_PFNMAP - Page-ranges managed without "struct page" - * VM_IO - Memory mapped I/O or similar - * - * We do not want to copy this VMA automatically on a fork(), - * expand this VMA due to mremap() or swap out these pages since - * the VMA is actually backed by physical pages in the remote - * node's physical memory and not via a struct page. - */ - vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; - - if (!scifdev_self(ep->remote_dev)) - vma->vm_flags |= VM_IO | VM_PFNMAP; - - /* Map this range of windows */ - err = scif_rma_list_mmap(window, start_offset, nr_pages, vma); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - goto error_unlock; - } - /* Set up the driver call back */ - vma->vm_ops = &scif_vm_ops; - vma->vm_private_data = vmapvt; -error_unlock: - mutex_unlock(&ep->rma_info.rma_lock); - if (err) { - kfree(vmapvt); - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - scif_delete_vma(ep, vma); - } - return err; -} diff --git a/drivers/misc/mic/scif/scif_nm.c b/drivers/misc/mic/scif/scif_nm.c deleted file mode 100644 index c4d9422082b7..000000000000 --- a/drivers/misc/mic/scif/scif_nm.c +++ /dev/null @@ -1,229 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include "scif_peer_bus.h" - -#include "scif_main.h" -#include "scif_map.h" - -/** - * scif_invalidate_ep() - Set state for all connected endpoints - * to disconnected and wake up all send/recv waitqueues - * - * @node: Node to invalidate - */ -static void scif_invalidate_ep(int node) -{ - struct scif_endpt *ep; - struct list_head *pos, *tmpq; - - flush_work(&scif_info.conn_work); - mutex_lock(&scif_info.connlock); - list_for_each_safe(pos, tmpq, &scif_info.disconnected) { - ep = list_entry(pos, struct scif_endpt, list); - if (ep->remote_dev->node == node) { - scif_unmap_all_windows(ep); - spin_lock(&ep->lock); - scif_cleanup_ep_qp(ep); - spin_unlock(&ep->lock); - } - } - list_for_each_safe(pos, tmpq, &scif_info.connected) { - ep = list_entry(pos, struct scif_endpt, list); - if (ep->remote_dev->node == node) { - list_del(pos); - spin_lock(&ep->lock); - ep->state = SCIFEP_DISCONNECTED; - list_add_tail(&ep->list, &scif_info.disconnected); - scif_cleanup_ep_qp(ep); - wake_up_interruptible(&ep->sendwq); - wake_up_interruptible(&ep->recvwq); - spin_unlock(&ep->lock); - scif_unmap_all_windows(ep); - } - } - mutex_unlock(&scif_info.connlock); -} - -void scif_free_qp(struct scif_dev *scifdev) -{ - struct scif_qp *qp = scifdev->qpairs; - - if (!qp) - return; - scif_unmap_single(qp->local_buf, scifdev, qp->inbound_q.size); - kfree(qp->inbound_q.rb_base); - scif_unmap_single(qp->local_qp, scifdev, sizeof(struct scif_qp)); - kfree(scifdev->qpairs); - scifdev->qpairs = NULL; -} - -static void scif_cleanup_qp(struct scif_dev *dev) -{ - struct scif_qp *qp = &dev->qpairs[0]; - - if (!qp) - return; - scif_iounmap((void *)qp->remote_qp, sizeof(struct scif_qp), dev); - scif_iounmap((void *)qp->outbound_q.rb_base, - sizeof(struct scif_qp), dev); - qp->remote_qp = NULL; - qp->local_write = 0; - qp->inbound_q.current_write_offset = 0; - qp->inbound_q.current_read_offset = 0; - if (scifdev_is_p2p(dev)) - scif_free_qp(dev); -} - -void scif_send_acks(struct scif_dev *dev) -{ - struct scifmsg msg; - - if (dev->node_remove_ack_pending) { - msg.uop = SCIF_NODE_REMOVE_ACK; - msg.src.node = scif_info.nodeid; - msg.dst.node = SCIF_MGMT_NODE; - msg.payload[0] = dev->node; - scif_nodeqp_send(&scif_dev[SCIF_MGMT_NODE], &msg); - dev->node_remove_ack_pending = false; - } - if (dev->exit_ack_pending) { - msg.uop = SCIF_EXIT_ACK; - msg.src.node = scif_info.nodeid; - msg.dst.node = dev->node; - scif_nodeqp_send(dev, &msg); - dev->exit_ack_pending = false; - } -} - -/** - * scif_cleanup_scifdev - Uninitialize SCIF data structures for remote - * SCIF device. - * @dev: Remote SCIF device. - */ -void scif_cleanup_scifdev(struct scif_dev *dev) -{ - struct scif_hw_dev *sdev = dev->sdev; - - if (!dev->sdev) - return; - if (scifdev_is_p2p(dev)) { - if (dev->cookie) { - sdev->hw_ops->free_irq(sdev, dev->cookie, dev); - dev->cookie = NULL; - } - scif_destroy_intr_wq(dev); - } - flush_work(&scif_info.misc_work); - scif_destroy_p2p(dev); - scif_invalidate_ep(dev->node); - scif_zap_mmaps(dev->node); - scif_cleanup_rma_for_zombies(dev->node); - flush_work(&scif_info.misc_work); - scif_send_acks(dev); - if (!dev->node && scif_info.card_initiated_exit) { - /* - * Send an SCIF_EXIT message which is the last message from MIC - * to the Host and wait for a SCIF_EXIT_ACK - */ - scif_send_exit(dev); - scif_info.card_initiated_exit = false; - } - scif_cleanup_qp(dev); -} - -/** - * scif_remove_node - * - * @node: Node to remove - */ -void scif_handle_remove_node(int node) -{ - struct scif_dev *scifdev = &scif_dev[node]; - - if (scif_peer_unregister_device(scifdev)) - scif_send_acks(scifdev); -} - -static int scif_send_rmnode_msg(int node, int remove_node) -{ - struct scifmsg notif_msg; - struct scif_dev *dev = &scif_dev[node]; - - notif_msg.uop = SCIF_NODE_REMOVE; - notif_msg.src.node = scif_info.nodeid; - notif_msg.dst.node = node; - notif_msg.payload[0] = remove_node; - return scif_nodeqp_send(dev, ¬if_msg); -} - -/** - * scif_node_disconnect - * - * @node_id: source node id [in] - * @mgmt_initiated: Disconnection initiated from the mgmt node - * - * Disconnect a node from the scif network. - */ -void scif_disconnect_node(u32 node_id, bool mgmt_initiated) -{ - int ret; - int msg_cnt = 0; - u32 i = 0; - struct scif_dev *scifdev = &scif_dev[node_id]; - - if (!node_id) - return; - - atomic_set(&scifdev->disconn_rescnt, 0); - - /* Destroy p2p network */ - for (i = 1; i <= scif_info.maxid; i++) { - if (i == node_id) - continue; - ret = scif_send_rmnode_msg(i, node_id); - if (!ret) - msg_cnt++; - } - /* Wait for the remote nodes to respond with SCIF_NODE_REMOVE_ACK */ - ret = wait_event_timeout(scifdev->disconn_wq, - (atomic_read(&scifdev->disconn_rescnt) - == msg_cnt), SCIF_NODE_ALIVE_TIMEOUT); - /* Tell the card to clean up */ - if (mgmt_initiated && _scifdev_alive(scifdev)) - /* - * Send an SCIF_EXIT message which is the last message from Host - * to the MIC and wait for a SCIF_EXIT_ACK - */ - scif_send_exit(scifdev); - atomic_set(&scifdev->disconn_rescnt, 0); - /* Tell the mgmt node to clean up */ - ret = scif_send_rmnode_msg(SCIF_MGMT_NODE, node_id); - if (!ret) - /* Wait for mgmt node to respond with SCIF_NODE_REMOVE_ACK */ - wait_event_timeout(scifdev->disconn_wq, - (atomic_read(&scifdev->disconn_rescnt) == 1), - SCIF_NODE_ALIVE_TIMEOUT); -} - -void scif_get_node_info(void) -{ - struct scifmsg msg; - DECLARE_COMPLETION_ONSTACK(node_info); - - msg.uop = SCIF_GET_NODE_INFO; - msg.src.node = scif_info.nodeid; - msg.dst.node = SCIF_MGMT_NODE; - msg.payload[3] = (u64)&node_info; - - if ((scif_nodeqp_send(&scif_dev[SCIF_MGMT_NODE], &msg))) - return; - - /* Wait for a response with SCIF_GET_NODE_INFO */ - wait_for_completion(&node_info); -} diff --git a/drivers/misc/mic/scif/scif_nodeqp.c b/drivers/misc/mic/scif/scif_nodeqp.c deleted file mode 100644 index 384ce08fa98a..000000000000 --- a/drivers/misc/mic/scif/scif_nodeqp.c +++ /dev/null @@ -1,1349 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include "../bus/scif_bus.h" -#include "scif_peer_bus.h" -#include "scif_main.h" -#include "scif_nodeqp.h" -#include "scif_map.h" - -/* - ************************************************************************ - * SCIF node Queue Pair (QP) setup flow: - * - * 1) SCIF driver gets probed with a scif_hw_dev via the scif_hw_bus - * 2) scif_setup_qp(..) allocates the local qp and calls - * scif_setup_qp_connect(..) which allocates and maps the local - * buffer for the inbound QP - * 3) The local node updates the device page with the DMA address of the QP - * 4) A delayed work is scheduled (qp_dwork) which periodically reads if - * the peer node has updated its QP DMA address - * 5) Once a valid non zero address is found in the QP DMA address field - * in the device page, the local node maps the remote node's QP, - * updates its outbound QP and sends a SCIF_INIT message to the peer - * 6) The SCIF_INIT message is received by the peer node QP interrupt bottom - * half handler by calling scif_init(..) - * 7) scif_init(..) registers a new SCIF peer node by calling - * scif_peer_register_device(..) which signifies the addition of a new - * SCIF node - * 8) On the mgmt node, P2P network setup/teardown is initiated if all the - * remote nodes are online via scif_p2p_setup(..) - * 9) For P2P setup, the host maps the remote nodes' aperture and memory - * bars and sends a SCIF_NODE_ADD message to both nodes - * 10) As part of scif_nodeadd, both nodes set up their local inbound - * QPs and send a SCIF_NODE_ADD_ACK to the mgmt node - * 11) As part of scif_node_add_ack(..) the mgmt node forwards the - * SCIF_NODE_ADD_ACK to the remote nodes - * 12) As part of scif_node_add_ack(..) the remote nodes update their - * outbound QPs, make sure they can access memory on the remote node - * and then add a new SCIF peer node by calling - * scif_peer_register_device(..) which signifies the addition of a new - * SCIF node. - * 13) The SCIF network is now established across all nodes. - * - ************************************************************************ - * SCIF node QP teardown flow (initiated by non mgmt node): - * - * 1) SCIF driver gets a remove callback with a scif_hw_dev via the scif_hw_bus - * 2) The device page QP DMA address field is updated with 0x0 - * 3) A non mgmt node now cleans up all local data structures and sends a - * SCIF_EXIT message to the peer and waits for a SCIF_EXIT_ACK - * 4) As part of scif_exit(..) handling scif_disconnect_node(..) is called - * 5) scif_disconnect_node(..) sends a SCIF_NODE_REMOVE message to all the - * peers and waits for a SCIF_NODE_REMOVE_ACK - * 6) As part of scif_node_remove(..) a remote node unregisters the peer - * node from the SCIF network and sends a SCIF_NODE_REMOVE_ACK - * 7) When the mgmt node has received all the SCIF_NODE_REMOVE_ACKs - * it sends itself a node remove message whose handling cleans up local - * data structures and unregisters the peer node from the SCIF network - * 8) The mgmt node sends a SCIF_EXIT_ACK - * 9) Upon receipt of the SCIF_EXIT_ACK the node initiating the teardown - * completes the SCIF remove routine - * 10) The SCIF network is now torn down for the node initiating the - * teardown sequence - * - ************************************************************************ - * SCIF node QP teardown flow (initiated by mgmt node): - * - * 1) SCIF driver gets a remove callback with a scif_hw_dev via the scif_hw_bus - * 2) The device page QP DMA address field is updated with 0x0 - * 3) The mgmt node calls scif_disconnect_node(..) - * 4) scif_disconnect_node(..) sends a SCIF_NODE_REMOVE message to all the peers - * and waits for a SCIF_NODE_REMOVE_ACK - * 5) As part of scif_node_remove(..) a remote node unregisters the peer - * node from the SCIF network and sends a SCIF_NODE_REMOVE_ACK - * 6) When the mgmt node has received all the SCIF_NODE_REMOVE_ACKs - * it unregisters the peer node from the SCIF network - * 7) The mgmt node sends a SCIF_EXIT message and waits for a SCIF_EXIT_ACK. - * 8) A non mgmt node upon receipt of a SCIF_EXIT message calls scif_stop(..) - * which would clean up local data structures for all SCIF nodes and - * then send a SCIF_EXIT_ACK back to the mgmt node - * 9) Upon receipt of the SCIF_EXIT_ACK the the mgmt node sends itself a node - * remove message whose handling cleans up local data structures and - * destroys any P2P mappings. - * 10) The SCIF hardware device for which a remove callback was received is now - * disconnected from the SCIF network. - */ -/* - * Initializes "local" data structures for the QP. Allocates the QP - * ring buffer (rb) and initializes the "in bound" queue. - */ -int scif_setup_qp_connect(struct scif_qp *qp, dma_addr_t *qp_offset, - int local_size, struct scif_dev *scifdev) -{ - void *local_q = qp->inbound_q.rb_base; - int err = 0; - u32 tmp_rd = 0; - - spin_lock_init(&qp->send_lock); - spin_lock_init(&qp->recv_lock); - - /* Allocate rb only if not already allocated */ - if (!local_q) { - local_q = kzalloc(local_size, GFP_KERNEL); - if (!local_q) { - err = -ENOMEM; - return err; - } - } - - err = scif_map_single(&qp->local_buf, local_q, scifdev, local_size); - if (err) - goto kfree; - /* - * To setup the inbound_q, the buffer lives locally, the read pointer - * is remote and the write pointer is local. - */ - scif_rb_init(&qp->inbound_q, - &tmp_rd, - &qp->local_write, - local_q, get_count_order(local_size)); - /* - * The read pointer is NULL initially and it is unsafe to use the ring - * buffer til this changes! - */ - qp->inbound_q.read_ptr = NULL; - err = scif_map_single(qp_offset, qp, - scifdev, sizeof(struct scif_qp)); - if (err) - goto unmap; - qp->local_qp = *qp_offset; - return err; -unmap: - scif_unmap_single(qp->local_buf, scifdev, local_size); - qp->local_buf = 0; -kfree: - kfree(local_q); - return err; -} - -/* When the other side has already done it's allocation, this is called */ -int scif_setup_qp_accept(struct scif_qp *qp, dma_addr_t *qp_offset, - dma_addr_t phys, int local_size, - struct scif_dev *scifdev) -{ - void *local_q; - void *remote_q; - struct scif_qp *remote_qp; - int remote_size; - int err = 0; - - spin_lock_init(&qp->send_lock); - spin_lock_init(&qp->recv_lock); - /* Start by figuring out where we need to point */ - remote_qp = scif_ioremap(phys, sizeof(struct scif_qp), scifdev); - if (!remote_qp) - return -EIO; - qp->remote_qp = remote_qp; - if (qp->remote_qp->magic != SCIFEP_MAGIC) { - err = -EIO; - goto iounmap; - } - qp->remote_buf = remote_qp->local_buf; - remote_size = qp->remote_qp->inbound_q.size; - remote_q = scif_ioremap(qp->remote_buf, remote_size, scifdev); - if (!remote_q) { - err = -EIO; - goto iounmap; - } - qp->remote_qp->local_write = 0; - /* - * To setup the outbound_q, the buffer lives in remote memory, - * the read pointer is local, the write pointer is remote - */ - scif_rb_init(&qp->outbound_q, - &qp->local_read, - &qp->remote_qp->local_write, - remote_q, - get_count_order(remote_size)); - local_q = kzalloc(local_size, GFP_KERNEL); - if (!local_q) { - err = -ENOMEM; - goto iounmap_1; - } - err = scif_map_single(&qp->local_buf, local_q, scifdev, local_size); - if (err) - goto kfree; - qp->remote_qp->local_read = 0; - /* - * To setup the inbound_q, the buffer lives locally, the read pointer - * is remote and the write pointer is local - */ - scif_rb_init(&qp->inbound_q, - &qp->remote_qp->local_read, - &qp->local_write, - local_q, get_count_order(local_size)); - err = scif_map_single(qp_offset, qp, scifdev, - sizeof(struct scif_qp)); - if (err) - goto unmap; - qp->local_qp = *qp_offset; - return err; -unmap: - scif_unmap_single(qp->local_buf, scifdev, local_size); - qp->local_buf = 0; -kfree: - kfree(local_q); -iounmap_1: - scif_iounmap(remote_q, remote_size, scifdev); - qp->outbound_q.rb_base = NULL; -iounmap: - scif_iounmap(qp->remote_qp, sizeof(struct scif_qp), scifdev); - qp->remote_qp = NULL; - return err; -} - -int scif_setup_qp_connect_response(struct scif_dev *scifdev, - struct scif_qp *qp, u64 payload) -{ - int err = 0; - void *r_buf; - int remote_size; - phys_addr_t tmp_phys; - - qp->remote_qp = scif_ioremap(payload, sizeof(struct scif_qp), scifdev); - - if (!qp->remote_qp) { - err = -ENOMEM; - goto error; - } - - if (qp->remote_qp->magic != SCIFEP_MAGIC) { - dev_err(&scifdev->sdev->dev, - "SCIFEP_MAGIC mismatch between self %d remote %d\n", - scif_dev[scif_info.nodeid].node, scifdev->node); - err = -ENODEV; - goto error; - } - - tmp_phys = qp->remote_qp->local_buf; - remote_size = qp->remote_qp->inbound_q.size; - r_buf = scif_ioremap(tmp_phys, remote_size, scifdev); - - if (!r_buf) - return -EIO; - - qp->local_read = 0; - scif_rb_init(&qp->outbound_q, - &qp->local_read, - &qp->remote_qp->local_write, - r_buf, - get_count_order(remote_size)); - /* - * Because the node QP may already be processing an INIT message, set - * the read pointer so the cached read offset isn't lost - */ - qp->remote_qp->local_read = qp->inbound_q.current_read_offset; - /* - * resetup the inbound_q now that we know where the - * inbound_read really is. - */ - scif_rb_init(&qp->inbound_q, - &qp->remote_qp->local_read, - &qp->local_write, - qp->inbound_q.rb_base, - get_count_order(qp->inbound_q.size)); -error: - return err; -} - -static __always_inline void -scif_send_msg_intr(struct scif_dev *scifdev) -{ - struct scif_hw_dev *sdev = scifdev->sdev; - - if (scifdev_is_p2p(scifdev)) - sdev->hw_ops->send_p2p_intr(sdev, scifdev->rdb, &scifdev->mmio); - else - sdev->hw_ops->send_intr(sdev, scifdev->rdb); -} - -int scif_qp_response(phys_addr_t phys, struct scif_dev *scifdev) -{ - int err = 0; - struct scifmsg msg; - - err = scif_setup_qp_connect_response(scifdev, scifdev->qpairs, phys); - if (!err) { - /* - * Now that everything is setup and mapped, we're ready - * to tell the peer about our queue's location - */ - msg.uop = SCIF_INIT; - msg.dst.node = scifdev->node; - err = scif_nodeqp_send(scifdev, &msg); - } - return err; -} - -void scif_send_exit(struct scif_dev *scifdev) -{ - struct scifmsg msg; - int ret; - - scifdev->exit = OP_IN_PROGRESS; - msg.uop = SCIF_EXIT; - msg.src.node = scif_info.nodeid; - msg.dst.node = scifdev->node; - ret = scif_nodeqp_send(scifdev, &msg); - if (ret) - goto done; - /* Wait for a SCIF_EXIT_ACK message */ - wait_event_timeout(scif_info.exitwq, scifdev->exit == OP_COMPLETED, - SCIF_NODE_ALIVE_TIMEOUT); -done: - scifdev->exit = OP_IDLE; -} - -int scif_setup_qp(struct scif_dev *scifdev) -{ - int err = 0; - int local_size; - struct scif_qp *qp; - - local_size = SCIF_NODE_QP_SIZE; - - qp = kzalloc(sizeof(*qp), GFP_KERNEL); - if (!qp) { - err = -ENOMEM; - return err; - } - qp->magic = SCIFEP_MAGIC; - scifdev->qpairs = qp; - err = scif_setup_qp_connect(qp, &scifdev->qp_dma_addr, - local_size, scifdev); - if (err) - goto free_qp; - /* - * We're as setup as we can be. The inbound_q is setup, w/o a usable - * outbound q. When we get a message, the read_ptr will be updated, - * and we will pull the message. - */ - return err; -free_qp: - kfree(scifdev->qpairs); - scifdev->qpairs = NULL; - return err; -} - -static void scif_p2p_freesg(struct scatterlist *sg) -{ - kfree(sg); -} - -static struct scatterlist * -scif_p2p_setsg(phys_addr_t pa, int page_size, int page_cnt) -{ - struct scatterlist *sg; - struct page *page; - int i; - - sg = kmalloc_array(page_cnt, sizeof(struct scatterlist), GFP_KERNEL); - if (!sg) - return NULL; - sg_init_table(sg, page_cnt); - for (i = 0; i < page_cnt; i++) { - page = pfn_to_page(pa >> PAGE_SHIFT); - sg_set_page(&sg[i], page, page_size, 0); - pa += page_size; - } - return sg; -} - -/* Init p2p mappings required to access peerdev from scifdev */ -static struct scif_p2p_info * -scif_init_p2p_info(struct scif_dev *scifdev, struct scif_dev *peerdev) -{ - struct scif_p2p_info *p2p; - int num_mmio_pages, num_aper_pages, sg_page_shift, err, num_aper_chunks; - struct scif_hw_dev *psdev = peerdev->sdev; - struct scif_hw_dev *sdev = scifdev->sdev; - - num_mmio_pages = psdev->mmio->len >> PAGE_SHIFT; - num_aper_pages = psdev->aper->len >> PAGE_SHIFT; - - p2p = kzalloc(sizeof(*p2p), GFP_KERNEL); - if (!p2p) - return NULL; - p2p->ppi_sg[SCIF_PPI_MMIO] = scif_p2p_setsg(psdev->mmio->pa, - PAGE_SIZE, num_mmio_pages); - if (!p2p->ppi_sg[SCIF_PPI_MMIO]) - goto free_p2p; - p2p->sg_nentries[SCIF_PPI_MMIO] = num_mmio_pages; - sg_page_shift = get_order(min(psdev->aper->len, (u64)(1 << 30))); - num_aper_chunks = num_aper_pages >> (sg_page_shift - PAGE_SHIFT); - p2p->ppi_sg[SCIF_PPI_APER] = scif_p2p_setsg(psdev->aper->pa, - 1 << sg_page_shift, - num_aper_chunks); - p2p->sg_nentries[SCIF_PPI_APER] = num_aper_chunks; - err = dma_map_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], - num_mmio_pages, PCI_DMA_BIDIRECTIONAL); - if (err != num_mmio_pages) - goto scif_p2p_free; - err = dma_map_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_APER], - num_aper_chunks, PCI_DMA_BIDIRECTIONAL); - if (err != num_aper_chunks) - goto dma_unmap; - p2p->ppi_da[SCIF_PPI_MMIO] = sg_dma_address(p2p->ppi_sg[SCIF_PPI_MMIO]); - p2p->ppi_da[SCIF_PPI_APER] = sg_dma_address(p2p->ppi_sg[SCIF_PPI_APER]); - p2p->ppi_len[SCIF_PPI_MMIO] = num_mmio_pages; - p2p->ppi_len[SCIF_PPI_APER] = num_aper_pages; - p2p->ppi_peer_id = peerdev->node; - return p2p; -dma_unmap: - dma_unmap_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], - p2p->sg_nentries[SCIF_PPI_MMIO], DMA_BIDIRECTIONAL); -scif_p2p_free: - scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); - scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); -free_p2p: - kfree(p2p); - return NULL; -} - -/* Uninitialize and release resources from a p2p mapping */ -static void scif_deinit_p2p_info(struct scif_dev *scifdev, - struct scif_p2p_info *p2p) -{ - struct scif_hw_dev *sdev = scifdev->sdev; - - dma_unmap_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], - p2p->sg_nentries[SCIF_PPI_MMIO], DMA_BIDIRECTIONAL); - dma_unmap_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_APER], - p2p->sg_nentries[SCIF_PPI_APER], DMA_BIDIRECTIONAL); - scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); - scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); - kfree(p2p); -} - -/** - * scif_node_connect: Respond to SCIF_NODE_CONNECT interrupt message - * @scifdev: SCIF device - * @dst: Destination node - * - * Connect the src and dst node by setting up the p2p connection - * between them. Management node here acts like a proxy. - */ -static void scif_node_connect(struct scif_dev *scifdev, int dst) -{ - struct scif_dev *dev_j = scifdev; - struct scif_dev *dev_i = NULL; - struct scif_p2p_info *p2p_ij = NULL; /* bus addr for j from i */ - struct scif_p2p_info *p2p_ji = NULL; /* bus addr for i from j */ - struct scif_p2p_info *p2p; - struct list_head *pos, *tmp; - struct scifmsg msg; - int err; - u64 tmppayload; - - if (dst < 1 || dst > scif_info.maxid) - return; - - dev_i = &scif_dev[dst]; - - if (!_scifdev_alive(dev_i)) - return; - /* - * If the p2p connection is already setup or in the process of setting - * up then just ignore this request. The requested node will get - * informed by SCIF_NODE_ADD_ACK or SCIF_NODE_ADD_NACK - */ - if (!list_empty(&dev_i->p2p)) { - list_for_each_safe(pos, tmp, &dev_i->p2p) { - p2p = list_entry(pos, struct scif_p2p_info, ppi_list); - if (p2p->ppi_peer_id == dev_j->node) - return; - } - } - p2p_ij = scif_init_p2p_info(dev_i, dev_j); - if (!p2p_ij) - return; - p2p_ji = scif_init_p2p_info(dev_j, dev_i); - if (!p2p_ji) { - scif_deinit_p2p_info(dev_i, p2p_ij); - return; - } - list_add_tail(&p2p_ij->ppi_list, &dev_i->p2p); - list_add_tail(&p2p_ji->ppi_list, &dev_j->p2p); - - /* - * Send a SCIF_NODE_ADD to dev_i, pass it its bus address - * as seen from dev_j - */ - msg.uop = SCIF_NODE_ADD; - msg.src.node = dev_j->node; - msg.dst.node = dev_i->node; - - msg.payload[0] = p2p_ji->ppi_da[SCIF_PPI_APER]; - msg.payload[1] = p2p_ij->ppi_da[SCIF_PPI_MMIO]; - msg.payload[2] = p2p_ij->ppi_da[SCIF_PPI_APER]; - msg.payload[3] = p2p_ij->ppi_len[SCIF_PPI_APER] << PAGE_SHIFT; - - err = scif_nodeqp_send(dev_i, &msg); - if (err) { - dev_err(&scifdev->sdev->dev, - "%s %d error %d\n", __func__, __LINE__, err); - return; - } - - /* Same as above but to dev_j */ - msg.uop = SCIF_NODE_ADD; - msg.src.node = dev_i->node; - msg.dst.node = dev_j->node; - - tmppayload = msg.payload[0]; - msg.payload[0] = msg.payload[2]; - msg.payload[2] = tmppayload; - msg.payload[1] = p2p_ji->ppi_da[SCIF_PPI_MMIO]; - msg.payload[3] = p2p_ji->ppi_len[SCIF_PPI_APER] << PAGE_SHIFT; - - scif_nodeqp_send(dev_j, &msg); -} - -static void scif_p2p_setup(void) -{ - int i, j; - - if (!scif_info.p2p_enable) - return; - - for (i = 1; i <= scif_info.maxid; i++) - if (!_scifdev_alive(&scif_dev[i])) - return; - - for (i = 1; i <= scif_info.maxid; i++) { - for (j = 1; j <= scif_info.maxid; j++) { - struct scif_dev *scifdev = &scif_dev[i]; - - if (i == j) - continue; - scif_node_connect(scifdev, j); - } - } -} - -static char *message_types[] = {"BAD", - "INIT", - "EXIT", - "SCIF_EXIT_ACK", - "SCIF_NODE_ADD", - "SCIF_NODE_ADD_ACK", - "SCIF_NODE_ADD_NACK", - "REMOVE_NODE", - "REMOVE_NODE_ACK", - "CNCT_REQ", - "CNCT_GNT", - "CNCT_GNTACK", - "CNCT_GNTNACK", - "CNCT_REJ", - "DISCNCT", - "DISCNT_ACK", - "CLIENT_SENT", - "CLIENT_RCVD", - "SCIF_GET_NODE_INFO", - "REGISTER", - "REGISTER_ACK", - "REGISTER_NACK", - "UNREGISTER", - "UNREGISTER_ACK", - "UNREGISTER_NACK", - "ALLOC_REQ", - "ALLOC_GNT", - "ALLOC_REJ", - "FREE_PHYS", - "FREE_VIRT", - "MUNMAP", - "MARK", - "MARK_ACK", - "MARK_NACK", - "WAIT", - "WAIT_ACK", - "WAIT_NACK", - "SIGNAL_LOCAL", - "SIGNAL_REMOTE", - "SIG_ACK", - "SIG_NACK"}; - -static void -scif_display_message(struct scif_dev *scifdev, struct scifmsg *msg, - const char *label) -{ - if (!scif_info.en_msg_log) - return; - if (msg->uop > SCIF_MAX_MSG) { - dev_err(&scifdev->sdev->dev, - "%s: unknown msg type %d\n", label, msg->uop); - return; - } - dev_info(&scifdev->sdev->dev, - "%s: msg type %s, src %d:%d, dest %d:%d payload 0x%llx:0x%llx:0x%llx:0x%llx\n", - label, message_types[msg->uop], msg->src.node, msg->src.port, - msg->dst.node, msg->dst.port, msg->payload[0], msg->payload[1], - msg->payload[2], msg->payload[3]); -} - -int _scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_qp *qp = scifdev->qpairs; - int err = -ENOMEM, loop_cnt = 0; - - scif_display_message(scifdev, msg, "Sent"); - if (!qp) { - err = -EINVAL; - goto error; - } - spin_lock(&qp->send_lock); - - while ((err = scif_rb_write(&qp->outbound_q, - msg, sizeof(struct scifmsg)))) { - mdelay(1); -#define SCIF_NODEQP_SEND_TO_MSEC (3 * 1000) - if (loop_cnt++ > (SCIF_NODEQP_SEND_TO_MSEC)) { - err = -ENODEV; - break; - } - } - if (!err) - scif_rb_commit(&qp->outbound_q); - spin_unlock(&qp->send_lock); - if (!err) { - if (scifdev_self(scifdev)) - /* - * For loopback we need to emulate an interrupt by - * queuing work for the queue handling real node - * Qp interrupts. - */ - queue_work(scifdev->intr_wq, &scifdev->intr_bh); - else - scif_send_msg_intr(scifdev); - } -error: - if (err) - dev_dbg(&scifdev->sdev->dev, - "%s %d error %d uop %d\n", - __func__, __LINE__, err, msg->uop); - return err; -} - -/** - * scif_nodeqp_send - Send a message on the node queue pair - * @scifdev: Scif Device. - * @msg: The message to be sent. - */ -int scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg) -{ - int err; - struct device *spdev = NULL; - - if (msg->uop > SCIF_EXIT_ACK) { - /* Don't send messages once the exit flow has begun */ - if (OP_IDLE != scifdev->exit) - return -ENODEV; - spdev = scif_get_peer_dev(scifdev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - return err; - } - } - err = _scif_nodeqp_send(scifdev, msg); - if (msg->uop > SCIF_EXIT_ACK) - scif_put_peer_dev(spdev); - return err; -} - -/* - * scif_misc_handler: - * - * Work queue handler for servicing miscellaneous SCIF tasks. - * Examples include: - * 1) Remote fence requests. - * 2) Destruction of temporary registered windows - * created during scif_vreadfrom()/scif_vwriteto(). - * 3) Cleanup of zombie endpoints. - */ -void scif_misc_handler(struct work_struct *work) -{ - scif_rma_handle_remote_fences(); - scif_rma_destroy_windows(); - scif_rma_destroy_tcw_invalid(); - scif_cleanup_zombie_epd(); -} - -/** - * scif_init() - Respond to SCIF_INIT interrupt message - * @scifdev: Remote SCIF device node - * @msg: Interrupt message - */ -static __always_inline void -scif_init(struct scif_dev *scifdev, struct scifmsg *msg) -{ - /* - * Allow the thread waiting for device page updates for the peer QP DMA - * address to complete initializing the inbound_q. - */ - flush_delayed_work(&scifdev->qp_dwork); - - scif_peer_register_device(scifdev); - - if (scif_is_mgmt_node()) { - mutex_lock(&scif_info.conflock); - scif_p2p_setup(); - mutex_unlock(&scif_info.conflock); - } -} - -/** - * scif_exit() - Respond to SCIF_EXIT interrupt message - * @scifdev: Remote SCIF device node - * @unused: Interrupt message (unused) - * - * This function stops the SCIF interface for the node which sent - * the SCIF_EXIT message and starts waiting for that node to - * resetup the queue pair again. - */ -static __always_inline void -scif_exit(struct scif_dev *scifdev, struct scifmsg *unused) -{ - scifdev->exit_ack_pending = true; - if (scif_is_mgmt_node()) - scif_disconnect_node(scifdev->node, false); - else - scif_stop(scifdev); - schedule_delayed_work(&scifdev->qp_dwork, - msecs_to_jiffies(1000)); -} - -/** - * scif_exitack() - Respond to SCIF_EXIT_ACK interrupt message - * @scifdev: Remote SCIF device node - * @unused: Interrupt message (unused) - * - */ -static __always_inline void -scif_exit_ack(struct scif_dev *scifdev, struct scifmsg *unused) -{ - scifdev->exit = OP_COMPLETED; - wake_up(&scif_info.exitwq); -} - -/** - * scif_node_add() - Respond to SCIF_NODE_ADD interrupt message - * @scifdev: Remote SCIF device node - * @msg: Interrupt message - * - * When the mgmt node driver has finished initializing a MIC node queue pair it - * marks the node as online. It then looks for all currently online MIC cards - * and send a SCIF_NODE_ADD message to identify the ID of the new card for - * peer to peer initialization - * - * The local node allocates its incoming queue and sends its address in the - * SCIF_NODE_ADD_ACK message back to the mgmt node, the mgmt node "reflects" - * this message to the new node - */ -static __always_inline void -scif_node_add(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_dev *newdev; - dma_addr_t qp_offset; - int qp_connect; - struct scif_hw_dev *sdev; - - dev_dbg(&scifdev->sdev->dev, - "Scifdev %d:%d received NODE_ADD msg for node %d\n", - scifdev->node, msg->dst.node, msg->src.node); - dev_dbg(&scifdev->sdev->dev, - "Remote address for this node's aperture %llx\n", - msg->payload[0]); - newdev = &scif_dev[msg->src.node]; - newdev->node = msg->src.node; - newdev->sdev = scif_dev[SCIF_MGMT_NODE].sdev; - sdev = newdev->sdev; - - if (scif_setup_intr_wq(newdev)) { - dev_err(&scifdev->sdev->dev, - "failed to setup interrupts for %d\n", msg->src.node); - goto interrupt_setup_error; - } - newdev->mmio.va = ioremap(msg->payload[1], sdev->mmio->len); - if (!newdev->mmio.va) { - dev_err(&scifdev->sdev->dev, - "failed to map mmio for %d\n", msg->src.node); - goto mmio_map_error; - } - newdev->qpairs = kzalloc(sizeof(*newdev->qpairs), GFP_KERNEL); - if (!newdev->qpairs) - goto qp_alloc_error; - /* - * Set the base address of the remote node's memory since it gets - * added to qp_offset - */ - newdev->base_addr = msg->payload[0]; - - qp_connect = scif_setup_qp_connect(newdev->qpairs, &qp_offset, - SCIF_NODE_QP_SIZE, newdev); - if (qp_connect) { - dev_err(&scifdev->sdev->dev, - "failed to setup qp_connect %d\n", qp_connect); - goto qp_connect_error; - } - - newdev->db = sdev->hw_ops->next_db(sdev); - newdev->cookie = sdev->hw_ops->request_irq(sdev, scif_intr_handler, - "SCIF_INTR", newdev, - newdev->db); - if (IS_ERR(newdev->cookie)) - goto qp_connect_error; - newdev->qpairs->magic = SCIFEP_MAGIC; - newdev->qpairs->qp_state = SCIF_QP_OFFLINE; - - msg->uop = SCIF_NODE_ADD_ACK; - msg->dst.node = msg->src.node; - msg->src.node = scif_info.nodeid; - msg->payload[0] = qp_offset; - msg->payload[2] = newdev->db; - scif_nodeqp_send(&scif_dev[SCIF_MGMT_NODE], msg); - return; -qp_connect_error: - kfree(newdev->qpairs); - newdev->qpairs = NULL; -qp_alloc_error: - iounmap(newdev->mmio.va); - newdev->mmio.va = NULL; -mmio_map_error: -interrupt_setup_error: - dev_err(&scifdev->sdev->dev, - "node add failed for node %d\n", msg->src.node); - msg->uop = SCIF_NODE_ADD_NACK; - msg->dst.node = msg->src.node; - msg->src.node = scif_info.nodeid; - scif_nodeqp_send(&scif_dev[SCIF_MGMT_NODE], msg); -} - -void scif_poll_qp_state(struct work_struct *work) -{ -#define SCIF_NODE_QP_RETRY 100 -#define SCIF_NODE_QP_TIMEOUT 100 - struct scif_dev *peerdev = container_of(work, struct scif_dev, - p2p_dwork.work); - struct scif_qp *qp = &peerdev->qpairs[0]; - - if (qp->qp_state != SCIF_QP_ONLINE || - qp->remote_qp->qp_state != SCIF_QP_ONLINE) { - if (peerdev->p2p_retry++ == SCIF_NODE_QP_RETRY) { - dev_err(&peerdev->sdev->dev, - "Warning: QP check timeout with state %d\n", - qp->qp_state); - goto timeout; - } - schedule_delayed_work(&peerdev->p2p_dwork, - msecs_to_jiffies(SCIF_NODE_QP_TIMEOUT)); - return; - } - return; -timeout: - dev_err(&peerdev->sdev->dev, - "%s %d remote node %d offline, state = 0x%x\n", - __func__, __LINE__, peerdev->node, qp->qp_state); - qp->remote_qp->qp_state = SCIF_QP_OFFLINE; - scif_peer_unregister_device(peerdev); - scif_cleanup_scifdev(peerdev); -} - -/** - * scif_node_add_ack() - Respond to SCIF_NODE_ADD_ACK interrupt message - * @scifdev: Remote SCIF device node - * @msg: Interrupt message - * - * After a MIC node receives the SCIF_NODE_ADD_ACK message it send this - * message to the mgmt node to confirm the sequence is finished. - * - */ -static __always_inline void -scif_node_add_ack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_dev *peerdev; - struct scif_qp *qp; - struct scif_dev *dst_dev = &scif_dev[msg->dst.node]; - - dev_dbg(&scifdev->sdev->dev, - "Scifdev %d received SCIF_NODE_ADD_ACK msg src %d dst %d\n", - scifdev->node, msg->src.node, msg->dst.node); - dev_dbg(&scifdev->sdev->dev, - "payload %llx %llx %llx %llx\n", msg->payload[0], - msg->payload[1], msg->payload[2], msg->payload[3]); - if (scif_is_mgmt_node()) { - /* - * the lock serializes with scif_qp_response_ack. The mgmt node - * is forwarding the NODE_ADD_ACK message from src to dst we - * need to make sure that the dst has already received a - * NODE_ADD for src and setup its end of the qp to dst - */ - mutex_lock(&scif_info.conflock); - msg->payload[1] = scif_info.maxid; - scif_nodeqp_send(dst_dev, msg); - mutex_unlock(&scif_info.conflock); - return; - } - peerdev = &scif_dev[msg->src.node]; - peerdev->sdev = scif_dev[SCIF_MGMT_NODE].sdev; - peerdev->node = msg->src.node; - - qp = &peerdev->qpairs[0]; - - if ((scif_setup_qp_connect_response(peerdev, &peerdev->qpairs[0], - msg->payload[0]))) - goto local_error; - peerdev->rdb = msg->payload[2]; - qp->remote_qp->qp_state = SCIF_QP_ONLINE; - - scif_peer_register_device(peerdev); - - schedule_delayed_work(&peerdev->p2p_dwork, 0); - return; -local_error: - scif_cleanup_scifdev(peerdev); -} - -/** - * scif_node_add_nack: Respond to SCIF_NODE_ADD_NACK interrupt message - * @scifdev: Remote SCIF device node - * @msg: Interrupt message - * - * SCIF_NODE_ADD failed, so inform the waiting wq. - */ -static __always_inline void -scif_node_add_nack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - if (scif_is_mgmt_node()) { - struct scif_dev *dst_dev = &scif_dev[msg->dst.node]; - - dev_dbg(&scifdev->sdev->dev, - "SCIF_NODE_ADD_NACK received from %d\n", scifdev->node); - scif_nodeqp_send(dst_dev, msg); - } -} - -/** - * scif_node_remove: Handle SCIF_NODE_REMOVE message - * @scifdev: Remote SCIF device node - * @msg: Interrupt message - * - * Handle node removal. - */ -static __always_inline void -scif_node_remove(struct scif_dev *scifdev, struct scifmsg *msg) -{ - int node = msg->payload[0]; - struct scif_dev *scdev = &scif_dev[node]; - - scdev->node_remove_ack_pending = true; - scif_handle_remove_node(node); -} - -/** - * scif_node_remove_ack: Handle SCIF_NODE_REMOVE_ACK message - * @scifdev: Remote SCIF device node - * @msg: Interrupt message - * - * The peer has acked a SCIF_NODE_REMOVE message. - */ -static __always_inline void -scif_node_remove_ack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_dev *sdev = &scif_dev[msg->payload[0]]; - - atomic_inc(&sdev->disconn_rescnt); - wake_up(&sdev->disconn_wq); -} - -/** - * scif_get_node_info: Respond to SCIF_GET_NODE_INFO interrupt message - * @scifdev: Remote SCIF device node - * @msg: Interrupt message - * - * Retrieve node info i.e maxid and total from the mgmt node. - */ -static __always_inline void -scif_get_node_info_resp(struct scif_dev *scifdev, struct scifmsg *msg) -{ - if (scif_is_mgmt_node()) { - swap(msg->dst.node, msg->src.node); - mutex_lock(&scif_info.conflock); - msg->payload[1] = scif_info.maxid; - msg->payload[2] = scif_info.total; - mutex_unlock(&scif_info.conflock); - scif_nodeqp_send(scifdev, msg); - } else { - struct completion *node_info = - (struct completion *)msg->payload[3]; - - mutex_lock(&scif_info.conflock); - scif_info.maxid = msg->payload[1]; - scif_info.total = msg->payload[2]; - complete_all(node_info); - mutex_unlock(&scif_info.conflock); - } -} - -static void -scif_msg_unknown(struct scif_dev *scifdev, struct scifmsg *msg) -{ - /* Bogus Node Qp Message? */ - dev_err(&scifdev->sdev->dev, - "Unknown message 0x%xn scifdev->node 0x%x\n", - msg->uop, scifdev->node); -} - -static void (*scif_intr_func[SCIF_MAX_MSG + 1]) - (struct scif_dev *, struct scifmsg *msg) = { - scif_msg_unknown, /* Error */ - scif_init, /* SCIF_INIT */ - scif_exit, /* SCIF_EXIT */ - scif_exit_ack, /* SCIF_EXIT_ACK */ - scif_node_add, /* SCIF_NODE_ADD */ - scif_node_add_ack, /* SCIF_NODE_ADD_ACK */ - scif_node_add_nack, /* SCIF_NODE_ADD_NACK */ - scif_node_remove, /* SCIF_NODE_REMOVE */ - scif_node_remove_ack, /* SCIF_NODE_REMOVE_ACK */ - scif_cnctreq, /* SCIF_CNCT_REQ */ - scif_cnctgnt, /* SCIF_CNCT_GNT */ - scif_cnctgnt_ack, /* SCIF_CNCT_GNTACK */ - scif_cnctgnt_nack, /* SCIF_CNCT_GNTNACK */ - scif_cnctrej, /* SCIF_CNCT_REJ */ - scif_discnct, /* SCIF_DISCNCT */ - scif_discnt_ack, /* SCIF_DISCNT_ACK */ - scif_clientsend, /* SCIF_CLIENT_SENT */ - scif_clientrcvd, /* SCIF_CLIENT_RCVD */ - scif_get_node_info_resp,/* SCIF_GET_NODE_INFO */ - scif_recv_reg, /* SCIF_REGISTER */ - scif_recv_reg_ack, /* SCIF_REGISTER_ACK */ - scif_recv_reg_nack, /* SCIF_REGISTER_NACK */ - scif_recv_unreg, /* SCIF_UNREGISTER */ - scif_recv_unreg_ack, /* SCIF_UNREGISTER_ACK */ - scif_recv_unreg_nack, /* SCIF_UNREGISTER_NACK */ - scif_alloc_req, /* SCIF_ALLOC_REQ */ - scif_alloc_gnt_rej, /* SCIF_ALLOC_GNT */ - scif_alloc_gnt_rej, /* SCIF_ALLOC_REJ */ - scif_free_virt, /* SCIF_FREE_VIRT */ - scif_recv_munmap, /* SCIF_MUNMAP */ - scif_recv_mark, /* SCIF_MARK */ - scif_recv_mark_resp, /* SCIF_MARK_ACK */ - scif_recv_mark_resp, /* SCIF_MARK_NACK */ - scif_recv_wait, /* SCIF_WAIT */ - scif_recv_wait_resp, /* SCIF_WAIT_ACK */ - scif_recv_wait_resp, /* SCIF_WAIT_NACK */ - scif_recv_sig_local, /* SCIF_SIG_LOCAL */ - scif_recv_sig_remote, /* SCIF_SIG_REMOTE */ - scif_recv_sig_resp, /* SCIF_SIG_ACK */ - scif_recv_sig_resp, /* SCIF_SIG_NACK */ -}; - -static int scif_max_msg_id = SCIF_MAX_MSG; -/** - * scif_nodeqp_msg_handler() - Common handler for node messages - * @scifdev: Remote device to respond to - * @qp: Remote memory pointer - * @msg: The message to be handled. - * - * This routine calls the appropriate routine to handle a Node Qp - * message receipt - */ -static void -scif_nodeqp_msg_handler(struct scif_dev *scifdev, - struct scif_qp *qp, struct scifmsg *msg) -{ - scif_display_message(scifdev, msg, "Rcvd"); - - if (msg->uop > (u32)scif_max_msg_id) { - /* Bogus Node Qp Message? */ - dev_err(&scifdev->sdev->dev, - "Unknown message 0x%xn scifdev->node 0x%x\n", - msg->uop, scifdev->node); - return; - } - - scif_intr_func[msg->uop](scifdev, msg); -} - -/** - * scif_nodeqp_intrhandler() - Interrupt handler for node messages - * @scifdev: Remote device to respond to - * @qp: Remote memory pointer - * - * This routine is triggered by the interrupt mechanism. It reads - * messages from the node queue RB and calls the Node QP Message handling - * routine. - */ -void scif_nodeqp_intrhandler(struct scif_dev *scifdev, struct scif_qp *qp) -{ - struct scifmsg msg; - int read_size; - - do { - read_size = scif_rb_get_next(&qp->inbound_q, &msg, sizeof(msg)); - if (!read_size) - break; - scif_nodeqp_msg_handler(scifdev, qp, &msg); - /* - * The node queue pair is unmapped so skip the read pointer - * update after receipt of a SCIF_EXIT_ACK - */ - if (SCIF_EXIT_ACK == msg.uop) - break; - scif_rb_update_read_ptr(&qp->inbound_q); - } while (1); -} - -/** - * scif_loopb_wq_handler - Loopback Workqueue Handler. - * @unused: loop back work (unused) - * - * This work queue routine is invoked by the loopback work queue handler. - * It grabs the recv lock, dequeues any available messages from the head - * of the loopback message list, calls the node QP message handler, - * waits for it to return, then frees up this message and dequeues more - * elements of the list if available. - */ -static void scif_loopb_wq_handler(struct work_struct *unused) -{ - struct scif_dev *scifdev = scif_info.loopb_dev; - struct scif_qp *qp = scifdev->qpairs; - struct scif_loopb_msg *msg; - - do { - msg = NULL; - spin_lock(&qp->recv_lock); - if (!list_empty(&scif_info.loopb_recv_q)) { - msg = list_first_entry(&scif_info.loopb_recv_q, - struct scif_loopb_msg, - list); - list_del(&msg->list); - } - spin_unlock(&qp->recv_lock); - - if (msg) { - scif_nodeqp_msg_handler(scifdev, qp, &msg->msg); - kfree(msg); - } - } while (msg); -} - -/** - * scif_loopb_msg_handler() - Workqueue handler for loopback messages. - * @scifdev: SCIF device - * @qp: Queue pair. - * - * This work queue routine is triggered when a loopback message is received. - * - * We need special handling for receiving Node Qp messages on a loopback SCIF - * device via two workqueues for receiving messages. - * - * The reason we need the extra workqueue which is not required with *normal* - * non-loopback SCIF devices is the potential classic deadlock described below: - * - * Thread A tries to send a message on a loopback SCIF device and blocks since - * there is no space in the RB while it has the send_lock held or another - * lock called lock X for example. - * - * Thread B: The Loopback Node QP message receive workqueue receives the message - * and tries to send a message (eg an ACK) to the loopback SCIF device. It tries - * to grab the send lock again or lock X and deadlocks with Thread A. The RB - * cannot be drained any further due to this classic deadlock. - * - * In order to avoid deadlocks as mentioned above we have an extra level of - * indirection achieved by having two workqueues. - * 1) The first workqueue whose handler is scif_loopb_msg_handler reads - * messages from the Node QP RB, adds them to a list and queues work for the - * second workqueue. - * - * 2) The second workqueue whose handler is scif_loopb_wq_handler dequeues - * messages from the list, handles them, frees up the memory and dequeues - * more elements from the list if possible. - */ -int -scif_loopb_msg_handler(struct scif_dev *scifdev, struct scif_qp *qp) -{ - int read_size; - struct scif_loopb_msg *msg; - - do { - msg = kmalloc(sizeof(*msg), GFP_KERNEL); - if (!msg) - return -ENOMEM; - read_size = scif_rb_get_next(&qp->inbound_q, &msg->msg, - sizeof(struct scifmsg)); - if (read_size != sizeof(struct scifmsg)) { - kfree(msg); - scif_rb_update_read_ptr(&qp->inbound_q); - break; - } - spin_lock(&qp->recv_lock); - list_add_tail(&msg->list, &scif_info.loopb_recv_q); - spin_unlock(&qp->recv_lock); - queue_work(scif_info.loopb_wq, &scif_info.loopb_work); - scif_rb_update_read_ptr(&qp->inbound_q); - } while (read_size == sizeof(struct scifmsg)); - return read_size; -} - -/** - * scif_setup_loopback_qp - One time setup work for Loopback Node Qp. - * @scifdev: SCIF device - * - * Sets up the required loopback workqueues, queue pairs and ring buffers - */ -int scif_setup_loopback_qp(struct scif_dev *scifdev) -{ - int err = 0; - void *local_q; - struct scif_qp *qp; - - err = scif_setup_intr_wq(scifdev); - if (err) - goto exit; - INIT_LIST_HEAD(&scif_info.loopb_recv_q); - snprintf(scif_info.loopb_wqname, sizeof(scif_info.loopb_wqname), - "SCIF LOOPB %d", scifdev->node); - scif_info.loopb_wq = - alloc_ordered_workqueue(scif_info.loopb_wqname, 0); - if (!scif_info.loopb_wq) { - err = -ENOMEM; - goto destroy_intr; - } - INIT_WORK(&scif_info.loopb_work, scif_loopb_wq_handler); - /* Allocate Self Qpair */ - scifdev->qpairs = kzalloc(sizeof(*scifdev->qpairs), GFP_KERNEL); - if (!scifdev->qpairs) { - err = -ENOMEM; - goto destroy_loopb_wq; - } - - qp = scifdev->qpairs; - qp->magic = SCIFEP_MAGIC; - spin_lock_init(&qp->send_lock); - spin_lock_init(&qp->recv_lock); - - local_q = kzalloc(SCIF_NODE_QP_SIZE, GFP_KERNEL); - if (!local_q) { - err = -ENOMEM; - goto free_qpairs; - } - /* - * For loopback the inbound_q and outbound_q are essentially the same - * since the Node sends a message on the loopback interface to the - * outbound_q which is then received on the inbound_q. - */ - scif_rb_init(&qp->outbound_q, - &qp->local_read, - &qp->local_write, - local_q, get_count_order(SCIF_NODE_QP_SIZE)); - - scif_rb_init(&qp->inbound_q, - &qp->local_read, - &qp->local_write, - local_q, get_count_order(SCIF_NODE_QP_SIZE)); - scif_info.nodeid = scifdev->node; - - scif_peer_register_device(scifdev); - - scif_info.loopb_dev = scifdev; - return err; -free_qpairs: - kfree(scifdev->qpairs); -destroy_loopb_wq: - destroy_workqueue(scif_info.loopb_wq); -destroy_intr: - scif_destroy_intr_wq(scifdev); -exit: - return err; -} - -/** - * scif_destroy_loopback_qp - One time uninit work for Loopback Node Qp - * @scifdev: SCIF device - * - * Destroys the workqueues and frees up the Ring Buffer and Queue Pair memory. - */ -int scif_destroy_loopback_qp(struct scif_dev *scifdev) -{ - scif_peer_unregister_device(scifdev); - destroy_workqueue(scif_info.loopb_wq); - scif_destroy_intr_wq(scifdev); - kfree(scifdev->qpairs->outbound_q.rb_base); - kfree(scifdev->qpairs); - scifdev->sdev = NULL; - scif_info.loopb_dev = NULL; - return 0; -} - -void scif_destroy_p2p(struct scif_dev *scifdev) -{ - struct scif_dev *peer_dev; - struct scif_p2p_info *p2p; - struct list_head *pos, *tmp; - int bd; - - mutex_lock(&scif_info.conflock); - /* Free P2P mappings in the given node for all its peer nodes */ - list_for_each_safe(pos, tmp, &scifdev->p2p) { - p2p = list_entry(pos, struct scif_p2p_info, ppi_list); - dma_unmap_sg(&scifdev->sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], - p2p->sg_nentries[SCIF_PPI_MMIO], - DMA_BIDIRECTIONAL); - dma_unmap_sg(&scifdev->sdev->dev, p2p->ppi_sg[SCIF_PPI_APER], - p2p->sg_nentries[SCIF_PPI_APER], - DMA_BIDIRECTIONAL); - scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); - scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); - list_del(pos); - kfree(p2p); - } - - /* Free P2P mapping created in the peer nodes for the given node */ - for (bd = SCIF_MGMT_NODE + 1; bd <= scif_info.maxid; bd++) { - peer_dev = &scif_dev[bd]; - list_for_each_safe(pos, tmp, &peer_dev->p2p) { - p2p = list_entry(pos, struct scif_p2p_info, ppi_list); - if (p2p->ppi_peer_id == scifdev->node) { - dma_unmap_sg(&peer_dev->sdev->dev, - p2p->ppi_sg[SCIF_PPI_MMIO], - p2p->sg_nentries[SCIF_PPI_MMIO], - DMA_BIDIRECTIONAL); - dma_unmap_sg(&peer_dev->sdev->dev, - p2p->ppi_sg[SCIF_PPI_APER], - p2p->sg_nentries[SCIF_PPI_APER], - DMA_BIDIRECTIONAL); - scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); - scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); - list_del(pos); - kfree(p2p); - } - } - } - mutex_unlock(&scif_info.conflock); -} diff --git a/drivers/misc/mic/scif/scif_nodeqp.h b/drivers/misc/mic/scif/scif_nodeqp.h deleted file mode 100644 index 95896273138e..000000000000 --- a/drivers/misc/mic/scif/scif_nodeqp.h +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Intel SCIF driver. - * - */ -#ifndef SCIF_NODEQP -#define SCIF_NODEQP - -#include "scif_rb.h" -#include "scif_peer_bus.h" - -#define SCIF_INIT 1 /* First message sent to the peer node for discovery */ -#define SCIF_EXIT 2 /* Last message from the peer informing intent to exit */ -#define SCIF_EXIT_ACK 3 /* Response to SCIF_EXIT message */ -#define SCIF_NODE_ADD 4 /* Tell Online nodes a new node exits */ -#define SCIF_NODE_ADD_ACK 5 /* Confirm to mgmt node sequence is finished */ -#define SCIF_NODE_ADD_NACK 6 /* SCIF_NODE_ADD failed */ -#define SCIF_NODE_REMOVE 7 /* Request to deactivate a SCIF node */ -#define SCIF_NODE_REMOVE_ACK 8 /* Response to a SCIF_NODE_REMOVE message */ -#define SCIF_CNCT_REQ 9 /* Phys addr of Request connection to a port */ -#define SCIF_CNCT_GNT 10 /* Phys addr of new Grant connection request */ -#define SCIF_CNCT_GNTACK 11 /* Error type Reject a connection request */ -#define SCIF_CNCT_GNTNACK 12 /* Error type Reject a connection request */ -#define SCIF_CNCT_REJ 13 /* Error type Reject a connection request */ -#define SCIF_DISCNCT 14 /* Notify peer that connection is being terminated */ -#define SCIF_DISCNT_ACK 15 /* Notify peer that connection is being terminated */ -#define SCIF_CLIENT_SENT 16 /* Notify the peer that data has been written */ -#define SCIF_CLIENT_RCVD 17 /* Notify the peer that data has been read */ -#define SCIF_GET_NODE_INFO 18 /* Get current node mask from the mgmt node*/ -#define SCIF_REGISTER 19 /* Tell peer about a new registered window */ -#define SCIF_REGISTER_ACK 20 /* Notify peer about unregistration success */ -#define SCIF_REGISTER_NACK 21 /* Notify peer about registration success */ -#define SCIF_UNREGISTER 22 /* Tell peer about unregistering a window */ -#define SCIF_UNREGISTER_ACK 23 /* Notify peer about registration failure */ -#define SCIF_UNREGISTER_NACK 24 /* Notify peer about unregistration failure */ -#define SCIF_ALLOC_REQ 25 /* Request a mapped buffer */ -#define SCIF_ALLOC_GNT 26 /* Notify peer about allocation success */ -#define SCIF_ALLOC_REJ 27 /* Notify peer about allocation failure */ -#define SCIF_FREE_VIRT 28 /* Free previously allocated virtual memory */ -#define SCIF_MUNMAP 29 /* Acknowledgment for a SCIF_MMAP request */ -#define SCIF_MARK 30 /* SCIF Remote Fence Mark Request */ -#define SCIF_MARK_ACK 31 /* SCIF Remote Fence Mark Success */ -#define SCIF_MARK_NACK 32 /* SCIF Remote Fence Mark Failure */ -#define SCIF_WAIT 33 /* SCIF Remote Fence Wait Request */ -#define SCIF_WAIT_ACK 34 /* SCIF Remote Fence Wait Success */ -#define SCIF_WAIT_NACK 35 /* SCIF Remote Fence Wait Failure */ -#define SCIF_SIG_LOCAL 36 /* SCIF Remote Fence Local Signal Request */ -#define SCIF_SIG_REMOTE 37 /* SCIF Remote Fence Remote Signal Request */ -#define SCIF_SIG_ACK 38 /* SCIF Remote Fence Remote Signal Success */ -#define SCIF_SIG_NACK 39 /* SCIF Remote Fence Remote Signal Failure */ -#define SCIF_MAX_MSG SCIF_SIG_NACK - -/* - * struct scifmsg - Node QP message format - * - * @src: Source information - * @dst: Destination information - * @uop: The message opcode - * @payload: Unique payload format for each message - */ -struct scifmsg { - struct scif_port_id src; - struct scif_port_id dst; - u32 uop; - u64 payload[4]; -} __packed; - -/* - * struct scif_allocmsg - Used with SCIF_ALLOC_REQ to request - * the remote note to allocate memory - * - * phys_addr: Physical address of the buffer - * vaddr: Virtual address of the buffer - * size: Size of the buffer - * state: Current state - * allocwq: wait queue for status - */ -struct scif_allocmsg { - dma_addr_t phys_addr; - unsigned long vaddr; - size_t size; - enum scif_msg_state state; - wait_queue_head_t allocwq; -}; - -/* - * struct scif_qp - Node Queue Pair - * - * Interesting structure -- a little difficult because we can only - * write across the PCIe, so any r/w pointer we need to read is - * local. We only need to read the read pointer on the inbound_q - * and read the write pointer in the outbound_q - * - * @magic: Magic value to ensure the peer sees the QP correctly - * @outbound_q: The outbound ring buffer for sending messages - * @inbound_q: The inbound ring buffer for receiving messages - * @local_write: Local write index - * @local_read: Local read index - * @remote_qp: The remote queue pair - * @local_buf: DMA address of local ring buffer - * @local_qp: DMA address of the local queue pair data structure - * @remote_buf: DMA address of remote ring buffer - * @qp_state: QP state i.e. online or offline used for P2P - * @send_lock: synchronize access to outbound queue - * @recv_lock: Synchronize access to inbound queue - */ -struct scif_qp { - u64 magic; -#define SCIFEP_MAGIC 0x5c1f000000005c1fULL - struct scif_rb outbound_q; - struct scif_rb inbound_q; - - u32 local_write __aligned(64); - u32 local_read __aligned(64); - struct scif_qp *remote_qp; - dma_addr_t local_buf; - dma_addr_t local_qp; - dma_addr_t remote_buf; - u32 qp_state; -#define SCIF_QP_OFFLINE 0xdead -#define SCIF_QP_ONLINE 0xc0de - spinlock_t send_lock; - spinlock_t recv_lock; -}; - -/* - * struct scif_loopb_msg - An element in the loopback Node QP message list. - * - * @msg - The SCIF node QP message - * @list - link in the list of messages - */ -struct scif_loopb_msg { - struct scifmsg msg; - struct list_head list; -}; - -int scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg); -int _scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_nodeqp_intrhandler(struct scif_dev *scifdev, struct scif_qp *qp); -int scif_loopb_msg_handler(struct scif_dev *scifdev, struct scif_qp *qp); -int scif_setup_qp(struct scif_dev *scifdev); -int scif_qp_response(phys_addr_t phys, struct scif_dev *dev); -int scif_setup_qp_connect(struct scif_qp *qp, dma_addr_t *qp_offset, - int local_size, struct scif_dev *scifdev); -int scif_setup_qp_accept(struct scif_qp *qp, dma_addr_t *qp_offset, - dma_addr_t phys, int local_size, - struct scif_dev *scifdev); -int scif_setup_qp_connect_response(struct scif_dev *scifdev, - struct scif_qp *qp, u64 payload); -int scif_setup_loopback_qp(struct scif_dev *scifdev); -int scif_destroy_loopback_qp(struct scif_dev *scifdev); -void scif_poll_qp_state(struct work_struct *work); -void scif_destroy_p2p(struct scif_dev *scifdev); -void scif_send_exit(struct scif_dev *scifdev); -static inline struct device *scif_get_peer_dev(struct scif_dev *scifdev) -{ - struct scif_peer_dev *spdev; - struct device *spdev_ret; - - rcu_read_lock(); - spdev = rcu_dereference(scifdev->spdev); - if (spdev) - spdev_ret = get_device(&spdev->dev); - else - spdev_ret = ERR_PTR(-ENODEV); - rcu_read_unlock(); - return spdev_ret; -} - -static inline void scif_put_peer_dev(struct device *dev) -{ - put_device(dev); -} -#endif /* SCIF_NODEQP */ diff --git a/drivers/misc/mic/scif/scif_peer_bus.c b/drivers/misc/mic/scif/scif_peer_bus.c deleted file mode 100644 index 6d608308bb60..000000000000 --- a/drivers/misc/mic/scif/scif_peer_bus.c +++ /dev/null @@ -1,175 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include "scif_main.h" -#include "../bus/scif_bus.h" -#include "scif_peer_bus.h" - -static inline struct scif_peer_dev * -dev_to_scif_peer(struct device *dev) -{ - return container_of(dev, struct scif_peer_dev, dev); -} - -struct bus_type scif_peer_bus = { - .name = "scif_peer_bus", -}; - -static void scif_peer_release_dev(struct device *d) -{ - struct scif_peer_dev *sdev = dev_to_scif_peer(d); - struct scif_dev *scifdev = &scif_dev[sdev->dnode]; - - scif_cleanup_scifdev(scifdev); - kfree(sdev); -} - -static int scif_peer_initialize_device(struct scif_dev *scifdev) -{ - struct scif_peer_dev *spdev; - int ret; - - spdev = kzalloc(sizeof(*spdev), GFP_KERNEL); - if (!spdev) { - ret = -ENOMEM; - goto err; - } - - spdev->dev.parent = scifdev->sdev->dev.parent; - spdev->dev.release = scif_peer_release_dev; - spdev->dnode = scifdev->node; - spdev->dev.bus = &scif_peer_bus; - dev_set_name(&spdev->dev, "scif_peer-dev%u", spdev->dnode); - - device_initialize(&spdev->dev); - get_device(&spdev->dev); - rcu_assign_pointer(scifdev->spdev, spdev); - - mutex_lock(&scif_info.conflock); - scif_info.total++; - scif_info.maxid = max_t(u32, spdev->dnode, scif_info.maxid); - mutex_unlock(&scif_info.conflock); - return 0; -err: - dev_err(&scifdev->sdev->dev, - "dnode %d: initialize_device rc %d\n", scifdev->node, ret); - return ret; -} - -static int scif_peer_add_device(struct scif_dev *scifdev) -{ - struct scif_peer_dev *spdev = rcu_dereference(scifdev->spdev); - char pool_name[16]; - int ret; - - ret = device_add(&spdev->dev); - put_device(&spdev->dev); - if (ret) { - dev_err(&scifdev->sdev->dev, - "dnode %d: peer device_add failed\n", scifdev->node); - goto put_spdev; - } - - scnprintf(pool_name, sizeof(pool_name), "scif-%d", spdev->dnode); - scifdev->signal_pool = dmam_pool_create(pool_name, &scifdev->sdev->dev, - sizeof(struct scif_status), 1, - 0); - if (!scifdev->signal_pool) { - dev_err(&scifdev->sdev->dev, - "dnode %d: dmam_pool_create failed\n", scifdev->node); - ret = -ENOMEM; - goto del_spdev; - } - dev_dbg(&spdev->dev, "Added peer dnode %d\n", spdev->dnode); - return 0; -del_spdev: - device_del(&spdev->dev); -put_spdev: - RCU_INIT_POINTER(scifdev->spdev, NULL); - synchronize_rcu(); - put_device(&spdev->dev); - - mutex_lock(&scif_info.conflock); - scif_info.total--; - mutex_unlock(&scif_info.conflock); - return ret; -} - -void scif_add_peer_device(struct work_struct *work) -{ - struct scif_dev *scifdev = container_of(work, struct scif_dev, - peer_add_work); - - scif_peer_add_device(scifdev); -} - -/* - * Peer device registration is split into a device_initialize and a device_add. - * The reason for doing this is as follows: First, peer device registration - * itself cannot be done in the message processing thread and must be delegated - * to another workqueue, otherwise if SCIF client probe, called during peer - * device registration, calls scif_connect(..), it will block the message - * processing thread causing a deadlock. Next, device_initialize is done in the - * "top-half" message processing thread and device_add in the "bottom-half" - * workqueue. If this is not done, SCIF_CNCT_REQ message processing executing - * concurrently with SCIF_INIT message processing is unable to get a reference - * on the peer device, thereby failing the connect request. - */ -void scif_peer_register_device(struct scif_dev *scifdev) -{ - int ret; - - mutex_lock(&scifdev->lock); - ret = scif_peer_initialize_device(scifdev); - if (ret) - goto exit; - schedule_work(&scifdev->peer_add_work); -exit: - mutex_unlock(&scifdev->lock); -} - -int scif_peer_unregister_device(struct scif_dev *scifdev) -{ - struct scif_peer_dev *spdev; - - mutex_lock(&scifdev->lock); - /* Flush work to ensure device register is complete */ - flush_work(&scifdev->peer_add_work); - - /* - * Continue holding scifdev->lock since theoretically unregister_device - * can be called simultaneously from multiple threads - */ - spdev = rcu_dereference(scifdev->spdev); - if (!spdev) { - mutex_unlock(&scifdev->lock); - return -ENODEV; - } - - RCU_INIT_POINTER(scifdev->spdev, NULL); - synchronize_rcu(); - mutex_unlock(&scifdev->lock); - - dev_dbg(&spdev->dev, "Removing peer dnode %d\n", spdev->dnode); - device_unregister(&spdev->dev); - - mutex_lock(&scif_info.conflock); - scif_info.total--; - mutex_unlock(&scif_info.conflock); - return 0; -} - -int scif_peer_bus_init(void) -{ - return bus_register(&scif_peer_bus); -} - -void scif_peer_bus_exit(void) -{ - bus_unregister(&scif_peer_bus); -} diff --git a/drivers/misc/mic/scif/scif_peer_bus.h b/drivers/misc/mic/scif/scif_peer_bus.h deleted file mode 100644 index 2ea4c51c18c1..000000000000 --- a/drivers/misc/mic/scif/scif_peer_bus.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#ifndef _SCIF_PEER_BUS_H_ -#define _SCIF_PEER_BUS_H_ - -#include -#include -#include - -struct scif_dev; - -void scif_add_peer_device(struct work_struct *work); -void scif_peer_register_device(struct scif_dev *sdev); -int scif_peer_unregister_device(struct scif_dev *scifdev); -int scif_peer_bus_init(void); -void scif_peer_bus_exit(void); -#endif /* _SCIF_PEER_BUS_H */ diff --git a/drivers/misc/mic/scif/scif_ports.c b/drivers/misc/mic/scif/scif_ports.c deleted file mode 100644 index 4bdb5ef9a139..000000000000 --- a/drivers/misc/mic/scif/scif_ports.c +++ /dev/null @@ -1,116 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include - -#include "scif_main.h" - -#define SCIF_PORT_COUNT 0x10000 /* Ports available */ - -struct idr scif_ports; - -/** - * struct scif_port - SCIF port information - * - * @ref_cnt: Reference count since there can be multiple endpoints - * created via scif_accept(..) simultaneously using a port. - */ -struct scif_port { - int ref_cnt; -}; - -/** - * __scif_get_port - Reserve a specified port # for SCIF and add it - * to the global list. - * @start: lowest port # to be reserved (inclusive). - * @end: highest port # to be reserved (exclusive). - * - * @return : Allocated SCIF port #, or -ENOSPC if port unavailable. - * On memory allocation failure, returns -ENOMEM. - */ -static int __scif_get_port(int start, int end) -{ - int id; - struct scif_port *port = kzalloc(sizeof(*port), GFP_ATOMIC); - - if (!port) - return -ENOMEM; - spin_lock(&scif_info.port_lock); - id = idr_alloc(&scif_ports, port, start, end, GFP_ATOMIC); - if (id >= 0) - port->ref_cnt++; - spin_unlock(&scif_info.port_lock); - return id; -} - -/** - * scif_rsrv_port - Reserve a specified port # for SCIF. - * @port : port # to be reserved. - * - * @return : Allocated SCIF port #, or -ENOSPC if port unavailable. - * On memory allocation failure, returns -ENOMEM. - */ -int scif_rsrv_port(u16 port) -{ - return __scif_get_port(port, port + 1); -} - -/** - * scif_get_new_port - Get and reserve any port # for SCIF in the range - * SCIF_PORT_RSVD + 1 to SCIF_PORT_COUNT - 1. - * - * @return : Allocated SCIF port #, or -ENOSPC if no ports available. - * On memory allocation failure, returns -ENOMEM. - */ -int scif_get_new_port(void) -{ - return __scif_get_port(SCIF_PORT_RSVD + 1, SCIF_PORT_COUNT); -} - -/** - * scif_get_port - Increment the reference count for a SCIF port - * @id : SCIF port - * - * @return : None - */ -void scif_get_port(u16 id) -{ - struct scif_port *port; - - if (!id) - return; - spin_lock(&scif_info.port_lock); - port = idr_find(&scif_ports, id); - if (port) - port->ref_cnt++; - spin_unlock(&scif_info.port_lock); -} - -/** - * scif_put_port - Release a reserved SCIF port - * @id : SCIF port to be released. - * - * @return : None - */ -void scif_put_port(u16 id) -{ - struct scif_port *port; - - if (!id) - return; - spin_lock(&scif_info.port_lock); - port = idr_find(&scif_ports, id); - if (port) { - port->ref_cnt--; - if (!port->ref_cnt) { - idr_remove(&scif_ports, id); - kfree(port); - } - } - spin_unlock(&scif_info.port_lock); -} diff --git a/drivers/misc/mic/scif/scif_rb.c b/drivers/misc/mic/scif/scif_rb.c deleted file mode 100644 index e425882ae06d..000000000000 --- a/drivers/misc/mic/scif/scif_rb.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel SCIF driver. - */ -#include -#include -#include -#include - -#include "scif_rb.h" - -#define scif_rb_ring_cnt(head, tail, size) CIRC_CNT(head, tail, size) -#define scif_rb_ring_space(head, tail, size) CIRC_SPACE(head, tail, size) - -/** - * scif_rb_init - Initializes the ring buffer - * @rb: ring buffer - * @read_ptr: A pointer to the read offset - * @write_ptr: A pointer to the write offset - * @rb_base: A pointer to the base of the ring buffer - * @size: The size of the ring buffer in powers of two - */ -void scif_rb_init(struct scif_rb *rb, u32 *read_ptr, u32 *write_ptr, - void *rb_base, u8 size) -{ - rb->rb_base = rb_base; - rb->size = (1 << size); - rb->read_ptr = read_ptr; - rb->write_ptr = write_ptr; - rb->current_read_offset = *read_ptr; - rb->current_write_offset = *write_ptr; -} - -/* Copies a message to the ring buffer -- handles the wrap around case */ -static void memcpy_torb(struct scif_rb *rb, void *header, - void *msg, u32 size) -{ - u32 size1, size2; - - if (header + size >= rb->rb_base + rb->size) { - /* Need to call two copies if it wraps around */ - size1 = (u32)(rb->rb_base + rb->size - header); - size2 = size - size1; - memcpy_toio((void __iomem __force *)header, msg, size1); - memcpy_toio((void __iomem __force *)rb->rb_base, - msg + size1, size2); - } else { - memcpy_toio((void __iomem __force *)header, msg, size); - } -} - -/* Copies a message from the ring buffer -- handles the wrap around case */ -static void memcpy_fromrb(struct scif_rb *rb, void *header, - void *msg, u32 size) -{ - u32 size1, size2; - - if (header + size >= rb->rb_base + rb->size) { - /* Need to call two copies if it wraps around */ - size1 = (u32)(rb->rb_base + rb->size - header); - size2 = size - size1; - memcpy_fromio(msg, (void __iomem __force *)header, size1); - memcpy_fromio(msg + size1, - (void __iomem __force *)rb->rb_base, size2); - } else { - memcpy_fromio(msg, (void __iomem __force *)header, size); - } -} - -/** - * scif_rb_space - Query space available for writing to the RB - * @rb: ring buffer - * - * Return: size available for writing to RB in bytes. - */ -u32 scif_rb_space(struct scif_rb *rb) -{ - rb->current_read_offset = *rb->read_ptr; - /* - * Update from the HW read pointer only once the peer has exposed the - * new empty slot. This barrier is paired with the memory barrier - * scif_rb_update_read_ptr() - */ - mb(); - return scif_rb_ring_space(rb->current_write_offset, - rb->current_read_offset, rb->size); -} - -/** - * scif_rb_write - Write a message to the RB - * @rb: ring buffer - * @msg: buffer to send the message. Must be at least size bytes long - * @size: the size (in bytes) to be copied to the RB - * - * This API does not block if there isn't enough space in the RB. - * Returns: 0 on success or -ENOMEM on failure - */ -int scif_rb_write(struct scif_rb *rb, void *msg, u32 size) -{ - void *header; - - if (scif_rb_space(rb) < size) - return -ENOMEM; - header = rb->rb_base + rb->current_write_offset; - memcpy_torb(rb, header, msg, size); - /* - * Wait until scif_rb_commit(). Update the local ring - * buffer data, not the shared data until commit. - */ - rb->current_write_offset = - (rb->current_write_offset + size) & (rb->size - 1); - return 0; -} - -/** - * scif_rb_commit - To submit the message to let the peer fetch it - * @rb: ring buffer - */ -void scif_rb_commit(struct scif_rb *rb) -{ - /* - * We must ensure ordering between the all the data committed - * previously before we expose the new message to the peer by - * updating the write_ptr. This write barrier is paired with - * the read barrier in scif_rb_count(..) - */ - wmb(); - WRITE_ONCE(*rb->write_ptr, rb->current_write_offset); -#ifdef CONFIG_INTEL_MIC_CARD - /* - * X100 Si bug: For the case where a Core is performing an EXT_WR - * followed by a Doorbell Write, the Core must perform two EXT_WR to the - * same address with the same data before it does the Doorbell Write. - * This way, if ordering is violated for the Interrupt Message, it will - * fall just behind the first Posted associated with the first EXT_WR. - */ - WRITE_ONCE(*rb->write_ptr, rb->current_write_offset); -#endif -} - -/** - * scif_rb_get - To get next message from the ring buffer - * @rb: ring buffer - * @size: Number of bytes to be read - * - * Return: NULL if no bytes to be read from the ring buffer, otherwise the - * pointer to the next byte - */ -static void *scif_rb_get(struct scif_rb *rb, u32 size) -{ - void *header = NULL; - - if (scif_rb_count(rb, size) >= size) - header = rb->rb_base + rb->current_read_offset; - return header; -} - -/* - * scif_rb_get_next - Read from ring buffer. - * @rb: ring buffer - * @msg: buffer to hold the message. Must be at least size bytes long - * @size: Number of bytes to be read - * - * Return: number of bytes read if available bytes are >= size, otherwise - * returns zero. - */ -u32 scif_rb_get_next(struct scif_rb *rb, void *msg, u32 size) -{ - void *header = NULL; - int read_size = 0; - - header = scif_rb_get(rb, size); - if (header) { - u32 next_cmd_offset = - (rb->current_read_offset + size) & (rb->size - 1); - - read_size = size; - rb->current_read_offset = next_cmd_offset; - memcpy_fromrb(rb, header, msg, size); - } - return read_size; -} - -/** - * scif_rb_update_read_ptr - * @rb: ring buffer - */ -void scif_rb_update_read_ptr(struct scif_rb *rb) -{ - u32 new_offset; - - new_offset = rb->current_read_offset; - /* - * We must ensure ordering between the all the data committed or read - * previously before we expose the empty slot to the peer by updating - * the read_ptr. This barrier is paired with the memory barrier in - * scif_rb_space(..) - */ - mb(); - WRITE_ONCE(*rb->read_ptr, new_offset); -#ifdef CONFIG_INTEL_MIC_CARD - /* - * X100 Si Bug: For the case where a Core is performing an EXT_WR - * followed by a Doorbell Write, the Core must perform two EXT_WR to the - * same address with the same data before it does the Doorbell Write. - * This way, if ordering is violated for the Interrupt Message, it will - * fall just behind the first Posted associated with the first EXT_WR. - */ - WRITE_ONCE(*rb->read_ptr, new_offset); -#endif -} - -/** - * scif_rb_count - * @rb: ring buffer - * @size: Number of bytes expected to be read - * - * Return: number of bytes that can be read from the RB - */ -u32 scif_rb_count(struct scif_rb *rb, u32 size) -{ - if (scif_rb_ring_cnt(rb->current_write_offset, - rb->current_read_offset, - rb->size) < size) { - rb->current_write_offset = *rb->write_ptr; - /* - * Update from the HW write pointer if empty only once the peer - * has exposed the new message. This read barrier is paired - * with the write barrier in scif_rb_commit(..) - */ - smp_rmb(); - } - return scif_rb_ring_cnt(rb->current_write_offset, - rb->current_read_offset, - rb->size); -} diff --git a/drivers/misc/mic/scif/scif_rb.h b/drivers/misc/mic/scif/scif_rb.h deleted file mode 100644 index 166dffe3093d..000000000000 --- a/drivers/misc/mic/scif/scif_rb.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Intel SCIF driver. - */ -#ifndef SCIF_RB_H -#define SCIF_RB_H -/* - * This file describes a general purpose, byte based ring buffer. Writers to the - * ring buffer need to synchronize using a lock. The same is true for readers, - * although in practice, the ring buffer has a single reader. It is lockless - * between producer and consumer so it can handle being used across the PCIe - * bus. The ring buffer ensures that there are no reads across the PCIe bus for - * performance reasons. Two of these are used to form a single bidirectional - * queue-pair across PCIe. - */ -/* - * struct scif_rb - SCIF Ring Buffer - * - * @rb_base: The base of the memory used for storing RB messages - * @read_ptr: Pointer to the read offset - * @write_ptr: Pointer to the write offset - * @size: Size of the memory in rb_base - * @current_read_offset: Cached read offset for performance - * @current_write_offset: Cached write offset for performance - */ -struct scif_rb { - void *rb_base; - u32 *read_ptr; - u32 *write_ptr; - u32 size; - u32 current_read_offset; - u32 current_write_offset; -}; - -/* methods used by both */ -void scif_rb_init(struct scif_rb *rb, u32 *read_ptr, u32 *write_ptr, - void *rb_base, u8 size); -/* writer only methods */ -/* write a new command, then scif_rb_commit() */ -int scif_rb_write(struct scif_rb *rb, void *msg, u32 size); -/* after write(), then scif_rb_commit() */ -void scif_rb_commit(struct scif_rb *rb); -/* query space available for writing to a RB. */ -u32 scif_rb_space(struct scif_rb *rb); - -/* reader only methods */ -/* read a new message from the ring buffer of size bytes */ -u32 scif_rb_get_next(struct scif_rb *rb, void *msg, u32 size); -/* update the read pointer so that the space can be reused */ -void scif_rb_update_read_ptr(struct scif_rb *rb); -/* count the number of bytes that can be read */ -u32 scif_rb_count(struct scif_rb *rb, u32 size); -#endif diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c deleted file mode 100644 index 18fb9d8b8a4b..000000000000 --- a/drivers/misc/mic/scif/scif_rma.c +++ /dev/null @@ -1,1760 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel SCIF driver. - */ -#include -#include -#include -#include - -#include "scif_main.h" -#include "scif_map.h" - -/* Used to skip ulimit checks for registrations with SCIF_MAP_KERNEL flag */ -#define SCIF_MAP_ULIMIT 0x40 - -bool scif_ulimit_check = 1; - -/** - * scif_rma_ep_init: - * @ep: end point - * - * Initialize RMA per EP data structures. - */ -void scif_rma_ep_init(struct scif_endpt *ep) -{ - struct scif_endpt_rma_info *rma = &ep->rma_info; - - mutex_init(&rma->rma_lock); - init_iova_domain(&rma->iovad, PAGE_SIZE, SCIF_IOVA_START_PFN); - spin_lock_init(&rma->tc_lock); - mutex_init(&rma->mmn_lock); - INIT_LIST_HEAD(&rma->reg_list); - INIT_LIST_HEAD(&rma->remote_reg_list); - atomic_set(&rma->tw_refcount, 0); - atomic_set(&rma->tcw_refcount, 0); - atomic_set(&rma->tcw_total_pages, 0); - atomic_set(&rma->fence_refcount, 0); - - rma->async_list_del = 0; - rma->dma_chan = NULL; - INIT_LIST_HEAD(&rma->mmn_list); - INIT_LIST_HEAD(&rma->vma_list); - init_waitqueue_head(&rma->markwq); -} - -/** - * scif_rma_ep_can_uninit: - * @ep: end point - * - * Returns 1 if an endpoint can be uninitialized and 0 otherwise. - */ -int scif_rma_ep_can_uninit(struct scif_endpt *ep) -{ - int ret = 0; - - mutex_lock(&ep->rma_info.rma_lock); - /* Destroy RMA Info only if both lists are empty */ - if (list_empty(&ep->rma_info.reg_list) && - list_empty(&ep->rma_info.remote_reg_list) && - list_empty(&ep->rma_info.mmn_list) && - !atomic_read(&ep->rma_info.tw_refcount) && - !atomic_read(&ep->rma_info.tcw_refcount) && - !atomic_read(&ep->rma_info.fence_refcount)) - ret = 1; - mutex_unlock(&ep->rma_info.rma_lock); - return ret; -} - -/** - * scif_create_pinned_pages: - * @nr_pages: number of pages in window - * @prot: read/write protection - * - * Allocate and prepare a set of pinned pages. - */ -static struct scif_pinned_pages * -scif_create_pinned_pages(int nr_pages, int prot) -{ - struct scif_pinned_pages *pin; - - might_sleep(); - pin = scif_zalloc(sizeof(*pin)); - if (!pin) - goto error; - - pin->pages = scif_zalloc(nr_pages * sizeof(*pin->pages)); - if (!pin->pages) - goto error_free_pinned_pages; - - pin->prot = prot; - pin->magic = SCIFEP_MAGIC; - return pin; - -error_free_pinned_pages: - scif_free(pin, sizeof(*pin)); -error: - return NULL; -} - -/** - * scif_destroy_pinned_pages: - * @pin: A set of pinned pages. - * - * Deallocate resources for pinned pages. - */ -static int scif_destroy_pinned_pages(struct scif_pinned_pages *pin) -{ - int j; - int writeable = pin->prot & SCIF_PROT_WRITE; - int kernel = SCIF_MAP_KERNEL & pin->map_flags; - - if (kernel) { - for (j = 0; j < pin->nr_pages; j++) { - if (pin->pages[j] && !kernel) { - if (writeable) - set_page_dirty_lock(pin->pages[j]); - put_page(pin->pages[j]); - } - } - } else - unpin_user_pages_dirty_lock(pin->pages, pin->nr_pages, - writeable); - scif_free(pin->pages, - pin->nr_pages * sizeof(*pin->pages)); - scif_free(pin, sizeof(*pin)); - return 0; -} - -/* - * scif_create_window: - * @ep: end point - * @nr_pages: number of pages - * @offset: registration offset - * @temp: true if a temporary window is being created - * - * Allocate and prepare a self registration window. - */ -struct scif_window *scif_create_window(struct scif_endpt *ep, int nr_pages, - s64 offset, bool temp) -{ - struct scif_window *window; - - might_sleep(); - window = scif_zalloc(sizeof(*window)); - if (!window) - goto error; - - window->dma_addr = scif_zalloc(nr_pages * sizeof(*window->dma_addr)); - if (!window->dma_addr) - goto error_free_window; - - window->num_pages = scif_zalloc(nr_pages * sizeof(*window->num_pages)); - if (!window->num_pages) - goto error_free_window; - - window->offset = offset; - window->ep = (u64)ep; - window->magic = SCIFEP_MAGIC; - window->reg_state = OP_IDLE; - init_waitqueue_head(&window->regwq); - window->unreg_state = OP_IDLE; - init_waitqueue_head(&window->unregwq); - INIT_LIST_HEAD(&window->list); - window->type = SCIF_WINDOW_SELF; - window->temp = temp; - return window; - -error_free_window: - scif_free(window->dma_addr, - nr_pages * sizeof(*window->dma_addr)); - scif_free(window, sizeof(*window)); -error: - return NULL; -} - -/** - * scif_destroy_incomplete_window: - * @ep: end point - * @window: registration window - * - * Deallocate resources for self window. - */ -static void scif_destroy_incomplete_window(struct scif_endpt *ep, - struct scif_window *window) -{ - int err; - int nr_pages = window->nr_pages; - struct scif_allocmsg *alloc = &window->alloc_handle; - struct scifmsg msg; - -retry: - /* Wait for a SCIF_ALLOC_GNT/REJ message */ - err = wait_event_timeout(alloc->allocwq, - alloc->state != OP_IN_PROGRESS, - SCIF_NODE_ALIVE_TIMEOUT); - if (!err && scifdev_alive(ep)) - goto retry; - - mutex_lock(&ep->rma_info.rma_lock); - if (alloc->state == OP_COMPLETED) { - msg.uop = SCIF_FREE_VIRT; - msg.src = ep->port; - msg.payload[0] = ep->remote_ep; - msg.payload[1] = window->alloc_handle.vaddr; - msg.payload[2] = (u64)window; - msg.payload[3] = SCIF_REGISTER; - _scif_nodeqp_send(ep->remote_dev, &msg); - } - mutex_unlock(&ep->rma_info.rma_lock); - - scif_free_window_offset(ep, window, window->offset); - scif_free(window->dma_addr, nr_pages * sizeof(*window->dma_addr)); - scif_free(window->num_pages, nr_pages * sizeof(*window->num_pages)); - scif_free(window, sizeof(*window)); -} - -/** - * scif_unmap_window: - * @remote_dev: SCIF remote device - * @window: registration window - * - * Delete any DMA mappings created for a registered self window - */ -void scif_unmap_window(struct scif_dev *remote_dev, struct scif_window *window) -{ - int j; - - if (scif_is_iommu_enabled() && !scifdev_self(remote_dev)) { - if (window->st) { - dma_unmap_sg(&remote_dev->sdev->dev, - window->st->sgl, window->st->nents, - DMA_BIDIRECTIONAL); - sg_free_table(window->st); - kfree(window->st); - window->st = NULL; - } - } else { - for (j = 0; j < window->nr_contig_chunks; j++) { - if (window->dma_addr[j]) { - scif_unmap_single(window->dma_addr[j], - remote_dev, - window->num_pages[j] << - PAGE_SHIFT); - window->dma_addr[j] = 0x0; - } - } - } -} - -static inline struct mm_struct *__scif_acquire_mm(void) -{ - if (scif_ulimit_check) - return get_task_mm(current); - return NULL; -} - -static inline void __scif_release_mm(struct mm_struct *mm) -{ - if (mm) - mmput(mm); -} - -static inline int -__scif_dec_pinned_vm_lock(struct mm_struct *mm, - int nr_pages) -{ - if (!mm || !nr_pages || !scif_ulimit_check) - return 0; - - atomic64_sub(nr_pages, &mm->pinned_vm); - return 0; -} - -static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, - int nr_pages) -{ - unsigned long locked, lock_limit; - - if (!mm || !nr_pages || !scif_ulimit_check) - return 0; - - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - locked = atomic64_add_return(nr_pages, &mm->pinned_vm); - - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { - atomic64_sub(nr_pages, &mm->pinned_vm); - dev_err(scif_info.mdev.this_device, - "locked(%lu) > lock_limit(%lu)\n", - locked, lock_limit); - return -ENOMEM; - } - return 0; -} - -/** - * scif_destroy_window: - * @ep: end point - * @window: registration window - * - * Deallocate resources for self window. - */ -int scif_destroy_window(struct scif_endpt *ep, struct scif_window *window) -{ - int j; - struct scif_pinned_pages *pinned_pages = window->pinned_pages; - int nr_pages = window->nr_pages; - - might_sleep(); - if (!window->temp && window->mm) { - __scif_dec_pinned_vm_lock(window->mm, window->nr_pages); - __scif_release_mm(window->mm); - window->mm = NULL; - } - - scif_free_window_offset(ep, window, window->offset); - scif_unmap_window(ep->remote_dev, window); - /* - * Decrement references for this set of pinned pages from - * this window. - */ - j = atomic_sub_return(1, &pinned_pages->ref_count); - if (j < 0) - dev_err(scif_info.mdev.this_device, - "%s %d incorrect ref count %d\n", - __func__, __LINE__, j); - /* - * If the ref count for pinned_pages is zero then someone - * has already called scif_unpin_pages() for it and we should - * destroy the page cache. - */ - if (!j) - scif_destroy_pinned_pages(window->pinned_pages); - scif_free(window->dma_addr, nr_pages * sizeof(*window->dma_addr)); - scif_free(window->num_pages, nr_pages * sizeof(*window->num_pages)); - window->magic = 0; - scif_free(window, sizeof(*window)); - return 0; -} - -/** - * scif_create_remote_lookup: - * @remote_dev: SCIF remote device - * @window: remote window - * - * Allocate and prepare lookup entries for the remote - * end to copy over the physical addresses. - * Returns 0 on success and appropriate errno on failure. - */ -static int scif_create_remote_lookup(struct scif_dev *remote_dev, - struct scif_window *window) -{ - int i, j, err = 0; - int nr_pages = window->nr_pages; - bool vmalloc_dma_phys, vmalloc_num_pages; - - might_sleep(); - /* Map window */ - err = scif_map_single(&window->mapped_offset, - window, remote_dev, sizeof(*window)); - if (err) - goto error_window; - - /* Compute the number of lookup entries. 21 == 2MB Shift */ - window->nr_lookup = ALIGN(nr_pages * PAGE_SIZE, - ((2) * 1024 * 1024)) >> 21; - - window->dma_addr_lookup.lookup = - scif_alloc_coherent(&window->dma_addr_lookup.offset, - remote_dev, window->nr_lookup * - sizeof(*window->dma_addr_lookup.lookup), - GFP_KERNEL | __GFP_ZERO); - if (!window->dma_addr_lookup.lookup) { - err = -ENOMEM; - goto error_window; - } - - window->num_pages_lookup.lookup = - scif_alloc_coherent(&window->num_pages_lookup.offset, - remote_dev, window->nr_lookup * - sizeof(*window->num_pages_lookup.lookup), - GFP_KERNEL | __GFP_ZERO); - if (!window->num_pages_lookup.lookup) { - err = -ENOMEM; - goto error_window; - } - - vmalloc_dma_phys = is_vmalloc_addr(&window->dma_addr[0]); - vmalloc_num_pages = is_vmalloc_addr(&window->num_pages[0]); - - /* Now map each of the pages containing physical addresses */ - for (i = 0, j = 0; i < nr_pages; i += SCIF_NR_ADDR_IN_PAGE, j++) { - err = scif_map_page(&window->dma_addr_lookup.lookup[j], - vmalloc_dma_phys ? - vmalloc_to_page(&window->dma_addr[i]) : - virt_to_page(&window->dma_addr[i]), - remote_dev); - if (err) - goto error_window; - err = scif_map_page(&window->num_pages_lookup.lookup[j], - vmalloc_num_pages ? - vmalloc_to_page(&window->num_pages[i]) : - virt_to_page(&window->num_pages[i]), - remote_dev); - if (err) - goto error_window; - } - return 0; -error_window: - return err; -} - -/** - * scif_destroy_remote_lookup: - * @remote_dev: SCIF remote device - * @window: remote window - * - * Destroy lookup entries used for the remote - * end to copy over the physical addresses. - */ -static void scif_destroy_remote_lookup(struct scif_dev *remote_dev, - struct scif_window *window) -{ - int i, j; - - if (window->nr_lookup) { - struct scif_rma_lookup *lup = &window->dma_addr_lookup; - struct scif_rma_lookup *npup = &window->num_pages_lookup; - - for (i = 0, j = 0; i < window->nr_pages; - i += SCIF_NR_ADDR_IN_PAGE, j++) { - if (lup->lookup && lup->lookup[j]) - scif_unmap_single(lup->lookup[j], - remote_dev, - PAGE_SIZE); - if (npup->lookup && npup->lookup[j]) - scif_unmap_single(npup->lookup[j], - remote_dev, - PAGE_SIZE); - } - if (lup->lookup) - scif_free_coherent(lup->lookup, lup->offset, - remote_dev, window->nr_lookup * - sizeof(*lup->lookup)); - if (npup->lookup) - scif_free_coherent(npup->lookup, npup->offset, - remote_dev, window->nr_lookup * - sizeof(*npup->lookup)); - if (window->mapped_offset) - scif_unmap_single(window->mapped_offset, - remote_dev, sizeof(*window)); - window->nr_lookup = 0; - } -} - -/** - * scif_create_remote_window: - * @scifdev: SCIF device - * @nr_pages: number of pages in window - * - * Allocate and prepare a remote registration window. - */ -static struct scif_window * -scif_create_remote_window(struct scif_dev *scifdev, int nr_pages) -{ - struct scif_window *window; - - might_sleep(); - window = scif_zalloc(sizeof(*window)); - if (!window) - goto error_ret; - - window->magic = SCIFEP_MAGIC; - window->nr_pages = nr_pages; - - window->dma_addr = scif_zalloc(nr_pages * sizeof(*window->dma_addr)); - if (!window->dma_addr) - goto error_window; - - window->num_pages = scif_zalloc(nr_pages * - sizeof(*window->num_pages)); - if (!window->num_pages) - goto error_window; - - if (scif_create_remote_lookup(scifdev, window)) - goto error_window; - - window->type = SCIF_WINDOW_PEER; - window->unreg_state = OP_IDLE; - INIT_LIST_HEAD(&window->list); - return window; -error_window: - scif_destroy_remote_window(window); -error_ret: - return NULL; -} - -/** - * scif_destroy_remote_window: - * @window: remote registration window - * - * Deallocate resources for remote window. - */ -void -scif_destroy_remote_window(struct scif_window *window) -{ - scif_free(window->dma_addr, window->nr_pages * - sizeof(*window->dma_addr)); - scif_free(window->num_pages, window->nr_pages * - sizeof(*window->num_pages)); - window->magic = 0; - scif_free(window, sizeof(*window)); -} - -/** - * scif_iommu_map: create DMA mappings if the IOMMU is enabled - * @remote_dev: SCIF remote device - * @window: remote registration window - * - * Map the physical pages using dma_map_sg(..) and then detect the number - * of contiguous DMA mappings allocated - */ -static int scif_iommu_map(struct scif_dev *remote_dev, - struct scif_window *window) -{ - struct scatterlist *sg; - int i, err; - scif_pinned_pages_t pin = window->pinned_pages; - - window->st = kzalloc(sizeof(*window->st), GFP_KERNEL); - if (!window->st) - return -ENOMEM; - - err = sg_alloc_table(window->st, window->nr_pages, GFP_KERNEL); - if (err) - return err; - - for_each_sg(window->st->sgl, sg, window->st->nents, i) - sg_set_page(sg, pin->pages[i], PAGE_SIZE, 0x0); - - err = dma_map_sg(&remote_dev->sdev->dev, window->st->sgl, - window->st->nents, DMA_BIDIRECTIONAL); - if (!err) - return -ENOMEM; - /* Detect contiguous ranges of DMA mappings */ - sg = window->st->sgl; - for (i = 0; sg; i++) { - dma_addr_t last_da; - - window->dma_addr[i] = sg_dma_address(sg); - window->num_pages[i] = sg_dma_len(sg) >> PAGE_SHIFT; - last_da = sg_dma_address(sg) + sg_dma_len(sg); - while ((sg = sg_next(sg)) && sg_dma_address(sg) == last_da) { - window->num_pages[i] += - (sg_dma_len(sg) >> PAGE_SHIFT); - last_da = window->dma_addr[i] + - sg_dma_len(sg); - } - window->nr_contig_chunks++; - } - return 0; -} - -/** - * scif_map_window: - * @remote_dev: SCIF remote device - * @window: self registration window - * - * Map pages of a window into the aperture/PCI. - * Also determine addresses required for DMA. - */ -int -scif_map_window(struct scif_dev *remote_dev, struct scif_window *window) -{ - int i, j, k, err = 0, nr_contig_pages; - scif_pinned_pages_t pin; - phys_addr_t phys_prev, phys_curr; - - might_sleep(); - - pin = window->pinned_pages; - - if (intel_iommu_enabled && !scifdev_self(remote_dev)) - return scif_iommu_map(remote_dev, window); - - for (i = 0, j = 0; i < window->nr_pages; i += nr_contig_pages, j++) { - phys_prev = page_to_phys(pin->pages[i]); - nr_contig_pages = 1; - - /* Detect physically contiguous chunks */ - for (k = i + 1; k < window->nr_pages; k++) { - phys_curr = page_to_phys(pin->pages[k]); - if (phys_curr != (phys_prev + PAGE_SIZE)) - break; - phys_prev = phys_curr; - nr_contig_pages++; - } - window->num_pages[j] = nr_contig_pages; - window->nr_contig_chunks++; - if (scif_is_mgmt_node()) { - /* - * Management node has to deal with SMPT on X100 and - * hence the DMA mapping is required - */ - err = scif_map_single(&window->dma_addr[j], - phys_to_virt(page_to_phys( - pin->pages[i])), - remote_dev, - nr_contig_pages << PAGE_SHIFT); - if (err) - return err; - } else { - window->dma_addr[j] = page_to_phys(pin->pages[i]); - } - } - return err; -} - -/** - * scif_send_scif_unregister: - * @ep: end point - * @window: self registration window - * - * Send a SCIF_UNREGISTER message. - */ -static int scif_send_scif_unregister(struct scif_endpt *ep, - struct scif_window *window) -{ - struct scifmsg msg; - - msg.uop = SCIF_UNREGISTER; - msg.src = ep->port; - msg.payload[0] = window->alloc_handle.vaddr; - msg.payload[1] = (u64)window; - return scif_nodeqp_send(ep->remote_dev, &msg); -} - -/** - * scif_unregister_window: - * @window: self registration window - * - * Send an unregistration request and wait for a response. - */ -int scif_unregister_window(struct scif_window *window) -{ - int err = 0; - struct scif_endpt *ep = (struct scif_endpt *)window->ep; - bool send_msg = false; - - might_sleep(); - switch (window->unreg_state) { - case OP_IDLE: - { - window->unreg_state = OP_IN_PROGRESS; - send_msg = true; - } - fallthrough; - case OP_IN_PROGRESS: - { - scif_get_window(window, 1); - mutex_unlock(&ep->rma_info.rma_lock); - if (send_msg) { - err = scif_send_scif_unregister(ep, window); - if (err) { - window->unreg_state = OP_COMPLETED; - goto done; - } - } else { - /* Return ENXIO since unregistration is in progress */ - mutex_lock(&ep->rma_info.rma_lock); - return -ENXIO; - } -retry: - /* Wait for a SCIF_UNREGISTER_(N)ACK message */ - err = wait_event_timeout(window->unregwq, - window->unreg_state != OP_IN_PROGRESS, - SCIF_NODE_ALIVE_TIMEOUT); - if (!err && scifdev_alive(ep)) - goto retry; - if (!err) { - err = -ENODEV; - window->unreg_state = OP_COMPLETED; - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", __func__, __LINE__, err); - } - if (err > 0) - err = 0; -done: - mutex_lock(&ep->rma_info.rma_lock); - scif_put_window(window, 1); - break; - } - case OP_FAILED: - { - if (!scifdev_alive(ep)) { - err = -ENODEV; - window->unreg_state = OP_COMPLETED; - } - break; - } - case OP_COMPLETED: - break; - default: - err = -ENODEV; - } - - if (window->unreg_state == OP_COMPLETED && window->ref_count) - scif_put_window(window, window->nr_pages); - - if (!window->ref_count) { - atomic_inc(&ep->rma_info.tw_refcount); - list_del_init(&window->list); - scif_free_window_offset(ep, window, window->offset); - mutex_unlock(&ep->rma_info.rma_lock); - if ((!!(window->pinned_pages->map_flags & SCIF_MAP_KERNEL)) && - scifdev_alive(ep)) { - scif_drain_dma_intr(ep->remote_dev->sdev, - ep->rma_info.dma_chan); - } else { - if (!__scif_dec_pinned_vm_lock(window->mm, - window->nr_pages)) { - __scif_release_mm(window->mm); - window->mm = NULL; - } - } - scif_queue_for_cleanup(window, &scif_info.rma); - mutex_lock(&ep->rma_info.rma_lock); - } - return err; -} - -/** - * scif_send_alloc_request: - * @ep: end point - * @window: self registration window - * - * Send a remote window allocation request - */ -static int scif_send_alloc_request(struct scif_endpt *ep, - struct scif_window *window) -{ - struct scifmsg msg; - struct scif_allocmsg *alloc = &window->alloc_handle; - - /* Set up the Alloc Handle */ - alloc->state = OP_IN_PROGRESS; - init_waitqueue_head(&alloc->allocwq); - - /* Send out an allocation request */ - msg.uop = SCIF_ALLOC_REQ; - msg.payload[1] = window->nr_pages; - msg.payload[2] = (u64)&window->alloc_handle; - return _scif_nodeqp_send(ep->remote_dev, &msg); -} - -/** - * scif_prep_remote_window: - * @ep: end point - * @window: self registration window - * - * Send a remote window allocation request, wait for an allocation response, - * and prepares the remote window by copying over the page lists - */ -static int scif_prep_remote_window(struct scif_endpt *ep, - struct scif_window *window) -{ - struct scifmsg msg; - struct scif_window *remote_window; - struct scif_allocmsg *alloc = &window->alloc_handle; - dma_addr_t *dma_phys_lookup, *tmp, *num_pages_lookup, *tmp1; - int i = 0, j = 0; - int nr_contig_chunks, loop_nr_contig_chunks; - int remaining_nr_contig_chunks, nr_lookup; - int err, map_err; - - map_err = scif_map_window(ep->remote_dev, window); - if (map_err) - dev_err(&ep->remote_dev->sdev->dev, - "%s %d map_err %d\n", __func__, __LINE__, map_err); - remaining_nr_contig_chunks = window->nr_contig_chunks; - nr_contig_chunks = window->nr_contig_chunks; -retry: - /* Wait for a SCIF_ALLOC_GNT/REJ message */ - err = wait_event_timeout(alloc->allocwq, - alloc->state != OP_IN_PROGRESS, - SCIF_NODE_ALIVE_TIMEOUT); - mutex_lock(&ep->rma_info.rma_lock); - /* Synchronize with the thread waking up allocwq */ - mutex_unlock(&ep->rma_info.rma_lock); - if (!err && scifdev_alive(ep)) - goto retry; - - if (!err) - err = -ENODEV; - - if (err > 0) - err = 0; - else - return err; - - /* Bail out. The remote end rejected this request */ - if (alloc->state == OP_FAILED) - return -ENOMEM; - - if (map_err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, map_err); - msg.uop = SCIF_FREE_VIRT; - msg.src = ep->port; - msg.payload[0] = ep->remote_ep; - msg.payload[1] = window->alloc_handle.vaddr; - msg.payload[2] = (u64)window; - msg.payload[3] = SCIF_REGISTER; - spin_lock(&ep->lock); - if (ep->state == SCIFEP_CONNECTED) - err = _scif_nodeqp_send(ep->remote_dev, &msg); - else - err = -ENOTCONN; - spin_unlock(&ep->lock); - return err; - } - - remote_window = scif_ioremap(alloc->phys_addr, sizeof(*window), - ep->remote_dev); - - /* Compute the number of lookup entries. 21 == 2MB Shift */ - nr_lookup = ALIGN(nr_contig_chunks, SCIF_NR_ADDR_IN_PAGE) - >> ilog2(SCIF_NR_ADDR_IN_PAGE); - - dma_phys_lookup = - scif_ioremap(remote_window->dma_addr_lookup.offset, - nr_lookup * - sizeof(*remote_window->dma_addr_lookup.lookup), - ep->remote_dev); - num_pages_lookup = - scif_ioremap(remote_window->num_pages_lookup.offset, - nr_lookup * - sizeof(*remote_window->num_pages_lookup.lookup), - ep->remote_dev); - - while (remaining_nr_contig_chunks) { - loop_nr_contig_chunks = min_t(int, remaining_nr_contig_chunks, - (int)SCIF_NR_ADDR_IN_PAGE); - /* #1/2 - Copy physical addresses over to the remote side */ - - /* #2/2 - Copy DMA addresses (addresses that are fed into the - * DMA engine) We transfer bus addresses which are then - * converted into a MIC physical address on the remote - * side if it is a MIC, if the remote node is a mgmt node we - * transfer the MIC physical address - */ - tmp = scif_ioremap(dma_phys_lookup[j], - loop_nr_contig_chunks * - sizeof(*window->dma_addr), - ep->remote_dev); - tmp1 = scif_ioremap(num_pages_lookup[j], - loop_nr_contig_chunks * - sizeof(*window->num_pages), - ep->remote_dev); - if (scif_is_mgmt_node()) { - memcpy_toio((void __force __iomem *)tmp, - &window->dma_addr[i], loop_nr_contig_chunks - * sizeof(*window->dma_addr)); - memcpy_toio((void __force __iomem *)tmp1, - &window->num_pages[i], loop_nr_contig_chunks - * sizeof(*window->num_pages)); - } else { - if (scifdev_is_p2p(ep->remote_dev)) { - /* - * add remote node's base address for this node - * to convert it into a MIC address - */ - int m; - dma_addr_t dma_addr; - - for (m = 0; m < loop_nr_contig_chunks; m++) { - dma_addr = window->dma_addr[i + m] + - ep->remote_dev->base_addr; - writeq(dma_addr, - (void __force __iomem *)&tmp[m]); - } - memcpy_toio((void __force __iomem *)tmp1, - &window->num_pages[i], - loop_nr_contig_chunks - * sizeof(*window->num_pages)); - } else { - /* Mgmt node or loopback - transfer DMA - * addresses as is, this is the same as a - * MIC physical address (we use the dma_addr - * and not the phys_addr array since the - * phys_addr is only setup if there is a mmap() - * request from the mgmt node) - */ - memcpy_toio((void __force __iomem *)tmp, - &window->dma_addr[i], - loop_nr_contig_chunks * - sizeof(*window->dma_addr)); - memcpy_toio((void __force __iomem *)tmp1, - &window->num_pages[i], - loop_nr_contig_chunks * - sizeof(*window->num_pages)); - } - } - remaining_nr_contig_chunks -= loop_nr_contig_chunks; - i += loop_nr_contig_chunks; - j++; - scif_iounmap(tmp, loop_nr_contig_chunks * - sizeof(*window->dma_addr), ep->remote_dev); - scif_iounmap(tmp1, loop_nr_contig_chunks * - sizeof(*window->num_pages), ep->remote_dev); - } - - /* Prepare the remote window for the peer */ - remote_window->peer_window = (u64)window; - remote_window->offset = window->offset; - remote_window->prot = window->prot; - remote_window->nr_contig_chunks = nr_contig_chunks; - remote_window->ep = ep->remote_ep; - scif_iounmap(num_pages_lookup, - nr_lookup * - sizeof(*remote_window->num_pages_lookup.lookup), - ep->remote_dev); - scif_iounmap(dma_phys_lookup, - nr_lookup * - sizeof(*remote_window->dma_addr_lookup.lookup), - ep->remote_dev); - scif_iounmap(remote_window, sizeof(*remote_window), ep->remote_dev); - window->peer_window = alloc->vaddr; - return err; -} - -/** - * scif_send_scif_register: - * @ep: end point - * @window: self registration window - * - * Send a SCIF_REGISTER message if EP is connected and wait for a - * SCIF_REGISTER_(N)ACK message else send a SCIF_FREE_VIRT - * message so that the peer can free its remote window allocated earlier. - */ -static int scif_send_scif_register(struct scif_endpt *ep, - struct scif_window *window) -{ - int err = 0; - struct scifmsg msg; - - msg.src = ep->port; - msg.payload[0] = ep->remote_ep; - msg.payload[1] = window->alloc_handle.vaddr; - msg.payload[2] = (u64)window; - spin_lock(&ep->lock); - if (ep->state == SCIFEP_CONNECTED) { - msg.uop = SCIF_REGISTER; - window->reg_state = OP_IN_PROGRESS; - err = _scif_nodeqp_send(ep->remote_dev, &msg); - spin_unlock(&ep->lock); - if (!err) { -retry: - /* Wait for a SCIF_REGISTER_(N)ACK message */ - err = wait_event_timeout(window->regwq, - window->reg_state != - OP_IN_PROGRESS, - SCIF_NODE_ALIVE_TIMEOUT); - if (!err && scifdev_alive(ep)) - goto retry; - err = !err ? -ENODEV : 0; - if (window->reg_state == OP_FAILED) - err = -ENOTCONN; - } - } else { - msg.uop = SCIF_FREE_VIRT; - msg.payload[3] = SCIF_REGISTER; - err = _scif_nodeqp_send(ep->remote_dev, &msg); - spin_unlock(&ep->lock); - if (!err) - err = -ENOTCONN; - } - return err; -} - -/** - * scif_get_window_offset: - * @ep: end point descriptor - * @flags: flags - * @offset: offset hint - * @num_pages: number of pages - * @out_offset: computed offset returned by reference. - * - * Compute/Claim a new offset for this EP. - */ -int scif_get_window_offset(struct scif_endpt *ep, int flags, s64 offset, - int num_pages, s64 *out_offset) -{ - s64 page_index; - struct iova *iova_ptr; - int err = 0; - - if (flags & SCIF_MAP_FIXED) { - page_index = SCIF_IOVA_PFN(offset); - iova_ptr = reserve_iova(&ep->rma_info.iovad, page_index, - page_index + num_pages - 1); - if (!iova_ptr) - err = -EADDRINUSE; - } else { - iova_ptr = alloc_iova(&ep->rma_info.iovad, num_pages, - SCIF_DMA_63BIT_PFN - 1, 0); - if (!iova_ptr) - err = -ENOMEM; - } - if (!err) - *out_offset = (iova_ptr->pfn_lo) << PAGE_SHIFT; - return err; -} - -/** - * scif_free_window_offset: - * @ep: end point descriptor - * @window: registration window - * @offset: Offset to be freed - * - * Free offset for this EP. The callee is supposed to grab - * the RMA mutex before calling this API. - */ -void scif_free_window_offset(struct scif_endpt *ep, - struct scif_window *window, s64 offset) -{ - if ((window && !window->offset_freed) || !window) { - free_iova(&ep->rma_info.iovad, offset >> PAGE_SHIFT); - if (window) - window->offset_freed = true; - } -} - -/** - * scif_alloc_req: Respond to SCIF_ALLOC_REQ interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Remote side is requesting a memory allocation. - */ -void scif_alloc_req(struct scif_dev *scifdev, struct scifmsg *msg) -{ - int err; - struct scif_window *window = NULL; - int nr_pages = msg->payload[1]; - - window = scif_create_remote_window(scifdev, nr_pages); - if (!window) { - err = -ENOMEM; - goto error; - } - - /* The peer's allocation request is granted */ - msg->uop = SCIF_ALLOC_GNT; - msg->payload[0] = (u64)window; - msg->payload[1] = window->mapped_offset; - err = scif_nodeqp_send(scifdev, msg); - if (err) - scif_destroy_remote_window(window); - return; -error: - /* The peer's allocation request is rejected */ - dev_err(&scifdev->sdev->dev, - "%s %d error %d alloc_ptr %p nr_pages 0x%x\n", - __func__, __LINE__, err, window, nr_pages); - msg->uop = SCIF_ALLOC_REJ; - scif_nodeqp_send(scifdev, msg); -} - -/** - * scif_alloc_gnt_rej: Respond to SCIF_ALLOC_GNT/REJ interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Remote side responded to a memory allocation. - */ -void scif_alloc_gnt_rej(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_allocmsg *handle = (struct scif_allocmsg *)msg->payload[2]; - struct scif_window *window = container_of(handle, struct scif_window, - alloc_handle); - struct scif_endpt *ep = (struct scif_endpt *)window->ep; - - mutex_lock(&ep->rma_info.rma_lock); - handle->vaddr = msg->payload[0]; - handle->phys_addr = msg->payload[1]; - if (msg->uop == SCIF_ALLOC_GNT) - handle->state = OP_COMPLETED; - else - handle->state = OP_FAILED; - wake_up(&handle->allocwq); - mutex_unlock(&ep->rma_info.rma_lock); -} - -/** - * scif_free_virt: Respond to SCIF_FREE_VIRT interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Free up memory kmalloc'd earlier. - */ -void scif_free_virt(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_window *window = (struct scif_window *)msg->payload[1]; - - scif_destroy_remote_window(window); -} - -static void -scif_fixup_aper_base(struct scif_dev *dev, struct scif_window *window) -{ - int j; - struct scif_hw_dev *sdev = dev->sdev; - phys_addr_t apt_base = 0; - - /* - * Add the aperture base if the DMA address is not card relative - * since the DMA addresses need to be an offset into the bar - */ - if (!scifdev_self(dev) && window->type == SCIF_WINDOW_PEER && - sdev->aper && !sdev->card_rel_da) - apt_base = sdev->aper->pa; - else - return; - - for (j = 0; j < window->nr_contig_chunks; j++) { - if (window->num_pages[j]) - window->dma_addr[j] += apt_base; - else - break; - } -} - -/** - * scif_recv_reg: Respond to SCIF_REGISTER interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Update remote window list with a new registered window. - */ -void scif_recv_reg(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; - struct scif_window *window = - (struct scif_window *)msg->payload[1]; - - mutex_lock(&ep->rma_info.rma_lock); - spin_lock(&ep->lock); - if (ep->state == SCIFEP_CONNECTED) { - msg->uop = SCIF_REGISTER_ACK; - scif_nodeqp_send(ep->remote_dev, msg); - scif_fixup_aper_base(ep->remote_dev, window); - /* No further failures expected. Insert new window */ - scif_insert_window(window, &ep->rma_info.remote_reg_list); - } else { - msg->uop = SCIF_REGISTER_NACK; - scif_nodeqp_send(ep->remote_dev, msg); - } - spin_unlock(&ep->lock); - mutex_unlock(&ep->rma_info.rma_lock); - /* free up any lookup resources now that page lists are transferred */ - scif_destroy_remote_lookup(ep->remote_dev, window); - /* - * We could not insert the window but we need to - * destroy the window. - */ - if (msg->uop == SCIF_REGISTER_NACK) - scif_destroy_remote_window(window); -} - -/** - * scif_recv_unreg: Respond to SCIF_UNREGISTER interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Remove window from remote registration list; - */ -void scif_recv_unreg(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_rma_req req; - struct scif_window *window = NULL; - struct scif_window *recv_window = - (struct scif_window *)msg->payload[0]; - struct scif_endpt *ep; - int del_window = 0; - - ep = (struct scif_endpt *)recv_window->ep; - req.out_window = &window; - req.offset = recv_window->offset; - req.prot = 0; - req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT; - req.type = SCIF_WINDOW_FULL; - req.head = &ep->rma_info.remote_reg_list; - msg->payload[0] = ep->remote_ep; - - mutex_lock(&ep->rma_info.rma_lock); - /* Does a valid window exist? */ - if (scif_query_window(&req)) { - dev_err(&scifdev->sdev->dev, - "%s %d -ENXIO\n", __func__, __LINE__); - msg->uop = SCIF_UNREGISTER_ACK; - goto error; - } - if (window) { - if (window->ref_count) - scif_put_window(window, window->nr_pages); - else - dev_err(&scifdev->sdev->dev, - "%s %d ref count should be +ve\n", - __func__, __LINE__); - window->unreg_state = OP_COMPLETED; - if (!window->ref_count) { - msg->uop = SCIF_UNREGISTER_ACK; - atomic_inc(&ep->rma_info.tw_refcount); - ep->rma_info.async_list_del = 1; - list_del_init(&window->list); - del_window = 1; - } else { - /* NACK! There are valid references to this window */ - msg->uop = SCIF_UNREGISTER_NACK; - } - } else { - /* The window did not make its way to the list at all. ACK */ - msg->uop = SCIF_UNREGISTER_ACK; - scif_destroy_remote_window(recv_window); - } -error: - mutex_unlock(&ep->rma_info.rma_lock); - if (del_window) - scif_drain_dma_intr(ep->remote_dev->sdev, - ep->rma_info.dma_chan); - scif_nodeqp_send(ep->remote_dev, msg); - if (del_window) - scif_queue_for_cleanup(window, &scif_info.rma); -} - -/** - * scif_recv_reg_ack: Respond to SCIF_REGISTER_ACK interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Wake up the window waiting to complete registration. - */ -void scif_recv_reg_ack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_window *window = - (struct scif_window *)msg->payload[2]; - struct scif_endpt *ep = (struct scif_endpt *)window->ep; - - mutex_lock(&ep->rma_info.rma_lock); - window->reg_state = OP_COMPLETED; - wake_up(&window->regwq); - mutex_unlock(&ep->rma_info.rma_lock); -} - -/** - * scif_recv_reg_nack: Respond to SCIF_REGISTER_NACK interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Wake up the window waiting to inform it that registration - * cannot be completed. - */ -void scif_recv_reg_nack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_window *window = - (struct scif_window *)msg->payload[2]; - struct scif_endpt *ep = (struct scif_endpt *)window->ep; - - mutex_lock(&ep->rma_info.rma_lock); - window->reg_state = OP_FAILED; - wake_up(&window->regwq); - mutex_unlock(&ep->rma_info.rma_lock); -} - -/** - * scif_recv_unreg_ack: Respond to SCIF_UNREGISTER_ACK interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Wake up the window waiting to complete unregistration. - */ -void scif_recv_unreg_ack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_window *window = - (struct scif_window *)msg->payload[1]; - struct scif_endpt *ep = (struct scif_endpt *)window->ep; - - mutex_lock(&ep->rma_info.rma_lock); - window->unreg_state = OP_COMPLETED; - wake_up(&window->unregwq); - mutex_unlock(&ep->rma_info.rma_lock); -} - -/** - * scif_recv_unreg_nack: Respond to SCIF_UNREGISTER_NACK interrupt message - * @scifdev: SCIF device - * @msg: Interrupt message - * - * Wake up the window waiting to inform it that unregistration - * cannot be completed immediately. - */ -void scif_recv_unreg_nack(struct scif_dev *scifdev, struct scifmsg *msg) -{ - struct scif_window *window = - (struct scif_window *)msg->payload[1]; - struct scif_endpt *ep = (struct scif_endpt *)window->ep; - - mutex_lock(&ep->rma_info.rma_lock); - window->unreg_state = OP_FAILED; - wake_up(&window->unregwq); - mutex_unlock(&ep->rma_info.rma_lock); -} - -int __scif_pin_pages(void *addr, size_t len, int *out_prot, - int map_flags, scif_pinned_pages_t *pages) -{ - struct scif_pinned_pages *pinned_pages; - int nr_pages, err = 0, i; - bool vmalloc_addr = false; - bool try_upgrade = false; - int prot = *out_prot; - int ulimit = 0; - struct mm_struct *mm = NULL; - - /* Unsupported flags */ - if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT)) - return -EINVAL; - ulimit = !!(map_flags & SCIF_MAP_ULIMIT); - - /* Unsupported protection requested */ - if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE)) - return -EINVAL; - - /* addr/len must be page aligned. len should be non zero */ - if (!len || - (ALIGN((u64)addr, PAGE_SIZE) != (u64)addr) || - (ALIGN((u64)len, PAGE_SIZE) != (u64)len)) - return -EINVAL; - - might_sleep(); - - nr_pages = len >> PAGE_SHIFT; - - /* Allocate a set of pinned pages */ - pinned_pages = scif_create_pinned_pages(nr_pages, prot); - if (!pinned_pages) - return -ENOMEM; - - if (map_flags & SCIF_MAP_KERNEL) { - if (is_vmalloc_addr(addr)) - vmalloc_addr = true; - - for (i = 0; i < nr_pages; i++) { - if (vmalloc_addr) - pinned_pages->pages[i] = - vmalloc_to_page(addr + (i * PAGE_SIZE)); - else - pinned_pages->pages[i] = - virt_to_page(addr + (i * PAGE_SIZE)); - } - pinned_pages->nr_pages = nr_pages; - pinned_pages->map_flags = SCIF_MAP_KERNEL; - } else { - /* - * SCIF supports registration caching. If a registration has - * been requested with read only permissions, then we try - * to pin the pages with RW permissions so that a subsequent - * transfer with RW permission can hit the cache instead of - * invalidating it. If the upgrade fails with RW then we - * revert back to R permission and retry - */ - if (prot == SCIF_PROT_READ) - try_upgrade = true; - prot |= SCIF_PROT_WRITE; -retry: - mm = current->mm; - if (ulimit) { - err = __scif_check_inc_pinned_vm(mm, nr_pages); - if (err) { - pinned_pages->nr_pages = 0; - goto error_unmap; - } - } - - pinned_pages->nr_pages = pin_user_pages_fast( - (u64)addr, - nr_pages, - (prot & SCIF_PROT_WRITE) ? FOLL_WRITE : 0, - pinned_pages->pages); - if (nr_pages != pinned_pages->nr_pages) { - if (pinned_pages->nr_pages < 0) - pinned_pages->nr_pages = 0; - if (try_upgrade) { - if (ulimit) - __scif_dec_pinned_vm_lock(mm, nr_pages); - /* Roll back any pinned pages */ - unpin_user_pages(pinned_pages->pages, - pinned_pages->nr_pages); - prot &= ~SCIF_PROT_WRITE; - try_upgrade = false; - goto retry; - } - } - pinned_pages->map_flags = 0; - } - - if (pinned_pages->nr_pages < nr_pages) { - err = -EFAULT; - goto dec_pinned; - } - - *out_prot = prot; - atomic_set(&pinned_pages->ref_count, 1); - *pages = pinned_pages; - return err; -dec_pinned: - if (ulimit) - __scif_dec_pinned_vm_lock(mm, nr_pages); - /* Something went wrong! Rollback */ -error_unmap: - scif_destroy_pinned_pages(pinned_pages); - *pages = NULL; - dev_dbg(scif_info.mdev.this_device, - "%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len); - return err; -} - -int scif_pin_pages(void *addr, size_t len, int prot, - int map_flags, scif_pinned_pages_t *pages) -{ - return __scif_pin_pages(addr, len, &prot, map_flags, pages); -} -EXPORT_SYMBOL_GPL(scif_pin_pages); - -int scif_unpin_pages(scif_pinned_pages_t pinned_pages) -{ - int err = 0, ret; - - if (!pinned_pages || SCIFEP_MAGIC != pinned_pages->magic) - return -EINVAL; - - ret = atomic_sub_return(1, &pinned_pages->ref_count); - if (ret < 0) { - dev_err(scif_info.mdev.this_device, - "%s %d scif_unpin_pages called without pinning? rc %d\n", - __func__, __LINE__, ret); - return -EINVAL; - } - /* - * Destroy the window if the ref count for this set of pinned - * pages has dropped to zero. If it is positive then there is - * a valid registered window which is backed by these pages and - * it will be destroyed once all such windows are unregistered. - */ - if (!ret) - err = scif_destroy_pinned_pages(pinned_pages); - - return err; -} -EXPORT_SYMBOL_GPL(scif_unpin_pages); - -static inline void -scif_insert_local_window(struct scif_window *window, struct scif_endpt *ep) -{ - mutex_lock(&ep->rma_info.rma_lock); - scif_insert_window(window, &ep->rma_info.reg_list); - mutex_unlock(&ep->rma_info.rma_lock); -} - -off_t scif_register_pinned_pages(scif_epd_t epd, - scif_pinned_pages_t pinned_pages, - off_t offset, int map_flags) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - s64 computed_offset; - struct scif_window *window; - int err; - size_t len; - struct device *spdev; - - /* Unsupported flags */ - if (map_flags & ~SCIF_MAP_FIXED) - return -EINVAL; - - len = pinned_pages->nr_pages << PAGE_SHIFT; - - /* - * Offset is not page aligned/negative or offset+len - * wraps around with SCIF_MAP_FIXED. - */ - if ((map_flags & SCIF_MAP_FIXED) && - ((ALIGN(offset, PAGE_SIZE) != offset) || - (offset < 0) || - (len > LONG_MAX - offset))) - return -EINVAL; - - might_sleep(); - - err = scif_verify_epd(ep); - if (err) - return err; - /* - * It is an error to pass pinned_pages to scif_register_pinned_pages() - * after calling scif_unpin_pages(). - */ - if (!atomic_add_unless(&pinned_pages->ref_count, 1, 0)) - return -EINVAL; - - /* Compute the offset for this registration */ - err = scif_get_window_offset(ep, map_flags, offset, - len, &computed_offset); - if (err) { - atomic_sub(1, &pinned_pages->ref_count); - return err; - } - - /* Allocate and prepare self registration window */ - window = scif_create_window(ep, pinned_pages->nr_pages, - computed_offset, false); - if (!window) { - atomic_sub(1, &pinned_pages->ref_count); - scif_free_window_offset(ep, NULL, computed_offset); - return -ENOMEM; - } - - window->pinned_pages = pinned_pages; - window->nr_pages = pinned_pages->nr_pages; - window->prot = pinned_pages->prot; - - spdev = scif_get_peer_dev(ep->remote_dev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - scif_destroy_window(ep, window); - return err; - } - err = scif_send_alloc_request(ep, window); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - goto error_unmap; - } - - /* Prepare the remote registration window */ - err = scif_prep_remote_window(ep, window); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - goto error_unmap; - } - - /* Tell the peer about the new window */ - err = scif_send_scif_register(ep, window); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - goto error_unmap; - } - - scif_put_peer_dev(spdev); - /* No further failures expected. Insert new window */ - scif_insert_local_window(window, ep); - return computed_offset; -error_unmap: - scif_destroy_window(ep, window); - scif_put_peer_dev(spdev); - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - return err; -} -EXPORT_SYMBOL_GPL(scif_register_pinned_pages); - -off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, - int prot, int map_flags) -{ - scif_pinned_pages_t pinned_pages; - off_t err; - struct scif_endpt *ep = (struct scif_endpt *)epd; - s64 computed_offset; - struct scif_window *window; - struct mm_struct *mm = NULL; - struct device *spdev; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI register: ep %p addr %p len 0x%lx offset 0x%lx prot 0x%x map_flags 0x%x\n", - epd, addr, len, offset, prot, map_flags); - /* Unsupported flags */ - if (map_flags & ~(SCIF_MAP_FIXED | SCIF_MAP_KERNEL)) - return -EINVAL; - - /* - * Offset is not page aligned/negative or offset+len - * wraps around with SCIF_MAP_FIXED. - */ - if ((map_flags & SCIF_MAP_FIXED) && - ((ALIGN(offset, PAGE_SIZE) != offset) || - (offset < 0) || - (len > LONG_MAX - offset))) - return -EINVAL; - - /* Unsupported protection requested */ - if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE)) - return -EINVAL; - - /* addr/len must be page aligned. len should be non zero */ - if (!len || (ALIGN((u64)addr, PAGE_SIZE) != (u64)addr) || - (ALIGN(len, PAGE_SIZE) != len)) - return -EINVAL; - - might_sleep(); - - err = scif_verify_epd(ep); - if (err) - return err; - - /* Compute the offset for this registration */ - err = scif_get_window_offset(ep, map_flags, offset, - len >> PAGE_SHIFT, &computed_offset); - if (err) - return err; - - spdev = scif_get_peer_dev(ep->remote_dev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - scif_free_window_offset(ep, NULL, computed_offset); - return err; - } - /* Allocate and prepare self registration window */ - window = scif_create_window(ep, len >> PAGE_SHIFT, - computed_offset, false); - if (!window) { - scif_free_window_offset(ep, NULL, computed_offset); - scif_put_peer_dev(spdev); - return -ENOMEM; - } - - window->nr_pages = len >> PAGE_SHIFT; - - err = scif_send_alloc_request(ep, window); - if (err) { - scif_destroy_incomplete_window(ep, window); - scif_put_peer_dev(spdev); - return err; - } - - if (!(map_flags & SCIF_MAP_KERNEL)) { - mm = __scif_acquire_mm(); - map_flags |= SCIF_MAP_ULIMIT; - } - /* Pin down the pages */ - err = __scif_pin_pages(addr, len, &prot, - map_flags & (SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT), - &pinned_pages); - if (err) { - scif_destroy_incomplete_window(ep, window); - __scif_release_mm(mm); - goto error; - } - - window->pinned_pages = pinned_pages; - window->prot = pinned_pages->prot; - window->mm = mm; - - /* Prepare the remote registration window */ - err = scif_prep_remote_window(ep, window); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %ld\n", __func__, __LINE__, err); - goto error_unmap; - } - - /* Tell the peer about the new window */ - err = scif_send_scif_register(ep, window); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %ld\n", __func__, __LINE__, err); - goto error_unmap; - } - - scif_put_peer_dev(spdev); - /* No further failures expected. Insert new window */ - scif_insert_local_window(window, ep); - dev_dbg(&ep->remote_dev->sdev->dev, - "SCIFAPI register: ep %p addr %p len 0x%lx computed_offset 0x%llx\n", - epd, addr, len, computed_offset); - return computed_offset; -error_unmap: - scif_destroy_window(ep, window); -error: - scif_put_peer_dev(spdev); - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %ld\n", __func__, __LINE__, err); - return err; -} -EXPORT_SYMBOL_GPL(scif_register); - -int -scif_unregister(scif_epd_t epd, off_t offset, size_t len) -{ - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct scif_window *window = NULL; - struct scif_rma_req req; - int nr_pages, err; - struct device *spdev; - - dev_dbg(scif_info.mdev.this_device, - "SCIFAPI unregister: ep %p offset 0x%lx len 0x%lx\n", - ep, offset, len); - /* len must be page aligned. len should be non zero */ - if (!len || - (ALIGN((u64)len, PAGE_SIZE) != (u64)len)) - return -EINVAL; - - /* Offset is not page aligned or offset+len wraps around */ - if ((ALIGN(offset, PAGE_SIZE) != offset) || - (offset < 0) || - (len > LONG_MAX - offset)) - return -EINVAL; - - err = scif_verify_epd(ep); - if (err) - return err; - - might_sleep(); - nr_pages = len >> PAGE_SHIFT; - - req.out_window = &window; - req.offset = offset; - req.prot = 0; - req.nr_bytes = len; - req.type = SCIF_WINDOW_FULL; - req.head = &ep->rma_info.reg_list; - - spdev = scif_get_peer_dev(ep->remote_dev); - if (IS_ERR(spdev)) { - err = PTR_ERR(spdev); - return err; - } - mutex_lock(&ep->rma_info.rma_lock); - /* Does a valid window exist? */ - err = scif_query_window(&req); - if (err) { - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); - goto error; - } - /* Unregister all the windows in this range */ - err = scif_rma_list_unregister(window, offset, nr_pages); - if (err) - dev_err(&ep->remote_dev->sdev->dev, - "%s %d err %d\n", __func__, __LINE__, err); -error: - mutex_unlock(&ep->rma_info.rma_lock); - scif_put_peer_dev(spdev); - return err; -} -EXPORT_SYMBOL_GPL(scif_unregister); diff --git a/drivers/misc/mic/scif/scif_rma.h b/drivers/misc/mic/scif/scif_rma.h deleted file mode 100644 index 964dd0fc3657..000000000000 --- a/drivers/misc/mic/scif/scif_rma.h +++ /dev/null @@ -1,477 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Intel SCIF driver. - * - */ -#ifndef SCIF_RMA_H -#define SCIF_RMA_H - -#include -#include - -#include "../bus/scif_bus.h" - -/* If this bit is set then the mark is a remote fence mark */ -#define SCIF_REMOTE_FENCE_BIT 31 -/* Magic value used to indicate a remote fence request */ -#define SCIF_REMOTE_FENCE BIT_ULL(SCIF_REMOTE_FENCE_BIT) - -#define SCIF_MAX_UNALIGNED_BUF_SIZE (1024 * 1024ULL) -#define SCIF_KMEM_UNALIGNED_BUF_SIZE (SCIF_MAX_UNALIGNED_BUF_SIZE + \ - (L1_CACHE_BYTES << 1)) - -#define SCIF_IOVA_START_PFN (1) -#define SCIF_IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) -#define SCIF_DMA_64BIT_PFN SCIF_IOVA_PFN(DMA_BIT_MASK(64)) -#define SCIF_DMA_63BIT_PFN SCIF_IOVA_PFN(DMA_BIT_MASK(63)) - -/* - * struct scif_endpt_rma_info - Per Endpoint Remote Memory Access Information - * - * @reg_list: List of registration windows for self - * @remote_reg_list: List of registration windows for peer - * @iovad: Offset generator - * @rma_lock: Synchronizes access to self/remote list and also protects the - * window from being destroyed while RMAs are in progress. - * @tc_lock: Synchronizes access to temporary cached windows list - * for SCIF Registration Caching. - * @mmn_lock: Synchronizes access to the list of MMU notifiers registered - * @tw_refcount: Keeps track of number of outstanding temporary registered - * windows created by scif_vreadfrom/scif_vwriteto which have - * not been destroyed. - * @tcw_refcount: Same as tw_refcount but for temporary cached windows - * @tcw_total_pages: Same as tcw_refcount but in terms of pages pinned - * @mmn_list: MMU notifier so that we can destroy the windows when required - * @fence_refcount: Keeps track of number of outstanding remote fence - * requests which have been received by the peer. - * @dma_chan: DMA channel used for all DMA transfers for this endpoint. - * @async_list_del: Detect asynchronous list entry deletion - * @vma_list: List of vmas with remote memory mappings - * @markwq: Wait queue used for scif_fence_mark/scif_fence_wait -*/ -struct scif_endpt_rma_info { - struct list_head reg_list; - struct list_head remote_reg_list; - struct iova_domain iovad; - struct mutex rma_lock; - spinlock_t tc_lock; - struct mutex mmn_lock; - atomic_t tw_refcount; - atomic_t tcw_refcount; - atomic_t tcw_total_pages; - struct list_head mmn_list; - atomic_t fence_refcount; - struct dma_chan *dma_chan; - int async_list_del; - struct list_head vma_list; - wait_queue_head_t markwq; -}; - -/* - * struct scif_fence_info - used for tracking fence requests - * - * @state: State of this transfer - * @wq: Fences wait on this queue - * @dma_mark: Used for storing the DMA mark - */ -struct scif_fence_info { - enum scif_msg_state state; - struct completion comp; - int dma_mark; -}; - -/* - * struct scif_remote_fence_info - used for tracking remote fence requests - * - * @msg: List of SCIF node QP fence messages - * @list: Link to list of remote fence requests - */ -struct scif_remote_fence_info { - struct scifmsg msg; - struct list_head list; -}; - -/* - * Specifies whether an RMA operation can span across partial windows, a single - * window or multiple contiguous windows. Mmaps can span across partial windows. - * Unregistration can span across complete windows. scif_get_pages() can span a - * single window. A window can also be of type self or peer. - */ -enum scif_window_type { - SCIF_WINDOW_PARTIAL, - SCIF_WINDOW_SINGLE, - SCIF_WINDOW_FULL, - SCIF_WINDOW_SELF, - SCIF_WINDOW_PEER -}; - -/* The number of physical addresses that can be stored in a PAGE. */ -#define SCIF_NR_ADDR_IN_PAGE (0x1000 >> 3) - -/* - * struct scif_rma_lookup - RMA lookup data structure for page list transfers - * - * Store an array of lookup offsets. Each offset in this array maps - * one 4K page containing 512 physical addresses i.e. 2MB. 512 such - * offsets in a 4K page will correspond to 1GB of registered address space. - - * @lookup: Array of offsets - * @offset: DMA offset of lookup array - */ -struct scif_rma_lookup { - dma_addr_t *lookup; - dma_addr_t offset; -}; - -/* - * struct scif_pinned_pages - A set of pinned pages obtained with - * scif_pin_pages() which could be part of multiple registered - * windows across different end points. - * - * @nr_pages: Number of pages which is defined as a s64 instead of an int - * to avoid sign extension with buffers >= 2GB - * @prot: read/write protections - * @map_flags: Flags specified during the pin operation - * @ref_count: Reference count bumped in terms of number of pages - * @magic: A magic value - * @pages: Array of pointers to struct pages populated with get_user_pages(..) - */ -struct scif_pinned_pages { - s64 nr_pages; - int prot; - int map_flags; - atomic_t ref_count; - u64 magic; - struct page **pages; -}; - -/* - * struct scif_status - Stores DMA status update information - * - * @src_dma_addr: Source buffer DMA address - * @val: src location for value to be written to the destination - * @ep: SCIF endpoint - */ -struct scif_status { - dma_addr_t src_dma_addr; - u64 val; - struct scif_endpt *ep; -}; - -/* - * struct scif_cb_arg - Stores the argument of the callback func - * - * @src_dma_addr: Source buffer DMA address - * @status: DMA status - * @ep: SCIF endpoint - */ -struct scif_cb_arg { - dma_addr_t src_dma_addr; - struct scif_status *status; - struct scif_endpt *ep; -}; - -/* - * struct scif_window - Registration Window for Self and Remote - * - * @nr_pages: Number of pages which is defined as a s64 instead of an int - * to avoid sign extension with buffers >= 2GB - * @nr_contig_chunks: Number of contiguous physical chunks - * @prot: read/write protections - * @ref_count: reference count in terms of number of pages - * @magic: Cookie to detect corruption - * @offset: registered offset - * @va_for_temp: va address that this window represents - * @dma_mark: Used to determine if all DMAs against the window are done - * @ep: Pointer to EP. Useful for passing EP around with messages to - avoid expensive list traversals. - * @list: link to list of windows for the endpoint - * @type: self or peer window - * @peer_window: Pointer to peer window. Useful for sending messages to peer - * without requiring an extra list traversal - * @unreg_state: unregistration state - * @offset_freed: True if the offset has been freed - * @temp: True for temporary windows created via scif_vreadfrom/scif_vwriteto - * @mm: memory descriptor for the task_struct which initiated the RMA - * @st: scatter gather table for DMA mappings with IOMMU enabled - * @pinned_pages: The set of pinned_pages backing this window - * @alloc_handle: Handle for sending ALLOC_REQ - * @regwq: Wait Queue for an registration (N)ACK - * @reg_state: Registration state - * @unregwq: Wait Queue for an unregistration (N)ACK - * @dma_addr_lookup: Lookup for physical addresses used for DMA - * @nr_lookup: Number of entries in lookup - * @mapped_offset: Offset used to map the window by the peer - * @dma_addr: Array of physical addresses used for Mgmt node & MIC initiated DMA - * @num_pages: Array specifying number of pages for each physical address - */ -struct scif_window { - s64 nr_pages; - int nr_contig_chunks; - int prot; - int ref_count; - u64 magic; - s64 offset; - unsigned long va_for_temp; - int dma_mark; - u64 ep; - struct list_head list; - enum scif_window_type type; - u64 peer_window; - enum scif_msg_state unreg_state; - bool offset_freed; - bool temp; - struct mm_struct *mm; - struct sg_table *st; - union { - struct { - struct scif_pinned_pages *pinned_pages; - struct scif_allocmsg alloc_handle; - wait_queue_head_t regwq; - enum scif_msg_state reg_state; - wait_queue_head_t unregwq; - }; - struct { - struct scif_rma_lookup dma_addr_lookup; - struct scif_rma_lookup num_pages_lookup; - int nr_lookup; - dma_addr_t mapped_offset; - }; - }; - dma_addr_t *dma_addr; - u64 *num_pages; -} __packed; - -/* - * scif_mmu_notif - SCIF mmu notifier information - * - * @mmu_notifier ep_mmu_notifier: MMU notifier operations - * @tc_reg_list: List of temp registration windows for self - * @mm: memory descriptor for the task_struct which initiated the RMA - * @ep: SCIF endpoint - * @list: link to list of MMU notifier information - */ -struct scif_mmu_notif { -#ifdef CONFIG_MMU_NOTIFIER - struct mmu_notifier ep_mmu_notifier; -#endif - struct list_head tc_reg_list; - struct mm_struct *mm; - struct scif_endpt *ep; - struct list_head list; -}; - -enum scif_rma_dir { - SCIF_LOCAL_TO_REMOTE, - SCIF_REMOTE_TO_LOCAL -}; - -extern struct kmem_cache *unaligned_cache; -/* Initialize RMA for this EP */ -void scif_rma_ep_init(struct scif_endpt *ep); -/* Check if epd can be uninitialized */ -int scif_rma_ep_can_uninit(struct scif_endpt *ep); -/* Obtain a new offset. Callee must grab RMA lock */ -int scif_get_window_offset(struct scif_endpt *ep, int flags, - s64 offset, int nr_pages, s64 *out_offset); -/* Free offset. Callee must grab RMA lock */ -void scif_free_window_offset(struct scif_endpt *ep, - struct scif_window *window, s64 offset); -/* Create self registration window */ -struct scif_window *scif_create_window(struct scif_endpt *ep, int nr_pages, - s64 offset, bool temp); -/* Destroy self registration window.*/ -int scif_destroy_window(struct scif_endpt *ep, struct scif_window *window); -void scif_unmap_window(struct scif_dev *remote_dev, struct scif_window *window); -/* Map pages of self window to Aperture/PCI */ -int scif_map_window(struct scif_dev *remote_dev, - struct scif_window *window); -/* Unregister a self window */ -int scif_unregister_window(struct scif_window *window); -/* Destroy remote registration window */ -void -scif_destroy_remote_window(struct scif_window *window); -/* remove valid remote memory mappings from process address space */ -void scif_zap_mmaps(int node); -/* Query if any applications have remote memory mappings */ -bool scif_rma_do_apps_have_mmaps(int node); -/* Cleanup remote registration lists for zombie endpoints */ -void scif_cleanup_rma_for_zombies(int node); -/* Reserve a DMA channel for a particular endpoint */ -int scif_reserve_dma_chan(struct scif_endpt *ep); -/* Setup a DMA mark for an endpoint */ -int _scif_fence_mark(scif_epd_t epd, int *mark); -int scif_prog_signal(scif_epd_t epd, off_t offset, u64 val, - enum scif_window_type type); -void scif_alloc_req(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_alloc_gnt_rej(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_free_virt(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_reg(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_unreg(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_reg_ack(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_reg_nack(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_unreg_ack(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_unreg_nack(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_mark(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_mark_resp(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_wait(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_wait_resp(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_sig_local(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_sig_remote(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_recv_sig_resp(struct scif_dev *scifdev, struct scifmsg *msg); -void scif_mmu_notif_handler(struct work_struct *work); -void scif_rma_handle_remote_fences(void); -void scif_rma_destroy_windows(void); -void scif_rma_destroy_tcw_invalid(void); -int scif_drain_dma_intr(struct scif_hw_dev *sdev, struct dma_chan *chan); - -struct scif_window_iter { - s64 offset; - int index; -}; - -static inline void -scif_init_window_iter(struct scif_window *window, struct scif_window_iter *iter) -{ - iter->offset = window->offset; - iter->index = 0; -} - -dma_addr_t scif_off_to_dma_addr(struct scif_window *window, s64 off, - size_t *nr_bytes, - struct scif_window_iter *iter); -static inline -dma_addr_t __scif_off_to_dma_addr(struct scif_window *window, s64 off) -{ - return scif_off_to_dma_addr(window, off, NULL, NULL); -} - -static inline bool scif_unaligned(off_t src_offset, off_t dst_offset) -{ - src_offset = src_offset & (L1_CACHE_BYTES - 1); - dst_offset = dst_offset & (L1_CACHE_BYTES - 1); - return !(src_offset == dst_offset); -} - -/* - * scif_zalloc: - * @size: Size of the allocation request. - * - * Helper API which attempts to allocate zeroed pages via - * __get_free_pages(..) first and then falls back on - * vzalloc(..) if that fails. - */ -static inline void *scif_zalloc(size_t size) -{ - void *ret = NULL; - size_t align = ALIGN(size, PAGE_SIZE); - - if (align && get_order(align) < MAX_ORDER) - ret = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(align)); - return ret ? ret : vzalloc(align); -} - -/* - * scif_free: - * @addr: Address to be freed. - * @size: Size of the allocation. - * Helper API which frees memory allocated via scif_zalloc(). - */ -static inline void scif_free(void *addr, size_t size) -{ - size_t align = ALIGN(size, PAGE_SIZE); - - if (is_vmalloc_addr(addr)) - vfree(addr); - else - free_pages((unsigned long)addr, get_order(align)); -} - -static inline void scif_get_window(struct scif_window *window, int nr_pages) -{ - window->ref_count += nr_pages; -} - -static inline void scif_put_window(struct scif_window *window, int nr_pages) -{ - window->ref_count -= nr_pages; -} - -static inline void scif_set_window_ref(struct scif_window *window, int nr_pages) -{ - window->ref_count = nr_pages; -} - -static inline void -scif_queue_for_cleanup(struct scif_window *window, struct list_head *list) -{ - spin_lock(&scif_info.rmalock); - list_add_tail(&window->list, list); - spin_unlock(&scif_info.rmalock); - schedule_work(&scif_info.misc_work); -} - -static inline void __scif_rma_destroy_tcw_helper(struct scif_window *window) -{ - list_del_init(&window->list); - scif_queue_for_cleanup(window, &scif_info.rma_tc); -} - -static inline bool scif_is_iommu_enabled(void) -{ -#ifdef CONFIG_INTEL_IOMMU - return intel_iommu_enabled; -#else - return false; -#endif -} -#endif /* SCIF_RMA_H */ diff --git a/drivers/misc/mic/scif/scif_rma_list.c b/drivers/misc/mic/scif/scif_rma_list.c deleted file mode 100644 index ef923ba134c8..000000000000 --- a/drivers/misc/mic/scif/scif_rma_list.c +++ /dev/null @@ -1,282 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel SCIF driver. - */ -#include "scif_main.h" -#include -#include - -/* - * scif_insert_tcw: - * - * Insert a temp window to the temp registration list sorted by va_for_temp. - * RMA lock must be held. - */ -void scif_insert_tcw(struct scif_window *window, struct list_head *head) -{ - struct scif_window *curr = NULL; - struct scif_window *prev = list_entry(head, struct scif_window, list); - struct list_head *item; - - INIT_LIST_HEAD(&window->list); - /* Compare with tail and if the entry is new tail add it to the end */ - if (!list_empty(head)) { - curr = list_entry(head->prev, struct scif_window, list); - if (curr->va_for_temp < window->va_for_temp) { - list_add_tail(&window->list, head); - return; - } - } - list_for_each(item, head) { - curr = list_entry(item, struct scif_window, list); - if (curr->va_for_temp > window->va_for_temp) - break; - prev = curr; - } - list_add(&window->list, &prev->list); -} - -/* - * scif_insert_window: - * - * Insert a window to the self registration list sorted by offset. - * RMA lock must be held. - */ -void scif_insert_window(struct scif_window *window, struct list_head *head) -{ - struct scif_window *curr = NULL, *prev = NULL; - struct list_head *item; - - INIT_LIST_HEAD(&window->list); - list_for_each(item, head) { - curr = list_entry(item, struct scif_window, list); - if (curr->offset > window->offset) - break; - prev = curr; - } - if (!prev) - list_add(&window->list, head); - else - list_add(&window->list, &prev->list); - scif_set_window_ref(window, window->nr_pages); -} - -/* - * scif_query_tcw: - * - * Query the temp cached registration list of ep for an overlapping window - * in case of permission mismatch, destroy the previous window. if permissions - * match and overlap is partial, destroy the window but return the new range - * RMA lock must be held. - */ -int scif_query_tcw(struct scif_endpt *ep, struct scif_rma_req *req) -{ - struct list_head *item, *temp, *head = req->head; - struct scif_window *window; - u64 start_va_window, start_va_req = req->va_for_temp; - u64 end_va_window, end_va_req = start_va_req + req->nr_bytes; - - if (!req->nr_bytes) - return -EINVAL; - /* - * Avoid traversing the entire list to find out that there - * is no entry that matches - */ - if (!list_empty(head)) { - window = list_last_entry(head, struct scif_window, list); - end_va_window = window->va_for_temp + - (window->nr_pages << PAGE_SHIFT); - if (start_va_req > end_va_window) - return -ENXIO; - } - list_for_each_safe(item, temp, head) { - window = list_entry(item, struct scif_window, list); - start_va_window = window->va_for_temp; - end_va_window = window->va_for_temp + - (window->nr_pages << PAGE_SHIFT); - if (start_va_req < start_va_window && - end_va_req < start_va_window) - break; - if (start_va_req >= end_va_window) - continue; - if ((window->prot & req->prot) == req->prot) { - if (start_va_req >= start_va_window && - end_va_req <= end_va_window) { - *req->out_window = window; - return 0; - } - /* expand window */ - if (start_va_req < start_va_window) { - req->nr_bytes += - start_va_window - start_va_req; - req->va_for_temp = start_va_window; - } - if (end_va_req >= end_va_window) - req->nr_bytes += end_va_window - end_va_req; - } - /* Destroy the old window to create a new one */ - __scif_rma_destroy_tcw_helper(window); - break; - } - return -ENXIO; -} - -/* - * scif_query_window: - * - * Query the registration list and check if a valid contiguous - * range of windows exist. - * RMA lock must be held. - */ -int scif_query_window(struct scif_rma_req *req) -{ - struct list_head *item; - struct scif_window *window; - s64 end_offset, offset = req->offset; - u64 tmp_min, nr_bytes_left = req->nr_bytes; - - if (!req->nr_bytes) - return -EINVAL; - - list_for_each(item, req->head) { - window = list_entry(item, struct scif_window, list); - end_offset = window->offset + - (window->nr_pages << PAGE_SHIFT); - if (offset < window->offset) - /* Offset not found! */ - return -ENXIO; - if (offset >= end_offset) - continue; - /* Check read/write protections. */ - if ((window->prot & req->prot) != req->prot) - return -EPERM; - if (nr_bytes_left == req->nr_bytes) - /* Store the first window */ - *req->out_window = window; - tmp_min = min((u64)end_offset - offset, nr_bytes_left); - nr_bytes_left -= tmp_min; - offset += tmp_min; - /* - * Range requested encompasses - * multiple windows contiguously. - */ - if (!nr_bytes_left) { - /* Done for partial window */ - if (req->type == SCIF_WINDOW_PARTIAL || - req->type == SCIF_WINDOW_SINGLE) - return 0; - /* Extra logic for full windows */ - if (offset == end_offset) - /* Spanning multiple whole windows */ - return 0; - /* Not spanning multiple whole windows */ - return -ENXIO; - } - if (req->type == SCIF_WINDOW_SINGLE) - break; - } - dev_err(scif_info.mdev.this_device, - "%s %d ENXIO\n", __func__, __LINE__); - return -ENXIO; -} - -/* - * scif_rma_list_unregister: - * - * Traverse the self registration list starting from window: - * 1) Call scif_unregister_window(..) - * RMA lock must be held. - */ -int scif_rma_list_unregister(struct scif_window *window, - s64 offset, int nr_pages) -{ - struct scif_endpt *ep = (struct scif_endpt *)window->ep; - struct list_head *head = &ep->rma_info.reg_list; - s64 end_offset; - int err = 0; - int loop_nr_pages; - struct scif_window *_window; - - list_for_each_entry_safe_from(window, _window, head, list) { - end_offset = window->offset + (window->nr_pages << PAGE_SHIFT); - loop_nr_pages = min((int)((end_offset - offset) >> PAGE_SHIFT), - nr_pages); - err = scif_unregister_window(window); - if (err) - return err; - nr_pages -= loop_nr_pages; - offset += (loop_nr_pages << PAGE_SHIFT); - if (!nr_pages) - break; - } - return 0; -} - -/* - * scif_unmap_all_window: - * - * Traverse all the windows in the self registration list and: - * 1) Delete any DMA mappings created - */ -void scif_unmap_all_windows(scif_epd_t epd) -{ - struct list_head *item, *tmp; - struct scif_window *window; - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct list_head *head = &ep->rma_info.reg_list; - - mutex_lock(&ep->rma_info.rma_lock); - list_for_each_safe(item, tmp, head) { - window = list_entry(item, struct scif_window, list); - scif_unmap_window(ep->remote_dev, window); - } - mutex_unlock(&ep->rma_info.rma_lock); -} - -/* - * scif_unregister_all_window: - * - * Traverse all the windows in the self registration list and: - * 1) Call scif_unregister_window(..) - * RMA lock must be held. - */ -int scif_unregister_all_windows(scif_epd_t epd) -{ - struct list_head *item, *tmp; - struct scif_window *window; - struct scif_endpt *ep = (struct scif_endpt *)epd; - struct list_head *head = &ep->rma_info.reg_list; - int err = 0; - - mutex_lock(&ep->rma_info.rma_lock); -retry: - item = NULL; - tmp = NULL; - list_for_each_safe(item, tmp, head) { - window = list_entry(item, struct scif_window, list); - ep->rma_info.async_list_del = 0; - err = scif_unregister_window(window); - if (err) - dev_err(scif_info.mdev.this_device, - "%s %d err %d\n", - __func__, __LINE__, err); - /* - * Need to restart list traversal if there has been - * an asynchronous list entry deletion. - */ - if (READ_ONCE(ep->rma_info.async_list_del)) - goto retry; - } - mutex_unlock(&ep->rma_info.rma_lock); - if (!list_empty(&ep->rma_info.mmn_list)) { - spin_lock(&scif_info.rmalock); - list_add_tail(&ep->mmu_list, &scif_info.mmu_notif_cleanup); - spin_unlock(&scif_info.rmalock); - schedule_work(&scif_info.mmu_notif_work); - } - return err; -} diff --git a/drivers/misc/mic/scif/scif_rma_list.h b/drivers/misc/mic/scif/scif_rma_list.h deleted file mode 100644 index 0f8e0ed65614..000000000000 --- a/drivers/misc/mic/scif/scif_rma_list.h +++ /dev/null @@ -1,48 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2015 Intel Corporation. - * - * Intel SCIF driver. - */ -#ifndef SCIF_RMA_LIST_H -#define SCIF_RMA_LIST_H - -/* - * struct scif_rma_req - Self Registration list RMA Request query - * - * @out_window - Returns the window if found - * @offset: Starting offset - * @nr_bytes: number of bytes - * @prot: protection requested i.e. read or write or both - * @type: Specify single, partial or multiple windows - * @head: Head of list on which to search - * @va_for_temp: VA for searching temporary cached windows - */ -struct scif_rma_req { - struct scif_window **out_window; - union { - s64 offset; - unsigned long va_for_temp; - }; - size_t nr_bytes; - int prot; - enum scif_window_type type; - struct list_head *head; -}; - -/* Insert */ -void scif_insert_window(struct scif_window *window, struct list_head *head); -void scif_insert_tcw(struct scif_window *window, - struct list_head *head); -/* Query */ -int scif_query_window(struct scif_rma_req *request); -int scif_query_tcw(struct scif_endpt *ep, struct scif_rma_req *request); -/* Called from close to unregister all self windows */ -int scif_unregister_all_windows(scif_epd_t epd); -void scif_unmap_all_windows(scif_epd_t epd); -/* Traverse list and unregister */ -int scif_rma_list_unregister(struct scif_window *window, s64 offset, - int nr_pages); -#endif /* SCIF_RMA_LIST_H */ diff --git a/drivers/misc/mic/vop/Makefile b/drivers/misc/mic/vop/Makefile deleted file mode 100644 index 51b9b0022786..000000000000 --- a/drivers/misc/mic/vop/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Makefile - Intel MIC Linux driver. -# Copyright(c) 2016, Intel Corporation. -# -obj-$(CONFIG_VOP) := vop.o - -vop-objs += vop_main.o -vop-objs += vop_debugfs.o -vop-objs += vop_vringh.o diff --git a/drivers/misc/mic/vop/vop_debugfs.c b/drivers/misc/mic/vop/vop_debugfs.c deleted file mode 100644 index 9d4f175f4dd1..000000000000 --- a/drivers/misc/mic/vop/vop_debugfs.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2016 Intel Corporation. - * - * Intel Virtio Over PCIe (VOP) driver. - */ -#include -#include - -#include "vop_main.h" - -static int vop_dp_show(struct seq_file *s, void *pos) -{ - struct mic_device_desc *d; - struct mic_device_ctrl *dc; - struct mic_vqconfig *vqconfig; - __u32 *features; - __u8 *config; - struct vop_info *vi = s->private; - struct vop_device *vpdev = vi->vpdev; - struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev); - int j, k; - - seq_printf(s, "Bootparam: magic 0x%x\n", - bootparam->magic); - seq_printf(s, "Bootparam: h2c_config_db %d\n", - bootparam->h2c_config_db); - seq_printf(s, "Bootparam: node_id %d\n", - bootparam->node_id); - seq_printf(s, "Bootparam: c2h_scif_db %d\n", - bootparam->c2h_scif_db); - seq_printf(s, "Bootparam: h2c_scif_db %d\n", - bootparam->h2c_scif_db); - seq_printf(s, "Bootparam: scif_host_dma_addr 0x%llx\n", - bootparam->scif_host_dma_addr); - seq_printf(s, "Bootparam: scif_card_dma_addr 0x%llx\n", - bootparam->scif_card_dma_addr); - - for (j = sizeof(*bootparam); - j < MIC_DP_SIZE; j += mic_total_desc_size(d)) { - d = (void *)bootparam + j; - dc = (void *)d + mic_aligned_desc_size(d); - - /* end of list */ - if (d->type == 0) - break; - - if (d->type == -1) - continue; - - seq_printf(s, "Type %d ", d->type); - seq_printf(s, "Num VQ %d ", d->num_vq); - seq_printf(s, "Feature Len %d\n", d->feature_len); - seq_printf(s, "Config Len %d ", d->config_len); - seq_printf(s, "Shutdown Status %d\n", d->status); - - for (k = 0; k < d->num_vq; k++) { - vqconfig = mic_vq_config(d) + k; - seq_printf(s, "vqconfig[%d]: ", k); - seq_printf(s, "address 0x%llx ", - vqconfig->address); - seq_printf(s, "num %d ", vqconfig->num); - seq_printf(s, "used address 0x%llx\n", - vqconfig->used_address); - } - - features = (__u32 *)mic_vq_features(d); - seq_printf(s, "Features: Host 0x%x ", features[0]); - seq_printf(s, "Guest 0x%x\n", features[1]); - - config = mic_vq_configspace(d); - for (k = 0; k < d->config_len; k++) - seq_printf(s, "config[%d]=%d\n", k, config[k]); - - seq_puts(s, "Device control:\n"); - seq_printf(s, "Config Change %d ", dc->config_change); - seq_printf(s, "Vdev reset %d\n", dc->vdev_reset); - seq_printf(s, "Guest Ack %d ", dc->guest_ack); - seq_printf(s, "Host ack %d\n", dc->host_ack); - seq_printf(s, "Used address updated %d ", - dc->used_address_updated); - seq_printf(s, "Vdev 0x%llx\n", dc->vdev); - seq_printf(s, "c2h doorbell %d ", dc->c2h_vdev_db); - seq_printf(s, "h2c doorbell %d\n", dc->h2c_vdev_db); - } - schedule_work(&vi->hotplug_work); - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(vop_dp); - -static int vop_vdev_info_show(struct seq_file *s, void *unused) -{ - struct vop_info *vi = s->private; - struct list_head *pos, *tmp; - struct vop_vdev *vdev; - int i, j; - - mutex_lock(&vi->vop_mutex); - list_for_each_safe(pos, tmp, &vi->vdev_list) { - vdev = list_entry(pos, struct vop_vdev, list); - seq_printf(s, "VDEV type %d state %s in %ld out %ld in_dma %ld out_dma %ld\n", - vdev->virtio_id, - vop_vdevup(vdev) ? "UP" : "DOWN", - vdev->in_bytes, - vdev->out_bytes, - vdev->in_bytes_dma, - vdev->out_bytes_dma); - for (i = 0; i < MIC_MAX_VRINGS; i++) { - struct vring_desc *desc; - struct vring_avail *avail; - struct vring_used *used; - struct vop_vringh *vvr = &vdev->vvr[i]; - struct vringh *vrh = &vvr->vrh; - int num = vrh->vring.num; - - if (!num) - continue; - desc = vrh->vring.desc; - seq_printf(s, "vring i %d avail_idx %d", - i, vvr->vring.info->avail_idx & (num - 1)); - seq_printf(s, " vring i %d avail_idx %d\n", - i, vvr->vring.info->avail_idx); - seq_printf(s, "vrh i %d weak_barriers %d", - i, vrh->weak_barriers); - seq_printf(s, " last_avail_idx %d last_used_idx %d", - vrh->last_avail_idx, vrh->last_used_idx); - seq_printf(s, " completed %d\n", vrh->completed); - for (j = 0; j < num; j++) { - seq_printf(s, "desc[%d] addr 0x%llx len %d", - j, desc->addr, desc->len); - seq_printf(s, " flags 0x%x next %d\n", - desc->flags, desc->next); - desc++; - } - avail = vrh->vring.avail; - seq_printf(s, "avail flags 0x%x idx %d\n", - vringh16_to_cpu(vrh, avail->flags), - vringh16_to_cpu(vrh, - avail->idx) & (num - 1)); - seq_printf(s, "avail flags 0x%x idx %d\n", - vringh16_to_cpu(vrh, avail->flags), - vringh16_to_cpu(vrh, avail->idx)); - for (j = 0; j < num; j++) - seq_printf(s, "avail ring[%d] %d\n", - j, avail->ring[j]); - used = vrh->vring.used; - seq_printf(s, "used flags 0x%x idx %d\n", - vringh16_to_cpu(vrh, used->flags), - vringh16_to_cpu(vrh, used->idx) & (num - 1)); - seq_printf(s, "used flags 0x%x idx %d\n", - vringh16_to_cpu(vrh, used->flags), - vringh16_to_cpu(vrh, used->idx)); - for (j = 0; j < num; j++) - seq_printf(s, "used ring[%d] id %d len %d\n", - j, vringh32_to_cpu(vrh, - used->ring[j].id), - vringh32_to_cpu(vrh, - used->ring[j].len)); - } - } - mutex_unlock(&vi->vop_mutex); - - return 0; -} - -DEFINE_SHOW_ATTRIBUTE(vop_vdev_info); - -void vop_init_debugfs(struct vop_info *vi) -{ - char name[16]; - - snprintf(name, sizeof(name), "%s%d", KBUILD_MODNAME, vi->vpdev->dnode); - vi->dbg = debugfs_create_dir(name, NULL); - debugfs_create_file("dp", 0444, vi->dbg, vi, &vop_dp_fops); - debugfs_create_file("vdev_info", 0444, vi->dbg, vi, &vop_vdev_info_fops); -} - -void vop_exit_debugfs(struct vop_info *vi) -{ - debugfs_remove_recursive(vi->dbg); -} diff --git a/drivers/misc/mic/vop/vop_main.c b/drivers/misc/mic/vop/vop_main.c deleted file mode 100644 index 714b94f42d38..000000000000 --- a/drivers/misc/mic/vop/vop_main.c +++ /dev/null @@ -1,784 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2016 Intel Corporation. - * - * Adapted from: - * - * virtio for kvm on s390 - * - * Copyright IBM Corp. 2008 - * - * Author(s): Christian Borntraeger - * - * Intel Virtio Over PCIe (VOP) driver. - */ -#include -#include -#include -#include -#include - -#include "vop_main.h" - -#define VOP_MAX_VRINGS 4 - -/* - * _vop_vdev - Allocated per virtio device instance injected by the peer. - * - * @vdev: Virtio device - * @desc: Virtio device page descriptor - * @dc: Virtio device control - * @vpdev: VOP device which is the parent for this virtio device - * @vr: Buffer for accessing the VRING - * @used_virt: Virtual address of used ring - * @used: DMA address of used ring - * @used_size: Size of the used buffer - * @reset_done: Track whether VOP reset is complete - * @virtio_cookie: Cookie returned upon requesting a interrupt - * @c2h_vdev_db: The doorbell used by the guest to interrupt the host - * @h2c_vdev_db: The doorbell used by the host to interrupt the guest - * @dnode: The destination node - */ -struct _vop_vdev { - struct virtio_device vdev; - struct mic_device_desc __iomem *desc; - struct mic_device_ctrl __iomem *dc; - struct vop_device *vpdev; - void __iomem *vr[VOP_MAX_VRINGS]; - void *used_virt[VOP_MAX_VRINGS]; - dma_addr_t used[VOP_MAX_VRINGS]; - int used_size[VOP_MAX_VRINGS]; - struct completion reset_done; - struct mic_irq *virtio_cookie; - int c2h_vdev_db; - int h2c_vdev_db; - int dnode; -}; - -#define to_vopvdev(vd) container_of(vd, struct _vop_vdev, vdev) - -#define _vop_aligned_desc_size(d) __mic_align(_vop_desc_size(d), 8) - -/* Helper API to obtain the parent of the virtio device */ -static inline struct device *_vop_dev(struct _vop_vdev *vdev) -{ - return vdev->vdev.dev.parent; -} - -static inline unsigned _vop_desc_size(struct mic_device_desc __iomem *desc) -{ - return sizeof(*desc) - + ioread8(&desc->num_vq) * sizeof(struct mic_vqconfig) - + ioread8(&desc->feature_len) * 2 - + ioread8(&desc->config_len); -} - -static inline struct mic_vqconfig __iomem * -_vop_vq_config(struct mic_device_desc __iomem *desc) -{ - return (struct mic_vqconfig __iomem *)(desc + 1); -} - -static inline u8 __iomem * -_vop_vq_features(struct mic_device_desc __iomem *desc) -{ - return (u8 __iomem *)(_vop_vq_config(desc) + ioread8(&desc->num_vq)); -} - -static inline u8 __iomem * -_vop_vq_configspace(struct mic_device_desc __iomem *desc) -{ - return _vop_vq_features(desc) + ioread8(&desc->feature_len) * 2; -} - -static inline unsigned -_vop_total_desc_size(struct mic_device_desc __iomem *desc) -{ - return _vop_aligned_desc_size(desc) + sizeof(struct mic_device_ctrl); -} - -/* This gets the device's feature bits. */ -static u64 vop_get_features(struct virtio_device *vdev) -{ - unsigned int i, bits; - u64 features = 0; - struct mic_device_desc __iomem *desc = to_vopvdev(vdev)->desc; - u8 __iomem *in_features = _vop_vq_features(desc); - int feature_len = ioread8(&desc->feature_len); - - bits = min_t(unsigned, feature_len, sizeof(vdev->features)) * 8; - for (i = 0; i < bits; i++) - if (ioread8(&in_features[i / 8]) & (BIT(i % 8))) - features |= BIT_ULL(i); - - return features; -} - -static void vop_transport_features(struct virtio_device *vdev) -{ - /* - * Packed ring isn't enabled on virtio_vop for now, - * because virtio_vop uses vring_new_virtqueue() which - * creates virtio rings on preallocated memory. - */ - __virtio_clear_bit(vdev, VIRTIO_F_RING_PACKED); - __virtio_set_bit(vdev, VIRTIO_F_ACCESS_PLATFORM); -} - -static int vop_finalize_features(struct virtio_device *vdev) -{ - unsigned int i, bits; - struct mic_device_desc __iomem *desc = to_vopvdev(vdev)->desc; - u8 feature_len = ioread8(&desc->feature_len); - /* Second half of bitmap is features we accept. */ - u8 __iomem *out_features = - _vop_vq_features(desc) + feature_len; - - /* Give virtio_ring a chance to accept features. */ - vring_transport_features(vdev); - - /* Give virtio_vop a chance to accept features. */ - vop_transport_features(vdev); - - memset_io(out_features, 0, feature_len); - bits = min_t(unsigned, feature_len, - sizeof(vdev->features)) * 8; - for (i = 0; i < bits; i++) { - if (__virtio_test_bit(vdev, i)) - iowrite8(ioread8(&out_features[i / 8]) | (1 << (i % 8)), - &out_features[i / 8]); - } - return 0; -} - -/* - * Reading and writing elements in config space - */ -static void vop_get(struct virtio_device *vdev, unsigned int offset, - void *buf, unsigned len) -{ - struct mic_device_desc __iomem *desc = to_vopvdev(vdev)->desc; - - if (offset + len > ioread8(&desc->config_len)) - return; - memcpy_fromio(buf, _vop_vq_configspace(desc) + offset, len); -} - -static void vop_set(struct virtio_device *vdev, unsigned int offset, - const void *buf, unsigned len) -{ - struct mic_device_desc __iomem *desc = to_vopvdev(vdev)->desc; - - if (offset + len > ioread8(&desc->config_len)) - return; - memcpy_toio(_vop_vq_configspace(desc) + offset, buf, len); -} - -/* - * The operations to get and set the status word just access the status - * field of the device descriptor. set_status also interrupts the host - * to tell about status changes. - */ -static u8 vop_get_status(struct virtio_device *vdev) -{ - return ioread8(&to_vopvdev(vdev)->desc->status); -} - -static void vop_set_status(struct virtio_device *dev, u8 status) -{ - struct _vop_vdev *vdev = to_vopvdev(dev); - struct vop_device *vpdev = vdev->vpdev; - - if (!status) - return; - iowrite8(status, &vdev->desc->status); - vpdev->hw_ops->send_intr(vpdev, vdev->c2h_vdev_db); -} - -/* Inform host on a virtio device reset and wait for ack from host */ -static void vop_reset_inform_host(struct virtio_device *dev) -{ - struct _vop_vdev *vdev = to_vopvdev(dev); - struct mic_device_ctrl __iomem *dc = vdev->dc; - struct vop_device *vpdev = vdev->vpdev; - int retry; - - iowrite8(0, &dc->host_ack); - iowrite8(1, &dc->vdev_reset); - vpdev->hw_ops->send_intr(vpdev, vdev->c2h_vdev_db); - - /* Wait till host completes all card accesses and acks the reset */ - for (retry = 100; retry--;) { - if (ioread8(&dc->host_ack)) - break; - msleep(100); - } - - dev_dbg(_vop_dev(vdev), "%s: retry: %d\n", __func__, retry); - - /* Reset status to 0 in case we timed out */ - iowrite8(0, &vdev->desc->status); -} - -static void vop_reset(struct virtio_device *dev) -{ - struct _vop_vdev *vdev = to_vopvdev(dev); - - dev_dbg(_vop_dev(vdev), "%s: virtio id %d\n", - __func__, dev->id.device); - - vop_reset_inform_host(dev); - complete_all(&vdev->reset_done); -} - -/* - * The virtio_ring code calls this API when it wants to notify the Host. - */ -static bool vop_notify(struct virtqueue *vq) -{ - struct _vop_vdev *vdev = vq->priv; - struct vop_device *vpdev = vdev->vpdev; - - vpdev->hw_ops->send_intr(vpdev, vdev->c2h_vdev_db); - return true; -} - -static void vop_del_vq(struct virtqueue *vq, int n) -{ - struct _vop_vdev *vdev = to_vopvdev(vq->vdev); - struct vop_device *vpdev = vdev->vpdev; - - dma_unmap_single(&vpdev->dev, vdev->used[n], - vdev->used_size[n], DMA_BIDIRECTIONAL); - free_pages((unsigned long)vdev->used_virt[n], - get_order(vdev->used_size[n])); - vring_del_virtqueue(vq); - vpdev->hw_ops->unmap(vpdev, vdev->vr[n]); - vdev->vr[n] = NULL; -} - -static void vop_del_vqs(struct virtio_device *dev) -{ - struct _vop_vdev *vdev = to_vopvdev(dev); - struct virtqueue *vq, *n; - int idx = 0; - - dev_dbg(_vop_dev(vdev), "%s\n", __func__); - - list_for_each_entry_safe(vq, n, &dev->vqs, list) - vop_del_vq(vq, idx++); -} - -static struct virtqueue *vop_new_virtqueue(unsigned int index, - unsigned int num, - struct virtio_device *vdev, - bool context, - void *pages, - bool (*notify)(struct virtqueue *vq), - void (*callback)(struct virtqueue *vq), - const char *name, - void *used) -{ - bool weak_barriers = false; - struct vring vring; - - vring_init(&vring, num, pages, MIC_VIRTIO_RING_ALIGN); - vring.used = used; - - return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, - notify, callback, name); -} - -/* - * This routine will assign vring's allocated in host/io memory. Code in - * virtio_ring.c however continues to access this io memory as if it were local - * memory without io accessors. - */ -static struct virtqueue *vop_find_vq(struct virtio_device *dev, - unsigned index, - void (*callback)(struct virtqueue *vq), - const char *name, bool ctx) -{ - struct _vop_vdev *vdev = to_vopvdev(dev); - struct vop_device *vpdev = vdev->vpdev; - struct mic_vqconfig __iomem *vqconfig; - struct mic_vqconfig config; - struct virtqueue *vq; - void __iomem *va; - struct _mic_vring_info __iomem *info; - void *used; - int vr_size, _vr_size, err, magic; - u8 type = ioread8(&vdev->desc->type); - - if (index >= ioread8(&vdev->desc->num_vq)) - return ERR_PTR(-ENOENT); - - if (!name) - return ERR_PTR(-ENOENT); - - /* First assign the vring's allocated in host memory */ - vqconfig = _vop_vq_config(vdev->desc) + index; - memcpy_fromio(&config, vqconfig, sizeof(config)); - _vr_size = round_up(vring_size(le16_to_cpu(config.num), MIC_VIRTIO_RING_ALIGN), 4); - vr_size = PAGE_ALIGN(_vr_size + sizeof(struct _mic_vring_info)); - va = vpdev->hw_ops->remap(vpdev, le64_to_cpu(config.address), vr_size); - if (!va) - return ERR_PTR(-ENOMEM); - vdev->vr[index] = va; - memset_io(va, 0x0, _vr_size); - - info = va + _vr_size; - magic = ioread32(&info->magic); - - if (WARN(magic != MIC_MAGIC + type + index, "magic mismatch")) { - err = -EIO; - goto unmap; - } - - vdev->used_size[index] = PAGE_ALIGN(sizeof(__u16) * 3 + - sizeof(struct vring_used_elem) * - le16_to_cpu(config.num)); - used = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(vdev->used_size[index])); - vdev->used_virt[index] = used; - if (!used) { - err = -ENOMEM; - dev_err(_vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto unmap; - } - - vq = vop_new_virtqueue(index, le16_to_cpu(config.num), dev, ctx, - (void __force *)va, vop_notify, callback, - name, used); - if (!vq) { - err = -ENOMEM; - goto free_used; - } - - vdev->used[index] = dma_map_single(&vpdev->dev, used, - vdev->used_size[index], - DMA_BIDIRECTIONAL); - if (dma_mapping_error(&vpdev->dev, vdev->used[index])) { - err = -ENOMEM; - dev_err(_vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto del_vq; - } - writeq(vdev->used[index], &vqconfig->used_address); - - vq->priv = vdev; - return vq; -del_vq: - vring_del_virtqueue(vq); -free_used: - free_pages((unsigned long)used, - get_order(vdev->used_size[index])); -unmap: - vpdev->hw_ops->unmap(vpdev, vdev->vr[index]); - return ERR_PTR(err); -} - -static int vop_find_vqs(struct virtio_device *dev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char * const names[], const bool *ctx, - struct irq_affinity *desc) -{ - struct _vop_vdev *vdev = to_vopvdev(dev); - struct vop_device *vpdev = vdev->vpdev; - struct mic_device_ctrl __iomem *dc = vdev->dc; - int i, err, retry, queue_idx = 0; - - /* We must have this many virtqueues. */ - if (nvqs > ioread8(&vdev->desc->num_vq)) - return -ENOENT; - - for (i = 0; i < nvqs; ++i) { - if (!names[i]) { - vqs[i] = NULL; - continue; - } - - dev_dbg(_vop_dev(vdev), "%s: %d: %s\n", - __func__, i, names[i]); - vqs[i] = vop_find_vq(dev, queue_idx++, callbacks[i], names[i], - ctx ? ctx[i] : false); - if (IS_ERR(vqs[i])) { - err = PTR_ERR(vqs[i]); - goto error; - } - } - - iowrite8(1, &dc->used_address_updated); - /* - * Send an interrupt to the host to inform it that used - * rings have been re-assigned. - */ - vpdev->hw_ops->send_intr(vpdev, vdev->c2h_vdev_db); - for (retry = 100; --retry;) { - if (!ioread8(&dc->used_address_updated)) - break; - msleep(100); - } - - dev_dbg(_vop_dev(vdev), "%s: retry: %d\n", __func__, retry); - if (!retry) { - err = -ENODEV; - goto error; - } - - return 0; -error: - vop_del_vqs(dev); - return err; -} - -/* - * The config ops structure as defined by virtio config - */ -static const struct virtio_config_ops vop_vq_config_ops = { - .get_features = vop_get_features, - .finalize_features = vop_finalize_features, - .get = vop_get, - .set = vop_set, - .get_status = vop_get_status, - .set_status = vop_set_status, - .reset = vop_reset, - .find_vqs = vop_find_vqs, - .del_vqs = vop_del_vqs, -}; - -static irqreturn_t vop_virtio_intr_handler(int irq, void *data) -{ - struct _vop_vdev *vdev = data; - struct vop_device *vpdev = vdev->vpdev; - struct virtqueue *vq; - - vpdev->hw_ops->ack_interrupt(vpdev, vdev->h2c_vdev_db); - list_for_each_entry(vq, &vdev->vdev.vqs, list) - vring_interrupt(0, vq); - - return IRQ_HANDLED; -} - -static void vop_virtio_release_dev(struct device *_d) -{ - struct virtio_device *vdev = - container_of(_d, struct virtio_device, dev); - struct _vop_vdev *vop_vdev = - container_of(vdev, struct _vop_vdev, vdev); - - kfree(vop_vdev); -} - -/* - * adds a new device and register it with virtio - * appropriate drivers are loaded by the device model - */ -static int _vop_add_device(struct mic_device_desc __iomem *d, - unsigned int offset, struct vop_device *vpdev, - int dnode) -{ - struct _vop_vdev *vdev, *reg_dev = NULL; - int ret; - u8 type = ioread8(&d->type); - - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; - - vdev->vpdev = vpdev; - vdev->vdev.dev.parent = &vpdev->dev; - vdev->vdev.dev.release = vop_virtio_release_dev; - vdev->vdev.id.device = type; - vdev->vdev.config = &vop_vq_config_ops; - vdev->desc = d; - vdev->dc = (void __iomem *)d + _vop_aligned_desc_size(d); - vdev->dnode = dnode; - vdev->vdev.priv = (void *)(unsigned long)dnode; - init_completion(&vdev->reset_done); - - vdev->h2c_vdev_db = vpdev->hw_ops->next_db(vpdev); - vdev->virtio_cookie = vpdev->hw_ops->request_irq(vpdev, - vop_virtio_intr_handler, "virtio intr", - vdev, vdev->h2c_vdev_db); - if (IS_ERR(vdev->virtio_cookie)) { - ret = PTR_ERR(vdev->virtio_cookie); - goto kfree; - } - iowrite8((u8)vdev->h2c_vdev_db, &vdev->dc->h2c_vdev_db); - vdev->c2h_vdev_db = ioread8(&vdev->dc->c2h_vdev_db); - - ret = register_virtio_device(&vdev->vdev); - reg_dev = vdev; - if (ret) { - dev_err(_vop_dev(vdev), - "Failed to register vop device %u type %u\n", - offset, type); - goto free_irq; - } - writeq((unsigned long)vdev, &vdev->dc->vdev); - dev_dbg(_vop_dev(vdev), "%s: registered vop device %u type %u vdev %p\n", - __func__, offset, type, vdev); - - return 0; - -free_irq: - vpdev->hw_ops->free_irq(vpdev, vdev->virtio_cookie, vdev); -kfree: - if (reg_dev) - put_device(&vdev->vdev.dev); - else - kfree(vdev); - return ret; -} - -/* - * match for a vop device with a specific desc pointer - */ -static int vop_match_desc(struct device *dev, void *data) -{ - struct virtio_device *_dev = dev_to_virtio(dev); - struct _vop_vdev *vdev = to_vopvdev(_dev); - - return vdev->desc == (void __iomem *)data; -} - -static struct _vop_vdev *vop_dc_to_vdev(struct mic_device_ctrl __iomem *dc) -{ - return (struct _vop_vdev *)(unsigned long)readq(&dc->vdev); -} - -static void _vop_handle_config_change(struct mic_device_desc __iomem *d, - unsigned int offset, - struct vop_device *vpdev) -{ - struct mic_device_ctrl __iomem *dc - = (void __iomem *)d + _vop_aligned_desc_size(d); - struct _vop_vdev *vdev = vop_dc_to_vdev(dc); - - if (ioread8(&dc->config_change) != MIC_VIRTIO_PARAM_CONFIG_CHANGED) - return; - - dev_dbg(&vpdev->dev, "%s %d\n", __func__, __LINE__); - virtio_config_changed(&vdev->vdev); - iowrite8(1, &dc->guest_ack); -} - -/* - * removes a virtio device if a hot remove event has been - * requested by the host. - */ -static int _vop_remove_device(struct mic_device_desc __iomem *d, - unsigned int offset, struct vop_device *vpdev) -{ - struct mic_device_ctrl __iomem *dc - = (void __iomem *)d + _vop_aligned_desc_size(d); - struct _vop_vdev *vdev = vop_dc_to_vdev(dc); - u8 status; - int ret = -1; - - if (ioread8(&dc->config_change) == MIC_VIRTIO_PARAM_DEV_REMOVE) { - struct device *dev = get_device(&vdev->vdev.dev); - - dev_dbg(&vpdev->dev, - "%s %d config_change %d type %d vdev %p\n", - __func__, __LINE__, - ioread8(&dc->config_change), ioread8(&d->type), vdev); - status = ioread8(&d->status); - reinit_completion(&vdev->reset_done); - unregister_virtio_device(&vdev->vdev); - vpdev->hw_ops->free_irq(vpdev, vdev->virtio_cookie, vdev); - iowrite8(-1, &dc->h2c_vdev_db); - if (status & VIRTIO_CONFIG_S_DRIVER_OK) - wait_for_completion(&vdev->reset_done); - put_device(dev); - iowrite8(1, &dc->guest_ack); - dev_dbg(&vpdev->dev, "%s %d guest_ack %d\n", - __func__, __LINE__, ioread8(&dc->guest_ack)); - iowrite8(-1, &d->type); - ret = 0; - } - return ret; -} - -#define REMOVE_DEVICES true - -static void _vop_scan_devices(void __iomem *dp, struct vop_device *vpdev, - bool remove, int dnode) -{ - s8 type; - unsigned int i; - struct mic_device_desc __iomem *d; - struct mic_device_ctrl __iomem *dc; - struct device *dev; - - for (i = sizeof(struct mic_bootparam); - i < MIC_DP_SIZE; i += _vop_total_desc_size(d)) { - d = dp + i; - dc = (void __iomem *)d + _vop_aligned_desc_size(d); - /* - * This read barrier is paired with the corresponding write - * barrier on the host which is inserted before adding or - * removing a virtio device descriptor, by updating the type. - */ - rmb(); - type = ioread8(&d->type); - - /* end of list */ - if (type == 0) - break; - - if (type == -1) - continue; - - /* device already exists */ - dev = device_find_child(&vpdev->dev, (void __force *)d, - vop_match_desc); - if (dev) { - if (remove) - iowrite8(MIC_VIRTIO_PARAM_DEV_REMOVE, - &dc->config_change); - put_device(dev); - _vop_handle_config_change(d, i, vpdev); - _vop_remove_device(d, i, vpdev); - if (remove) { - iowrite8(0, &dc->config_change); - iowrite8(0, &dc->guest_ack); - } - continue; - } - - /* new device */ - dev_dbg(&vpdev->dev, "%s %d Adding new virtio device %p\n", - __func__, __LINE__, d); - if (!remove) - _vop_add_device(d, i, vpdev, dnode); - } -} - -static void vop_scan_devices(struct vop_info *vi, - struct vop_device *vpdev, bool remove) -{ - void __iomem *dp = vpdev->hw_ops->get_remote_dp(vpdev); - - if (!dp) - return; - mutex_lock(&vi->vop_mutex); - _vop_scan_devices(dp, vpdev, remove, vpdev->dnode); - mutex_unlock(&vi->vop_mutex); -} - -/* - * vop_hotplug_device tries to find changes in the device page. - */ -static void vop_hotplug_devices(struct work_struct *work) -{ - struct vop_info *vi = container_of(work, struct vop_info, - hotplug_work); - - vop_scan_devices(vi, vi->vpdev, !REMOVE_DEVICES); -} - -/* - * Interrupt handler for hot plug/config changes etc. - */ -static irqreturn_t vop_extint_handler(int irq, void *data) -{ - struct vop_info *vi = data; - struct mic_bootparam __iomem *bp; - struct vop_device *vpdev = vi->vpdev; - - bp = vpdev->hw_ops->get_remote_dp(vpdev); - dev_dbg(&vpdev->dev, "%s %d hotplug work\n", - __func__, __LINE__); - vpdev->hw_ops->ack_interrupt(vpdev, ioread8(&bp->h2c_config_db)); - schedule_work(&vi->hotplug_work); - return IRQ_HANDLED; -} - -static int vop_driver_probe(struct vop_device *vpdev) -{ - struct vop_info *vi; - int rc; - - vi = kzalloc(sizeof(*vi), GFP_KERNEL); - if (!vi) { - rc = -ENOMEM; - goto exit; - } - dev_set_drvdata(&vpdev->dev, vi); - vi->vpdev = vpdev; - - mutex_init(&vi->vop_mutex); - INIT_WORK(&vi->hotplug_work, vop_hotplug_devices); - if (vpdev->dnode) { - rc = vop_host_init(vi); - if (rc < 0) - goto free; - } else { - struct mic_bootparam __iomem *bootparam; - - vop_scan_devices(vi, vpdev, !REMOVE_DEVICES); - - vi->h2c_config_db = vpdev->hw_ops->next_db(vpdev); - vi->cookie = vpdev->hw_ops->request_irq(vpdev, - vop_extint_handler, - "virtio_config_intr", - vi, vi->h2c_config_db); - if (IS_ERR(vi->cookie)) { - rc = PTR_ERR(vi->cookie); - goto free; - } - bootparam = vpdev->hw_ops->get_remote_dp(vpdev); - iowrite8(vi->h2c_config_db, &bootparam->h2c_config_db); - } - vop_init_debugfs(vi); - return 0; -free: - kfree(vi); -exit: - return rc; -} - -static void vop_driver_remove(struct vop_device *vpdev) -{ - struct vop_info *vi = dev_get_drvdata(&vpdev->dev); - - if (vpdev->dnode) { - vop_host_uninit(vi); - } else { - struct mic_bootparam __iomem *bootparam = - vpdev->hw_ops->get_remote_dp(vpdev); - if (bootparam) - iowrite8(-1, &bootparam->h2c_config_db); - vpdev->hw_ops->free_irq(vpdev, vi->cookie, vi); - flush_work(&vi->hotplug_work); - vop_scan_devices(vi, vpdev, REMOVE_DEVICES); - } - vop_exit_debugfs(vi); - kfree(vi); -} - -static const struct vop_device_id id_table[] = { - { VOP_DEV_TRNSP, VOP_DEV_ANY_ID }, - { 0 }, -}; - -static struct vop_driver vop_driver = { - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = vop_driver_probe, - .remove = vop_driver_remove, -}; - -module_vop_driver(vop_driver); - -MODULE_DEVICE_TABLE(mbus, id_table); -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Intel(R) Virtio Over PCIe (VOP) driver"); -MODULE_LICENSE("GPL v2"); diff --git a/drivers/misc/mic/vop/vop_main.h b/drivers/misc/mic/vop/vop_main.h deleted file mode 100644 index 2451d9218137..000000000000 --- a/drivers/misc/mic/vop/vop_main.h +++ /dev/null @@ -1,158 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2016 Intel Corporation. - * - * Intel Virtio Over PCIe (VOP) driver. - */ -#ifndef _VOP_MAIN_H_ -#define _VOP_MAIN_H_ - -#include -#include -#include -#include - -#include -#include "../common/mic_dev.h" - -#include "../bus/vop_bus.h" - -/* - * Note on endianness. - * 1. Host can be both BE or LE - * 2. Guest/card is LE. Host uses le_to_cpu to access desc/avail - * rings and ioreadXX/iowriteXX to access used ring. - * 3. Device page exposed by host to guest contains LE values. Guest - * accesses these using ioreadXX/iowriteXX etc. This way in general we - * obey the virtio spec according to which guest works with native - * endianness and host is aware of guest endianness and does all - * required endianness conversion. - * 4. Data provided from user space to guest (in ADD_DEVICE and - * CONFIG_CHANGE ioctl's) is not interpreted by the driver and should be - * in guest endianness. - */ - -/* - * vop_info - Allocated per invocation of VOP probe - * - * @vpdev: VOP device - * @hotplug_work: Handle virtio device creation, deletion and configuration - * @cookie: Cookie received upon requesting a virtio configuration interrupt - * @h2c_config_db: The doorbell used by the peer to indicate a config change - * @vdev_list: List of "active" virtio devices injected in the peer node - * @vop_mutex: Synchronize access to the device page as well as serialize - * creation/deletion of virtio devices on the peer node - * @dp: Peer device page information - * @dbg: Debugfs entry - * @dma_ch: The DMA channel used by this transport for data transfers. - * @name: Name for this transport used in misc device creation. - * @miscdev: The misc device registered. - */ -struct vop_info { - struct vop_device *vpdev; - struct work_struct hotplug_work; - struct mic_irq *cookie; - int h2c_config_db; - struct list_head vdev_list; - struct mutex vop_mutex; - void __iomem *dp; - struct dentry *dbg; - struct dma_chan *dma_ch; - char name[16]; - struct miscdevice miscdev; -}; - -/** - * struct vop_vringh - Virtio ring host information. - * - * @vring: The VOP vring used for setting up user space mappings. - * @vrh: The host VRINGH used for accessing the card vrings. - * @riov: The VRINGH read kernel IOV. - * @wiov: The VRINGH write kernel IOV. - * @head: The VRINGH head index address passed to vringh_getdesc_kern(..). - * @vr_mutex: Mutex for synchronizing access to the VRING. - * @buf: Temporary kernel buffer used to copy in/out data - * from/to the card via DMA. - * @buf_da: dma address of buf. - * @vdev: Back pointer to VOP virtio device for vringh_notify(..). - */ -struct vop_vringh { - struct mic_vring vring; - struct vringh vrh; - struct vringh_kiov riov; - struct vringh_kiov wiov; - u16 head; - struct mutex vr_mutex; - void *buf; - dma_addr_t buf_da; - struct vop_vdev *vdev; -}; - -/** - * struct vop_vdev - Host information for a card Virtio device. - * - * @virtio_id - Virtio device id. - * @waitq - Waitqueue to allow ring3 apps to poll. - * @vpdev - pointer to VOP bus device. - * @poll_wake - Used for waking up threads blocked in poll. - * @out_bytes - Debug stats for number of bytes copied from host to card. - * @in_bytes - Debug stats for number of bytes copied from card to host. - * @out_bytes_dma - Debug stats for number of bytes copied from host to card - * using DMA. - * @in_bytes_dma - Debug stats for number of bytes copied from card to host - * using DMA. - * @tx_len_unaligned - Debug stats for number of bytes copied to the card where - * the transfer length did not have the required DMA alignment. - * @tx_dst_unaligned - Debug stats for number of bytes copied where the - * destination address on the card did not have the required DMA alignment. - * @vvr - Store per VRING data structures. - * @virtio_bh_work - Work struct used to schedule virtio bottom half handling. - * @dd - Virtio device descriptor. - * @dc - Virtio device control fields. - * @list - List of Virtio devices. - * @virtio_db - The doorbell used by the card to interrupt the host. - * @virtio_cookie - The cookie returned while requesting interrupts. - * @vi: Transport information. - * @vdev_mutex: Mutex synchronizing virtio device injection, - * removal and data transfers. - * @destroy: Track if a virtio device is being destroyed. - * @deleted: The virtio device has been deleted. - */ -struct vop_vdev { - int virtio_id; - wait_queue_head_t waitq; - struct vop_device *vpdev; - int poll_wake; - unsigned long out_bytes; - unsigned long in_bytes; - unsigned long out_bytes_dma; - unsigned long in_bytes_dma; - unsigned long tx_len_unaligned; - unsigned long tx_dst_unaligned; - unsigned long rx_dst_unaligned; - struct vop_vringh vvr[MIC_MAX_VRINGS]; - struct work_struct virtio_bh_work; - struct mic_device_desc *dd; - struct mic_device_ctrl *dc; - struct list_head list; - int virtio_db; - struct mic_irq *virtio_cookie; - struct vop_info *vi; - struct mutex vdev_mutex; - struct completion destroy; - bool deleted; -}; - -/* Helper API to check if a virtio device is running */ -static inline bool vop_vdevup(struct vop_vdev *vdev) -{ - return !!vdev->dd->status; -} - -void vop_init_debugfs(struct vop_info *vi); -void vop_exit_debugfs(struct vop_info *vi); -int vop_host_init(struct vop_info *vi); -void vop_host_uninit(struct vop_info *vi); -#endif diff --git a/drivers/misc/mic/vop/vop_vringh.c b/drivers/misc/mic/vop/vop_vringh.c deleted file mode 100644 index 7014ffe88632..000000000000 --- a/drivers/misc/mic/vop/vop_vringh.c +++ /dev/null @@ -1,1166 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2016 Intel Corporation. - * - * Intel Virtio Over PCIe (VOP) driver. - */ -#include -#include -#include - -#include -#include "../common/mic_dev.h" - -#include -#include "vop_main.h" - -/* Helper API to obtain the VOP PCIe device */ -static inline struct device *vop_dev(struct vop_vdev *vdev) -{ - return vdev->vpdev->dev.parent; -} - -/* Helper API to check if a virtio device is initialized */ -static inline int vop_vdev_inited(struct vop_vdev *vdev) -{ - if (!vdev) - return -EINVAL; - /* Device has not been created yet */ - if (!vdev->dd || !vdev->dd->type) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, -EINVAL); - return -EINVAL; - } - /* Device has been removed/deleted */ - if (vdev->dd->type == -1) { - dev_dbg(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, -ENODEV); - return -ENODEV; - } - return 0; -} - -static void _vop_notify(struct vringh *vrh) -{ - struct vop_vringh *vvrh = container_of(vrh, struct vop_vringh, vrh); - struct vop_vdev *vdev = vvrh->vdev; - struct vop_device *vpdev = vdev->vpdev; - s8 db = vdev->dc->h2c_vdev_db; - - if (db != -1) - vpdev->hw_ops->send_intr(vpdev, db); -} - -static void vop_virtio_init_post(struct vop_vdev *vdev) -{ - struct mic_vqconfig *vqconfig = mic_vq_config(vdev->dd); - struct vop_device *vpdev = vdev->vpdev; - int i, used_size; - - for (i = 0; i < vdev->dd->num_vq; i++) { - used_size = PAGE_ALIGN(sizeof(u16) * 3 + - sizeof(struct vring_used_elem) * - le16_to_cpu(vqconfig->num)); - if (!le64_to_cpu(vqconfig[i].used_address)) { - dev_warn(vop_dev(vdev), "used_address zero??\n"); - continue; - } - vdev->vvr[i].vrh.vring.used = - (void __force *)vpdev->hw_ops->remap( - vpdev, - le64_to_cpu(vqconfig[i].used_address), - used_size); - } - - vdev->dc->used_address_updated = 0; - - dev_info(vop_dev(vdev), "%s: device type %d LINKUP\n", - __func__, vdev->virtio_id); -} - -static inline void vop_virtio_device_reset(struct vop_vdev *vdev) -{ - int i; - - dev_dbg(vop_dev(vdev), "%s: status %d device type %d RESET\n", - __func__, vdev->dd->status, vdev->virtio_id); - - for (i = 0; i < vdev->dd->num_vq; i++) - /* - * Avoid lockdep false positive. The + 1 is for the vop - * mutex which is held in the reset devices code path. - */ - mutex_lock_nested(&vdev->vvr[i].vr_mutex, i + 1); - - /* 0 status means "reset" */ - vdev->dd->status = 0; - vdev->dc->vdev_reset = 0; - vdev->dc->host_ack = 1; - - for (i = 0; i < vdev->dd->num_vq; i++) { - struct vringh *vrh = &vdev->vvr[i].vrh; - - vdev->vvr[i].vring.info->avail_idx = 0; - vrh->completed = 0; - vrh->last_avail_idx = 0; - vrh->last_used_idx = 0; - } - - for (i = 0; i < vdev->dd->num_vq; i++) - mutex_unlock(&vdev->vvr[i].vr_mutex); -} - -static void vop_virtio_reset_devices(struct vop_info *vi) -{ - struct list_head *pos, *tmp; - struct vop_vdev *vdev; - - list_for_each_safe(pos, tmp, &vi->vdev_list) { - vdev = list_entry(pos, struct vop_vdev, list); - vop_virtio_device_reset(vdev); - vdev->poll_wake = 1; - wake_up(&vdev->waitq); - } -} - -static void vop_bh_handler(struct work_struct *work) -{ - struct vop_vdev *vdev = container_of(work, struct vop_vdev, - virtio_bh_work); - - if (vdev->dc->used_address_updated) - vop_virtio_init_post(vdev); - - if (vdev->dc->vdev_reset) - vop_virtio_device_reset(vdev); - - vdev->poll_wake = 1; - wake_up(&vdev->waitq); -} - -static irqreturn_t _vop_virtio_intr_handler(int irq, void *data) -{ - struct vop_vdev *vdev = data; - struct vop_device *vpdev = vdev->vpdev; - - vpdev->hw_ops->ack_interrupt(vpdev, vdev->virtio_db); - schedule_work(&vdev->virtio_bh_work); - return IRQ_HANDLED; -} - -static int vop_virtio_config_change(struct vop_vdev *vdev, void *argp) -{ - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wake); - int ret = 0, retry, i; - struct vop_device *vpdev = vdev->vpdev; - struct vop_info *vi = dev_get_drvdata(&vpdev->dev); - struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev); - s8 db = bootparam->h2c_config_db; - - mutex_lock(&vi->vop_mutex); - for (i = 0; i < vdev->dd->num_vq; i++) - mutex_lock_nested(&vdev->vvr[i].vr_mutex, i + 1); - - if (db == -1 || vdev->dd->type == -1) { - ret = -EIO; - goto exit; - } - - memcpy(mic_vq_configspace(vdev->dd), argp, vdev->dd->config_len); - vdev->dc->config_change = MIC_VIRTIO_PARAM_CONFIG_CHANGED; - vpdev->hw_ops->send_intr(vpdev, db); - - for (retry = 100; retry--;) { - ret = wait_event_timeout(wake, vdev->dc->guest_ack, - msecs_to_jiffies(100)); - if (ret) - break; - } - - dev_dbg(vop_dev(vdev), - "%s %d retry: %d\n", __func__, __LINE__, retry); - vdev->dc->config_change = 0; - vdev->dc->guest_ack = 0; -exit: - for (i = 0; i < vdev->dd->num_vq; i++) - mutex_unlock(&vdev->vvr[i].vr_mutex); - mutex_unlock(&vi->vop_mutex); - return ret; -} - -static int vop_copy_dp_entry(struct vop_vdev *vdev, - struct mic_device_desc *argp, __u8 *type, - struct mic_device_desc **devpage) -{ - struct vop_device *vpdev = vdev->vpdev; - struct mic_device_desc *devp; - struct mic_vqconfig *vqconfig; - int ret = 0, i; - bool slot_found = false; - - vqconfig = mic_vq_config(argp); - for (i = 0; i < argp->num_vq; i++) { - if (le16_to_cpu(vqconfig[i].num) > MIC_MAX_VRING_ENTRIES) { - ret = -EINVAL; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - goto exit; - } - } - - /* Find the first free device page entry */ - for (i = sizeof(struct mic_bootparam); - i < MIC_DP_SIZE - mic_total_desc_size(argp); - i += mic_total_desc_size(devp)) { - devp = vpdev->hw_ops->get_dp(vpdev) + i; - if (devp->type == 0 || devp->type == -1) { - slot_found = true; - break; - } - } - if (!slot_found) { - ret = -EINVAL; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - goto exit; - } - /* - * Save off the type before doing the memcpy. Type will be set in the - * end after completing all initialization for the new device. - */ - *type = argp->type; - argp->type = 0; - memcpy(devp, argp, mic_desc_size(argp)); - - *devpage = devp; -exit: - return ret; -} - -static void vop_init_device_ctrl(struct vop_vdev *vdev, - struct mic_device_desc *devpage) -{ - struct mic_device_ctrl *dc; - - dc = (void *)devpage + mic_aligned_desc_size(devpage); - - dc->config_change = 0; - dc->guest_ack = 0; - dc->vdev_reset = 0; - dc->host_ack = 0; - dc->used_address_updated = 0; - dc->c2h_vdev_db = -1; - dc->h2c_vdev_db = -1; - vdev->dc = dc; -} - -static int vop_virtio_add_device(struct vop_vdev *vdev, - struct mic_device_desc *argp) -{ - struct vop_info *vi = vdev->vi; - struct vop_device *vpdev = vi->vpdev; - struct mic_device_desc *dd = NULL; - struct mic_vqconfig *vqconfig; - int vr_size, i, j, ret; - u8 type = 0; - s8 db = -1; - char irqname[16]; - struct mic_bootparam *bootparam; - u16 num; - dma_addr_t vr_addr; - - bootparam = vpdev->hw_ops->get_dp(vpdev); - init_waitqueue_head(&vdev->waitq); - INIT_LIST_HEAD(&vdev->list); - vdev->vpdev = vpdev; - - ret = vop_copy_dp_entry(vdev, argp, &type, &dd); - if (ret) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - return ret; - } - - vop_init_device_ctrl(vdev, dd); - - vdev->dd = dd; - vdev->virtio_id = type; - vqconfig = mic_vq_config(dd); - INIT_WORK(&vdev->virtio_bh_work, vop_bh_handler); - - for (i = 0; i < dd->num_vq; i++) { - struct vop_vringh *vvr = &vdev->vvr[i]; - struct mic_vring *vr = &vdev->vvr[i].vring; - - num = le16_to_cpu(vqconfig[i].num); - mutex_init(&vvr->vr_mutex); - vr_size = PAGE_ALIGN(round_up(vring_size(num, MIC_VIRTIO_RING_ALIGN), 4) + - sizeof(struct _mic_vring_info)); - vr->va = (void *) - __get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(vr_size)); - if (!vr->va) { - ret = -ENOMEM; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - goto err; - } - vr->len = vr_size; - vr->info = vr->va + round_up(vring_size(num, MIC_VIRTIO_RING_ALIGN), 4); - vr->info->magic = cpu_to_le32(MIC_MAGIC + vdev->virtio_id + i); - vr_addr = dma_map_single(&vpdev->dev, vr->va, vr_size, - DMA_BIDIRECTIONAL); - if (dma_mapping_error(&vpdev->dev, vr_addr)) { - free_pages((unsigned long)vr->va, get_order(vr_size)); - ret = -ENOMEM; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - goto err; - } - vqconfig[i].address = cpu_to_le64(vr_addr); - - vring_init(&vr->vr, num, vr->va, MIC_VIRTIO_RING_ALIGN); - ret = vringh_init_kern(&vvr->vrh, - *(u32 *)mic_vq_features(vdev->dd), - num, false, vr->vr.desc, vr->vr.avail, - vr->vr.used); - if (ret) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - goto err; - } - vringh_kiov_init(&vvr->riov, NULL, 0); - vringh_kiov_init(&vvr->wiov, NULL, 0); - vvr->head = USHRT_MAX; - vvr->vdev = vdev; - vvr->vrh.notify = _vop_notify; - dev_dbg(&vpdev->dev, - "%s %d index %d va %p info %p vr_size 0x%x\n", - __func__, __LINE__, i, vr->va, vr->info, vr_size); - vvr->buf = (void *)__get_free_pages(GFP_KERNEL, - get_order(VOP_INT_DMA_BUF_SIZE)); - vvr->buf_da = dma_map_single(&vpdev->dev, - vvr->buf, VOP_INT_DMA_BUF_SIZE, - DMA_BIDIRECTIONAL); - } - - snprintf(irqname, sizeof(irqname), "vop%dvirtio%d", vpdev->index, - vdev->virtio_id); - vdev->virtio_db = vpdev->hw_ops->next_db(vpdev); - vdev->virtio_cookie = vpdev->hw_ops->request_irq(vpdev, - _vop_virtio_intr_handler, irqname, vdev, - vdev->virtio_db); - if (IS_ERR(vdev->virtio_cookie)) { - ret = PTR_ERR(vdev->virtio_cookie); - dev_dbg(&vpdev->dev, "request irq failed\n"); - goto err; - } - - vdev->dc->c2h_vdev_db = vdev->virtio_db; - - /* - * Order the type update with previous stores. This write barrier - * is paired with the corresponding read barrier before the uncached - * system memory read of the type, on the card while scanning the - * device page. - */ - smp_wmb(); - dd->type = type; - argp->type = type; - - if (bootparam) { - db = bootparam->h2c_config_db; - if (db != -1) - vpdev->hw_ops->send_intr(vpdev, db); - } - dev_dbg(&vpdev->dev, "Added virtio id %d db %d\n", dd->type, db); - return 0; -err: - vqconfig = mic_vq_config(dd); - for (j = 0; j < i; j++) { - struct vop_vringh *vvr = &vdev->vvr[j]; - - dma_unmap_single(&vpdev->dev, le64_to_cpu(vqconfig[j].address), - vvr->vring.len, DMA_BIDIRECTIONAL); - free_pages((unsigned long)vvr->vring.va, - get_order(vvr->vring.len)); - } - return ret; -} - -static void vop_dev_remove(struct vop_info *pvi, struct mic_device_ctrl *devp, - struct vop_device *vpdev) -{ - struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev); - s8 db; - int ret, retry; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wake); - - devp->config_change = MIC_VIRTIO_PARAM_DEV_REMOVE; - db = bootparam->h2c_config_db; - if (db != -1) - vpdev->hw_ops->send_intr(vpdev, db); - else - goto done; - for (retry = 15; retry--;) { - ret = wait_event_timeout(wake, devp->guest_ack, - msecs_to_jiffies(1000)); - if (ret) - break; - } -done: - devp->config_change = 0; - devp->guest_ack = 0; -} - -static void vop_virtio_del_device(struct vop_vdev *vdev) -{ - struct vop_info *vi = vdev->vi; - struct vop_device *vpdev = vdev->vpdev; - int i; - struct mic_vqconfig *vqconfig; - struct mic_bootparam *bootparam = vpdev->hw_ops->get_dp(vpdev); - - if (!bootparam) - goto skip_hot_remove; - vop_dev_remove(vi, vdev->dc, vpdev); -skip_hot_remove: - vpdev->hw_ops->free_irq(vpdev, vdev->virtio_cookie, vdev); - flush_work(&vdev->virtio_bh_work); - vqconfig = mic_vq_config(vdev->dd); - for (i = 0; i < vdev->dd->num_vq; i++) { - struct vop_vringh *vvr = &vdev->vvr[i]; - - dma_unmap_single(&vpdev->dev, - vvr->buf_da, VOP_INT_DMA_BUF_SIZE, - DMA_BIDIRECTIONAL); - free_pages((unsigned long)vvr->buf, - get_order(VOP_INT_DMA_BUF_SIZE)); - vringh_kiov_cleanup(&vvr->riov); - vringh_kiov_cleanup(&vvr->wiov); - dma_unmap_single(&vpdev->dev, le64_to_cpu(vqconfig[i].address), - vvr->vring.len, DMA_BIDIRECTIONAL); - free_pages((unsigned long)vvr->vring.va, - get_order(vvr->vring.len)); - } - /* - * Order the type update with previous stores. This write barrier - * is paired with the corresponding read barrier before the uncached - * system memory read of the type, on the card while scanning the - * device page. - */ - smp_wmb(); - vdev->dd->type = -1; -} - -/* - * vop_sync_dma - Wrapper for synchronous DMAs. - * - * @dev - The address of the pointer to the device instance used - * for DMA registration. - * @dst - destination DMA address. - * @src - source DMA address. - * @len - size of the transfer. - * - * Return DMA_SUCCESS on success - */ -static int vop_sync_dma(struct vop_vdev *vdev, dma_addr_t dst, dma_addr_t src, - size_t len) -{ - int err = 0; - struct dma_device *ddev; - struct dma_async_tx_descriptor *tx; - struct vop_info *vi = dev_get_drvdata(&vdev->vpdev->dev); - struct dma_chan *vop_ch = vi->dma_ch; - - if (!vop_ch) { - err = -EBUSY; - goto error; - } - ddev = vop_ch->device; - tx = ddev->device_prep_dma_memcpy(vop_ch, dst, src, len, - DMA_PREP_FENCE); - if (!tx) { - err = -ENOMEM; - goto error; - } else { - dma_cookie_t cookie; - - cookie = tx->tx_submit(tx); - if (dma_submit_error(cookie)) { - err = -ENOMEM; - goto error; - } - dma_async_issue_pending(vop_ch); - err = dma_sync_wait(vop_ch, cookie); - } -error: - if (err) - dev_err(&vi->vpdev->dev, "%s %d err %d\n", - __func__, __LINE__, err); - return err; -} - -#define VOP_USE_DMA true - -/* - * Initiates the copies across the PCIe bus from card memory to a user - * space buffer. When transfers are done using DMA, source/destination - * addresses and transfer length must follow the alignment requirements of - * the MIC DMA engine. - */ -static int vop_virtio_copy_to_user(struct vop_vdev *vdev, void __user *ubuf, - size_t len, u64 daddr, size_t dlen, - int vr_idx) -{ - struct vop_device *vpdev = vdev->vpdev; - void __iomem *dbuf = vpdev->hw_ops->remap(vpdev, daddr, len); - struct vop_vringh *vvr = &vdev->vvr[vr_idx]; - struct vop_info *vi = dev_get_drvdata(&vpdev->dev); - size_t dma_alignment; - bool x200; - size_t dma_offset, partlen; - int err; - - if (!VOP_USE_DMA || !vi->dma_ch) { - if (copy_to_user(ubuf, (void __force *)dbuf, len)) { - err = -EFAULT; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto err; - } - vdev->in_bytes += len; - err = 0; - goto err; - } - - dma_alignment = 1 << vi->dma_ch->device->copy_align; - x200 = is_dma_copy_aligned(vi->dma_ch->device, 1, 1, 1); - - dma_offset = daddr - round_down(daddr, dma_alignment); - daddr -= dma_offset; - len += dma_offset; - /* - * X100 uses DMA addresses as seen by the card so adding - * the aperture base is not required for DMA. However x200 - * requires DMA addresses to be an offset into the bar so - * add the aperture base for x200. - */ - if (x200) - daddr += vpdev->aper->pa; - while (len) { - partlen = min_t(size_t, len, VOP_INT_DMA_BUF_SIZE); - err = vop_sync_dma(vdev, vvr->buf_da, daddr, - ALIGN(partlen, dma_alignment)); - if (err) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto err; - } - if (copy_to_user(ubuf, vvr->buf + dma_offset, - partlen - dma_offset)) { - err = -EFAULT; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto err; - } - daddr += partlen; - ubuf += partlen; - dbuf += partlen; - vdev->in_bytes_dma += partlen; - vdev->in_bytes += partlen; - len -= partlen; - dma_offset = 0; - } - err = 0; -err: - vpdev->hw_ops->unmap(vpdev, dbuf); - dev_dbg(vop_dev(vdev), - "%s: ubuf %p dbuf %p len 0x%zx vr_idx 0x%x\n", - __func__, ubuf, dbuf, len, vr_idx); - return err; -} - -/* - * Initiates copies across the PCIe bus from a user space buffer to card - * memory. When transfers are done using DMA, source/destination addresses - * and transfer length must follow the alignment requirements of the MIC - * DMA engine. - */ -static int vop_virtio_copy_from_user(struct vop_vdev *vdev, void __user *ubuf, - size_t len, u64 daddr, size_t dlen, - int vr_idx) -{ - struct vop_device *vpdev = vdev->vpdev; - void __iomem *dbuf = vpdev->hw_ops->remap(vpdev, daddr, len); - struct vop_vringh *vvr = &vdev->vvr[vr_idx]; - struct vop_info *vi = dev_get_drvdata(&vdev->vpdev->dev); - size_t dma_alignment; - bool x200; - size_t partlen; - bool dma = VOP_USE_DMA && vi->dma_ch; - int err = 0; - size_t offset = 0; - - if (dma) { - dma_alignment = 1 << vi->dma_ch->device->copy_align; - x200 = is_dma_copy_aligned(vi->dma_ch->device, 1, 1, 1); - - if (daddr & (dma_alignment - 1)) { - vdev->tx_dst_unaligned += len; - dma = false; - } else if (ALIGN(len, dma_alignment) > dlen) { - vdev->tx_len_unaligned += len; - dma = false; - } - } - - if (!dma) - goto memcpy; - - /* - * X100 uses DMA addresses as seen by the card so adding - * the aperture base is not required for DMA. However x200 - * requires DMA addresses to be an offset into the bar so - * add the aperture base for x200. - */ - if (x200) - daddr += vpdev->aper->pa; - while (len) { - partlen = min_t(size_t, len, VOP_INT_DMA_BUF_SIZE); - - if (copy_from_user(vvr->buf, ubuf, partlen)) { - err = -EFAULT; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto err; - } - err = vop_sync_dma(vdev, daddr, vvr->buf_da, - ALIGN(partlen, dma_alignment)); - if (err) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto err; - } - daddr += partlen; - ubuf += partlen; - dbuf += partlen; - vdev->out_bytes_dma += partlen; - vdev->out_bytes += partlen; - len -= partlen; - } -memcpy: - /* - * We are copying to IO below and should ideally use something - * like copy_from_user_toio(..) if it existed. - */ - while (len) { - partlen = min_t(size_t, len, VOP_INT_DMA_BUF_SIZE); - - if (copy_from_user(vvr->buf, ubuf + offset, partlen)) { - err = -EFAULT; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto err; - } - memcpy_toio(dbuf + offset, vvr->buf, partlen); - offset += partlen; - vdev->out_bytes += partlen; - len -= partlen; - } - err = 0; -err: - vpdev->hw_ops->unmap(vpdev, dbuf); - dev_dbg(vop_dev(vdev), - "%s: ubuf %p dbuf %p len 0x%zx vr_idx 0x%x\n", - __func__, ubuf, dbuf, len, vr_idx); - return err; -} - -#define MIC_VRINGH_READ true - -/* Determine the total number of bytes consumed in a VRINGH KIOV */ -static inline u32 vop_vringh_iov_consumed(struct vringh_kiov *iov) -{ - int i; - u32 total = iov->consumed; - - for (i = 0; i < iov->i; i++) - total += iov->iov[i].iov_len; - return total; -} - -/* - * Traverse the VRINGH KIOV and issue the APIs to trigger the copies. - * This API is heavily based on the vringh_iov_xfer(..) implementation - * in vringh.c. The reason we cannot reuse vringh_iov_pull_kern(..) - * and vringh_iov_push_kern(..) directly is because there is no - * way to override the VRINGH xfer(..) routines as of v3.10. - */ -static int vop_vringh_copy(struct vop_vdev *vdev, struct vringh_kiov *iov, - void __user *ubuf, size_t len, bool read, int vr_idx, - size_t *out_len) -{ - int ret = 0; - size_t partlen, tot_len = 0; - - while (len && iov->i < iov->used) { - struct kvec *kiov = &iov->iov[iov->i]; - unsigned long daddr = (unsigned long)kiov->iov_base; - - partlen = min(kiov->iov_len, len); - if (read) - ret = vop_virtio_copy_to_user(vdev, ubuf, partlen, - daddr, - kiov->iov_len, - vr_idx); - else - ret = vop_virtio_copy_from_user(vdev, ubuf, partlen, - daddr, - kiov->iov_len, - vr_idx); - if (ret) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - break; - } - len -= partlen; - ubuf += partlen; - tot_len += partlen; - iov->consumed += partlen; - kiov->iov_len -= partlen; - kiov->iov_base += partlen; - if (!kiov->iov_len) { - /* Fix up old iov element then increment. */ - kiov->iov_len = iov->consumed; - kiov->iov_base -= iov->consumed; - - iov->consumed = 0; - iov->i++; - } - } - *out_len = tot_len; - return ret; -} - -/* - * Use the standard VRINGH infrastructure in the kernel to fetch new - * descriptors, initiate the copies and update the used ring. - */ -static int _vop_virtio_copy(struct vop_vdev *vdev, struct mic_copy_desc *copy) -{ - int ret = 0; - u32 iovcnt = copy->iovcnt; - struct iovec iov; - struct iovec __user *u_iov = copy->iov; - void __user *ubuf = NULL; - struct vop_vringh *vvr = &vdev->vvr[copy->vr_idx]; - struct vringh_kiov *riov = &vvr->riov; - struct vringh_kiov *wiov = &vvr->wiov; - struct vringh *vrh = &vvr->vrh; - u16 *head = &vvr->head; - struct mic_vring *vr = &vvr->vring; - size_t len = 0, out_len; - - copy->out_len = 0; - /* Fetch a new IOVEC if all previous elements have been processed */ - if (riov->i == riov->used && wiov->i == wiov->used) { - ret = vringh_getdesc_kern(vrh, riov, wiov, - head, GFP_KERNEL); - /* Check if there are available descriptors */ - if (ret <= 0) - return ret; - } - while (iovcnt) { - if (!len) { - /* Copy over a new iovec from user space. */ - ret = copy_from_user(&iov, u_iov, sizeof(*u_iov)); - if (ret) { - ret = -EINVAL; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - break; - } - len = iov.iov_len; - ubuf = iov.iov_base; - } - /* Issue all the read descriptors first */ - ret = vop_vringh_copy(vdev, riov, ubuf, len, - MIC_VRINGH_READ, copy->vr_idx, &out_len); - if (ret) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - break; - } - len -= out_len; - ubuf += out_len; - copy->out_len += out_len; - /* Issue the write descriptors next */ - ret = vop_vringh_copy(vdev, wiov, ubuf, len, - !MIC_VRINGH_READ, copy->vr_idx, &out_len); - if (ret) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, ret); - break; - } - len -= out_len; - ubuf += out_len; - copy->out_len += out_len; - if (!len) { - /* One user space iovec is now completed */ - iovcnt--; - u_iov++; - } - /* Exit loop if all elements in KIOVs have been processed. */ - if (riov->i == riov->used && wiov->i == wiov->used) - break; - } - /* - * Update the used ring if a descriptor was available and some data was - * copied in/out and the user asked for a used ring update. - */ - if (*head != USHRT_MAX && copy->out_len && copy->update_used) { - u32 total = 0; - - /* Determine the total data consumed */ - total += vop_vringh_iov_consumed(riov); - total += vop_vringh_iov_consumed(wiov); - vringh_complete_kern(vrh, *head, total); - *head = USHRT_MAX; - if (vringh_need_notify_kern(vrh) > 0) - vringh_notify(vrh); - vringh_kiov_cleanup(riov); - vringh_kiov_cleanup(wiov); - /* Update avail idx for user space */ - vr->info->avail_idx = vrh->last_avail_idx; - } - return ret; -} - -static inline int vop_verify_copy_args(struct vop_vdev *vdev, - struct mic_copy_desc *copy) -{ - if (!vdev || copy->vr_idx >= vdev->dd->num_vq) - return -EINVAL; - return 0; -} - -/* Copy a specified number of virtio descriptors in a chain */ -static int vop_virtio_copy_desc(struct vop_vdev *vdev, - struct mic_copy_desc *copy) -{ - int err; - struct vop_vringh *vvr; - - err = vop_verify_copy_args(vdev, copy); - if (err) - return err; - - vvr = &vdev->vvr[copy->vr_idx]; - mutex_lock(&vvr->vr_mutex); - if (!vop_vdevup(vdev)) { - err = -ENODEV; - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - goto err; - } - err = _vop_virtio_copy(vdev, copy); - if (err) { - dev_err(vop_dev(vdev), "%s %d err %d\n", - __func__, __LINE__, err); - } -err: - mutex_unlock(&vvr->vr_mutex); - return err; -} - -static int vop_open(struct inode *inode, struct file *f) -{ - struct vop_vdev *vdev; - struct vop_info *vi = container_of(f->private_data, - struct vop_info, miscdev); - - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; - vdev->vi = vi; - mutex_init(&vdev->vdev_mutex); - f->private_data = vdev; - init_completion(&vdev->destroy); - complete(&vdev->destroy); - return 0; -} - -static int vop_release(struct inode *inode, struct file *f) -{ - struct vop_vdev *vdev = f->private_data, *vdev_tmp; - struct vop_info *vi = vdev->vi; - struct list_head *pos, *tmp; - bool found = false; - - mutex_lock(&vdev->vdev_mutex); - if (vdev->deleted) - goto unlock; - mutex_lock(&vi->vop_mutex); - list_for_each_safe(pos, tmp, &vi->vdev_list) { - vdev_tmp = list_entry(pos, struct vop_vdev, list); - if (vdev == vdev_tmp) { - vop_virtio_del_device(vdev); - list_del(pos); - found = true; - break; - } - } - mutex_unlock(&vi->vop_mutex); -unlock: - mutex_unlock(&vdev->vdev_mutex); - if (!found) - wait_for_completion(&vdev->destroy); - f->private_data = NULL; - kfree(vdev); - return 0; -} - -static long vop_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - struct vop_vdev *vdev = f->private_data; - struct vop_info *vi = vdev->vi; - void __user *argp = (void __user *)arg; - int ret; - - switch (cmd) { - case MIC_VIRTIO_ADD_DEVICE: - { - struct mic_device_desc dd, *dd_config; - - if (copy_from_user(&dd, argp, sizeof(dd))) - return -EFAULT; - - if (mic_aligned_desc_size(&dd) > MIC_MAX_DESC_BLK_SIZE || - dd.num_vq > MIC_MAX_VRINGS) - return -EINVAL; - - dd_config = memdup_user(argp, mic_desc_size(&dd)); - if (IS_ERR(dd_config)) - return PTR_ERR(dd_config); - - /* Ensure desc has not changed between the two reads */ - if (memcmp(&dd, dd_config, sizeof(dd))) { - ret = -EINVAL; - goto free_ret; - } - mutex_lock(&vdev->vdev_mutex); - mutex_lock(&vi->vop_mutex); - ret = vop_virtio_add_device(vdev, dd_config); - if (ret) - goto unlock_ret; - list_add_tail(&vdev->list, &vi->vdev_list); -unlock_ret: - mutex_unlock(&vi->vop_mutex); - mutex_unlock(&vdev->vdev_mutex); -free_ret: - kfree(dd_config); - return ret; - } - case MIC_VIRTIO_COPY_DESC: - { - struct mic_copy_desc copy; - - mutex_lock(&vdev->vdev_mutex); - ret = vop_vdev_inited(vdev); - if (ret) - goto _unlock_ret; - - if (copy_from_user(©, argp, sizeof(copy))) { - ret = -EFAULT; - goto _unlock_ret; - } - - ret = vop_virtio_copy_desc(vdev, ©); - if (ret < 0) - goto _unlock_ret; - if (copy_to_user( - &((struct mic_copy_desc __user *)argp)->out_len, - ©.out_len, sizeof(copy.out_len))) - ret = -EFAULT; -_unlock_ret: - mutex_unlock(&vdev->vdev_mutex); - return ret; - } - case MIC_VIRTIO_CONFIG_CHANGE: - { - void *buf; - - mutex_lock(&vdev->vdev_mutex); - ret = vop_vdev_inited(vdev); - if (ret) - goto __unlock_ret; - buf = memdup_user(argp, vdev->dd->config_len); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto __unlock_ret; - } - ret = vop_virtio_config_change(vdev, buf); - kfree(buf); -__unlock_ret: - mutex_unlock(&vdev->vdev_mutex); - return ret; - } - default: - return -ENOIOCTLCMD; - }; - return 0; -} - -/* - * We return EPOLLIN | EPOLLOUT from poll when new buffers are enqueued, and - * not when previously enqueued buffers may be available. This means that - * in the card->host (TX) path, when userspace is unblocked by poll it - * must drain all available descriptors or it can stall. - */ -static __poll_t vop_poll(struct file *f, poll_table *wait) -{ - struct vop_vdev *vdev = f->private_data; - __poll_t mask = 0; - - mutex_lock(&vdev->vdev_mutex); - if (vop_vdev_inited(vdev)) { - mask = EPOLLERR; - goto done; - } - poll_wait(f, &vdev->waitq, wait); - if (vop_vdev_inited(vdev)) { - mask = EPOLLERR; - } else if (vdev->poll_wake) { - vdev->poll_wake = 0; - mask = EPOLLIN | EPOLLOUT; - } -done: - mutex_unlock(&vdev->vdev_mutex); - return mask; -} - -static inline int -vop_query_offset(struct vop_vdev *vdev, unsigned long offset, - unsigned long *size, unsigned long *pa) -{ - struct vop_device *vpdev = vdev->vpdev; - unsigned long start = MIC_DP_SIZE; - int i; - - /* - * MMAP interface is as follows: - * offset region - * 0x0 virtio device_page - * 0x1000 first vring - * 0x1000 + size of 1st vring second vring - * .... - */ - if (!offset) { - *pa = virt_to_phys(vpdev->hw_ops->get_dp(vpdev)); - *size = MIC_DP_SIZE; - return 0; - } - - for (i = 0; i < vdev->dd->num_vq; i++) { - struct vop_vringh *vvr = &vdev->vvr[i]; - - if (offset == start) { - *pa = virt_to_phys(vvr->vring.va); - *size = vvr->vring.len; - return 0; - } - start += vvr->vring.len; - } - return -1; -} - -/* - * Maps the device page and virtio rings to user space for readonly access. - */ -static int vop_mmap(struct file *f, struct vm_area_struct *vma) -{ - struct vop_vdev *vdev = f->private_data; - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long pa, size = vma->vm_end - vma->vm_start, size_rem = size; - int i, err; - - err = vop_vdev_inited(vdev); - if (err) - goto ret; - if (vma->vm_flags & VM_WRITE) { - err = -EACCES; - goto ret; - } - while (size_rem) { - i = vop_query_offset(vdev, offset, &size, &pa); - if (i < 0) { - err = -EINVAL; - goto ret; - } - err = remap_pfn_range(vma, vma->vm_start + offset, - pa >> PAGE_SHIFT, size, - vma->vm_page_prot); - if (err) - goto ret; - size_rem -= size; - offset += size; - } -ret: - return err; -} - -static const struct file_operations vop_fops = { - .open = vop_open, - .release = vop_release, - .unlocked_ioctl = vop_ioctl, - .poll = vop_poll, - .mmap = vop_mmap, - .owner = THIS_MODULE, -}; - -int vop_host_init(struct vop_info *vi) -{ - int rc; - struct miscdevice *mdev; - struct vop_device *vpdev = vi->vpdev; - - INIT_LIST_HEAD(&vi->vdev_list); - vi->dma_ch = vpdev->dma_ch; - mdev = &vi->miscdev; - mdev->minor = MISC_DYNAMIC_MINOR; - snprintf(vi->name, sizeof(vi->name), "vop_virtio%d", vpdev->index); - mdev->name = vi->name; - mdev->fops = &vop_fops; - mdev->parent = &vpdev->dev; - - rc = misc_register(mdev); - if (rc) - dev_err(&vpdev->dev, "%s failed rc %d\n", __func__, rc); - return rc; -} - -void vop_host_uninit(struct vop_info *vi) -{ - struct list_head *pos, *tmp; - struct vop_vdev *vdev; - - mutex_lock(&vi->vop_mutex); - vop_virtio_reset_devices(vi); - list_for_each_safe(pos, tmp, &vi->vdev_list) { - vdev = list_entry(pos, struct vop_vdev, list); - list_del(pos); - reinit_completion(&vdev->destroy); - mutex_unlock(&vi->vop_mutex); - mutex_lock(&vdev->vdev_mutex); - vop_virtio_del_device(vdev); - vdev->deleted = true; - mutex_unlock(&vdev->vdev_mutex); - complete(&vdev->destroy); - mutex_lock(&vi->vop_mutex); - } - mutex_unlock(&vi->vop_mutex); - misc_deregister(&vi->miscdev); -} diff --git a/include/linux/mic_bus.h b/include/linux/mic_bus.h deleted file mode 100644 index e99c789424e0..000000000000 --- a/include/linux/mic_bus.h +++ /dev/null @@ -1,100 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2014 Intel Corporation. - * - * Intel MIC Bus driver. - * - * This implementation is very similar to the virtio bus driver - * implementation @ include/linux/virtio.h. - */ -#ifndef _MIC_BUS_H_ -#define _MIC_BUS_H_ -/* - * Everything a mbus driver needs to work with any particular mbus - * implementation. - */ -#include -#include - -struct mbus_device_id { - __u32 device; - __u32 vendor; -}; - -#define MBUS_DEV_DMA_HOST 2 -#define MBUS_DEV_DMA_MIC 3 -#define MBUS_DEV_ANY_ID 0xffffffff - -/** - * mbus_device - representation of a device using mbus - * @mmio_va: virtual address of mmio space - * @hw_ops: the hardware ops supported by this device. - * @id: the device type identification (used to match it with a driver). - * @dev: underlying device. - * be used to communicate with. - * @index: unique position on the mbus bus - */ -struct mbus_device { - void __iomem *mmio_va; - struct mbus_hw_ops *hw_ops; - struct mbus_device_id id; - struct device dev; - int index; -}; - -/** - * mbus_driver - operations for a mbus I/O driver - * @driver: underlying device driver (populate name and owner). - * @id_table: the ids serviced by this driver. - * @probe: the function to call when a device is found. Returns 0 or -errno. - * @remove: the function to call when a device is removed. - */ -struct mbus_driver { - struct device_driver driver; - const struct mbus_device_id *id_table; - int (*probe)(struct mbus_device *dev); - void (*scan)(struct mbus_device *dev); - void (*remove)(struct mbus_device *dev); -}; - -/** - * struct mic_irq - opaque pointer used as cookie - */ -struct mic_irq; - -/** - * mbus_hw_ops - Hardware operations for accessing a MIC device on the MIC bus. - */ -struct mbus_hw_ops { - struct mic_irq* (*request_threaded_irq)(struct mbus_device *mbdev, - irq_handler_t handler, - irq_handler_t thread_fn, - const char *name, void *data, - int intr_src); - void (*free_irq)(struct mbus_device *mbdev, - struct mic_irq *cookie, void *data); - void (*ack_interrupt)(struct mbus_device *mbdev, int num); -}; - -struct mbus_device * -mbus_register_device(struct device *pdev, int id, const struct dma_map_ops *dma_ops, - struct mbus_hw_ops *hw_ops, int index, - void __iomem *mmio_va); -void mbus_unregister_device(struct mbus_device *mbdev); - -int mbus_register_driver(struct mbus_driver *drv); -void mbus_unregister_driver(struct mbus_driver *drv); - -static inline struct mbus_device *dev_to_mbus(struct device *_dev) -{ - return container_of(_dev, struct mbus_device, dev); -} - -static inline struct mbus_driver *drv_to_mbus(struct device_driver *drv) -{ - return container_of(drv, struct mbus_driver, driver); -} - -#endif /* _MIC_BUS_H */ diff --git a/include/linux/scif.h b/include/linux/scif.h deleted file mode 100644 index 329e695b8fe5..000000000000 --- a/include/linux/scif.h +++ /dev/null @@ -1,1339 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2014 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * BSD LICENSE - * - * Copyright(c) 2014 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Intel SCIF driver. - * - */ -#ifndef __SCIF_H__ -#define __SCIF_H__ - -#include -#include -#include -#include - -#define SCIF_ACCEPT_SYNC 1 -#define SCIF_SEND_BLOCK 1 -#define SCIF_RECV_BLOCK 1 - -enum { - SCIF_PROT_READ = (1 << 0), - SCIF_PROT_WRITE = (1 << 1) -}; - -enum { - SCIF_MAP_FIXED = 0x10, - SCIF_MAP_KERNEL = 0x20, -}; - -enum { - SCIF_FENCE_INIT_SELF = (1 << 0), - SCIF_FENCE_INIT_PEER = (1 << 1), - SCIF_SIGNAL_LOCAL = (1 << 4), - SCIF_SIGNAL_REMOTE = (1 << 5) -}; - -enum { - SCIF_RMA_USECPU = (1 << 0), - SCIF_RMA_USECACHE = (1 << 1), - SCIF_RMA_SYNC = (1 << 2), - SCIF_RMA_ORDERED = (1 << 3) -}; - -/* End of SCIF Admin Reserved Ports */ -#define SCIF_ADMIN_PORT_END 1024 - -/* End of SCIF Reserved Ports */ -#define SCIF_PORT_RSVD 1088 - -typedef struct scif_endpt *scif_epd_t; -typedef struct scif_pinned_pages *scif_pinned_pages_t; - -/** - * struct scif_range - SCIF registered range used in kernel mode - * @cookie: cookie used internally by SCIF - * @nr_pages: number of pages of PAGE_SIZE - * @prot_flags: R/W protection - * @phys_addr: Array of bus addresses - * @va: Array of kernel virtual addresses backed by the pages in the phys_addr - * array. The va is populated only when called on the host for a remote - * SCIF connection on MIC. This is required to support the use case of DMA - * between MIC and another device which is not a SCIF node e.g., an IB or - * ethernet NIC. - */ -struct scif_range { - void *cookie; - int nr_pages; - int prot_flags; - dma_addr_t *phys_addr; - void __iomem **va; -}; - -/** - * struct scif_pollepd - SCIF endpoint to be monitored via scif_poll - * @epd: SCIF endpoint - * @events: requested events - * @revents: returned events - */ -struct scif_pollepd { - scif_epd_t epd; - __poll_t events; - __poll_t revents; -}; - -/** - * scif_peer_dev - representation of a peer SCIF device - * - * Peer devices show up as PCIe devices for the mgmt node but not the cards. - * The mgmt node discovers all the cards on the PCIe bus and informs the other - * cards about their peers. Upon notification of a peer a node adds a peer - * device to the peer bus to maintain symmetry in the way devices are - * discovered across all nodes in the SCIF network. - * - * @dev: underlying device - * @dnode - The destination node which this device will communicate with. - */ -struct scif_peer_dev { - struct device dev; - u8 dnode; -}; - -/** - * scif_client - representation of a SCIF client - * @name: client name - * @probe - client method called when a peer device is registered - * @remove - client method called when a peer device is unregistered - * @si - subsys_interface used internally for implementing SCIF clients - */ -struct scif_client { - const char *name; - void (*probe)(struct scif_peer_dev *spdev); - void (*remove)(struct scif_peer_dev *spdev); - struct subsys_interface si; -}; - -#define SCIF_OPEN_FAILED ((scif_epd_t)-1) -#define SCIF_REGISTER_FAILED ((off_t)-1) -#define SCIF_MMAP_FAILED ((void *)-1) - -/** - * scif_open() - Create an endpoint - * - * Return: - * Upon successful completion, scif_open() returns an endpoint descriptor to - * be used in subsequent SCIF functions calls to refer to that endpoint; - * otherwise in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is - * returned and errno is set to indicate the error; in kernel mode a NULL - * scif_epd_t is returned. - * - * Errors: - * ENOMEM - Insufficient kernel memory was available - */ -scif_epd_t scif_open(void); - -/** - * scif_bind() - Bind an endpoint to a port - * @epd: endpoint descriptor - * @pn: port number - * - * scif_bind() binds endpoint epd to port pn, where pn is a port number on the - * local node. If pn is zero, a port number greater than or equal to - * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to - * exactly one local port. Ports less than 1024 when requested can only be bound - * by system (or root) processes or by processes executed by privileged users. - * - * Return: - * Upon successful completion, scif_bind() returns the port number to which epd - * is bound; otherwise in user mode -1 is returned and errno is set to - * indicate the error; in kernel mode the negative of one of the following - * errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINVAL - the endpoint or the port is already bound - * EISCONN - The endpoint is already connected - * ENOSPC - No port number available for assignment - * EACCES - The port requested is protected and the user is not the superuser - */ -int scif_bind(scif_epd_t epd, u16 pn); - -/** - * scif_listen() - Listen for connections on an endpoint - * @epd: endpoint descriptor - * @backlog: maximum pending connection requests - * - * scif_listen() marks the endpoint epd as a listening endpoint - that is, as - * an endpoint that will be used to accept incoming connection requests. Once - * so marked, the endpoint is said to be in the listening state and may not be - * used as the endpoint of a connection. - * - * The endpoint, epd, must have been bound to a port. - * - * The backlog argument defines the maximum length to which the queue of - * pending connections for epd may grow. If a connection request arrives when - * the queue is full, the client may receive an error with an indication that - * the connection was refused. - * - * Return: - * Upon successful completion, scif_listen() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINVAL - the endpoint is not bound to a port - * EISCONN - The endpoint is already connected or listening - */ -int scif_listen(scif_epd_t epd, int backlog); - -/** - * scif_connect() - Initiate a connection on a port - * @epd: endpoint descriptor - * @dst: global id of port to which to connect - * - * The scif_connect() function requests the connection of endpoint epd to remote - * port dst. If the connection is successful, a peer endpoint, bound to dst, is - * created on node dst.node. On successful return, the connection is complete. - * - * If the endpoint epd has not already been bound to a port, scif_connect() - * will bind it to an unused local port. - * - * A connection is terminated when an endpoint of the connection is closed, - * either explicitly by scif_close(), or when a process that owns one of the - * endpoints of the connection is terminated. - * - * In user space, scif_connect() supports an asynchronous connection mode - * if the application has set the O_NONBLOCK flag on the endpoint via the - * fcntl() system call. Setting this flag will result in the calling process - * not to wait during scif_connect(). - * - * Return: - * Upon successful completion, scif_connect() returns the port ID to which the - * endpoint, epd, is bound; otherwise in user mode -1 is returned and errno is - * set to indicate the error; in kernel mode the negative of one of the - * following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNREFUSED - The destination was not listening for connections or refused - * the connection request - * EINVAL - dst.port is not a valid port ID - * EISCONN - The endpoint is already connected - * ENOMEM - No buffer space is available - * ENODEV - The destination node does not exist, or the node is lost or existed, - * but is not currently in the network since it may have crashed - * ENOSPC - No port number available for assignment - * EOPNOTSUPP - The endpoint is listening and cannot be connected - */ -int scif_connect(scif_epd_t epd, struct scif_port_id *dst); - -/** - * scif_accept() - Accept a connection on an endpoint - * @epd: endpoint descriptor - * @peer: global id of port to which connected - * @newepd: new connected endpoint descriptor - * @flags: flags - * - * The scif_accept() call extracts the first connection request from the queue - * of pending connections for the port on which epd is listening. scif_accept() - * creates a new endpoint, bound to the same port as epd, and allocates a new - * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new - * endpoint is connected to the endpoint through which the connection was - * requested. epd is unaffected by this call, and remains in the listening - * state. - * - * On successful return, peer holds the global port identifier (node id and - * local port number) of the port which requested the connection. - * - * A connection is terminated when an endpoint of the connection is closed, - * either explicitly by scif_close(), or when a process that owns one of the - * endpoints of the connection is terminated. - * - * The number of connections that can (subsequently) be accepted on epd is only - * limited by system resources (memory). - * - * The flags argument is formed by OR'ing together zero or more of the - * following values. - * SCIF_ACCEPT_SYNC - block until a connection request is presented. If - * SCIF_ACCEPT_SYNC is not in flags, and no pending - * connections are present on the queue, scif_accept() - * fails with an EAGAIN error - * - * In user mode, the select() and poll() functions can be used to determine - * when there is a connection request. In kernel mode, the scif_poll() - * function may be used for this purpose. A readable event will be delivered - * when a connection is requested. - * - * Return: - * Upon successful completion, scif_accept() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EAGAIN - SCIF_ACCEPT_SYNC is not set and no connections are present to be - * accepted or SCIF_ACCEPT_SYNC is not set and remote node failed to complete - * its connection request - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * EINTR - Interrupted function - * EINVAL - epd is not a listening endpoint, or flags is invalid, or peer is - * NULL, or newepd is NULL - * ENODEV - The requesting node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOENT - Secondary part of epd registration failed - */ -int scif_accept(scif_epd_t epd, struct scif_port_id *peer, scif_epd_t - *newepd, int flags); - -/** - * scif_close() - Close an endpoint - * @epd: endpoint descriptor - * - * scif_close() closes an endpoint and performs necessary teardown of - * facilities associated with that endpoint. - * - * If epd is a listening endpoint then it will no longer accept connection - * requests on the port to which it is bound. Any pending connection requests - * are rejected. - * - * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs - * which are in-process through epd or its peer endpoint will complete before - * scif_close() returns. Registered windows of the local and peer endpoints are - * released as if scif_unregister() was called against each window. - * - * Closing a SCIF endpoint does not affect local registered memory mapped by - * a SCIF endpoint on a remote node. The local memory remains mapped by the peer - * SCIF endpoint explicitly removed by calling munmap(..) by the peer. - * - * If the peer endpoint's receive queue is not empty at the time that epd is - * closed, then the peer endpoint can be passed as the endpoint parameter to - * scif_recv() until the receive queue is empty. - * - * epd is freed and may no longer be accessed. - * - * Return: - * Upon successful completion, scif_close() returns 0; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode the - * negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - */ -int scif_close(scif_epd_t epd); - -/** - * scif_send() - Send a message - * @epd: endpoint descriptor - * @msg: message buffer address - * @len: message length - * @flags: blocking mode flags - * - * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data - * are copied from memory starting at address msg. On successful execution the - * return value of scif_send() is the number of bytes that were sent, and is - * zero if no bytes were sent because len was zero. scif_send() may be called - * only when the endpoint is in a connected state. - * - * If a scif_send() call is non-blocking, then it sends only those bytes which - * can be sent without waiting, up to a maximum of len bytes. - * - * If a scif_send() call is blocking, then it normally returns after sending - * all len bytes. If a blocking call is interrupted or the connection is - * reset, the call is considered successful if some bytes were sent or len is - * zero, otherwise the call is considered unsuccessful. - * - * In user mode, the select() and poll() functions can be used to determine - * when the send queue is not full. In kernel mode, the scif_poll() function - * may be used for this purpose. - * - * It is recommended that scif_send()/scif_recv() only be used for short - * control-type message communication between SCIF endpoints. The SCIF RMA - * APIs are expected to provide better performance for transfer sizes of - * 1024 bytes or longer for the current MIC hardware and software - * implementation. - * - * scif_send() will block until the entire message is sent if SCIF_SEND_BLOCK - * is passed as the flags argument. - * - * Return: - * Upon successful completion, scif_send() returns the number of bytes sent; - * otherwise in user mode -1 is returned and errno is set to indicate the - * error; in kernel mode the negative of one of the following errors is - * returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or len is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -int scif_send(scif_epd_t epd, void *msg, int len, int flags); - -/** - * scif_recv() - Receive a message - * @epd: endpoint descriptor - * @msg: message buffer address - * @len: message buffer length - * @flags: blocking mode flags - * - * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of - * data are copied to memory starting at address msg. On successful execution - * the return value of scif_recv() is the number of bytes that were received, - * and is zero if no bytes were received because len was zero. scif_recv() may - * be called only when the endpoint is in a connected state. - * - * If a scif_recv() call is non-blocking, then it receives only those bytes - * which can be received without waiting, up to a maximum of len bytes. - * - * If a scif_recv() call is blocking, then it normally returns after receiving - * all len bytes. If the blocking call was interrupted due to a disconnection, - * subsequent calls to scif_recv() will copy all bytes received upto the point - * of disconnection. - * - * In user mode, the select() and poll() functions can be used to determine - * when data is available to be received. In kernel mode, the scif_poll() - * function may be used for this purpose. - * - * It is recommended that scif_send()/scif_recv() only be used for short - * control-type message communication between SCIF endpoints. The SCIF RMA - * APIs are expected to provide better performance for transfer sizes of - * 1024 bytes or longer for the current MIC hardware and software - * implementation. - * - * scif_recv() will block until the entire message is received if - * SCIF_RECV_BLOCK is passed as the flags argument. - * - * Return: - * Upon successful completion, scif_recv() returns the number of bytes - * received; otherwise in user mode -1 is returned and errno is set to - * indicate the error; in kernel mode the negative of one of the following - * errors is returned. - * - * Errors: - * EAGAIN - The destination node is returning from a low power state - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or len is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -int scif_recv(scif_epd_t epd, void *msg, int len, int flags); - -/** - * scif_register() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @addr: starting virtual address - * @len: length of range - * @offset: offset of window - * @prot_flags: read/write protection flags - * @map_flags: mapping flags - * - * The scif_register() function opens a window, a range of whole pages of the - * registered address space of the endpoint epd, starting at offset po and - * continuing for len bytes. The value of po, further described below, is a - * function of the parameters offset and len, and the value of map_flags. Each - * page of the window represents the physical memory page which backs the - * corresponding page of the range of virtual address pages starting at addr - * and continuing for len bytes. addr and len are constrained to be multiples - * of the page size. A successful scif_register() call returns po. - * - * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset - * exactly, and offset is constrained to be a multiple of the page size. The - * mapping established by scif_register() will not replace any existing - * registration; an error is returned if any page within the range [offset, - * offset + len - 1] intersects an existing window. - * - * When SCIF_MAP_FIXED is not set, the implementation uses offset in an - * implementation-defined manner to arrive at po. The po value so chosen will - * be an area of the registered address space that the implementation deems - * suitable for a mapping of len bytes. An offset value of 0 is interpreted as - * granting the implementation complete freedom in selecting po, subject to - * constraints described below. A non-zero value of offset is taken to be a - * suggestion of an offset near which the mapping should be placed. When the - * implementation selects a value for po, it does not replace any extant - * window. In all cases, po will be a multiple of the page size. - * - * The physical pages which are so represented by a window are available for - * access in calls to mmap(), scif_readfrom(), scif_writeto(), - * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the - * physical pages represented by the window will not be reused by the memory - * subsystem for any other purpose. Note that the same physical page may be - * represented by multiple windows. - * - * Subsequent operations which change the memory pages to which virtual - * addresses are mapped (such as mmap(), munmap()) have no effect on - * existing window. - * - * If the process will fork(), it is recommended that the registered - * virtual address range be marked with MADV_DONTFORK. Doing so will prevent - * problems due to copy-on-write semantics. - * - * The prot_flags argument is formed by OR'ing together one or more of the - * following values. - * SCIF_PROT_READ - allow read operations from the window - * SCIF_PROT_WRITE - allow write operations to the window - * - * Return: - * Upon successful completion, scif_register() returns the offset at which the - * mapping was placed (po); otherwise in user mode SCIF_REGISTER_FAILED (that - * is (off_t *)-1) is returned and errno is set to indicate the error; in - * kernel mode the negative of one of the following errors is returned. - * - * Errors: - * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags, and pages in the range - * [offset, offset + len -1] are already registered - * EAGAIN - The mapping could not be performed due to lack of resources - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - map_flags is invalid, or prot_flags is invalid, or SCIF_MAP_FIXED is - * set in flags, and offset is not a multiple of the page size, or addr is not a - * multiple of the page size, or len is not a multiple of the page size, or is - * 0, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN -The endpoint is not connected - */ -off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, - int prot_flags, int map_flags); - -/** - * scif_unregister() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @offset: start of range to unregister - * @len: length of range to unregister - * - * The scif_unregister() function closes those previously registered windows - * which are entirely within the range [offset, offset + len - 1]. It is an - * error to specify a range which intersects only a subrange of a window. - * - * On a successful return, pages within the window may no longer be specified - * in calls to mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(), - * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, - * however, continues to exist until all previous references against it are - * removed. A window is referenced if there is a mapping to it created by - * mmap(), or if scif_get_pages() was called against the window - * (and the pages have not been returned via scif_put_pages()). A window is - * also referenced while an RMA, in which some range of the window is a source - * or destination, is in progress. Finally a window is referenced while some - * offset in that window was specified to scif_fence_signal(), and the RMAs - * marked by that call to scif_fence_signal() have not completed. While a - * window is in this state, its registered address space pages are not - * available for use in a new registered window. - * - * When all such references to the window have been removed, its references to - * all the physical pages which it represents are removed. Similarly, the - * registered address space pages of the window become available for - * registration in a new window. - * - * Return: - * Upon successful completion, scif_unregister() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. In the event of an - * error, no windows are unregistered. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - the range [offset, offset + len - 1] intersects a subrange of a - * window, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_unregister(scif_epd_t epd, off_t offset, size_t len); - -/** - * scif_readfrom() - Copy from a remote address space - * @epd: endpoint descriptor - * @loffset: offset in local registered address space to - * which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space - * from which to copy - * @rma_flags: transfer mode flags - * - * scif_readfrom() copies len bytes from the remote registered address space of - * the peer of endpoint epd, starting at the offset roffset to the local - * registered address space of epd, starting at the offset loffset. - * - * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, - * roffset + len - 1] must be within some registered window or windows of the - * local and remote nodes. A range may intersect multiple registered windows, - * but only if those windows are contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * The optimal DMA performance will likely be realized if both - * loffset and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if loffset and roffset are not - * cacheline aligned but are separated by some multiple of 64. The lowest level - * of performance is likely if loffset and roffset are not separated by a - * multiple of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_readfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered - * address space of epd, or, The range [roffset, roffset + len - 1] is invalid - * for the registered address space of the peer of epd, or loffset or roffset - * is negative - */ -int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t - roffset, int rma_flags); - -/** - * scif_writeto() - Copy to a remote address space - * @epd: endpoint descriptor - * @loffset: offset in local registered address space - * from which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space to - * which to copy - * @rma_flags: transfer mode flags - * - * scif_writeto() copies len bytes from the local registered address space of - * epd, starting at the offset loffset to the remote registered address space - * of the peer of endpoint epd, starting at the offset roffset. - * - * Each of the specified ranges [loffset, loffset + len - 1] and [roffset, - * roffset + len - 1] must be within some registered window or windows of the - * local and remote nodes. A range may intersect multiple registered windows, - * but only if those windows are contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_writeto() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * The optimal DMA performance will likely be realized if both - * loffset and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if loffset and roffset are not cacheline - * aligned but are separated by some multiple of 64. The lowest level of - * performance is likely if loffset and roffset are not separated by a multiple - * of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_readfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - The range [loffset, loffset + len - 1] is invalid for the registered - * address space of epd, or, The range [roffset , roffset + len -1] is invalid - * for the registered address space of the peer of epd, or loffset or roffset - * is negative - */ -int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t - roffset, int rma_flags); - -/** - * scif_vreadfrom() - Copy from a remote address space - * @epd: endpoint descriptor - * @addr: address to which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space - * from which to copy - * @rma_flags: transfer mode flags - * - * scif_vreadfrom() copies len bytes from the remote registered address - * space of the peer of endpoint epd, starting at the offset roffset, to local - * memory, starting at addr. - * - * The specified range [roffset, roffset + len - 1] must be within some - * registered window or windows of the remote nodes. The range may - * intersect multiple registered windows, but only if those windows are - * contiguous in the registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_vreadfrom() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back - * the specified local memory range may be remain in a pinned state even after - * the specified transfer completes. This may reduce overhead if some or all of - * the same virtual address range is referenced in a subsequent call of - * scif_vreadfrom() or scif_vwriteto(). - * - * The optimal DMA performance will likely be realized if both - * addr and roffset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if addr and roffset are not - * cacheline aligned but are separated by some multiple of 64. The lowest level - * of performance is likely if addr and roffset are not separated by a - * multiple of 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_USECACHE - enable registration caching - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_vreadfrom() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t roffset, - int rma_flags); - -/** - * scif_vwriteto() - Copy to a remote address space - * @epd: endpoint descriptor - * @addr: address from which to copy - * @len: length of range to copy - * @roffset: offset in remote registered address space to - * which to copy - * @rma_flags: transfer mode flags - * - * scif_vwriteto() copies len bytes from the local memory, starting at addr, to - * the remote registered address space of the peer of endpoint epd, starting at - * the offset roffset. - * - * The specified range [roffset, roffset + len - 1] must be within some - * registered window or windows of the remote nodes. The range may intersect - * multiple registered windows, but only if those windows are contiguous in the - * registered address space. - * - * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using - * programmed read/writes. Otherwise the data is copied using DMA. If rma_- - * flags includes SCIF_RMA_SYNC, then scif_vwriteto() will return after the - * transfer is complete. Otherwise, the transfer may be performed asynchron- - * ously. The order in which any two asynchronous RMA operations complete - * is non-deterministic. The synchronization functions, scif_fence_mark()/ - * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to - * the completion of asynchronous RMA operations on the same endpoint. - * - * The DMA transfer of individual bytes is not guaranteed to complete in - * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last - * cacheline or partial cacheline of the source range will become visible on - * the destination node after all other transferred data in the source - * range has become visible on the destination node. - * - * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back - * the specified local memory range may be remain in a pinned state even after - * the specified transfer completes. This may reduce overhead if some or all of - * the same virtual address range is referenced in a subsequent call of - * scif_vreadfrom() or scif_vwriteto(). - * - * The optimal DMA performance will likely be realized if both - * addr and offset are cacheline aligned (are a multiple of 64). Lower - * performance will likely be realized if addr and offset are not cacheline - * aligned but are separated by some multiple of 64. The lowest level of - * performance is likely if addr and offset are not separated by a multiple of - * 64. - * - * The rma_flags argument is formed by ORing together zero or more of the - * following values. - * SCIF_RMA_USECPU - perform the transfer using the CPU, otherwise use the DMA - * engine. - * SCIF_RMA_USECACHE - allow registration caching - * SCIF_RMA_SYNC - perform the transfer synchronously, returning after the - * transfer has completed. Passing this flag results in the - * current implementation busy waiting and consuming CPU cycles - * while the DMA transfer is in progress for best performance by - * avoiding the interrupt latency. - * SCIF_RMA_ORDERED - ensure that the last cacheline or partial cacheline of - * the source range becomes visible on the destination node - * after all other transferred data in the source range has - * become visible on the destination - * - * Return: - * Upon successful completion, scif_vwriteto() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EACCES - Attempt to write to a read-only range - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - rma_flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [roffset, roffset + len - 1] are invalid for the - * registered address space of epd - */ -int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t roffset, - int rma_flags); - -/** - * scif_fence_mark() - Mark previously issued RMAs - * @epd: endpoint descriptor - * @flags: control flags - * @mark: marked value returned as output. - * - * scif_fence_mark() returns after marking the current set of all uncompleted - * RMAs initiated through the endpoint epd or the current set of all - * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are - * marked with a value returned at mark. The application may subsequently call - * scif_fence_wait(), passing the value returned at mark, to await completion - * of all RMAs so marked. - * - * The flags argument has exactly one of the following values. - * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint - * epd are marked - * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer - * of endpoint epd are marked - * - * Return: - * Upon successful completion, scif_fence_mark() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENOMEM - Insufficient kernel memory was available - */ -int scif_fence_mark(scif_epd_t epd, int flags, int *mark); - -/** - * scif_fence_wait() - Wait for completion of marked RMAs - * @epd: endpoint descriptor - * @mark: mark request - * - * scif_fence_wait() returns after all RMAs marked with mark have completed. - * The value passed in mark must have been obtained in a previous call to - * scif_fence_mark(). - * - * Return: - * Upon successful completion, scif_fence_wait() returns 0; otherwise in user - * mode -1 is returned and errno is set to indicate the error; in kernel mode - * the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENOMEM - Insufficient kernel memory was available - */ -int scif_fence_wait(scif_epd_t epd, int mark); - -/** - * scif_fence_signal() - Request a memory update on completion of RMAs - * @epd: endpoint descriptor - * @loff: local offset - * @lval: local value to write to loffset - * @roff: remote offset - * @rval: remote value to write to roffset - * @flags: flags - * - * scif_fence_signal() returns after marking the current set of all uncompleted - * RMAs initiated through the endpoint epd or marking the current set of all - * uncompleted RMAs initiated through the peer of endpoint epd. - * - * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the - * marked set, lval is written to memory at the address corresponding to offset - * loff in the local registered address space of epd. loff must be within a - * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion - * of the RMAs in the marked set, rval is written to memory at the address - * corresponding to offset roff in the remote registered address space of epd. - * roff must be within a remote registered window of the peer of epd. Note - * that any specified offset must be DWORD (4 byte / 32 bit) aligned. - * - * The flags argument is formed by OR'ing together the following. - * Exactly one of the following values. - * SCIF_FENCE_INIT_SELF - RMA operations initiated through endpoint - * epd are marked - * SCIF_FENCE_INIT_PEER - RMA operations initiated through the peer - * of endpoint epd are marked - * One or more of the following values. - * SCIF_SIGNAL_LOCAL - On completion of the marked set of RMAs, write lval to - * memory at the address corresponding to offset loff in the local - * registered address space of epd. - * SCIF_SIGNAL_REMOTE - On completion of the marked set of RMAs, write rval to - * memory at the address corresponding to offset roff in the remote - * registered address space of epd. - * - * Return: - * Upon successful completion, scif_fence_signal() returns 0; otherwise in - * user mode -1 is returned and errno is set to indicate the error; in kernel - * mode the negative of one of the following errors is returned. - * - * Errors: - * EBADF, ENOTTY - epd is not a valid endpoint descriptor - * ECONNRESET - Connection reset by peer - * EINVAL - flags is invalid, or loff or roff are not DWORD aligned - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - loff is invalid for the registered address of epd, or roff is invalid - * for the registered address space, of the peer of epd - */ -int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, off_t roff, - u64 rval, int flags); - -/** - * scif_get_node_ids() - Return information about online nodes - * @nodes: array in which to return online node IDs - * @len: number of entries in the nodes array - * @self: address to place the node ID of the local node - * - * scif_get_node_ids() fills in the nodes array with up to len node IDs of the - * nodes in the SCIF network. If there is not enough space in nodes, as - * indicated by the len parameter, only len node IDs are returned in nodes. The - * return value of scif_get_node_ids() is the total number of nodes currently in - * the SCIF network. By checking the return value against the len parameter, - * the user may determine if enough space for nodes was allocated. - * - * The node ID of the local node is returned at self. - * - * Return: - * Upon successful completion, scif_get_node_ids() returns the actual number of - * online nodes in the SCIF network including 'self'; otherwise in user mode - * -1 is returned and errno is set to indicate the error; in kernel mode no - * errors are returned. - */ -int scif_get_node_ids(u16 *nodes, int len, u16 *self); - -/** - * scif_pin_pages() - Pin a set of pages - * @addr: Virtual address of range to pin - * @len: Length of range to pin - * @prot_flags: Page protection flags - * @map_flags: Page classification flags - * @pinned_pages: Handle to pinned pages - * - * scif_pin_pages() pins (locks in physical memory) the physical pages which - * back the range of virtual address pages starting at addr and continuing for - * len bytes. addr and len are constrained to be multiples of the page size. A - * successful scif_pin_pages() call returns a handle to pinned_pages which may - * be used in subsequent calls to scif_register_pinned_pages(). - * - * The pages will remain pinned as long as there is a reference against the - * scif_pinned_pages_t value returned by scif_pin_pages() and until - * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A - * reference is added to a scif_pinned_pages_t value each time a window is - * created by calling scif_register_pinned_pages() and passing the - * scif_pinned_pages_t value. A reference is removed from a - * scif_pinned_pages_t value each time such a window is deleted. - * - * Subsequent operations which change the memory pages to which virtual - * addresses are mapped (such as mmap(), munmap()) have no effect on the - * scif_pinned_pages_t value or windows created against it. - * - * If the process will fork(), it is recommended that the registered - * virtual address range be marked with MADV_DONTFORK. Doing so will prevent - * problems due to copy-on-write semantics. - * - * The prot_flags argument is formed by OR'ing together one or more of the - * following values. - * SCIF_PROT_READ - allow read operations against the pages - * SCIF_PROT_WRITE - allow write operations against the pages - * The map_flags argument can be set as SCIF_MAP_KERNEL to interpret addr as a - * kernel space address. By default, addr is interpreted as a user space - * address. - * - * Return: - * Upon successful completion, scif_pin_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * - * Errors: - * EINVAL - prot_flags is invalid, map_flags is invalid, or offset is negative - * ENOMEM - Not enough space - */ -int scif_pin_pages(void *addr, size_t len, int prot_flags, int map_flags, - scif_pinned_pages_t *pinned_pages); - -/** - * scif_unpin_pages() - Unpin a set of pages - * @pinned_pages: Handle to pinned pages to be unpinned - * - * scif_unpin_pages() prevents scif_register_pinned_pages() from registering new - * windows against pinned_pages. The physical pages represented by pinned_pages - * will remain pinned until all windows previously registered against - * pinned_pages are deleted (the window is scif_unregister()'d and all - * references to the window are removed (see scif_unregister()). - * - * pinned_pages must have been obtain from a previous call to scif_pin_pages(). - * After calling scif_unpin_pages(), it is an error to pass pinned_pages to - * scif_register_pinned_pages(). - * - * Return: - * Upon successful completion, scif_unpin_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * - * Errors: - * EINVAL - pinned_pages is not valid - */ -int scif_unpin_pages(scif_pinned_pages_t pinned_pages); - -/** - * scif_register_pinned_pages() - Mark a memory region for remote access. - * @epd: endpoint descriptor - * @pinned_pages: Handle to pinned pages - * @offset: Registered address space offset - * @map_flags: Flags which control where pages are mapped - * - * The scif_register_pinned_pages() function opens a window, a range of whole - * pages of the registered address space of the endpoint epd, starting at - * offset po. The value of po, further described below, is a function of the - * parameters offset and pinned_pages, and the value of map_flags. Each page of - * the window represents a corresponding physical memory page of the range - * represented by pinned_pages; the length of the window is the same as the - * length of range represented by pinned_pages. A successful - * scif_register_pinned_pages() call returns po as the return value. - * - * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset - * exactly, and offset is constrained to be a multiple of the page size. The - * mapping established by scif_register_pinned_pages() will not replace any - * existing registration; an error is returned if any page of the new window - * would intersect an existing window. - * - * When SCIF_MAP_FIXED is not set, the implementation uses offset in an - * implementation-defined manner to arrive at po. The po so chosen will be an - * area of the registered address space that the implementation deems suitable - * for a mapping of the required size. An offset value of 0 is interpreted as - * granting the implementation complete freedom in selecting po, subject to - * constraints described below. A non-zero value of offset is taken to be a - * suggestion of an offset near which the mapping should be placed. When the - * implementation selects a value for po, it does not replace any extant - * window. In all cases, po will be a multiple of the page size. - * - * The physical pages which are so represented by a window are available for - * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(), - * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the - * physical pages represented by the window will not be reused by the memory - * subsystem for any other purpose. Note that the same physical page may be - * represented by multiple windows. - * - * Windows created by scif_register_pinned_pages() are unregistered by - * scif_unregister(). - * - * The map_flags argument can be set to SCIF_MAP_FIXED which interprets a - * fixed offset. - * - * Return: - * Upon successful completion, scif_register_pinned_pages() returns the offset - * at which the mapping was placed (po); otherwise the negative of one of the - * following errors is returned. - * - * Errors: - * EADDRINUSE - SCIF_MAP_FIXED is set in map_flags and pages in the new window - * would intersect an existing window - * EAGAIN - The mapping could not be performed due to lack of resources - * ECONNRESET - Connection reset by peer - * EINVAL - map_flags is invalid, or SCIF_MAP_FIXED is set in map_flags, and - * offset is not a multiple of the page size, or offset is negative - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOMEM - Not enough space - * ENOTCONN - The endpoint is not connected - */ -off_t scif_register_pinned_pages(scif_epd_t epd, - scif_pinned_pages_t pinned_pages, - off_t offset, int map_flags); - -/** - * scif_get_pages() - Add references to remote registered pages - * @epd: endpoint descriptor - * @offset: remote registered offset - * @len: length of range of pages - * @pages: returned scif_range structure - * - * scif_get_pages() returns the addresses of the physical pages represented by - * those pages of the registered address space of the peer of epd, starting at - * offset and continuing for len bytes. offset and len are constrained to be - * multiples of the page size. - * - * All of the pages in the specified range [offset, offset + len - 1] must be - * within a single window of the registered address space of the peer of epd. - * - * The addresses are returned as a virtually contiguous array pointed to by the - * phys_addr component of the scif_range structure whose address is returned in - * pages. The nr_pages component of scif_range is the length of the array. The - * prot_flags component of scif_range holds the protection flag value passed - * when the pages were registered. - * - * Each physical page whose address is returned by scif_get_pages() remains - * available and will not be released for reuse until the scif_range structure - * is returned in a call to scif_put_pages(). The scif_range structure returned - * by scif_get_pages() must be unmodified. - * - * It is an error to call scif_close() on an endpoint on which a scif_range - * structure of that endpoint has not been returned to scif_put_pages(). - * - * Return: - * Upon successful completion, scif_get_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * Errors: - * ECONNRESET - Connection reset by peer. - * EINVAL - offset is not a multiple of the page size, or offset is negative, or - * len is not a multiple of the page size - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - * ENXIO - Offsets in the range [offset, offset + len - 1] are invalid - * for the registered address space of the peer epd - */ -int scif_get_pages(scif_epd_t epd, off_t offset, size_t len, - struct scif_range **pages); - -/** - * scif_put_pages() - Remove references from remote registered pages - * @pages: pages to be returned - * - * scif_put_pages() releases a scif_range structure previously obtained by - * calling scif_get_pages(). The physical pages represented by pages may - * be reused when the window which represented those pages is unregistered. - * Therefore, those pages must not be accessed after calling scif_put_pages(). - * - * Return: - * Upon successful completion, scif_put_pages() returns 0; otherwise the - * negative of one of the following errors is returned. - * Errors: - * EINVAL - pages does not point to a valid scif_range structure, or - * the scif_range structure pointed to by pages was already returned - * ENODEV - The remote node is lost or existed, but is not currently in the - * network since it may have crashed - * ENOTCONN - The endpoint is not connected - */ -int scif_put_pages(struct scif_range *pages); - -/** - * scif_poll() - Wait for some event on an endpoint - * @epds: Array of endpoint descriptors - * @nepds: Length of epds - * @timeout: Upper limit on time for which scif_poll() will block - * - * scif_poll() waits for one of a set of endpoints to become ready to perform - * an I/O operation. - * - * The epds argument specifies the endpoint descriptors to be examined and the - * events of interest for each endpoint descriptor. epds is a pointer to an - * array with one member for each open endpoint descriptor of interest. - * - * The number of items in the epds array is specified in nepds. The epd field - * of scif_pollepd is an endpoint descriptor of an open endpoint. The field - * events is a bitmask specifying the events which the application is - * interested in. The field revents is an output parameter, filled by the - * kernel with the events that actually occurred. The bits returned in revents - * can include any of those specified in events, or one of the values EPOLLERR, - * EPOLLHUP, or EPOLLNVAL. (These three bits are meaningless in the events - * field, and will be set in the revents field whenever the corresponding - * condition is true.) - * - * If none of the events requested (and no error) has occurred for any of the - * endpoint descriptors, then scif_poll() blocks until one of the events occurs. - * - * The timeout argument specifies an upper limit on the time for which - * scif_poll() will block, in milliseconds. Specifying a negative value in - * timeout means an infinite timeout. - * - * The following bits may be set in events and returned in revents. - * EPOLLIN - Data may be received without blocking. For a connected - * endpoint, this means that scif_recv() may be called without blocking. For a - * listening endpoint, this means that scif_accept() may be called without - * blocking. - * EPOLLOUT - Data may be sent without blocking. For a connected endpoint, this - * means that scif_send() may be called without blocking. EPOLLOUT may also be - * used to block waiting for a non-blocking connect to complete. This bit value - * has no meaning for a listening endpoint and is ignored if specified. - * - * The following bits are only returned in revents, and are ignored if set in - * events. - * EPOLLERR - An error occurred on the endpoint - * EPOLLHUP - The connection to the peer endpoint was disconnected - * EPOLLNVAL - The specified endpoint descriptor is invalid. - * - * Return: - * Upon successful completion, scif_poll() returns a non-negative value. A - * positive value indicates the total number of endpoint descriptors that have - * been selected (that is, endpoint descriptors for which the revents member is - * non-zero). A value of 0 indicates that the call timed out and no endpoint - * descriptors have been selected. Otherwise in user mode -1 is returned and - * errno is set to indicate the error; in kernel mode the negative of one of - * the following errors is returned. - * - * Errors: - * EINTR - A signal occurred before any requested event - * EINVAL - The nepds argument is greater than {OPEN_MAX} - * ENOMEM - There was no space to allocate file descriptor tables - */ -int scif_poll(struct scif_pollepd *epds, unsigned int nepds, long timeout); - -/** - * scif_client_register() - Register a SCIF client - * @client: client to be registered - * - * scif_client_register() registers a SCIF client. The probe() method - * of the client is called when SCIF peer devices come online and the - * remove() method is called when the peer devices disappear. - * - * Return: - * Upon successful completion, scif_client_register() returns a non-negative - * value. Otherwise the return value is the same as subsys_interface_register() - * in the kernel. - */ -int scif_client_register(struct scif_client *client); - -/** - * scif_client_unregister() - Unregister a SCIF client - * @client: client to be unregistered - * - * scif_client_unregister() unregisters a SCIF client. - * - * Return: - * None - */ -void scif_client_unregister(struct scif_client *client); - -#endif /* __SCIF_H__ */ diff --git a/include/uapi/linux/mic_common.h b/include/uapi/linux/mic_common.h deleted file mode 100644 index 504e523f702c..000000000000 --- a/include/uapi/linux/mic_common.h +++ /dev/null @@ -1,235 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - * Intel MIC driver. - * - */ -#ifndef __MIC_COMMON_H_ -#define __MIC_COMMON_H_ - -#include - -#define __mic_align(a, x) (((a) + (x) - 1) & ~((x) - 1)) - -/** - * struct mic_device_desc: Virtio device information shared between the - * virtio driver and userspace backend - * - * @type: Device type: console/network/disk etc. Type 0/-1 terminates. - * @num_vq: Number of virtqueues. - * @feature_len: Number of bytes of feature bits. Multiply by 2: one for - host features and one for guest acknowledgements. - * @config_len: Number of bytes of the config array after virtqueues. - * @status: A status byte, written by the Guest. - * @config: Start of the following variable length config. - */ -struct mic_device_desc { - __s8 type; - __u8 num_vq; - __u8 feature_len; - __u8 config_len; - __u8 status; - __le64 config[0]; -} __attribute__ ((aligned(8))); - -/** - * struct mic_device_ctrl: Per virtio device information in the device page - * used internally by the host and card side drivers. - * - * @vdev: Used for storing MIC vdev information by the guest. - * @config_change: Set to 1 by host when a config change is requested. - * @vdev_reset: Set to 1 by guest to indicate virtio device has been reset. - * @guest_ack: Set to 1 by guest to ack a command. - * @host_ack: Set to 1 by host to ack a command. - * @used_address_updated: Set to 1 by guest when the used address should be - * updated. - * @c2h_vdev_db: The doorbell number to be used by guest. Set by host. - * @h2c_vdev_db: The doorbell number to be used by host. Set by guest. - */ -struct mic_device_ctrl { - __le64 vdev; - __u8 config_change; - __u8 vdev_reset; - __u8 guest_ack; - __u8 host_ack; - __u8 used_address_updated; - __s8 c2h_vdev_db; - __s8 h2c_vdev_db; -} __attribute__ ((aligned(8))); - -/** - * struct mic_bootparam: Virtio device independent information in device page - * - * @magic: A magic value used by the card to ensure it can see the host - * @h2c_config_db: Host to Card Virtio config doorbell set by card - * @node_id: Unique id of the node - * @h2c_scif_db - Host to card SCIF doorbell set by card - * @c2h_scif_db - Card to host SCIF doorbell set by host - * @scif_host_dma_addr - SCIF host queue pair DMA address - * @scif_card_dma_addr - SCIF card queue pair DMA address - */ -struct mic_bootparam { - __le32 magic; - __s8 h2c_config_db; - __u8 node_id; - __u8 h2c_scif_db; - __u8 c2h_scif_db; - __u64 scif_host_dma_addr; - __u64 scif_card_dma_addr; -} __attribute__ ((aligned(8))); - -/** - * struct mic_device_page: High level representation of the device page - * - * @bootparam: The bootparam structure is used for sharing information and - * status updates between MIC host and card drivers. - * @desc: Array of MIC virtio device descriptors. - */ -struct mic_device_page { - struct mic_bootparam bootparam; - struct mic_device_desc desc[0]; -}; -/** - * struct mic_vqconfig: This is how we expect the device configuration field - * for a virtqueue to be laid out in config space. - * - * @address: Guest/MIC physical address of the virtio ring - * (avail and desc rings) - * @used_address: Guest/MIC physical address of the used ring - * @num: The number of entries in the virtio_ring - */ -struct mic_vqconfig { - __le64 address; - __le64 used_address; - __le16 num; -} __attribute__ ((aligned(8))); - -/* - * The alignment to use between consumer and producer parts of vring. - * This is pagesize for historical reasons. - */ -#define MIC_VIRTIO_RING_ALIGN 4096 - -#define MIC_MAX_VRINGS 4 -#define MIC_VRING_ENTRIES 128 - -/* - * Max vring entries (power of 2) to ensure desc and avail rings - * fit in a single page - */ -#define MIC_MAX_VRING_ENTRIES 128 - -/** - * Max size of the desc block in bytes: includes: - * - struct mic_device_desc - * - struct mic_vqconfig (num_vq of these) - * - host and guest features - * - virtio device config space - */ -#define MIC_MAX_DESC_BLK_SIZE 256 - -/** - * struct _mic_vring_info - Host vring info exposed to userspace backend - * for the avail index and magic for the card. - * - * @avail_idx: host avail idx - * @magic: A magic debug cookie. - */ -struct _mic_vring_info { - __u16 avail_idx; - __le32 magic; -}; - -/** - * struct mic_vring - Vring information. - * - * @vr: The virtio ring. - * @info: Host vring information exposed to the userspace backend for the - * avail index and magic for the card. - * @va: The va for the buffer allocated for vr and info. - * @len: The length of the buffer required for allocating vr and info. - */ -struct mic_vring { - struct vring vr; - struct _mic_vring_info *info; - void *va; - int len; -}; - -#define mic_aligned_desc_size(d) __mic_align(mic_desc_size(d), 8) - -#ifndef INTEL_MIC_CARD -static inline unsigned mic_desc_size(const struct mic_device_desc *desc) -{ - return sizeof(*desc) + desc->num_vq * sizeof(struct mic_vqconfig) - + desc->feature_len * 2 + desc->config_len; -} - -static inline struct mic_vqconfig * -mic_vq_config(const struct mic_device_desc *desc) -{ - return (struct mic_vqconfig *)(desc + 1); -} - -static inline __u8 *mic_vq_features(const struct mic_device_desc *desc) -{ - return (__u8 *)(mic_vq_config(desc) + desc->num_vq); -} - -static inline __u8 *mic_vq_configspace(const struct mic_device_desc *desc) -{ - return mic_vq_features(desc) + desc->feature_len * 2; -} -static inline unsigned mic_total_desc_size(struct mic_device_desc *desc) -{ - return mic_aligned_desc_size(desc) + sizeof(struct mic_device_ctrl); -} -#endif - -/* Device page size */ -#define MIC_DP_SIZE 4096 - -#define MIC_MAGIC 0xc0ffee00 - -/** - * enum mic_states - MIC states. - */ -enum mic_states { - MIC_READY = 0, - MIC_BOOTING, - MIC_ONLINE, - MIC_SHUTTING_DOWN, - MIC_RESETTING, - MIC_RESET_FAILED, - MIC_LAST -}; - -/** - * enum mic_status - MIC status reported by card after - * a host or card initiated shutdown or a card crash. - */ -enum mic_status { - MIC_NOP = 0, - MIC_CRASHED, - MIC_HALTED, - MIC_POWER_OFF, - MIC_RESTART, - MIC_STATUS_LAST -}; - -#endif diff --git a/include/uapi/linux/mic_ioctl.h b/include/uapi/linux/mic_ioctl.h deleted file mode 100644 index 687b9cd9d3e2..000000000000 --- a/include/uapi/linux/mic_ioctl.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - * Intel MIC Host driver. - * - */ -#ifndef _MIC_IOCTL_H_ -#define _MIC_IOCTL_H_ - -#include - -/* - * mic_copy - MIC virtio descriptor copy. - * - * @iov: An array of IOVEC structures containing user space buffers. - * @iovcnt: Number of IOVEC structures in iov. - * @vr_idx: The vring index. - * @update_used: A non zero value results in used index being updated. - * @out_len: The aggregate of the total length written to or read from - * the virtio device. - */ -struct mic_copy_desc { -#ifdef __KERNEL__ - struct iovec __user *iov; -#else - struct iovec *iov; -#endif - __u32 iovcnt; - __u8 vr_idx; - __u8 update_used; - __u32 out_len; -}; - -/* - * Add a new virtio device - * The (struct mic_device_desc *) pointer points to a device page entry - * for the virtio device consisting of: - * - struct mic_device_desc - * - struct mic_vqconfig (num_vq of these) - * - host and guest features - * - virtio device config space - * The total size referenced by the pointer should equal the size returned - * by desc_size() in mic_common.h - */ -#define MIC_VIRTIO_ADD_DEVICE _IOWR('s', 1, struct mic_device_desc *) - -/* - * Copy the number of entries in the iovec and update the used index - * if requested by the user. - */ -#define MIC_VIRTIO_COPY_DESC _IOWR('s', 2, struct mic_copy_desc *) - -/* - * Notify virtio device of a config change - * The (__u8 *) pointer points to config space values for the device - * as they should be written into the device page. The total size - * referenced by the pointer should equal the config_len field of struct - * mic_device_desc. - */ -#define MIC_VIRTIO_CONFIG_CHANGE _IOWR('s', 5, __u8 *) - -#endif diff --git a/samples/mic/mpssd/.gitignore b/samples/mic/mpssd/.gitignore deleted file mode 100644 index aa03f1eb37a0..000000000000 --- a/samples/mic/mpssd/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -mpssd diff --git a/samples/mic/mpssd/Makefile b/samples/mic/mpssd/Makefile deleted file mode 100644 index a7a6e0c70424..000000000000 --- a/samples/mic/mpssd/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -ifndef CROSS_COMPILE -uname_M := $(shell uname -m 2>/dev/null || echo not) -ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/) - -ifeq ($(ARCH),x86) - -PROGS := mpssd -CC = $(CROSS_COMPILE)gcc -CFLAGS := -I../../../usr/include -I../../../tools/include - -ifdef DEBUG -CFLAGS += -DDEBUG=$(DEBUG) -endif - -all: $(PROGS) -mpssd: mpssd.c sysfs.c - $(CC) $(CFLAGS) mpssd.c sysfs.c -o mpssd -lpthread - -install: - install mpssd /usr/sbin/mpssd - install micctrl /usr/sbin/micctrl - -clean: - rm -fr $(PROGS) - -endif -endif diff --git a/samples/mic/mpssd/micctrl b/samples/mic/mpssd/micctrl deleted file mode 100755 index 030a60b04046..000000000000 --- a/samples/mic/mpssd/micctrl +++ /dev/null @@ -1,162 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only -# Intel MIC Platform Software Stack (MPSS) -# -# Copyright(c) 2013 Intel Corporation. -# -# Intel MIC User Space Tools. -# -# micctrl - Controls MIC boot/start/stop. -# -# chkconfig: 2345 95 05 -# description: start MPSS stack processing. -# -### BEGIN INIT INFO -# Provides: micctrl -### END INIT INFO - -# Source function library. -. /etc/init.d/functions - -sysfs="/sys/class/mic" - -_status() -{ - f=$sysfs/$1 - echo -e $1 state: "`cat $f/state`" shutdown_status: "`cat $f/shutdown_status`" -} - -status() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _status $1 - return $? - fi - for f in $sysfs/* - do - _status `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_reset() -{ - f=$sysfs/$1 - echo reset > $f/state -} - -reset() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _reset $1 - return $? - fi - for f in $sysfs/* - do - _reset `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_boot() -{ - f=$sysfs/$1 - echo "linux" > $f/bootmode - echo "mic/uos.img" > $f/firmware - echo "mic/$1.image" > $f/ramdisk - echo "boot" > $f/state -} - -boot() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _boot $1 - return $? - fi - for f in $sysfs/* - do - _boot `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_shutdown() -{ - f=$sysfs/$1 - echo shutdown > $f/state -} - -shutdown() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _shutdown $1 - return $? - fi - for f in $sysfs/* - do - _shutdown `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -_wait() -{ - f=$sysfs/$1 - while [ "`cat $f/state`" != "offline" -a "`cat $f/state`" != "online" ] - do - sleep 1 - echo -e "Waiting for $1 to go offline" - done -} - -wait() -{ - if [ "`echo $1 | head -c3`" == "mic" ]; then - _wait $1 - return $? - fi - # Wait for the cards to go offline - for f in $sysfs/* - do - _wait `basename $f` - RETVAL=$? - [ $RETVAL -ne 0 ] && return $RETVAL - done - return 0 -} - -if [ ! -d "$sysfs" ]; then - echo -e $"Module unloaded " - exit 3 -fi - -case $1 in - -s) - status $2 - ;; - -r) - reset $2 - ;; - -b) - boot $2 - ;; - -S) - shutdown $2 - ;; - -w) - wait $2 - ;; - *) - echo $"Usage: $0 {-s (status) |-r (reset) |-b (boot) |-S (shutdown) |-w (wait)}" - exit 2 -esac - -exit $? diff --git a/samples/mic/mpssd/mpss b/samples/mic/mpssd/mpss deleted file mode 100755 index 248ac7313c71..000000000000 --- a/samples/mic/mpssd/mpss +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0-only -# Intel MIC Platform Software Stack (MPSS) -# -# Copyright(c) 2013 Intel Corporation. -# -# Intel MIC User Space Tools. -# -# mpss Start mpssd. -# -# chkconfig: 2345 95 05 -# description: start MPSS stack processing. -# -### BEGIN INIT INFO -# Provides: mpss -# Required-Start: -# Required-Stop: -# Short-Description: MPSS stack control -# Description: MPSS stack control -### END INIT INFO - -# Source function library. -. /etc/init.d/functions - -exec=/usr/sbin/mpssd -sysfs="/sys/class/mic" -mic_modules="mic_host mic_x100_dma scif vop" - -start() -{ - [ -x $exec ] || exit 5 - - if [ "`ps -e | awk '{print $4}' | grep mpssd | head -1`" = "mpssd" ]; then - echo -e $"MPSSD already running! " - success - echo - return 0 - fi - - echo -e $"Starting MPSS Stack" - echo -e $"Loading MIC drivers:" $mic_modules - - modprobe -a $mic_modules - RETVAL=$? - if [ $RETVAL -ne 0 ]; then - failure - echo - return $RETVAL - fi - - # Start the daemon - echo -n $"Starting MPSSD " - $exec - RETVAL=$? - if [ $RETVAL -ne 0 ]; then - failure - echo - return $RETVAL - fi - success - echo - - sleep 5 - - # Boot the cards - micctrl -b - - # Wait till ping works - for f in $sysfs/* - do - count=100 - ipaddr=`cat $f/cmdline` - ipaddr=${ipaddr#*address,} - ipaddr=`echo $ipaddr | cut -d, -f1 | cut -d\; -f1` - while [ $count -ge 0 ] - do - echo -e "Pinging "`basename $f`" " - ping -c 1 $ipaddr &> /dev/null - RETVAL=$? - if [ $RETVAL -eq 0 ]; then - success - break - fi - sleep 1 - count=`expr $count - 1` - done - [ $RETVAL -ne 0 ] && failure || success - echo - done - return $RETVAL -} - -stop() -{ - echo -e $"Shutting down MPSS Stack: " - - # Bail out if module is unloaded - if [ ! -d "$sysfs" ]; then - echo -n $"Module unloaded " - success - echo - return 0 - fi - - # Shut down the cards. - micctrl -S - - # Wait for the cards to go offline - for f in $sysfs/* - do - while [ "`cat $f/state`" != "ready" ] - do - sleep 1 - echo -e "Waiting for "`basename $f`" to become ready" - done - done - - # Display the status of the cards - micctrl -s - - # Kill MPSSD now - echo -n $"Killing MPSSD" - killall -9 mpssd 2>/dev/null - RETVAL=$? - [ $RETVAL -ne 0 ] && failure || success - echo - return $RETVAL -} - -restart() -{ - stop - sleep 5 - start -} - -status() -{ - micctrl -s - if [ "`ps -e | awk '{print $4}' | grep mpssd | head -n 1`" = "mpssd" ]; then - echo "mpssd is running" - else - echo "mpssd is stopped" - fi - return 0 -} - -unload() -{ - if [ ! -d "$sysfs" ]; then - echo -n $"No MIC_HOST Module: " - success - echo - return - fi - - stop - - sleep 5 - echo -n $"Removing MIC drivers:" $mic_modules - modprobe -r $mic_modules - RETVAL=$? - [ $RETVAL -ne 0 ] && failure || success - echo - return $RETVAL -} - -case $1 in - start) - start - ;; - stop) - stop - ;; - restart) - restart - ;; - status) - status - ;; - unload) - unload - ;; - *) - echo $"Usage: $0 {start|stop|restart|status|unload}" - exit 2 -esac - -exit $? diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c deleted file mode 100644 index c03a05d498f0..000000000000 --- a/samples/mic/mpssd/mpssd.c +++ /dev/null @@ -1,1815 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ - -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "mpssd.h" -#include -#include -#include - -static void *init_mic(void *arg); - -static FILE *logfp; -static struct mic_info mic_list; - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define min_t(type, x, y) ({ \ - type __min1 = (x); \ - type __min2 = (y); \ - __min1 < __min2 ? __min1 : __min2; }) - -/* align addr on a size boundary - adjust address up/down if needed */ -#define _ALIGN_DOWN(addr, size) ((addr)&(~((size)-1))) -#define _ALIGN_UP(addr, size) _ALIGN_DOWN(addr + size - 1, size) - -/* align addr on a size boundary - adjust address up if needed */ -#define _ALIGN(addr, size) _ALIGN_UP(addr, size) - -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) - -#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) - -#define GSO_ENABLED 1 -#define MAX_GSO_SIZE (64 * 1024) -#define ETH_H_LEN 14 -#define MAX_NET_PKT_SIZE (_ALIGN_UP(MAX_GSO_SIZE + ETH_H_LEN, 64)) -#define MIC_DEVICE_PAGE_END 0x1000 - -#ifndef VIRTIO_NET_HDR_F_DATA_VALID -#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ -#endif - -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[2]; - __u32 host_features, guest_acknowledgements; - struct virtio_console_config cons_config; -} virtcons_dev_page = { - .dd = { - .type = VIRTIO_ID_CONSOLE, - .num_vq = ARRAY_SIZE(virtcons_dev_page.vqconfig), - .feature_len = sizeof(virtcons_dev_page.host_features), - .config_len = sizeof(virtcons_dev_page.cons_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .vqconfig[1] = { - .num = htole16(MIC_VRING_ENTRIES), - }, -}; - -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[2]; - __u32 host_features, guest_acknowledgements; - struct virtio_net_config net_config; -} virtnet_dev_page = { - .dd = { - .type = VIRTIO_ID_NET, - .num_vq = ARRAY_SIZE(virtnet_dev_page.vqconfig), - .feature_len = sizeof(virtnet_dev_page.host_features), - .config_len = sizeof(virtnet_dev_page.net_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .vqconfig[1] = { - .num = htole16(MIC_VRING_ENTRIES), - }, -#if GSO_ENABLED - .host_features = htole32( - 1 << VIRTIO_NET_F_CSUM | - 1 << VIRTIO_NET_F_GSO | - 1 << VIRTIO_NET_F_GUEST_TSO4 | - 1 << VIRTIO_NET_F_GUEST_TSO6 | - 1 << VIRTIO_NET_F_GUEST_ECN), -#else - .host_features = 0, -#endif -}; - -static const char *mic_config_dir = "/etc/mpss"; -static const char *virtblk_backend = "VIRTBLK_BACKEND"; -static struct { - struct mic_device_desc dd; - struct mic_vqconfig vqconfig[1]; - __u32 host_features, guest_acknowledgements; - struct virtio_blk_config blk_config; -} virtblk_dev_page = { - .dd = { - .type = VIRTIO_ID_BLOCK, - .num_vq = ARRAY_SIZE(virtblk_dev_page.vqconfig), - .feature_len = sizeof(virtblk_dev_page.host_features), - .config_len = sizeof(virtblk_dev_page.blk_config), - }, - .vqconfig[0] = { - .num = htole16(MIC_VRING_ENTRIES), - }, - .host_features = - htole32(1<name, strerror(errno)); - return ret; - } - } - if (pid < 0) { - mpsslog("%s fork failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - ret = waitpid(pid, NULL, 0); - if (ret < 0) { - mpsslog("%s waitpid failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - snprintf(ipaddr, IFNAMSIZ, "172.31.%d.254/24", mic->id + 1); - - pid = fork(); - if (pid == 0) { - ifargv[0] = "ip"; - ifargv[1] = "addr"; - ifargv[2] = "add"; - ifargv[3] = ipaddr; - ifargv[4] = "dev"; - ifargv[5] = dev; - ifargv[6] = NULL; - mpsslog("Configuring %s ipaddr %s\n", dev, ipaddr); - ret = execvp("ip", ifargv); - if (ret < 0) { - mpsslog("%s execvp failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - } - if (pid < 0) { - mpsslog("%s fork failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - - ret = waitpid(pid, NULL, 0); - if (ret < 0) { - mpsslog("%s waitpid failed errno %s\n", - mic->name, strerror(errno)); - return ret; - } - mpsslog("MIC name %s %s %d DONE!\n", - mic->name, __func__, __LINE__); - return 0; -} - -static int tun_alloc(struct mic_info *mic, char *dev) -{ - struct ifreq ifr; - int fd, err; -#if GSO_ENABLED - unsigned offload; -#endif - fd = open("/dev/net/tun", O_RDWR); - if (fd < 0) { - mpsslog("Could not open /dev/net/tun %s\n", strerror(errno)); - goto done; - } - - memset(&ifr, 0, sizeof(ifr)); - - ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; - if (*dev) - strncpy(ifr.ifr_name, dev, IFNAMSIZ); - - err = ioctl(fd, TUNSETIFF, (void *)&ifr); - if (err < 0) { - mpsslog("%s %s %d TUNSETIFF failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - close(fd); - return err; - } -#if GSO_ENABLED - offload = TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN; - - err = ioctl(fd, TUNSETOFFLOAD, offload); - if (err < 0) { - mpsslog("%s %s %d TUNSETOFFLOAD failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - close(fd); - return err; - } -#endif - strcpy(dev, ifr.ifr_name); - mpsslog("Created TAP %s\n", dev); -done: - return fd; -} - -#define NET_FD_VIRTIO_NET 0 -#define NET_FD_TUN 1 -#define MAX_NET_FD 2 - -static void set_dp(struct mic_info *mic, int type, void *dp) -{ - switch (type) { - case VIRTIO_ID_CONSOLE: - mic->mic_console.console_dp = dp; - return; - case VIRTIO_ID_NET: - mic->mic_net.net_dp = dp; - return; - case VIRTIO_ID_BLOCK: - mic->mic_virtblk.block_dp = dp; - return; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - assert(0); -} - -static void *get_dp(struct mic_info *mic, int type) -{ - switch (type) { - case VIRTIO_ID_CONSOLE: - return mic->mic_console.console_dp; - case VIRTIO_ID_NET: - return mic->mic_net.net_dp; - case VIRTIO_ID_BLOCK: - return mic->mic_virtblk.block_dp; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - assert(0); - return NULL; -} - -static struct mic_device_desc *get_device_desc(struct mic_info *mic, int type) -{ - struct mic_device_desc *d; - int i; - void *dp = get_dp(mic, type); - - for (i = sizeof(struct mic_bootparam); i < PAGE_SIZE; - i += mic_total_desc_size(d)) { - d = dp + i; - - /* End of list */ - if (d->type == 0) - break; - - if (d->type == -1) - continue; - - mpsslog("%s %s d-> type %d d %p\n", - mic->name, __func__, d->type, d); - - if (d->type == (__u8)type) - return d; - } - mpsslog("%s %s %d not found\n", mic->name, __func__, type); - return NULL; -} - -/* See comments in vhost.c for explanation of next_desc() */ -static unsigned next_desc(struct vring_desc *desc) -{ - unsigned int next; - - if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) - return -1U; - next = le16toh(desc->next); - return next; -} - -/* Sum up all the IOVEC length */ -static ssize_t -sum_iovec_len(struct mic_copy_desc *copy) -{ - ssize_t sum = 0; - unsigned int i; - - for (i = 0; i < copy->iovcnt; i++) - sum += copy->iov[i].iov_len; - return sum; -} - -static inline void verify_out_len(struct mic_info *mic, - struct mic_copy_desc *copy) -{ - if (copy->out_len != sum_iovec_len(copy)) { - mpsslog("%s %s %d BUG copy->out_len 0x%x len 0x%zx\n", - mic->name, __func__, __LINE__, - copy->out_len, sum_iovec_len(copy)); - assert(copy->out_len == sum_iovec_len(copy)); - } -} - -/* Display an iovec */ -static void -disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy, - const char *s, int line) -{ - unsigned int i; - - for (i = 0; i < copy->iovcnt; i++) - mpsslog("%s %s %d copy->iov[%d] addr %p len 0x%zx\n", - mic->name, s, line, i, - copy->iov[i].iov_base, copy->iov[i].iov_len); -} - -static inline __u16 read_avail_idx(struct mic_vring *vr) -{ - return READ_ONCE(vr->info->avail_idx); -} - -static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr, - struct mic_copy_desc *copy, ssize_t len) -{ - copy->vr_idx = tx ? 0 : 1; - copy->update_used = true; - if (type == VIRTIO_ID_NET) - copy->iov[1].iov_len = len - sizeof(struct virtio_net_hdr); - else - copy->iov[0].iov_len = len; -} - -/* Central API which triggers the copies */ -static int -mic_virtio_copy(struct mic_info *mic, int fd, - struct mic_vring *vr, struct mic_copy_desc *copy) -{ - int ret; - - ret = ioctl(fd, MIC_VIRTIO_COPY_DESC, copy); - if (ret) { - mpsslog("%s %s %d errno %s ret %d\n", - mic->name, __func__, __LINE__, - strerror(errno), ret); - } - return ret; -} - -static inline unsigned _vring_size(unsigned int num, unsigned long align) -{ - return _ALIGN_UP(((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num) - + align - 1) & ~(align - 1)) - + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num, 4); -} - -/* - * This initialization routine requires at least one - * vring i.e. vr0. vr1 is optional. - */ -static void * -init_vr(struct mic_info *mic, int fd, int type, - struct mic_vring *vr0, struct mic_vring *vr1, int num_vq) -{ - int vr_size; - char *va; - - vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN) + - sizeof(struct _mic_vring_info)); - va = mmap(NULL, MIC_DEVICE_PAGE_END + vr_size * num_vq, - PROT_READ, MAP_SHARED, fd, 0); - if (MAP_FAILED == va) { - mpsslog("%s %s %d mmap failed errno %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - goto done; - } - set_dp(mic, type, va); - vr0->va = (struct mic_vring *)&va[MIC_DEVICE_PAGE_END]; - vr0->info = vr0->va + - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN); - vring_init(&vr0->vr, - MIC_VRING_ENTRIES, vr0->va, MIC_VIRTIO_RING_ALIGN); - mpsslog("%s %s vr0 %p vr0->info %p vr_size 0x%x vring 0x%x ", - __func__, mic->name, vr0->va, vr0->info, vr_size, - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN)); - mpsslog("magic 0x%x expected 0x%x\n", - le32toh(vr0->info->magic), MIC_MAGIC + type); - assert(le32toh(vr0->info->magic) == MIC_MAGIC + type); - if (vr1) { - vr1->va = (struct mic_vring *) - &va[MIC_DEVICE_PAGE_END + vr_size]; - vr1->info = vr1->va + _vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN); - vring_init(&vr1->vr, - MIC_VRING_ENTRIES, vr1->va, MIC_VIRTIO_RING_ALIGN); - mpsslog("%s %s vr1 %p vr1->info %p vr_size 0x%x vring 0x%x ", - __func__, mic->name, vr1->va, vr1->info, vr_size, - _vring_size(MIC_VRING_ENTRIES, MIC_VIRTIO_RING_ALIGN)); - mpsslog("magic 0x%x expected 0x%x\n", - le32toh(vr1->info->magic), MIC_MAGIC + type + 1); - assert(le32toh(vr1->info->magic) == MIC_MAGIC + type + 1); - } -done: - return va; -} - -static int -wait_for_card_driver(struct mic_info *mic, int fd, int type) -{ - struct pollfd pollfd; - int err; - struct mic_device_desc *desc = get_device_desc(mic, type); - __u8 prev_status; - - if (!desc) - return -ENODEV; - prev_status = desc->status; - pollfd.fd = fd; - mpsslog("%s %s Waiting .... desc-> type %d status 0x%x\n", - mic->name, __func__, type, desc->status); - - while (1) { - pollfd.events = POLLIN; - pollfd.revents = 0; - err = poll(&pollfd, 1, -1); - if (err < 0) { - mpsslog("%s %s poll failed %s\n", - mic->name, __func__, strerror(errno)); - continue; - } - - if (pollfd.revents) { - if (desc->status != prev_status) { - mpsslog("%s %s Waiting... desc-> type %d " - "status 0x%x\n", - mic->name, __func__, type, - desc->status); - prev_status = desc->status; - } - if (desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - mpsslog("%s %s poll.revents %d\n", - mic->name, __func__, pollfd.revents); - mpsslog("%s %s desc-> type %d status 0x%x\n", - mic->name, __func__, type, - desc->status); - break; - } - } - } - return 0; -} - -/* Spin till we have some descriptors */ -static void -spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr) -{ - __u16 avail_idx = read_avail_idx(vr); - - while (avail_idx == le16toh(READ_ONCE(vr->vr.avail->idx))) { -#ifdef DEBUG - mpsslog("%s %s waiting for desc avail %d info_avail %d\n", - mic->name, __func__, - le16toh(vr->vr.avail->idx), vr->info->avail_idx); -#endif - sched_yield(); - } -} - -static void * -virtio_net(void *arg) -{ - static __u8 vnet_hdr[2][sizeof(struct virtio_net_hdr)]; - static __u8 vnet_buf[2][MAX_NET_PKT_SIZE] __attribute__ ((aligned(64))); - struct iovec vnet_iov[2][2] = { - { { .iov_base = vnet_hdr[0], .iov_len = sizeof(vnet_hdr[0]) }, - { .iov_base = vnet_buf[0], .iov_len = sizeof(vnet_buf[0]) } }, - { { .iov_base = vnet_hdr[1], .iov_len = sizeof(vnet_hdr[1]) }, - { .iov_base = vnet_buf[1], .iov_len = sizeof(vnet_buf[1]) } }, - }; - struct iovec *iov0 = vnet_iov[0], *iov1 = vnet_iov[1]; - struct mic_info *mic = (struct mic_info *)arg; - char if_name[IFNAMSIZ]; - struct pollfd net_poll[MAX_NET_FD]; - struct mic_vring tx_vr, rx_vr; - struct mic_copy_desc copy; - struct mic_device_desc *desc; - int err; - - snprintf(if_name, IFNAMSIZ, "mic%d", mic->id); - mic->mic_net.tap_fd = tun_alloc(mic, if_name); - if (mic->mic_net.tap_fd < 0) - goto done; - - if (tap_configure(mic, if_name)) - goto done; - mpsslog("MIC name %s id %d\n", mic->name, mic->id); - - net_poll[NET_FD_VIRTIO_NET].fd = mic->mic_net.virtio_net_fd; - net_poll[NET_FD_VIRTIO_NET].events = POLLIN; - net_poll[NET_FD_TUN].fd = mic->mic_net.tap_fd; - net_poll[NET_FD_TUN].events = POLLIN; - - if (MAP_FAILED == init_vr(mic, mic->mic_net.virtio_net_fd, - VIRTIO_ID_NET, &tx_vr, &rx_vr, - virtnet_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - goto done; - } - - copy.iovcnt = 2; - desc = get_device_desc(mic, VIRTIO_ID_NET); - - while (1) { - ssize_t len; - - net_poll[NET_FD_VIRTIO_NET].revents = 0; - net_poll[NET_FD_TUN].revents = 0; - - /* Start polling for data from tap and virtio net */ - err = poll(net_poll, 2, -1); - if (err < 0) { - mpsslog("%s poll failed %s\n", - __func__, strerror(errno)); - continue; - } - if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - err = wait_for_card_driver(mic, - mic->mic_net.virtio_net_fd, - VIRTIO_ID_NET); - if (err) { - mpsslog("%s %s %d Exiting...\n", - mic->name, __func__, __LINE__); - break; - } - } - /* - * Check if there is data to be read from TUN and write to - * virtio net fd if there is. - */ - if (net_poll[NET_FD_TUN].revents & POLLIN) { - copy.iov = iov0; - len = readv(net_poll[NET_FD_TUN].fd, - copy.iov, copy.iovcnt); - if (len > 0) { - struct virtio_net_hdr *hdr - = (struct virtio_net_hdr *)vnet_hdr[0]; - - /* Disable checksums on the card since we are on - a reliable PCIe link */ - hdr->flags |= VIRTIO_NET_HDR_F_DATA_VALID; -#ifdef DEBUG - mpsslog("%s %s %d hdr->flags 0x%x ", mic->name, - __func__, __LINE__, hdr->flags); - mpsslog("copy.out_len %d hdr->gso_type 0x%x\n", - copy.out_len, hdr->gso_type); -#endif -#ifdef DEBUG - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read from tap 0x%lx\n", - mic->name, __func__, __LINE__, - len); -#endif - spin_for_descriptors(mic, &tx_vr); - txrx_prepare(VIRTIO_ID_NET, 1, &tx_vr, ©, - len); - - err = mic_virtio_copy(mic, - mic->mic_net.virtio_net_fd, &tx_vr, - ©); - if (err < 0) { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - } - if (!err) - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d wrote to net 0x%lx\n", - mic->name, __func__, __LINE__, - sum_iovec_len(©)); -#endif - /* Reinitialize IOV for next run */ - iov0[1].iov_len = MAX_NET_PKT_SIZE; - } else if (len < 0) { - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read failed %s ", mic->name, - __func__, __LINE__, strerror(errno)); - mpsslog("cnt %d sum %zd\n", - copy.iovcnt, sum_iovec_len(©)); - } - } - - /* - * Check if there is data to be read from virtio net and - * write to TUN if there is. - */ - if (net_poll[NET_FD_VIRTIO_NET].revents & POLLIN) { - while (rx_vr.info->avail_idx != - le16toh(rx_vr.vr.avail->idx)) { - copy.iov = iov1; - txrx_prepare(VIRTIO_ID_NET, 0, &rx_vr, ©, - MAX_NET_PKT_SIZE - + sizeof(struct virtio_net_hdr)); - - err = mic_virtio_copy(mic, - mic->mic_net.virtio_net_fd, &rx_vr, - ©); - if (!err) { -#ifdef DEBUG - struct virtio_net_hdr *hdr - = (struct virtio_net_hdr *) - vnet_hdr[1]; - - mpsslog("%s %s %d hdr->flags 0x%x, ", - mic->name, __func__, __LINE__, - hdr->flags); - mpsslog("out_len %d gso_type 0x%x\n", - copy.out_len, - hdr->gso_type); -#endif - /* Set the correct output iov_len */ - iov1[1].iov_len = copy.out_len - - sizeof(struct virtio_net_hdr); - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, __LINE__); - mpsslog("read from net 0x%lx\n", - sum_iovec_len(©)); -#endif - len = writev(net_poll[NET_FD_TUN].fd, - copy.iov, copy.iovcnt); - if (len != sum_iovec_len(©)) { - mpsslog("Tun write failed %s ", - strerror(errno)); - mpsslog("len 0x%zx ", len); - mpsslog("read_len 0x%zx\n", - sum_iovec_len(©)); - } else { -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, - __LINE__); - mpsslog("wrote to tap 0x%lx\n", - len); -#endif - } - } else { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - break; - } - } - } - if (net_poll[NET_FD_VIRTIO_NET].revents & POLLERR) - mpsslog("%s: %s: POLLERR\n", __func__, mic->name); - } -done: - pthread_exit(NULL); -} - -/* virtio_console */ -#define VIRTIO_CONSOLE_FD 0 -#define MONITOR_FD (VIRTIO_CONSOLE_FD + 1) -#define MAX_CONSOLE_FD (MONITOR_FD + 1) /* must be the last one + 1 */ -#define MAX_BUFFER_SIZE PAGE_SIZE - -static void * -virtio_console(void *arg) -{ - static __u8 vcons_buf[2][PAGE_SIZE]; - struct iovec vcons_iov[2] = { - { .iov_base = vcons_buf[0], .iov_len = sizeof(vcons_buf[0]) }, - { .iov_base = vcons_buf[1], .iov_len = sizeof(vcons_buf[1]) }, - }; - struct iovec *iov0 = &vcons_iov[0], *iov1 = &vcons_iov[1]; - struct mic_info *mic = (struct mic_info *)arg; - int err; - struct pollfd console_poll[MAX_CONSOLE_FD]; - int pty_fd; - char *pts_name; - ssize_t len; - struct mic_vring tx_vr, rx_vr; - struct mic_copy_desc copy; - struct mic_device_desc *desc; - - pty_fd = posix_openpt(O_RDWR); - if (pty_fd < 0) { - mpsslog("can't open a pseudoterminal master device: %s\n", - strerror(errno)); - goto _return; - } - pts_name = ptsname(pty_fd); - if (pts_name == NULL) { - mpsslog("can't get pts name\n"); - goto _close_pty; - } - printf("%s console message goes to %s\n", mic->name, pts_name); - mpsslog("%s console message goes to %s\n", mic->name, pts_name); - err = grantpt(pty_fd); - if (err < 0) { - mpsslog("can't grant access: %s %s\n", - pts_name, strerror(errno)); - goto _close_pty; - } - err = unlockpt(pty_fd); - if (err < 0) { - mpsslog("can't unlock a pseudoterminal: %s %s\n", - pts_name, strerror(errno)); - goto _close_pty; - } - console_poll[MONITOR_FD].fd = pty_fd; - console_poll[MONITOR_FD].events = POLLIN; - - console_poll[VIRTIO_CONSOLE_FD].fd = mic->mic_console.virtio_console_fd; - console_poll[VIRTIO_CONSOLE_FD].events = POLLIN; - - if (MAP_FAILED == init_vr(mic, mic->mic_console.virtio_console_fd, - VIRTIO_ID_CONSOLE, &tx_vr, &rx_vr, - virtcons_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - goto _close_pty; - } - - copy.iovcnt = 1; - desc = get_device_desc(mic, VIRTIO_ID_CONSOLE); - - for (;;) { - console_poll[MONITOR_FD].revents = 0; - console_poll[VIRTIO_CONSOLE_FD].revents = 0; - err = poll(console_poll, MAX_CONSOLE_FD, -1); - if (err < 0) { - mpsslog("%s %d: poll failed: %s\n", __func__, __LINE__, - strerror(errno)); - continue; - } - if (!(desc->status & VIRTIO_CONFIG_S_DRIVER_OK)) { - err = wait_for_card_driver(mic, - mic->mic_console.virtio_console_fd, - VIRTIO_ID_CONSOLE); - if (err) { - mpsslog("%s %s %d Exiting...\n", - mic->name, __func__, __LINE__); - break; - } - } - - if (console_poll[MONITOR_FD].revents & POLLIN) { - copy.iov = iov0; - len = readv(pty_fd, copy.iov, copy.iovcnt); - if (len > 0) { -#ifdef DEBUG - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read from tap 0x%lx\n", - mic->name, __func__, __LINE__, - len); -#endif - spin_for_descriptors(mic, &tx_vr); - txrx_prepare(VIRTIO_ID_CONSOLE, 1, &tx_vr, - ©, len); - - err = mic_virtio_copy(mic, - mic->mic_console.virtio_console_fd, - &tx_vr, ©); - if (err < 0) { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - } - if (!err) - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d wrote to net 0x%lx\n", - mic->name, __func__, __LINE__, - sum_iovec_len(©)); -#endif - /* Reinitialize IOV for next run */ - iov0->iov_len = PAGE_SIZE; - } else if (len < 0) { - disp_iovec(mic, ©, __func__, __LINE__); - mpsslog("%s %s %d read failed %s ", - mic->name, __func__, __LINE__, - strerror(errno)); - mpsslog("cnt %d sum %zd\n", - copy.iovcnt, sum_iovec_len(©)); - } - } - - if (console_poll[VIRTIO_CONSOLE_FD].revents & POLLIN) { - while (rx_vr.info->avail_idx != - le16toh(rx_vr.vr.avail->idx)) { - copy.iov = iov1; - txrx_prepare(VIRTIO_ID_CONSOLE, 0, &rx_vr, - ©, PAGE_SIZE); - - err = mic_virtio_copy(mic, - mic->mic_console.virtio_console_fd, - &rx_vr, ©); - if (!err) { - /* Set the correct output iov_len */ - iov1->iov_len = copy.out_len; - verify_out_len(mic, ©); -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, __LINE__); - mpsslog("read from net 0x%lx\n", - sum_iovec_len(©)); -#endif - len = writev(pty_fd, - copy.iov, copy.iovcnt); - if (len != sum_iovec_len(©)) { - mpsslog("Tun write failed %s ", - strerror(errno)); - mpsslog("len 0x%zx ", len); - mpsslog("read_len 0x%zx\n", - sum_iovec_len(©)); - } else { -#ifdef DEBUG - disp_iovec(mic, ©, __func__, - __LINE__); - mpsslog("%s %s %d ", - mic->name, __func__, - __LINE__); - mpsslog("wrote to tap 0x%lx\n", - len); -#endif - } - } else { - mpsslog("%s %s %d mic_virtio_copy %s\n", - mic->name, __func__, __LINE__, - strerror(errno)); - break; - } - } - } - if (console_poll[NET_FD_VIRTIO_NET].revents & POLLERR) - mpsslog("%s: %s: POLLERR\n", __func__, mic->name); - } -_close_pty: - close(pty_fd); -_return: - pthread_exit(NULL); -} - -static void -add_virtio_device(struct mic_info *mic, struct mic_device_desc *dd) -{ - char path[PATH_MAX]; - int fd, err; - - snprintf(path, PATH_MAX, "/dev/vop_virtio%d", mic->id); - fd = open(path, O_RDWR); - if (fd < 0) { - mpsslog("Could not open %s %s\n", path, strerror(errno)); - return; - } - - err = ioctl(fd, MIC_VIRTIO_ADD_DEVICE, dd); - if (err < 0) { - mpsslog("Could not add %d %s\n", dd->type, strerror(errno)); - close(fd); - return; - } - switch (dd->type) { - case VIRTIO_ID_NET: - mic->mic_net.virtio_net_fd = fd; - mpsslog("Added VIRTIO_ID_NET for %s\n", mic->name); - break; - case VIRTIO_ID_CONSOLE: - mic->mic_console.virtio_console_fd = fd; - mpsslog("Added VIRTIO_ID_CONSOLE for %s\n", mic->name); - break; - case VIRTIO_ID_BLOCK: - mic->mic_virtblk.virtio_block_fd = fd; - mpsslog("Added VIRTIO_ID_BLOCK for %s\n", mic->name); - break; - } -} - -static bool -set_backend_file(struct mic_info *mic) -{ - FILE *config; - char buff[PATH_MAX], *line, *evv, *p; - - snprintf(buff, PATH_MAX, "%s/mpssd%03d.conf", mic_config_dir, mic->id); - config = fopen(buff, "r"); - if (config == NULL) - return false; - do { /* look for "virtblk_backend=XXXX" */ - line = fgets(buff, PATH_MAX, config); - if (line == NULL) - break; - if (*line == '#') - continue; - p = strchr(line, '\n'); - if (p) - *p = '\0'; - } while (strncmp(line, virtblk_backend, strlen(virtblk_backend)) != 0); - fclose(config); - if (line == NULL) - return false; - evv = strchr(line, '='); - if (evv == NULL) - return false; - mic->mic_virtblk.backend_file = malloc(strlen(evv) + 1); - if (mic->mic_virtblk.backend_file == NULL) { - mpsslog("%s %d can't allocate memory\n", mic->name, mic->id); - return false; - } - strcpy(mic->mic_virtblk.backend_file, evv + 1); - return true; -} - -#define SECTOR_SIZE 512 -static bool -set_backend_size(struct mic_info *mic) -{ - mic->mic_virtblk.backend_size = lseek(mic->mic_virtblk.backend, 0, - SEEK_END); - if (mic->mic_virtblk.backend_size < 0) { - mpsslog("%s: can't seek: %s\n", - mic->name, mic->mic_virtblk.backend_file); - return false; - } - virtblk_dev_page.blk_config.capacity = - mic->mic_virtblk.backend_size / SECTOR_SIZE; - if ((mic->mic_virtblk.backend_size % SECTOR_SIZE) != 0) - virtblk_dev_page.blk_config.capacity++; - - virtblk_dev_page.blk_config.capacity = - htole64(virtblk_dev_page.blk_config.capacity); - - return true; -} - -static bool -open_backend(struct mic_info *mic) -{ - if (!set_backend_file(mic)) - goto _error_exit; - mic->mic_virtblk.backend = open(mic->mic_virtblk.backend_file, O_RDWR); - if (mic->mic_virtblk.backend < 0) { - mpsslog("%s: can't open: %s\n", mic->name, - mic->mic_virtblk.backend_file); - goto _error_free; - } - if (!set_backend_size(mic)) - goto _error_close; - mic->mic_virtblk.backend_addr = mmap(NULL, - mic->mic_virtblk.backend_size, - PROT_READ|PROT_WRITE, MAP_SHARED, - mic->mic_virtblk.backend, 0L); - if (mic->mic_virtblk.backend_addr == MAP_FAILED) { - mpsslog("%s: can't map: %s %s\n", - mic->name, mic->mic_virtblk.backend_file, - strerror(errno)); - goto _error_close; - } - return true; - - _error_close: - close(mic->mic_virtblk.backend); - _error_free: - free(mic->mic_virtblk.backend_file); - _error_exit: - return false; -} - -static void -close_backend(struct mic_info *mic) -{ - munmap(mic->mic_virtblk.backend_addr, mic->mic_virtblk.backend_size); - close(mic->mic_virtblk.backend); - free(mic->mic_virtblk.backend_file); -} - -static bool -start_virtblk(struct mic_info *mic, struct mic_vring *vring) -{ - if (((unsigned long)&virtblk_dev_page.blk_config % 8) != 0) { - mpsslog("%s: blk_config is not 8 byte aligned.\n", - mic->name); - return false; - } - add_virtio_device(mic, &virtblk_dev_page.dd); - if (MAP_FAILED == init_vr(mic, mic->mic_virtblk.virtio_block_fd, - VIRTIO_ID_BLOCK, vring, NULL, - virtblk_dev_page.dd.num_vq)) { - mpsslog("%s init_vr failed %s\n", - mic->name, strerror(errno)); - return false; - } - return true; -} - -static void -stop_virtblk(struct mic_info *mic) -{ - int vr_size, ret; - - vr_size = PAGE_ALIGN(_vring_size(MIC_VRING_ENTRIES, - MIC_VIRTIO_RING_ALIGN) + - sizeof(struct _mic_vring_info)); - ret = munmap(mic->mic_virtblk.block_dp, - MIC_DEVICE_PAGE_END + vr_size * virtblk_dev_page.dd.num_vq); - if (ret < 0) - mpsslog("%s munmap errno %d\n", mic->name, errno); - close(mic->mic_virtblk.virtio_block_fd); -} - -static __u8 -header_error_check(struct vring_desc *desc) -{ - if (le32toh(desc->len) != sizeof(struct virtio_blk_outhdr)) { - mpsslog("%s() %d: length is not sizeof(virtio_blk_outhd)\n", - __func__, __LINE__); - return -EIO; - } - if (!(le16toh(desc->flags) & VRING_DESC_F_NEXT)) { - mpsslog("%s() %d: alone\n", - __func__, __LINE__); - return -EIO; - } - if (le16toh(desc->flags) & VRING_DESC_F_WRITE) { - mpsslog("%s() %d: not read\n", - __func__, __LINE__); - return -EIO; - } - return 0; -} - -static int -read_header(int fd, struct virtio_blk_outhdr *hdr, __u32 desc_idx) -{ - struct iovec iovec; - struct mic_copy_desc copy; - - iovec.iov_len = sizeof(*hdr); - iovec.iov_base = hdr; - copy.iov = &iovec; - copy.iovcnt = 1; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = false; /* do not update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -static int -transfer_blocks(int fd, struct iovec *iovec, __u32 iovcnt) -{ - struct mic_copy_desc copy; - - copy.iov = iovec; - copy.iovcnt = iovcnt; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = false; /* do not update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -static __u8 -status_error_check(struct vring_desc *desc) -{ - if (le32toh(desc->len) != sizeof(__u8)) { - mpsslog("%s() %d: length is not sizeof(status)\n", - __func__, __LINE__); - return -EIO; - } - return 0; -} - -static int -write_status(int fd, __u8 *status) -{ - struct iovec iovec; - struct mic_copy_desc copy; - - iovec.iov_base = status; - iovec.iov_len = sizeof(*status); - copy.iov = &iovec; - copy.iovcnt = 1; - copy.vr_idx = 0; /* only one vring on virtio_block */ - copy.update_used = true; /* Update used index */ - return ioctl(fd, MIC_VIRTIO_COPY_DESC, ©); -} - -#ifndef VIRTIO_BLK_T_GET_ID -#define VIRTIO_BLK_T_GET_ID 8 -#endif - -static void * -virtio_block(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - int ret; - struct pollfd block_poll; - struct mic_vring vring; - __u16 avail_idx; - __u32 desc_idx; - struct vring_desc *desc; - struct iovec *iovec, *piov; - __u8 status; - __u32 buffer_desc_idx; - struct virtio_blk_outhdr hdr; - void *fos; - - for (;;) { /* forever */ - if (!open_backend(mic)) { /* No virtblk */ - for (mic->mic_virtblk.signaled = 0; - !mic->mic_virtblk.signaled;) - sleep(1); - continue; - } - - /* backend file is specified. */ - if (!start_virtblk(mic, &vring)) - goto _close_backend; - iovec = malloc(sizeof(*iovec) * - le32toh(virtblk_dev_page.blk_config.seg_max)); - if (!iovec) { - mpsslog("%s: can't alloc iovec: %s\n", - mic->name, strerror(ENOMEM)); - goto _stop_virtblk; - } - - block_poll.fd = mic->mic_virtblk.virtio_block_fd; - block_poll.events = POLLIN; - for (mic->mic_virtblk.signaled = 0; - !mic->mic_virtblk.signaled;) { - block_poll.revents = 0; - /* timeout in 1 sec to see signaled */ - ret = poll(&block_poll, 1, 1000); - if (ret < 0) { - mpsslog("%s %d: poll failed: %s\n", - __func__, __LINE__, - strerror(errno)); - continue; - } - - if (!(block_poll.revents & POLLIN)) { -#ifdef DEBUG - mpsslog("%s %d: block_poll.revents=0x%x\n", - __func__, __LINE__, block_poll.revents); -#endif - continue; - } - - /* POLLIN */ - while (vring.info->avail_idx != - le16toh(vring.vr.avail->idx)) { - /* read header element */ - avail_idx = - vring.info->avail_idx & - (vring.vr.num - 1); - desc_idx = le16toh( - vring.vr.avail->ring[avail_idx]); - desc = &vring.vr.desc[desc_idx]; -#ifdef DEBUG - mpsslog("%s() %d: avail_idx=%d ", - __func__, __LINE__, - vring.info->avail_idx); - mpsslog("vring.vr.num=%d desc=%p\n", - vring.vr.num, desc); -#endif - status = header_error_check(desc); - ret = read_header( - mic->mic_virtblk.virtio_block_fd, - &hdr, desc_idx); - if (ret < 0) { - mpsslog("%s() %d %s: ret=%d %s\n", - __func__, __LINE__, - mic->name, ret, - strerror(errno)); - break; - } - /* buffer element */ - piov = iovec; - status = 0; - fos = mic->mic_virtblk.backend_addr + - (hdr.sector * SECTOR_SIZE); - buffer_desc_idx = next_desc(desc); - desc_idx = buffer_desc_idx; - for (desc = &vring.vr.desc[buffer_desc_idx]; - desc->flags & VRING_DESC_F_NEXT; - desc_idx = next_desc(desc), - desc = &vring.vr.desc[desc_idx]) { - piov->iov_len = desc->len; - piov->iov_base = fos; - piov++; - fos += desc->len; - } - /* Returning NULLs for VIRTIO_BLK_T_GET_ID. */ - if (hdr.type & ~(VIRTIO_BLK_T_OUT | - VIRTIO_BLK_T_GET_ID)) { - /* - VIRTIO_BLK_T_IN - does not do - anything. Probably for documenting. - VIRTIO_BLK_T_SCSI_CMD - for - virtio_scsi. - VIRTIO_BLK_T_FLUSH - turned off in - config space. - VIRTIO_BLK_T_BARRIER - defined but not - used in anywhere. - */ - mpsslog("%s() %d: type %x ", - __func__, __LINE__, - hdr.type); - mpsslog("is not supported\n"); - status = -ENOTSUP; - - } else { - ret = transfer_blocks( - mic->mic_virtblk.virtio_block_fd, - iovec, - piov - iovec); - if (ret < 0 && - status != 0) - status = ret; - } - /* write status and update used pointer */ - if (status != 0) - status = status_error_check(desc); - ret = write_status( - mic->mic_virtblk.virtio_block_fd, - &status); -#ifdef DEBUG - mpsslog("%s() %d: write status=%d on desc=%p\n", - __func__, __LINE__, - status, desc); -#endif - } - } - free(iovec); -_stop_virtblk: - stop_virtblk(mic); -_close_backend: - close_backend(mic); - } /* forever */ - - pthread_exit(NULL); -} - -static void -reset(struct mic_info *mic) -{ -#define RESET_TIMEOUT 120 - int i = RESET_TIMEOUT; - setsysfs(mic->name, "state", "reset"); - while (i) { - char *state; - state = readsysfs(mic->name, "state"); - if (!state) - goto retry; - mpsslog("%s: %s %d state %s\n", - mic->name, __func__, __LINE__, state); - - if (!strcmp(state, "ready")) { - free(state); - break; - } - free(state); -retry: - sleep(1); - i--; - } -} - -static int -get_mic_shutdown_status(struct mic_info *mic, char *shutdown_status) -{ - if (!strcmp(shutdown_status, "nop")) - return MIC_NOP; - if (!strcmp(shutdown_status, "crashed")) - return MIC_CRASHED; - if (!strcmp(shutdown_status, "halted")) - return MIC_HALTED; - if (!strcmp(shutdown_status, "poweroff")) - return MIC_POWER_OFF; - if (!strcmp(shutdown_status, "restart")) - return MIC_RESTART; - mpsslog("%s: BUG invalid status %s\n", mic->name, shutdown_status); - /* Invalid state */ - assert(0); -}; - -static int get_mic_state(struct mic_info *mic) -{ - char *state = NULL; - enum mic_states mic_state; - - while (!state) { - state = readsysfs(mic->name, "state"); - sleep(1); - } - mpsslog("%s: %s %d state %s\n", - mic->name, __func__, __LINE__, state); - - if (!strcmp(state, "ready")) { - mic_state = MIC_READY; - } else if (!strcmp(state, "booting")) { - mic_state = MIC_BOOTING; - } else if (!strcmp(state, "online")) { - mic_state = MIC_ONLINE; - } else if (!strcmp(state, "shutting_down")) { - mic_state = MIC_SHUTTING_DOWN; - } else if (!strcmp(state, "reset_failed")) { - mic_state = MIC_RESET_FAILED; - } else if (!strcmp(state, "resetting")) { - mic_state = MIC_RESETTING; - } else { - mpsslog("%s: BUG invalid state %s\n", mic->name, state); - assert(0); - } - - free(state); - return mic_state; -}; - -static void mic_handle_shutdown(struct mic_info *mic) -{ -#define SHUTDOWN_TIMEOUT 60 - int i = SHUTDOWN_TIMEOUT; - char *shutdown_status; - while (i) { - shutdown_status = readsysfs(mic->name, "shutdown_status"); - if (!shutdown_status) { - sleep(1); - continue; - } - mpsslog("%s: %s %d shutdown_status %s\n", - mic->name, __func__, __LINE__, shutdown_status); - switch (get_mic_shutdown_status(mic, shutdown_status)) { - case MIC_RESTART: - mic->restart = 1; - case MIC_HALTED: - case MIC_POWER_OFF: - case MIC_CRASHED: - free(shutdown_status); - goto reset; - default: - break; - } - free(shutdown_status); - sleep(1); - i--; - } -reset: - if (!i) - mpsslog("%s: %s %d timing out waiting for shutdown_status %s\n", - mic->name, __func__, __LINE__, shutdown_status); - reset(mic); -} - -static int open_state_fd(struct mic_info *mic) -{ - char pathname[PATH_MAX]; - int fd; - - snprintf(pathname, PATH_MAX - 1, "%s/%s/%s", - MICSYSFSDIR, mic->name, "state"); - - fd = open(pathname, O_RDONLY); - if (fd < 0) - mpsslog("%s: opening file %s failed %s\n", - mic->name, pathname, strerror(errno)); - return fd; -} - -static int block_till_state_change(int fd, struct mic_info *mic) -{ - struct pollfd ufds[1]; - char value[PAGE_SIZE]; - int ret; - - ufds[0].fd = fd; - ufds[0].events = POLLERR | POLLPRI; - ret = poll(ufds, 1, -1); - if (ret < 0) { - mpsslog("%s: %s %d poll failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - ret = lseek(fd, 0, SEEK_SET); - if (ret < 0) { - mpsslog("%s: %s %d Failed to seek to 0: %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - ret = read(fd, value, sizeof(value)); - if (ret < 0) { - mpsslog("%s: %s %d Failed to read sysfs entry: %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - return ret; - } - - return 0; -} - -static void * -mic_config(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - int fd, ret, stat = 0; - - fd = open_state_fd(mic); - if (fd < 0) { - mpsslog("%s: %s %d open state fd failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - goto exit; - } - - do { - ret = block_till_state_change(fd, mic); - if (ret < 0) { - mpsslog("%s: %s %d block_till_state_change error %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - goto close_exit; - } - - switch (get_mic_state(mic)) { - case MIC_SHUTTING_DOWN: - mic_handle_shutdown(mic); - break; - case MIC_READY: - case MIC_RESET_FAILED: - ret = kill(mic->pid, SIGTERM); - mpsslog("%s: %s %d kill pid %d ret %d\n", - mic->name, __func__, __LINE__, - mic->pid, ret); - if (!ret) { - ret = waitpid(mic->pid, &stat, - WIFSIGNALED(stat)); - mpsslog("%s: %s %d waitpid ret %d pid %d\n", - mic->name, __func__, __LINE__, - ret, mic->pid); - } - if (mic->boot_on_resume) { - setsysfs(mic->name, "state", "boot"); - mic->boot_on_resume = 0; - } - goto close_exit; - default: - break; - } - } while (1); - -close_exit: - close(fd); -exit: - init_mic(mic); - pthread_exit(NULL); -} - -static void -set_cmdline(struct mic_info *mic) -{ - char buffer[PATH_MAX]; - int len; - - len = snprintf(buffer, PATH_MAX, - "clocksource=tsc highres=off nohz=off "); - len += snprintf(buffer + len, PATH_MAX - len, - "cpufreq_on;corec6_off;pc3_off;pc6_off "); - len += snprintf(buffer + len, PATH_MAX - len, - "ifcfg=static;address,172.31.%d.1;netmask,255.255.255.0", - mic->id + 1); - - setsysfs(mic->name, "cmdline", buffer); - mpsslog("%s: Command line: \"%s\"\n", mic->name, buffer); - snprintf(buffer, PATH_MAX, "172.31.%d.1", mic->id + 1); - mpsslog("%s: IPADDR: \"%s\"\n", mic->name, buffer); -} - -static void -set_log_buf_info(struct mic_info *mic) -{ - int fd; - off_t len; - char system_map[] = "/lib/firmware/mic/System.map"; - char *map, *temp, log_buf[17] = {'\0'}; - - fd = open(system_map, O_RDONLY); - if (fd < 0) { - mpsslog("%s: Opening System.map failed: %d\n", - mic->name, errno); - return; - } - len = lseek(fd, 0, SEEK_END); - if (len < 0) { - mpsslog("%s: Reading System.map size failed: %d\n", - mic->name, errno); - close(fd); - return; - } - map = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0); - if (map == MAP_FAILED) { - mpsslog("%s: mmap of System.map failed: %d\n", - mic->name, errno); - close(fd); - return; - } - temp = strstr(map, "__log_buf"); - if (!temp) { - mpsslog("%s: __log_buf not found: %d\n", mic->name, errno); - munmap(map, len); - close(fd); - return; - } - strncpy(log_buf, temp - 19, 16); - setsysfs(mic->name, "log_buf_addr", log_buf); - mpsslog("%s: log_buf_addr: %s\n", mic->name, log_buf); - temp = strstr(map, "log_buf_len"); - if (!temp) { - mpsslog("%s: log_buf_len not found: %d\n", mic->name, errno); - munmap(map, len); - close(fd); - return; - } - strncpy(log_buf, temp - 19, 16); - setsysfs(mic->name, "log_buf_len", log_buf); - mpsslog("%s: log_buf_len: %s\n", mic->name, log_buf); - munmap(map, len); - close(fd); -} - -static void -change_virtblk_backend(int x, siginfo_t *siginfo, void *p) -{ - struct mic_info *mic; - - for (mic = mic_list.next; mic != NULL; mic = mic->next) - mic->mic_virtblk.signaled = 1/* true */; -} - -static void -set_mic_boot_params(struct mic_info *mic) -{ - set_log_buf_info(mic); - set_cmdline(mic); -} - -static void * -init_mic(void *arg) -{ - struct mic_info *mic = (struct mic_info *)arg; - struct sigaction ignore = { - .sa_flags = 0, - .sa_handler = SIG_IGN - }; - struct sigaction act = { - .sa_flags = SA_SIGINFO, - .sa_sigaction = change_virtblk_backend, - }; - char buffer[PATH_MAX]; - int err, fd; - - /* - * Currently, one virtio block device is supported for each MIC card - * at a time. Any user (or test) can send a SIGUSR1 to the MIC daemon. - * The signal informs the virtio block backend about a change in the - * configuration file which specifies the virtio backend file name on - * the host. Virtio block backend then re-reads the configuration file - * and switches to the new block device. This signalling mechanism may - * not be required once multiple virtio block devices are supported by - * the MIC daemon. - */ - sigaction(SIGUSR1, &ignore, NULL); -retry: - fd = open_state_fd(mic); - if (fd < 0) { - mpsslog("%s: %s %d open state fd failed %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - sleep(2); - goto retry; - } - - if (mic->restart) { - snprintf(buffer, PATH_MAX, "boot"); - setsysfs(mic->name, "state", buffer); - mpsslog("%s restarting mic %d\n", - mic->name, mic->restart); - mic->restart = 0; - } - - while (1) { - while (block_till_state_change(fd, mic)) { - mpsslog("%s: %s %d block_till_state_change error %s\n", - mic->name, __func__, __LINE__, strerror(errno)); - sleep(2); - continue; - } - - if (get_mic_state(mic) == MIC_BOOTING) - break; - } - - mic->pid = fork(); - switch (mic->pid) { - case 0: - add_virtio_device(mic, &virtcons_dev_page.dd); - add_virtio_device(mic, &virtnet_dev_page.dd); - err = pthread_create(&mic->mic_console.console_thread, NULL, - virtio_console, mic); - if (err) - mpsslog("%s virtcons pthread_create failed %s\n", - mic->name, strerror(err)); - err = pthread_create(&mic->mic_net.net_thread, NULL, - virtio_net, mic); - if (err) - mpsslog("%s virtnet pthread_create failed %s\n", - mic->name, strerror(err)); - err = pthread_create(&mic->mic_virtblk.block_thread, NULL, - virtio_block, mic); - if (err) - mpsslog("%s virtblk pthread_create failed %s\n", - mic->name, strerror(err)); - sigemptyset(&act.sa_mask); - err = sigaction(SIGUSR1, &act, NULL); - if (err) - mpsslog("%s sigaction SIGUSR1 failed %s\n", - mic->name, strerror(errno)); - while (1) - sleep(60); - case -1: - mpsslog("fork failed MIC name %s id %d errno %d\n", - mic->name, mic->id, errno); - break; - default: - err = pthread_create(&mic->config_thread, NULL, - mic_config, mic); - if (err) - mpsslog("%s mic_config pthread_create failed %s\n", - mic->name, strerror(err)); - } - - return NULL; -} - -static void -start_daemon(void) -{ - struct mic_info *mic; - int err; - - for (mic = mic_list.next; mic; mic = mic->next) { - set_mic_boot_params(mic); - err = pthread_create(&mic->init_thread, NULL, init_mic, mic); - if (err) - mpsslog("%s init_mic pthread_create failed %s\n", - mic->name, strerror(err)); - } - - while (1) - sleep(60); -} - -static int -init_mic_list(void) -{ - struct mic_info *mic = &mic_list; - struct dirent *file; - DIR *dp; - int cnt = 0; - - dp = opendir(MICSYSFSDIR); - if (!dp) - return 0; - - while ((file = readdir(dp)) != NULL) { - if (!strncmp(file->d_name, "mic", 3)) { - mic->next = calloc(1, sizeof(struct mic_info)); - if (mic->next) { - mic = mic->next; - mic->id = atoi(&file->d_name[3]); - mic->name = malloc(strlen(file->d_name) + 16); - if (mic->name) - strcpy(mic->name, file->d_name); - mpsslog("MIC name %s id %d\n", mic->name, - mic->id); - cnt++; - } - } - } - - closedir(dp); - return cnt; -} - -void -mpsslog(char *format, ...) -{ - va_list args; - char buffer[4096]; - char ts[52], *ts1; - time_t t; - - if (logfp == NULL) - return; - - va_start(args, format); - vsprintf(buffer, format, args); - va_end(args); - - time(&t); - ts1 = ctime_r(&t, ts); - ts1[strlen(ts1) - 1] = '\0'; - fprintf(logfp, "%s: %s", ts1, buffer); - - fflush(logfp); -} - -int -main(int argc, char *argv[]) -{ - int cnt; - pid_t pid; - - myname = argv[0]; - - logfp = fopen(LOGFILE_NAME, "a+"); - if (!logfp) { - fprintf(stderr, "cannot open logfile '%s'\n", LOGFILE_NAME); - exit(1); - } - pid = fork(); - switch (pid) { - case 0: - break; - case -1: - exit(2); - default: - exit(0); - } - - mpsslog("MIC Daemon start\n"); - - cnt = init_mic_list(); - if (cnt == 0) { - mpsslog("MIC module not loaded\n"); - exit(3); - } - mpsslog("MIC found %d devices\n", cnt); - - start_daemon(); - - exit(0); -} diff --git a/samples/mic/mpssd/mpssd.h b/samples/mic/mpssd/mpssd.h deleted file mode 100644 index 5f98bdafe653..000000000000 --- a/samples/mic/mpssd/mpssd.h +++ /dev/null @@ -1,89 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ -#ifndef _MPSSD_H_ -#define _MPSSD_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MICSYSFSDIR "/sys/class/mic" -#define LOGFILE_NAME "/var/log/mpssd" -#define PAGE_SIZE 4096 - -struct mic_console_info { - pthread_t console_thread; - int virtio_console_fd; - void *console_dp; -}; - -struct mic_net_info { - pthread_t net_thread; - int virtio_net_fd; - int tap_fd; - void *net_dp; -}; - -struct mic_virtblk_info { - pthread_t block_thread; - int virtio_block_fd; - void *block_dp; - volatile sig_atomic_t signaled; - char *backend_file; - int backend; - void *backend_addr; - long backend_size; -}; - -struct mic_info { - int id; - char *name; - pthread_t config_thread; - pthread_t init_thread; - pid_t pid; - struct mic_console_info mic_console; - struct mic_net_info mic_net; - struct mic_virtblk_info mic_virtblk; - int restart; - int boot_on_resume; - struct mic_info *next; -}; - -__attribute__((format(printf, 1, 2))) -void mpsslog(char *format, ...); -char *readsysfs(char *dir, char *entry); -int setsysfs(char *dir, char *entry, char *value); -#endif diff --git a/samples/mic/mpssd/sysfs.c b/samples/mic/mpssd/sysfs.c deleted file mode 100644 index 3fb08eb7ed9d..000000000000 --- a/samples/mic/mpssd/sysfs.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * Intel MIC User Space Tools. - */ - -#include "mpssd.h" - -#define PAGE_SIZE 4096 - -char * -readsysfs(char *dir, char *entry) -{ - char filename[PATH_MAX]; - char value[PAGE_SIZE]; - char *string = NULL; - int fd; - int len; - - if (dir == NULL) - snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry); - else - snprintf(filename, PATH_MAX, - "%s/%s/%s", MICSYSFSDIR, dir, entry); - - fd = open(filename, O_RDONLY); - if (fd < 0) { - mpsslog("Failed to open sysfs entry '%s': %s\n", - filename, strerror(errno)); - return NULL; - } - - len = read(fd, value, sizeof(value)); - if (len < 0) { - mpsslog("Failed to read sysfs entry '%s': %s\n", - filename, strerror(errno)); - goto readsys_ret; - } - if (len == 0) - goto readsys_ret; - - value[len - 1] = '\0'; - - string = malloc(strlen(value) + 1); - if (string) - strcpy(string, value); - -readsys_ret: - close(fd); - return string; -} - -int -setsysfs(char *dir, char *entry, char *value) -{ - char filename[PATH_MAX]; - char *oldvalue; - int fd, ret = 0; - - if (dir == NULL) - snprintf(filename, PATH_MAX, "%s/%s", MICSYSFSDIR, entry); - else - snprintf(filename, PATH_MAX, "%s/%s/%s", - MICSYSFSDIR, dir, entry); - - oldvalue = readsysfs(dir, entry); - - fd = open(filename, O_RDWR); - if (fd < 0) { - ret = errno; - mpsslog("Failed to open sysfs entry '%s': %s\n", - filename, strerror(errno)); - goto done; - } - - if (!oldvalue || strcmp(value, oldvalue)) { - if (write(fd, value, strlen(value)) < 0) { - ret = errno; - mpsslog("Failed to write new sysfs entry '%s': %s\n", - filename, strerror(errno)); - } - } - close(fd); -done: - if (oldvalue) - free(oldvalue); - return ret; -} -- cgit v1.2.3 From b59e286be280fa3c2e94a0716ddcee6ba02bc8ba Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 27 Oct 2020 20:33:12 +0800 Subject: ICMPv6: Add ICMPv6 Parameter Problem, code 3 definition Based on RFC7112, Section 6: IANA has added the following "Type 4 - Parameter Problem" message to the "Internet Control Message Protocol version 6 (ICMPv6) Parameters" registry: CODE NAME/DESCRIPTION 3 IPv6 First Fragment has incomplete IPv6 Header Chain Signed-off-by: Hangbin Liu Signed-off-by: Jakub Kicinski --- include/uapi/linux/icmpv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index c1661febc2dc..0564fd7ccde4 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -138,6 +138,7 @@ struct icmp6hdr { #define ICMPV6_HDR_FIELD 0 #define ICMPV6_UNK_NEXTHDR 1 #define ICMPV6_UNK_OPTION 2 +#define ICMPV6_HDR_INCOMP 3 /* * constants for (set|get)sockopt -- cgit v1.2.3