diff options
Diffstat (limited to 'net')
163 files changed, 3860 insertions, 2216 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index 1218fb3b52da..4674235b0d9b 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -37,6 +37,7 @@ #include <linux/uio.h> #include <net/9p/9p.h> #include <linux/parser.h> +#include <linux/seq_file.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include "protocol.h" @@ -77,6 +78,30 @@ inline int p9_is_proto_dotu(struct p9_client *clnt) } EXPORT_SYMBOL(p9_is_proto_dotu); +int p9_show_client_options(struct seq_file *m, struct p9_client *clnt) +{ + if (clnt->msize != 8192) + seq_printf(m, ",msize=%u", clnt->msize); + seq_printf(m, "trans=%s", clnt->trans_mod->name); + + switch (clnt->proto_version) { + case p9_proto_legacy: + seq_puts(m, ",noextend"); + break; + case p9_proto_2000u: + seq_puts(m, ",version=9p2000.u"); + break; + case p9_proto_2000L: + /* Default */ + break; + } + + if (clnt->trans_mod->show_options) + return clnt->trans_mod->show_options(m, clnt); + return 0; +} +EXPORT_SYMBOL(p9_show_client_options); + /* * Some error codes are taken directly from the server replies, * make sure they are valid. diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index dca3cdd1a014..ddfa86648f95 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -41,6 +41,7 @@ #include <linux/file.h> #include <linux/parser.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> @@ -51,6 +52,9 @@ #define MAX_SOCK_BUF (64*1024) #define MAXPOLLWADDR 2 +static struct p9_trans_module p9_tcp_trans; +static struct p9_trans_module p9_fd_trans; + /** * struct p9_fd_opts - per-transport options * @rfd: file descriptor for reading (trans=fd) @@ -63,7 +67,7 @@ struct p9_fd_opts { int rfd; int wfd; u16 port; - int privport; + bool privport; }; /* @@ -720,6 +724,20 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) return 0; } +static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt) +{ + if (clnt->trans_mod == &p9_tcp_trans) { + if (clnt->trans_opts.tcp.port != P9_PORT) + seq_printf(m, "port=%u", clnt->trans_opts.tcp.port); + } else if (clnt->trans_mod == &p9_fd_trans) { + if (clnt->trans_opts.fd.rfd != ~0) + seq_printf(m, "rfd=%u", clnt->trans_opts.fd.rfd); + if (clnt->trans_opts.fd.wfd != ~0) + seq_printf(m, "wfd=%u", clnt->trans_opts.fd.wfd); + } + return 0; +} + /** * parse_opts - parse mount options into p9_fd_opts structure * @params: options string passed from mount @@ -738,7 +756,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) opts->port = P9_PORT; opts->rfd = ~0; opts->wfd = ~0; - opts->privport = 0; + opts->privport = false; if (!params) return 0; @@ -776,7 +794,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts) opts->wfd = option; break; case Opt_privport: - opts->privport = 1; + opts->privport = true; break; default: continue; @@ -942,6 +960,8 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) csocket = NULL; + client->trans_opts.tcp.port = opts.port; + client->trans_opts.tcp.privport = opts.privport; sin_server.sin_family = AF_INET; sin_server.sin_addr.s_addr = in_aton(addr); sin_server.sin_port = htons(opts.port); @@ -1020,6 +1040,8 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) struct p9_fd_opts opts; parse_opts(args, &opts); + client->trans_opts.fd.rfd = opts.rfd; + client->trans_opts.fd.wfd = opts.wfd; if (opts.rfd == ~0 || opts.wfd == ~0) { pr_err("Insufficient options for proto=fd\n"); @@ -1044,6 +1066,7 @@ static struct p9_trans_module p9_tcp_trans = { .request = p9_fd_request, .cancel = p9_fd_cancel, .cancelled = p9_fd_cancelled, + .show_options = p9_fd_show_options, .owner = THIS_MODULE, }; @@ -1056,6 +1079,7 @@ static struct p9_trans_module p9_unix_trans = { .request = p9_fd_request, .cancel = p9_fd_cancel, .cancelled = p9_fd_cancelled, + .show_options = p9_fd_show_options, .owner = THIS_MODULE, }; @@ -1068,6 +1092,7 @@ static struct p9_trans_module p9_fd_trans = { .request = p9_fd_request, .cancel = p9_fd_cancel, .cancelled = p9_fd_cancelled, + .show_options = p9_fd_show_options, .owner = THIS_MODULE, }; diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 553ed4ecb6a0..6d8e3031978f 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -43,6 +43,7 @@ #include <linux/parser.h> #include <linux/semaphore.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> @@ -70,6 +71,8 @@ * @dm_mr: DMA Memory Region pointer * @lkey: The local access only memory region key * @timeout: Number of uSecs to wait for connection management events + * @privport: Whether a privileged port may be used + * @port: The port to use * @sq_depth: The depth of the Send Queue * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. @@ -95,6 +98,8 @@ struct p9_trans_rdma { struct ib_qp *qp; struct ib_cq *cq; long timeout; + bool privport; + u16 port; int sq_depth; struct semaphore sq_sem; int rq_depth; @@ -133,10 +138,10 @@ struct p9_rdma_context { */ struct p9_rdma_opts { short port; + bool privport; int sq_depth; int rq_depth; long timeout; - int privport; }; /* @@ -159,6 +164,23 @@ static match_table_t tokens = { {Opt_err, NULL}, }; +static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) +{ + struct p9_trans_rdma *rdma = clnt->trans; + + if (rdma->port != P9_PORT) + seq_printf(m, ",port=%u", rdma->port); + if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) + seq_printf(m, ",sq=%u", rdma->sq_depth); + if (rdma->rq_depth != P9_RDMA_RQ_DEPTH) + seq_printf(m, ",rq=%u", rdma->rq_depth); + if (rdma->timeout != P9_RDMA_TIMEOUT) + seq_printf(m, ",timeout=%lu", rdma->timeout); + if (rdma->privport) + seq_puts(m, ",privport"); + return 0; +} + /** * parse_opts - parse mount options into rdma options structure * @params: options string passed from mount @@ -177,7 +199,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) opts->sq_depth = P9_RDMA_SQ_DEPTH; opts->rq_depth = P9_RDMA_RQ_DEPTH; opts->timeout = P9_RDMA_TIMEOUT; - opts->privport = 0; + opts->privport = false; if (!params) return 0; @@ -218,7 +240,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts) opts->timeout = option; break; case Opt_privport: - opts->privport = 1; + opts->privport = true; break; default: continue; @@ -560,6 +582,8 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) if (!rdma) return NULL; + rdma->port = opts->port; + rdma->privport = opts->privport; rdma->sq_depth = opts->sq_depth; rdma->rq_depth = opts->rq_depth; rdma->timeout = opts->timeout; @@ -733,6 +757,7 @@ static struct p9_trans_module p9_rdma_trans = { .request = rdma_request, .cancel = rdma_cancel, .cancelled = rdma_cancelled, + .show_options = p9_rdma_show_options, }; /** diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index e1133bc634b5..8a3ce79b1307 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1549,9 +1549,41 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, return found; } +/** + * batadv_tt_global_sync_flags - update TT sync flags + * @tt_global: the TT global entry to update sync flags in + * + * Updates the sync flag bits in the tt_global flag attribute with a logical + * OR of all sync flags from any of its TT orig entries. + */ +static void +batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global) +{ + struct batadv_tt_orig_list_entry *orig_entry; + const struct hlist_head *head; + u16 flags = BATADV_NO_FLAGS; + + rcu_read_lock(); + head = &tt_global->orig_list; + hlist_for_each_entry_rcu(orig_entry, head, list) + flags |= orig_entry->flags; + rcu_read_unlock(); + + flags |= tt_global->common.flags & (~BATADV_TT_SYNC_MASK); + tt_global->common.flags = flags; +} + +/** + * batadv_tt_global_orig_entry_add - add or update a TT orig entry + * @tt_global: the TT global entry to add an orig entry in + * @orig_node: the originator to add an orig entry for + * @ttvn: translation table version number of this changeset + * @flags: TT sync flags + */ static void batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, - struct batadv_orig_node *orig_node, int ttvn) + struct batadv_orig_node *orig_node, int ttvn, + u8 flags) { struct batadv_tt_orig_list_entry *orig_entry; @@ -1561,7 +1593,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, * was added during a "temporary client detection" */ orig_entry->ttvn = ttvn; - goto out; + orig_entry->flags = flags; + goto sync_flags; } orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC); @@ -1573,6 +1606,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; + orig_entry->flags = flags; kref_init(&orig_entry->refcount); spin_lock_bh(&tt_global->list_lock); @@ -1582,6 +1616,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, spin_unlock_bh(&tt_global->list_lock); atomic_inc(&tt_global->orig_list_count); +sync_flags: + batadv_tt_global_sync_flags(tt_global); out: if (orig_entry) batadv_tt_orig_list_entry_put(orig_entry); @@ -1703,10 +1739,10 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } /* the change can carry possible "attribute" flags like the - * TT_CLIENT_WIFI, therefore they have to be copied in the + * TT_CLIENT_TEMP, therefore they have to be copied in the * client entry */ - common->flags |= flags; + common->flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a @@ -1723,7 +1759,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } add_orig_entry: /* add the new orig_entry (if needed) or update it */ - batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn); + batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn, + flags & BATADV_TT_SYNC_MASK); batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new global tt entry: %pM (vid: %d, via %pM)\n", @@ -1946,6 +1983,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_tt_orig_list_entry *orig, bool best) { + u16 flags = (common->flags & (~BATADV_TT_SYNC_MASK)) | orig->flags; void *hdr; struct batadv_orig_node_vlan *vlan; u8 last_ttvn; @@ -1975,7 +2013,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || - nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, flags)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) @@ -2589,6 +2627,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; + struct batadv_tt_orig_list_entry *tt_orig; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; @@ -2627,8 +2666,9 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /* find out if this global entry is announced by this * originator */ - if (!batadv_tt_global_entry_has_orig(tt_global, - orig_node)) + tt_orig = batadv_tt_global_orig_entry_find(tt_global, + orig_node); + if (!tt_orig) continue; /* use network order to read the VID: this ensures that @@ -2640,10 +2680,12 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /* compute the CRC on flags that have to be kept in sync * among nodes */ - flags = tt_common->flags & BATADV_TT_SYNC_MASK; + flags = tt_orig->flags; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); + + batadv_tt_orig_list_entry_put(tt_orig); } rcu_read_unlock(); } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ea43a6449247..a62795868794 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1260,6 +1260,7 @@ struct batadv_tt_global_entry { * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client * @orig_node: pointer to orig node announcing this non-mesh client * @ttvn: translation table version number which added the non-mesh client + * @flags: per orig entry TT sync flags * @list: list node for batadv_tt_global_entry::orig_list * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner @@ -1267,6 +1268,7 @@ struct batadv_tt_global_entry { struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; u8 ttvn; + u8 flags; struct hlist_node list; struct kref refcount; struct rcu_head rcu; diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 002743ea509c..8112893037bd 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -734,7 +734,7 @@ static void hidp_stop(struct hid_device *hid) hid->claimed = 0; } -static struct hid_ll_driver hidp_hid_driver = { +struct hid_ll_driver hidp_hid_driver = { .parse = hidp_parse, .start = hidp_start, .stop = hidp_stop, @@ -743,6 +743,7 @@ static struct hid_ll_driver hidp_hid_driver = { .raw_request = hidp_raw_request, .output_report = hidp_output_report, }; +EXPORT_SYMBOL_GPL(hidp_hid_driver); /* This function sets up the hid device. It does not add it to the HID system. That is done in hidp_add_connection(). */ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f0f3447e8aa4..5a7be3bddfa9 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -34,11 +34,11 @@ static struct lock_class_key bridge_netdev_addr_lock_key; netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - const unsigned char *dest = skb->data; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; + const unsigned char *dest; u16 vid = 0; rcu_read_lock(); @@ -53,6 +53,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) brstats->tx_bytes += skb->len; u64_stats_update_end(&brstats->syncp); +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; +#endif BR_INPUT_SKB_CB(skb)->brdev = dev; skb_reset_mac_header(skb); @@ -61,6 +64,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; + dest = eth_hdr(skb)->h_dest; if (is_broadcast_ether_addr(dest)) { br_flood(br, skb, BR_PKT_BROADCAST, false, true); } else if (is_multicast_ether_addr(dest)) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 013f2290bfa5..7637f58c1226 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -131,11 +131,11 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); - const unsigned char *dest = eth_hdr(skb)->h_dest; enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; + const unsigned char *dest; struct net_bridge *br; u16 vid = 0; @@ -153,6 +153,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); local_rcv = !!(br->dev->flags & IFF_PROMISC); + dest = eth_hdr(skb)->h_dest; if (is_multicast_ether_addr(dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(dest)) { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 09dcdb9c0f3c..a0b11e7d67d9 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -323,7 +323,8 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, __mdb_entry_to_br_ip(entry, &complete_info->ip); mdb.obj.complete_priv = complete_info; mdb.obj.complete = br_mdb_complete; - switchdev_port_obj_add(port_dev, &mdb.obj); + if (switchdev_port_obj_add(port_dev, &mdb.obj)) + kfree(complete_info); } } else if (port_dev && type == RTM_DELMDB) { switchdev_port_obj_del(port_dev, &mdb.obj); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 181a44d0f1da..f6b1c7de059d 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -115,7 +115,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) { - if (!fdb->added_by_user) + if (!fdb->added_by_user || !fdb->dst) return; switch (type) { diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c index c1dc48686200..da1c2fdc08c8 100644 --- a/net/bridge/netfilter/ebt_nflog.c +++ b/net/bridge/netfilter/ebt_nflog.c @@ -30,6 +30,7 @@ ebt_nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; + li.u.ulog.flags = 0; nf_log_packet(net, PF_BRIDGE, xt_hooknum(par), skb, xt_in(par), xt_out(par), &li, "%s", info->prefix); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 47e94b560ba0..5c036d2f401e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -85,6 +85,7 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_OSD_OP: return "osd_op"; case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; + case CEPH_MSG_OSD_BACKOFF: return "osd_backoff"; default: return "unknown"; } } @@ -598,7 +599,11 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) { struct ceph_client *client; struct ceph_entity_addr *myaddr = NULL; - int err = -ENOMEM; + int err; + + err = wait_for_random_bytes(); + if (err < 0) + return ERR_PTR(err); client = kzalloc(sizeof(*client), GFP_KERNEL); if (client == NULL) diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 5bf94c04f645..4b428f46a8ca 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -1,6 +1,7 @@ #ifdef __KERNEL__ # include <linux/slab.h> # include <linux/crush/crush.h> +void clear_choose_args(struct crush_map *c); #else # include "crush_compat.h" # include "crush.h" @@ -127,6 +128,8 @@ void crush_destroy(struct crush_map *map) #ifndef __KERNEL__ kfree(map->choose_tries); +#else + clear_choose_args(map); #endif kfree(map); } diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b5cd8c21bfdf..417df675c71b 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -302,19 +302,42 @@ static __u64 crush_ln(unsigned int xin) * */ +static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket, + const struct crush_choose_arg *arg, + int position) +{ + if (!arg || !arg->weight_set) + return bucket->item_weights; + + if (position >= arg->weight_set_size) + position = arg->weight_set_size - 1; + return arg->weight_set[position].weights; +} + +static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket, + const struct crush_choose_arg *arg) +{ + if (!arg || !arg->ids) + return bucket->h.items; + + return arg->ids; +} + static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, - int x, int r) + int x, int r, + const struct crush_choose_arg *arg, + int position) { unsigned int i, high = 0; unsigned int u; - unsigned int w; __s64 ln, draw, high_draw = 0; + __u32 *weights = get_choose_arg_weights(bucket, arg, position); + __s32 *ids = get_choose_arg_ids(bucket, arg); for (i = 0; i < bucket->h.size; i++) { - w = bucket->item_weights[i]; - if (w) { - u = crush_hash32_3(bucket->h.hash, x, - bucket->h.items[i], r); + dprintk("weight 0x%x item %d\n", weights[i], ids[i]); + if (weights[i]) { + u = crush_hash32_3(bucket->h.hash, x, ids[i], r); u &= 0xffff; /* @@ -335,7 +358,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, * weight means a larger (less negative) value * for draw. */ - draw = div64_s64(ln, w); + draw = div64_s64(ln, weights[i]); } else { draw = S64_MIN; } @@ -352,7 +375,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, static int crush_bucket_choose(const struct crush_bucket *in, struct crush_work_bucket *work, - int x, int r) + int x, int r, + const struct crush_choose_arg *arg, + int position) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); BUG_ON(in->size == 0); @@ -374,7 +399,7 @@ static int crush_bucket_choose(const struct crush_bucket *in, case CRUSH_BUCKET_STRAW2: return bucket_straw2_choose( (const struct crush_bucket_straw2 *)in, - x, r); + x, r, arg, position); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; @@ -436,7 +461,8 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int vary_r, unsigned int stable, int *out2, - int parent_r) + int parent_r, + const struct crush_choose_arg *choose_args) { int rep; unsigned int ftotal, flocal; @@ -486,7 +512,10 @@ static int crush_choose_firstn(const struct crush_map *map, else item = crush_bucket_choose( in, work->work[-1-in->id], - x, r); + x, r, + (choose_args ? + &choose_args[-1-in->id] : 0), + outpos); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); skip_rep = 1; @@ -543,7 +572,8 @@ static int crush_choose_firstn(const struct crush_map *map, vary_r, stable, NULL, - sub_r) <= outpos) + sub_r, + choose_args) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -620,7 +650,8 @@ static void crush_choose_indep(const struct crush_map *map, unsigned int recurse_tries, int recurse_to_leaf, int *out2, - int parent_r) + int parent_r, + const struct crush_choose_arg *choose_args) { const struct crush_bucket *in = bucket; int endpos = outpos + left; @@ -692,7 +723,10 @@ static void crush_choose_indep(const struct crush_map *map, item = crush_bucket_choose( in, work->work[-1-in->id], - x, r); + x, r, + (choose_args ? + &choose_args[-1-in->id] : 0), + outpos); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); out[rep] = CRUSH_ITEM_NONE; @@ -746,7 +780,8 @@ static void crush_choose_indep(const struct crush_map *map, x, 1, numrep, 0, out2, rep, recurse_tries, 0, - 0, NULL, r); + 0, NULL, r, + choose_args); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -823,7 +858,7 @@ void crush_init_workspace(const struct crush_map *map, void *v) * set the pointer first and then reserve the space for it to * point to by incrementing the point. */ - v += sizeof(struct crush_work *); + v += sizeof(struct crush_work); w->work = v; v += map->max_buckets * sizeof(struct crush_work_bucket *); for (b = 0; b < map->max_buckets; ++b) { @@ -854,11 +889,12 @@ void crush_init_workspace(const struct crush_map *map, void *v) * @weight: weight vector (for map leaves) * @weight_max: size of weight vector * @cwin: pointer to at least crush_work_size() bytes of memory + * @choose_args: weights and ids for each known bucket */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - void *cwin) + void *cwin, const struct crush_choose_arg *choose_args) { int result_len; struct crush_work *cw = cwin; @@ -968,11 +1004,6 @@ int crush_do_rule(const struct crush_map *map, for (i = 0; i < wsize; i++) { int bno; - /* - * see CRUSH_N, CRUSH_N_MINUS macros. - * basically, numrep <= 0 means relative to - * the provided result_max - */ numrep = curstep->arg1; if (numrep <= 0) { numrep += result_max; @@ -1013,7 +1044,8 @@ int crush_do_rule(const struct crush_map *map, vary_r, stable, c+osize, - 0); + 0, + choose_args); } else { out_size = ((numrep < (result_max-osize)) ? numrep : (result_max-osize)); @@ -1030,7 +1062,8 @@ int crush_do_rule(const struct crush_map *map, choose_leaf_tries : 1, recurse_to_leaf, c+osize, - 0); + 0, + choose_args); osize += out_size; } } diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 71ba13927b3d..fa5233e0d01c 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -77,7 +77,7 @@ static int osdmap_show(struct seq_file *s, void *p) } for (i = 0; i < map->max_osd; i++) { struct ceph_entity_addr *addr = &map->osd_addr[i]; - int state = map->osd_state[i]; + u32 state = map->osd_state[i]; char sb[64]; seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", @@ -104,6 +104,29 @@ static int osdmap_show(struct seq_file *s, void *p) seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, pg->pgid.seed, pg->primary_temp.osd); } + for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_upmap.len; i++) + seq_printf(s, "%s%d", (i == 0 ? "" : ","), + pg->pg_upmap.osds[i]); + seq_printf(s, "]\n"); + } + for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_upmap_items.len; i++) + seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","), + pg->pg_upmap_items.from_to[i][0], + pg->pg_upmap_items.from_to[i][1]); + seq_printf(s, "]\n"); + } up_read(&osdc->lock); return 0; @@ -147,17 +170,26 @@ static int monc_show(struct seq_file *s, void *p) return 0; } +static void dump_spgid(struct seq_file *s, const struct ceph_spg *spgid) +{ + seq_printf(s, "%llu.%x", spgid->pgid.pool, spgid->pgid.seed); + if (spgid->shard != CEPH_SPG_NOSHARD) + seq_printf(s, "s%d", spgid->shard); +} + static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) { int i; - seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed); + seq_printf(s, "osd%d\t%llu.%x\t", t->osd, t->pgid.pool, t->pgid.seed); + dump_spgid(s, &t->spgid); + seq_puts(s, "\t["); for (i = 0; i < t->up.size; i++) seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]); seq_printf(s, "]/%d\t[", t->up.primary); for (i = 0; i < t->acting.size; i++) seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); - seq_printf(s, "]/%d\t", t->acting.primary); + seq_printf(s, "]/%d\te%u\t", t->acting.primary, t->epoch); if (t->target_oloc.pool_ns) { seq_printf(s, "%*pE/%*pE\t0x%x", (int)t->target_oloc.pool_ns->len, @@ -234,6 +266,73 @@ static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd) mutex_unlock(&osd->lock); } +static void dump_snapid(struct seq_file *s, u64 snapid) +{ + if (snapid == CEPH_NOSNAP) + seq_puts(s, "head"); + else if (snapid == CEPH_SNAPDIR) + seq_puts(s, "snapdir"); + else + seq_printf(s, "%llx", snapid); +} + +static void dump_name_escaped(struct seq_file *s, unsigned char *name, + size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (name[i] == '%' || name[i] == ':' || name[i] == '/' || + name[i] < 32 || name[i] >= 127) { + seq_printf(s, "%%%02x", name[i]); + } else { + seq_putc(s, name[i]); + } + } +} + +static void dump_hoid(struct seq_file *s, const struct ceph_hobject_id *hoid) +{ + if (hoid->snapid == 0 && hoid->hash == 0 && !hoid->is_max && + hoid->pool == S64_MIN) { + seq_puts(s, "MIN"); + return; + } + if (hoid->is_max) { + seq_puts(s, "MAX"); + return; + } + seq_printf(s, "%lld:%08x:", hoid->pool, hoid->hash_reverse_bits); + dump_name_escaped(s, hoid->nspace, hoid->nspace_len); + seq_putc(s, ':'); + dump_name_escaped(s, hoid->key, hoid->key_len); + seq_putc(s, ':'); + dump_name_escaped(s, hoid->oid, hoid->oid_len); + seq_putc(s, ':'); + dump_snapid(s, hoid->snapid); +} + +static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd) +{ + struct rb_node *n; + + mutex_lock(&osd->lock); + for (n = rb_first(&osd->o_backoffs_by_id); n; n = rb_next(n)) { + struct ceph_osd_backoff *backoff = + rb_entry(n, struct ceph_osd_backoff, id_node); + + seq_printf(s, "osd%d\t", osd->o_osd); + dump_spgid(s, &backoff->spgid); + seq_printf(s, "\t%llu\t", backoff->id); + dump_hoid(s, backoff->begin); + seq_putc(s, '\t'); + dump_hoid(s, backoff->end); + seq_putc(s, '\n'); + } + + mutex_unlock(&osd->lock); +} + static int osdc_show(struct seq_file *s, void *pp) { struct ceph_client *client = s->private; @@ -259,6 +358,13 @@ static int osdc_show(struct seq_file *s, void *pp) } dump_linger_requests(s, &osdc->homeless_osd); + seq_puts(s, "BACKOFFS\n"); + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + dump_backoffs(s, osd); + } + up_read(&osdc->lock); return 0; } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 588a91930051..a67298c7e0cd 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1287,14 +1287,17 @@ static void prepare_write_message(struct ceph_connection *con) if (m->needs_out_seq) { m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; + + if (con->ops->reencode_message) + con->ops->reencode_message(m); } - WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), m->data_length); - BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); + WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); @@ -2033,8 +2036,7 @@ static int process_connect(struct ceph_connection *con) { u64 sup_feat = from_msgr(con->msgr)->supported_features; u64 req_feat = from_msgr(con->msgr)->required_features; - u64 server_feat = ceph_sanitize_features( - le64_to_cpu(con->in_reply.features)); + u64 server_feat = le64_to_cpu(con->in_reply.features); int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -3201,8 +3203,10 @@ static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) return NULL; data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); - if (data) - data->type = type; + if (!data) + return NULL; + + data->type = type; INIT_LIST_HEAD(&data->links); return data; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 250f11f78609..875675765531 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -6,6 +6,7 @@ #include <linux/random.h> #include <linux/sched.h> +#include <linux/ceph/ceph_features.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/libceph.h> #include <linux/ceph/debugfs.h> @@ -297,6 +298,10 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); if (monc->sub_renew_sent) { + /* + * This is only needed for legacy (infernalis or older) + * MONs -- see delayed_work(). + */ monc->sub_renew_after = monc->sub_renew_sent + (seconds >> 1) * HZ - 1; dout("%s sent %lu duration %d renew after %lu\n", __func__, @@ -955,7 +960,8 @@ static void delayed_work(struct work_struct *work) __validate_auth(monc); } - if (is_auth) { + if (is_auth && + !(monc->con.peer_features & CEPH_FEATURE_MON_STATEFUL_SUB)) { unsigned long now = jiffies; dout("%s renew subs? now %lu renew after %lu\n", diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 924f07c36ddb..dcfbdd74dfd1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -12,6 +12,7 @@ #include <linux/bio.h> #endif +#include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/osd_client.h> #include <linux/ceph/messenger.h> @@ -49,6 +50,7 @@ static void link_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq); static void unlink_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq); +static void clear_backoffs(struct ceph_osd *osd); #if 1 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) @@ -373,6 +375,7 @@ static void target_copy(struct ceph_osd_request_target *dest, ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); dest->pgid = src->pgid; /* struct */ + dest->spgid = src->spgid; /* struct */ dest->pg_num = src->pg_num; dest->pg_num_mask = src->pg_num_mask; ceph_osds_copy(&dest->acting, &src->acting); @@ -384,6 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest, dest->flags = src->flags; dest->paused = src->paused; + dest->epoch = src->epoch; + dest->last_force_resend = src->last_force_resend; + dest->osd = src->osd; } @@ -537,7 +543,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); -static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc) +static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc) { return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); } @@ -552,17 +558,21 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); /* create request message */ - msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ - msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ + msg_size = CEPH_ENCODING_START_BLK_LEN + + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ + msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */ + msg_size += CEPH_ENCODING_START_BLK_LEN + + sizeof(struct ceph_osd_reqid); /* reqid */ + msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */ + msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */ msg_size += CEPH_ENCODING_START_BLK_LEN + ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ - msg_size += 1 + 8 + 4 + 4; /* pgid */ msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); - msg_size += 4; /* retry_attempt */ + msg_size += 4 + 8; /* retry_attempt, features */ if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); @@ -1010,6 +1020,8 @@ static void osd_init(struct ceph_osd *osd) RB_CLEAR_NODE(&osd->o_node); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; + osd->o_backoff_mappings = RB_ROOT; + osd->o_backoffs_by_id = RB_ROOT; INIT_LIST_HEAD(&osd->o_osd_lru); INIT_LIST_HEAD(&osd->o_keepalive_item); osd->o_incarnation = 1; @@ -1021,6 +1033,8 @@ static void osd_cleanup(struct ceph_osd *osd) WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id)); WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); @@ -1141,6 +1155,7 @@ static void close_osd(struct ceph_osd *osd) unlink_linger(osd, lreq); link_linger(&osdc->homeless_osd, lreq); } + clear_backoffs(osd); __remove_osd_from_lru(osd); erase_osd(&osdc->osds, osd); @@ -1297,7 +1312,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || __pool_full(pi); - WARN_ON(pi->id != t->base_oloc.pool); + WARN_ON(pi->id != t->target_oloc.pool); return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || (osdc->osdmap->epoch < osdc->epoch_barrier); @@ -1311,19 +1326,23 @@ enum calc_target_result { static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, - u32 *last_force_resend, + struct ceph_connection *con, bool any_change) { struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; bool force_resend = false; - bool need_check_tiering = false; - bool need_resend = false; + bool unpaused = false; + bool legacy_change; + bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); + bool recovery_deletes = ceph_osdmap_flag(osdc, + CEPH_OSDMAP_RECOVERY_DELETES); enum calc_target_result ct_res; int ret; + t->epoch = osdc->osdmap->epoch; pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); if (!pi) { t->osd = CEPH_HOMELESS_OSD; @@ -1332,33 +1351,33 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, } if (osdc->osdmap->epoch == pi->last_force_request_resend) { - if (last_force_resend && - *last_force_resend < pi->last_force_request_resend) { - *last_force_resend = pi->last_force_request_resend; + if (t->last_force_resend < pi->last_force_request_resend) { + t->last_force_resend = pi->last_force_request_resend; force_resend = true; - } else if (!last_force_resend) { + } else if (t->last_force_resend == 0) { force_resend = true; } } - if (ceph_oid_empty(&t->target_oid) || force_resend) { - ceph_oid_copy(&t->target_oid, &t->base_oid); - need_check_tiering = true; - } - if (ceph_oloc_empty(&t->target_oloc) || force_resend) { - ceph_oloc_copy(&t->target_oloc, &t->base_oloc); - need_check_tiering = true; - } - if (need_check_tiering && - (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + /* apply tiering */ + ceph_oid_copy(&t->target_oid, &t->base_oid); + ceph_oloc_copy(&t->target_oloc, &t->base_oloc); + if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) t->target_oloc.pool = pi->read_tier; if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) t->target_oloc.pool = pi->write_tier; + + pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); + if (!pi) { + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } } - ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, - &t->target_oloc, &pgid); + ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, + &pgid); if (ret) { WARN_ON(ret != -ENOENT); t->osd = CEPH_HOMELESS_OSD; @@ -1368,7 +1387,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, last_pgid.pool = pgid.pool; last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); - ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); + ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting); if (any_change && ceph_is_new_interval(&t->acting, &acting, @@ -1382,18 +1401,23 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, pi->pg_num, t->sort_bitwise, sort_bitwise, + t->recovery_deletes, + recovery_deletes, &last_pgid)) force_resend = true; if (t->paused && !target_should_be_paused(osdc, t, pi)) { t->paused = false; - need_resend = true; + unpaused = true; } + legacy_change = ceph_pg_compare(&t->pgid, &pgid) || + ceph_osds_changed(&t->acting, &acting, any_change); + if (t->pg_num) + split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); - if (ceph_pg_compare(&t->pgid, &pgid) || - ceph_osds_changed(&t->acting, &acting, any_change) || - force_resend) { + if (legacy_change || force_resend || split) { t->pgid = pgid; /* struct */ + ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid); ceph_osds_copy(&t->acting, &acting); ceph_osds_copy(&t->up, &up); t->size = pi->size; @@ -1401,17 +1425,345 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->pg_num = pi->pg_num; t->pg_num_mask = pi->pg_num_mask; t->sort_bitwise = sort_bitwise; + t->recovery_deletes = recovery_deletes; t->osd = acting.primary; - need_resend = true; } - ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; + if (unpaused || legacy_change || force_resend || + (split && con && CEPH_HAVE_FEATURE(con->peer_features, + RESEND_ON_SPLIT))) + ct_res = CALC_TARGET_NEED_RESEND; + else + ct_res = CALC_TARGET_NO_ACTION; + out: dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); return ct_res; } +static struct ceph_spg_mapping *alloc_spg_mapping(void) +{ + struct ceph_spg_mapping *spg; + + spg = kmalloc(sizeof(*spg), GFP_NOIO); + if (!spg) + return NULL; + + RB_CLEAR_NODE(&spg->node); + spg->backoffs = RB_ROOT; + return spg; +} + +static void free_spg_mapping(struct ceph_spg_mapping *spg) +{ + WARN_ON(!RB_EMPTY_NODE(&spg->node)); + WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs)); + + kfree(spg); +} + +/* + * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to + * ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is + * defined only within a specific spgid; it does not pass anything to + * children on split, or to another primary. + */ +DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare, + RB_BYPTR, const struct ceph_spg *, node) + +static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid) +{ + return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits; +} + +static void hoid_get_effective_key(const struct ceph_hobject_id *hoid, + void **pkey, size_t *pkey_len) +{ + if (hoid->key_len) { + *pkey = hoid->key; + *pkey_len = hoid->key_len; + } else { + *pkey = hoid->oid; + *pkey_len = hoid->oid_len; + } +} + +static int compare_names(const void *name1, size_t name1_len, + const void *name2, size_t name2_len) +{ + int ret; + + ret = memcmp(name1, name2, min(name1_len, name2_len)); + if (!ret) { + if (name1_len < name2_len) + ret = -1; + else if (name1_len > name2_len) + ret = 1; + } + return ret; +} + +static int hoid_compare(const struct ceph_hobject_id *lhs, + const struct ceph_hobject_id *rhs) +{ + void *effective_key1, *effective_key2; + size_t effective_key1_len, effective_key2_len; + int ret; + + if (lhs->is_max < rhs->is_max) + return -1; + if (lhs->is_max > rhs->is_max) + return 1; + + if (lhs->pool < rhs->pool) + return -1; + if (lhs->pool > rhs->pool) + return 1; + + if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs)) + return -1; + if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs)) + return 1; + + ret = compare_names(lhs->nspace, lhs->nspace_len, + rhs->nspace, rhs->nspace_len); + if (ret) + return ret; + + hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len); + hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len); + ret = compare_names(effective_key1, effective_key1_len, + effective_key2, effective_key2_len); + if (ret) + return ret; + + ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len); + if (ret) + return ret; + + if (lhs->snapid < rhs->snapid) + return -1; + if (lhs->snapid > rhs->snapid) + return 1; + + return 0; +} + +/* + * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX + * compat stuff here. + * + * Assumes @hoid is zero-initialized. + */ +static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid) +{ + u8 struct_v; + u32 struct_len; + int ret; + + ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v, + &struct_len); + if (ret) + return ret; + + if (struct_v < 4) { + pr_err("got struct_v %d < 4 of hobject_t\n", struct_v); + goto e_inval; + } + + hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len, + GFP_NOIO); + if (IS_ERR(hoid->key)) { + ret = PTR_ERR(hoid->key); + hoid->key = NULL; + return ret; + } + + hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len, + GFP_NOIO); + if (IS_ERR(hoid->oid)) { + ret = PTR_ERR(hoid->oid); + hoid->oid = NULL; + return ret; + } + + ceph_decode_64_safe(p, end, hoid->snapid, e_inval); + ceph_decode_32_safe(p, end, hoid->hash, e_inval); + ceph_decode_8_safe(p, end, hoid->is_max, e_inval); + + hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len, + GFP_NOIO); + if (IS_ERR(hoid->nspace)) { + ret = PTR_ERR(hoid->nspace); + hoid->nspace = NULL; + return ret; + } + + ceph_decode_64_safe(p, end, hoid->pool, e_inval); + + ceph_hoid_build_hash_cache(hoid); + return 0; + +e_inval: + return -EINVAL; +} + +static int hoid_encoding_size(const struct ceph_hobject_id *hoid) +{ + return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */ + 4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len; +} + +static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid) +{ + ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid)); + ceph_encode_string(p, end, hoid->key, hoid->key_len); + ceph_encode_string(p, end, hoid->oid, hoid->oid_len); + ceph_encode_64(p, hoid->snapid); + ceph_encode_32(p, hoid->hash); + ceph_encode_8(p, hoid->is_max); + ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len); + ceph_encode_64(p, hoid->pool); +} + +static void free_hoid(struct ceph_hobject_id *hoid) +{ + if (hoid) { + kfree(hoid->key); + kfree(hoid->oid); + kfree(hoid->nspace); + kfree(hoid); + } +} + +static struct ceph_osd_backoff *alloc_backoff(void) +{ + struct ceph_osd_backoff *backoff; + + backoff = kzalloc(sizeof(*backoff), GFP_NOIO); + if (!backoff) + return NULL; + + RB_CLEAR_NODE(&backoff->spg_node); + RB_CLEAR_NODE(&backoff->id_node); + return backoff; +} + +static void free_backoff(struct ceph_osd_backoff *backoff) +{ + WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node)); + WARN_ON(!RB_EMPTY_NODE(&backoff->id_node)); + + free_hoid(backoff->begin); + free_hoid(backoff->end); + kfree(backoff); +} + +/* + * Within a specific spgid, backoffs are managed by ->begin hoid. + */ +DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare, + RB_BYVAL, spg_node); + +static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root, + const struct ceph_hobject_id *hoid) +{ + struct rb_node *n = root->rb_node; + + while (n) { + struct ceph_osd_backoff *cur = + rb_entry(n, struct ceph_osd_backoff, spg_node); + int cmp; + + cmp = hoid_compare(hoid, cur->begin); + if (cmp < 0) { + n = n->rb_left; + } else if (cmp > 0) { + if (hoid_compare(hoid, cur->end) < 0) + return cur; + + n = n->rb_right; + } else { + return cur; + } + } + + return NULL; +} + +/* + * Each backoff has a unique id within its OSD session. + */ +DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node) + +static void clear_backoffs(struct ceph_osd *osd) +{ + while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) { + struct ceph_spg_mapping *spg = + rb_entry(rb_first(&osd->o_backoff_mappings), + struct ceph_spg_mapping, node); + + while (!RB_EMPTY_ROOT(&spg->backoffs)) { + struct ceph_osd_backoff *backoff = + rb_entry(rb_first(&spg->backoffs), + struct ceph_osd_backoff, spg_node); + + erase_backoff(&spg->backoffs, backoff); + erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); + free_backoff(backoff); + } + erase_spg_mapping(&osd->o_backoff_mappings, spg); + free_spg_mapping(spg); + } +} + +/* + * Set up a temporary, non-owning view into @t. + */ +static void hoid_fill_from_target(struct ceph_hobject_id *hoid, + const struct ceph_osd_request_target *t) +{ + hoid->key = NULL; + hoid->key_len = 0; + hoid->oid = t->target_oid.name; + hoid->oid_len = t->target_oid.name_len; + hoid->snapid = CEPH_NOSNAP; + hoid->hash = t->pgid.seed; + hoid->is_max = false; + if (t->target_oloc.pool_ns) { + hoid->nspace = t->target_oloc.pool_ns->str; + hoid->nspace_len = t->target_oloc.pool_ns->len; + } else { + hoid->nspace = NULL; + hoid->nspace_len = 0; + } + hoid->pool = t->target_oloc.pool; + ceph_hoid_build_hash_cache(hoid); +} + +static bool should_plug_request(struct ceph_osd_request *req) +{ + struct ceph_osd *osd = req->r_osd; + struct ceph_spg_mapping *spg; + struct ceph_osd_backoff *backoff; + struct ceph_hobject_id hoid; + + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid); + if (!spg) + return false; + + hoid_fill_from_target(&hoid, &req->r_t); + backoff = lookup_containing_backoff(&spg->backoffs, &hoid); + if (!backoff) + return false; + + dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n", + __func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool, + backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id); + return true; +} + static void setup_request_data(struct ceph_osd_request *req, struct ceph_msg *msg) { @@ -1483,7 +1835,37 @@ static void setup_request_data(struct ceph_osd_request *req, WARN_ON(data_len != msg->data_length); } -static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) +static void encode_pgid(void **p, const struct ceph_pg *pgid) +{ + ceph_encode_8(p, 1); + ceph_encode_64(p, pgid->pool); + ceph_encode_32(p, pgid->seed); + ceph_encode_32(p, -1); /* preferred */ +} + +static void encode_spgid(void **p, const struct ceph_spg *spgid) +{ + ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1); + encode_pgid(p, &spgid->pgid); + ceph_encode_8(p, spgid->shard); +} + +static void encode_oloc(void **p, void *end, + const struct ceph_object_locator *oloc) +{ + ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc)); + ceph_encode_64(p, oloc->pool); + ceph_encode_32(p, -1); /* preferred */ + ceph_encode_32(p, 0); /* key len */ + if (oloc->pool_ns) + ceph_encode_string(p, end, oloc->pool_ns->str, + oloc->pool_ns->len); + else + ceph_encode_32(p, 0); +} + +static void encode_request_partial(struct ceph_osd_request *req, + struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const end = p + msg->front_alloc_len; @@ -1500,38 +1882,27 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) setup_request_data(req, msg); - ceph_encode_32(&p, 1); /* client_inc, always 1 */ + encode_spgid(&p, &req->r_t.spgid); /* actual spg */ + ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ ceph_encode_32(&p, req->r_osdc->osdmap->epoch); ceph_encode_32(&p, req->r_flags); - ceph_encode_timespec(p, &req->r_mtime); - p += sizeof(struct ceph_timespec); - /* reassert_version */ - memset(p, 0, sizeof(struct ceph_eversion)); - p += sizeof(struct ceph_eversion); - - /* oloc */ - ceph_start_encoding(&p, 5, 4, - ceph_oloc_encoding_size(&req->r_t.target_oloc)); - ceph_encode_64(&p, req->r_t.target_oloc.pool); - ceph_encode_32(&p, -1); /* preferred */ - ceph_encode_32(&p, 0); /* key len */ - if (req->r_t.target_oloc.pool_ns) - ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str, - req->r_t.target_oloc.pool_ns->len); - else - ceph_encode_32(&p, 0); + /* reqid */ + ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid)); + memset(p, 0, sizeof(struct ceph_osd_reqid)); + p += sizeof(struct ceph_osd_reqid); - /* pgid */ - ceph_encode_8(&p, 1); - ceph_encode_64(&p, req->r_t.pgid.pool); - ceph_encode_32(&p, req->r_t.pgid.seed); - ceph_encode_32(&p, -1); /* preferred */ + /* trace */ + memset(p, 0, sizeof(struct ceph_blkin_trace_info)); + p += sizeof(struct ceph_blkin_trace_info); + + ceph_encode_32(&p, 0); /* client_inc, always 0 */ + ceph_encode_timespec(p, &req->r_mtime); + p += sizeof(struct ceph_timespec); - /* oid */ - ceph_encode_32(&p, req->r_t.target_oid.name_len); - memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len); - p += req->r_t.target_oid.name_len; + encode_oloc(&p, end, &req->r_t.target_oloc); + ceph_encode_string(&p, end, req->r_t.target_oid.name, + req->r_t.target_oid.name_len); /* ops, can imply data */ ceph_encode_16(&p, req->r_num_ops); @@ -1552,10 +1923,11 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) } ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ + BUG_ON(p > end - 8); /* space for features */ - BUG_ON(p > end); + msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */ + /* front_len is finalized in encode_request_finish() */ msg->front.iov_len = p - msg->front.iov_base; - msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.data_len = cpu_to_le32(data_len); /* @@ -1565,9 +1937,100 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) */ msg->hdr.data_off = cpu_to_le16(req->r_data_offset); - dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__, - req, req->r_t.target_oid.name, req->r_t.target_oid.name_len, - msg->front.iov_len, data_len); + dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg, + req->r_t.target_oid.name, req->r_t.target_oid.name_len); +} + +static void encode_request_finish(struct ceph_msg *msg) +{ + void *p = msg->front.iov_base; + void *const partial_end = p + msg->front.iov_len; + void *const end = p + msg->front_alloc_len; + + if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) { + /* luminous OSD -- encode features and be done */ + p = partial_end; + ceph_encode_64(&p, msg->con->peer_features); + } else { + struct { + char spgid[CEPH_ENCODING_START_BLK_LEN + + CEPH_PGID_ENCODING_LEN + 1]; + __le32 hash; + __le32 epoch; + __le32 flags; + char reqid[CEPH_ENCODING_START_BLK_LEN + + sizeof(struct ceph_osd_reqid)]; + char trace[sizeof(struct ceph_blkin_trace_info)]; + __le32 client_inc; + struct ceph_timespec mtime; + } __packed head; + struct ceph_pg pgid; + void *oloc, *oid, *tail; + int oloc_len, oid_len, tail_len; + int len; + + /* + * Pre-luminous OSD -- reencode v8 into v4 using @head + * as a temporary buffer. Encode the raw PG; the rest + * is just a matter of moving oloc, oid and tail blobs + * around. + */ + memcpy(&head, p, sizeof(head)); + p += sizeof(head); + + oloc = p; + p += CEPH_ENCODING_START_BLK_LEN; + pgid.pool = ceph_decode_64(&p); + p += 4 + 4; /* preferred, key len */ + len = ceph_decode_32(&p); + p += len; /* nspace */ + oloc_len = p - oloc; + + oid = p; + len = ceph_decode_32(&p); + p += len; + oid_len = p - oid; + + tail = p; + tail_len = partial_end - p; + + p = msg->front.iov_base; + ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc)); + ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch)); + ceph_encode_copy(&p, &head.flags, sizeof(head.flags)); + ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime)); + + /* reassert_version */ + memset(p, 0, sizeof(struct ceph_eversion)); + p += sizeof(struct ceph_eversion); + + BUG_ON(p >= oloc); + memmove(p, oloc, oloc_len); + p += oloc_len; + + pgid.seed = le32_to_cpu(head.hash); + encode_pgid(&p, &pgid); /* raw pg */ + + BUG_ON(p >= oid); + memmove(p, oid, oid_len); + p += oid_len; + + /* tail -- ops, snapid, snapc, retry_attempt */ + BUG_ON(p >= tail); + memmove(p, tail, tail_len); + p += tail_len; + + msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ + } + + BUG_ON(p > end); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg, + le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), + le16_to_cpu(msg->hdr.version)); } /* @@ -1580,6 +2043,10 @@ static void send_request(struct ceph_osd_request *req) verify_osd_locked(osd); WARN_ON(osd->o_osd != req->r_t.osd); + /* backoff? */ + if (should_plug_request(req)) + return; + /* * We may have a previously queued request message hanging * around. Cancel it to avoid corrupting the msgr. @@ -1593,11 +2060,13 @@ static void send_request(struct ceph_osd_request *req) else WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); - encode_request(req, req->r_request); + encode_request_partial(req, req->r_request); - dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n", + dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n", __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, - req->r_t.osd, req->r_flags, req->r_attempts); + req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed, + req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags, + req->r_attempts); req->r_t.paused = false; req->r_stamp = jiffies; @@ -1645,7 +2114,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); + ct_res = calc_target(osdc, &req->r_t, NULL, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; @@ -1737,13 +2206,12 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked) static void finish_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; - struct ceph_osd *osd = req->r_osd; - verify_osd_locked(osd); + WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); - unlink_request(osd, req); + if (req->r_osd) + unlink_request(req->r_osd, req); atomic_dec(&osdc->num_requests); /* @@ -2441,7 +2909,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd *osd; - calc_target(osdc, &lreq->t, &lreq->last_force_resend, false); + calc_target(osdc, &lreq->t, NULL, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); @@ -3059,7 +3527,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; - ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true); + ct_res = calc_target(osdc, &lreq->t, NULL, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; @@ -3117,6 +3585,7 @@ static void scan_requests(struct ceph_osd *osd, list_add_tail(&lreq->scan_item, need_resend_linger); break; case CALC_TARGET_POOL_DNE: + list_del_init(&lreq->scan_item); check_linger_pool_dne(lreq); break; } @@ -3130,8 +3599,8 @@ static void scan_requests(struct ceph_osd *osd, n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - ct_res = calc_target(osdc, &req->r_t, - &req->r_last_force_resend, false); + ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, + false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || @@ -3229,8 +3698,25 @@ static void kick_requests(struct ceph_osd_client *osdc, struct list_head *need_resend_linger) { struct ceph_osd_linger_request *lreq, *nlreq; + enum calc_target_result ct_res; struct rb_node *n; + /* make sure need_resend targets reflect latest map */ + for (n = rb_first(need_resend); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + n = rb_next(n); + + if (req->r_t.epoch < osdc->osdmap->epoch) { + ct_res = calc_target(osdc, &req->r_t, NULL, false); + if (ct_res == CALC_TARGET_POOL_DNE) { + erase_request(need_resend, req); + check_pool_dne(req); + } + } + } + for (n = rb_first(need_resend); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); @@ -3239,8 +3725,6 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(n); erase_request(need_resend, req); /* before link_request() */ - WARN_ON(req->r_osd); - calc_target(osdc, &req->r_t, NULL, false); osd = lookup_create_osd(osdc, req->r_t.osd, true); link_request(osd, req); if (!req->r_linger) { @@ -3383,6 +3867,8 @@ static void kick_osd_requests(struct ceph_osd *osd) { struct rb_node *n; + clear_backoffs(osd); + for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); @@ -3428,6 +3914,261 @@ out_unlock: up_write(&osdc->lock); } +struct MOSDBackoff { + struct ceph_spg spgid; + u32 map_epoch; + u8 op; + u64 id; + struct ceph_hobject_id *begin; + struct ceph_hobject_id *end; +}; + +static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m) +{ + void *p = msg->front.iov_base; + void *const end = p + msg->front.iov_len; + u8 struct_v; + u32 struct_len; + int ret; + + ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len); + if (ret) + return ret; + + ret = ceph_decode_pgid(&p, end, &m->spgid.pgid); + if (ret) + return ret; + + ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval); + ceph_decode_32_safe(&p, end, m->map_epoch, e_inval); + ceph_decode_8_safe(&p, end, m->op, e_inval); + ceph_decode_64_safe(&p, end, m->id, e_inval); + + m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO); + if (!m->begin) + return -ENOMEM; + + ret = decode_hoid(&p, end, m->begin); + if (ret) { + free_hoid(m->begin); + return ret; + } + + m->end = kzalloc(sizeof(*m->end), GFP_NOIO); + if (!m->end) { + free_hoid(m->begin); + return -ENOMEM; + } + + ret = decode_hoid(&p, end, m->end); + if (ret) { + free_hoid(m->begin); + free_hoid(m->end); + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static struct ceph_msg *create_backoff_message( + const struct ceph_osd_backoff *backoff, + u32 map_epoch) +{ + struct ceph_msg *msg; + void *p, *end; + int msg_size; + + msg_size = CEPH_ENCODING_START_BLK_LEN + + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ + msg_size += 4 + 1 + 8; /* map_epoch, op, id */ + msg_size += CEPH_ENCODING_START_BLK_LEN + + hoid_encoding_size(backoff->begin); + msg_size += CEPH_ENCODING_START_BLK_LEN + + hoid_encoding_size(backoff->end); + + msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true); + if (!msg) + return NULL; + + p = msg->front.iov_base; + end = p + msg->front_alloc_len; + + encode_spgid(&p, &backoff->spgid); + ceph_encode_32(&p, map_epoch); + ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK); + ceph_encode_64(&p, backoff->id); + encode_hoid(&p, end, backoff->begin); + encode_hoid(&p, end, backoff->end); + BUG_ON(p != end); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */ + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + return msg; +} + +static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m) +{ + struct ceph_spg_mapping *spg; + struct ceph_osd_backoff *backoff; + struct ceph_msg *msg; + + dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, + m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); + + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid); + if (!spg) { + spg = alloc_spg_mapping(); + if (!spg) { + pr_err("%s failed to allocate spg\n", __func__); + return; + } + spg->spgid = m->spgid; /* struct */ + insert_spg_mapping(&osd->o_backoff_mappings, spg); + } + + backoff = alloc_backoff(); + if (!backoff) { + pr_err("%s failed to allocate backoff\n", __func__); + return; + } + backoff->spgid = m->spgid; /* struct */ + backoff->id = m->id; + backoff->begin = m->begin; + m->begin = NULL; /* backoff now owns this */ + backoff->end = m->end; + m->end = NULL; /* ditto */ + + insert_backoff(&spg->backoffs, backoff); + insert_backoff_by_id(&osd->o_backoffs_by_id, backoff); + + /* + * Ack with original backoff's epoch so that the OSD can + * discard this if there was a PG split. + */ + msg = create_backoff_message(backoff, m->map_epoch); + if (!msg) { + pr_err("%s failed to allocate msg\n", __func__); + return; + } + ceph_con_send(&osd->o_con, msg); +} + +static bool target_contained_by(const struct ceph_osd_request_target *t, + const struct ceph_hobject_id *begin, + const struct ceph_hobject_id *end) +{ + struct ceph_hobject_id hoid; + int cmp; + + hoid_fill_from_target(&hoid, t); + cmp = hoid_compare(&hoid, begin); + return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0); +} + +static void handle_backoff_unblock(struct ceph_osd *osd, + const struct MOSDBackoff *m) +{ + struct ceph_spg_mapping *spg; + struct ceph_osd_backoff *backoff; + struct rb_node *n; + + dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, + m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); + + backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id); + if (!backoff) { + pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n", + __func__, osd->o_osd, m->spgid.pgid.pool, + m->spgid.pgid.seed, m->spgid.shard, m->id); + return; + } + + if (hoid_compare(backoff->begin, m->begin) && + hoid_compare(backoff->end, m->end)) { + pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n", + __func__, osd->o_osd, m->spgid.pgid.pool, + m->spgid.pgid.seed, m->spgid.shard, m->id); + /* unblock it anyway... */ + } + + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid); + BUG_ON(!spg); + + erase_backoff(&spg->backoffs, backoff); + erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); + free_backoff(backoff); + + if (RB_EMPTY_ROOT(&spg->backoffs)) { + erase_spg_mapping(&osd->o_backoff_mappings, spg); + free_spg_mapping(spg); + } + + for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) { + /* + * Match against @m, not @backoff -- the PG may + * have split on the OSD. + */ + if (target_contained_by(&req->r_t, m->begin, m->end)) { + /* + * If no other installed backoff applies, + * resend. + */ + send_request(req); + } + } + } +} + +static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg) +{ + struct ceph_osd_client *osdc = osd->o_osdc; + struct MOSDBackoff m; + int ret; + + down_read(&osdc->lock); + if (!osd_registered(osd)) { + dout("%s osd%d unknown\n", __func__, osd->o_osd); + up_read(&osdc->lock); + return; + } + WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); + + mutex_lock(&osd->lock); + ret = decode_MOSDBackoff(msg, &m); + if (ret) { + pr_err("failed to decode MOSDBackoff: %d\n", ret); + ceph_msg_dump(msg); + goto out_unlock; + } + + switch (m.op) { + case CEPH_OSD_BACKOFF_OP_BLOCK: + handle_backoff_block(osd, &m); + break; + case CEPH_OSD_BACKOFF_OP_UNBLOCK: + handle_backoff_unblock(osd, &m); + break; + default: + pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op); + } + + free_hoid(m.begin); + free_hoid(m.end); + +out_unlock: + mutex_unlock(&osd->lock); + up_read(&osdc->lock); +} + /* * Process osd watch notifications */ @@ -4365,6 +5106,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) case CEPH_MSG_OSD_OPREPLY: handle_reply(osd, msg); break; + case CEPH_MSG_OSD_BACKOFF: + handle_backoff(osd, msg); + break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); break; @@ -4487,6 +5231,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: + case CEPH_MSG_OSD_BACKOFF: case CEPH_MSG_WATCH_NOTIFY: return alloc_msg_with_page_vector(hdr); case CEPH_MSG_OSD_OPREPLY: @@ -4571,6 +5316,14 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } +static void osd_reencode_message(struct ceph_msg *msg) +{ + int type = le16_to_cpu(msg->hdr.type); + + if (type == CEPH_MSG_OSD_OP) + encode_request_finish(msg); +} + static int osd_sign_message(struct ceph_msg *msg) { struct ceph_osd *o = msg->con->private; @@ -4595,6 +5348,7 @@ static const struct ceph_connection_operations osd_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .alloc_msg = alloc_msg, + .reencode_message = osd_reencode_message, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, .fault = osd_fault, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 55e3a477f92d..f358d0bfa76b 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -11,7 +11,7 @@ #include <linux/crush/hash.h> #include <linux/crush/mapper.h> -char *ceph_osdmap_state_str(char *str, int len, int state) +char *ceph_osdmap_state_str(char *str, int len, u32 state) { if (!len) return str; @@ -138,19 +138,179 @@ bad: return -EINVAL; } -static int skip_name_map(void **p, void *end) +static struct crush_choose_arg_map *alloc_choose_arg_map(void) { - int len; - ceph_decode_32_safe(p, end, len ,bad); - while (len--) { - int strlen; - *p += sizeof(u32); - ceph_decode_32_safe(p, end, strlen, bad); - *p += strlen; + struct crush_choose_arg_map *arg_map; + + arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO); + if (!arg_map) + return NULL; + + RB_CLEAR_NODE(&arg_map->node); + return arg_map; } - return 0; -bad: - return -EINVAL; + +static void free_choose_arg_map(struct crush_choose_arg_map *arg_map) +{ + if (arg_map) { + int i, j; + + WARN_ON(!RB_EMPTY_NODE(&arg_map->node)); + + for (i = 0; i < arg_map->size; i++) { + struct crush_choose_arg *arg = &arg_map->args[i]; + + for (j = 0; j < arg->weight_set_size; j++) + kfree(arg->weight_set[j].weights); + kfree(arg->weight_set); + kfree(arg->ids); + } + kfree(arg_map->args); + kfree(arg_map); + } +} + +DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index, + node); + +void clear_choose_args(struct crush_map *c) +{ + while (!RB_EMPTY_ROOT(&c->choose_args)) { + struct crush_choose_arg_map *arg_map = + rb_entry(rb_first(&c->choose_args), + struct crush_choose_arg_map, node); + + erase_choose_arg_map(&c->choose_args, arg_map); + free_choose_arg_map(arg_map); + } +} + +static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen) +{ + u32 *a = NULL; + u32 len; + int ret; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len) { + u32 i; + + a = kmalloc_array(len, sizeof(u32), GFP_NOIO); + if (!a) { + ret = -ENOMEM; + goto fail; + } + + ceph_decode_need(p, end, len * sizeof(u32), e_inval); + for (i = 0; i < len; i++) + a[i] = ceph_decode_32(p); + } + + *plen = len; + return a; + +e_inval: + ret = -EINVAL; +fail: + kfree(a); + return ERR_PTR(ret); +} + +/* + * Assumes @arg is zero-initialized. + */ +static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg) +{ + int ret; + + ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval); + if (arg->weight_set_size) { + u32 i; + + arg->weight_set = kmalloc_array(arg->weight_set_size, + sizeof(*arg->weight_set), + GFP_NOIO); + if (!arg->weight_set) + return -ENOMEM; + + for (i = 0; i < arg->weight_set_size; i++) { + struct crush_weight_set *w = &arg->weight_set[i]; + + w->weights = decode_array_32_alloc(p, end, &w->size); + if (IS_ERR(w->weights)) { + ret = PTR_ERR(w->weights); + w->weights = NULL; + return ret; + } + } + } + + arg->ids = decode_array_32_alloc(p, end, &arg->ids_size); + if (IS_ERR(arg->ids)) { + ret = PTR_ERR(arg->ids); + arg->ids = NULL; + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_choose_args(void **p, void *end, struct crush_map *c) +{ + struct crush_choose_arg_map *arg_map = NULL; + u32 num_choose_arg_maps, num_buckets; + int ret; + + ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval); + while (num_choose_arg_maps--) { + arg_map = alloc_choose_arg_map(); + if (!arg_map) { + ret = -ENOMEM; + goto fail; + } + + ceph_decode_64_safe(p, end, arg_map->choose_args_index, + e_inval); + arg_map->size = c->max_buckets; + arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args), + GFP_NOIO); + if (!arg_map->args) { + ret = -ENOMEM; + goto fail; + } + + ceph_decode_32_safe(p, end, num_buckets, e_inval); + while (num_buckets--) { + struct crush_choose_arg *arg; + u32 bucket_index; + + ceph_decode_32_safe(p, end, bucket_index, e_inval); + if (bucket_index >= arg_map->size) + goto e_inval; + + arg = &arg_map->args[bucket_index]; + ret = decode_choose_arg(p, end, arg); + if (ret) + goto fail; + + if (arg->ids_size && + arg->ids_size != c->buckets[bucket_index]->size) + goto e_inval; + } + + insert_choose_arg_map(&c->choose_args, arg_map); + } + + return 0; + +e_inval: + ret = -EINVAL; +fail: + free_choose_arg_map(arg_map); + return ret; } static void crush_finalize(struct crush_map *c) @@ -182,12 +342,11 @@ static void crush_finalize(struct crush_map *c) static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; - int err = -EINVAL; + int err; int i, j; void **p = &pbyval; void *start = pbyval; u32 magic; - u32 num_name_maps; dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); @@ -195,6 +354,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (c == NULL) return ERR_PTR(-ENOMEM); + c->choose_args = RB_ROOT; + /* set tunables to default values */ c->choose_local_tries = 2; c->choose_local_fallback_tries = 5; @@ -250,7 +411,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) size = sizeof(struct crush_bucket_straw2); break; default: - err = -EINVAL; goto bad; } BUG_ON(size == 0); @@ -282,31 +442,31 @@ static struct crush_map *crush_decode(void *pbyval, void *end) err = crush_decode_uniform_bucket(p, end, (struct crush_bucket_uniform *)b); if (err < 0) - goto bad; + goto fail; break; case CRUSH_BUCKET_LIST: err = crush_decode_list_bucket(p, end, (struct crush_bucket_list *)b); if (err < 0) - goto bad; + goto fail; break; case CRUSH_BUCKET_TREE: err = crush_decode_tree_bucket(p, end, (struct crush_bucket_tree *)b); if (err < 0) - goto bad; + goto fail; break; case CRUSH_BUCKET_STRAW: err = crush_decode_straw_bucket(p, end, (struct crush_bucket_straw *)b); if (err < 0) - goto bad; + goto fail; break; case CRUSH_BUCKET_STRAW2: err = crush_decode_straw2_bucket(p, end, (struct crush_bucket_straw2 *)b); if (err < 0) - goto bad; + goto fail; break; } } @@ -317,7 +477,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) u32 yes; struct crush_rule *r; - err = -EINVAL; ceph_decode_32_safe(p, end, yes, bad); if (!yes) { dout("crush_decode NO rule %d off %x %p to %p\n", @@ -332,7 +491,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) /* len */ ceph_decode_32_safe(p, end, yes, bad); #if BITS_PER_LONG == 32 - err = -EINVAL; if (yes > (ULONG_MAX - sizeof(*r)) / sizeof(struct crush_rule_step)) goto bad; @@ -353,12 +511,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end) } } - /* ignore trailing name maps. */ - for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { - err = skip_name_map(p, end); - if (err < 0) - goto done; - } + ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */ + ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */ + ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ /* tunables */ ceph_decode_need(p, end, 3*sizeof(u32), done); @@ -391,6 +546,21 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable chooseleaf_stable = %d\n", c->chooseleaf_stable); + if (*p != end) { + /* class_map */ + ceph_decode_skip_map(p, end, 32, 32, bad); + /* class_name */ + ceph_decode_skip_map(p, end, 32, string, bad); + /* class_bucket */ + ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad); + } + + if (*p != end) { + err = decode_choose_args(p, end, c); + if (err) + goto fail; + } + done: crush_finalize(c); dout("crush_decode success\n"); @@ -398,10 +568,14 @@ done: badmem: err = -ENOMEM; -bad: +fail: dout("crush_decode fail %d\n", err); crush_destroy(c); return ERR_PTR(err); + +bad: + err = -EINVAL; + goto fail; } int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) @@ -418,75 +592,49 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) return 0; } -/* - * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) and primary_temp (explicit primary setting) - */ -static int __insert_pg_mapping(struct ceph_pg_mapping *new, - struct rb_root *root) +int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct ceph_pg_mapping *pg = NULL; - int c; + int ret; - dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new); - while (*p) { - parent = *p; - pg = rb_entry(parent, struct ceph_pg_mapping, node); - c = ceph_pg_compare(&new->pgid, &pg->pgid); - if (c < 0) - p = &(*p)->rb_left; - else if (c > 0) - p = &(*p)->rb_right; - else - return -EEXIST; - } + ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid); + if (ret) + return ret; + + if (lhs->shard < rhs->shard) + return -1; + if (lhs->shard > rhs->shard) + return 1; - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, root); return 0; } -static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, - struct ceph_pg pgid) +static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len) { - struct rb_node *n = root->rb_node; struct ceph_pg_mapping *pg; - int c; - while (n) { - pg = rb_entry(n, struct ceph_pg_mapping, node); - c = ceph_pg_compare(&pgid, &pg->pgid); - if (c < 0) { - n = n->rb_left; - } else if (c > 0) { - n = n->rb_right; - } else { - dout("__lookup_pg_mapping %lld.%x got %p\n", - pgid.pool, pgid.seed, pg); - return pg; - } - } - return NULL; + pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO); + if (!pg) + return NULL; + + RB_CLEAR_NODE(&pg->node); + return pg; } -static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) +static void free_pg_mapping(struct ceph_pg_mapping *pg) { - struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); + WARN_ON(!RB_EMPTY_NODE(&pg->node)); - if (pg) { - dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, - pg); - rb_erase(&pg->node, root); - kfree(pg); - return 0; - } - dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); - return -ENOENT; + kfree(pg); } /* + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) and primary_temp (explicit primary setting) + */ +DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, + RB_BYPTR, const struct ceph_pg *, node) + +/* * rbtree of pg pool info */ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) @@ -682,11 +830,48 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += len; } + /* + * last_force_op_resend_preluminous, will be overridden if the + * map was encoded with RESEND_ON_SPLIT + */ if (ev >= 15) pi->last_force_request_resend = ceph_decode_32(p); else pi->last_force_request_resend = 0; + if (ev >= 16) + *p += 4; /* skip min_read_recency_for_promote */ + + if (ev >= 17) + *p += 8; /* skip expected_num_objects */ + + if (ev >= 19) + *p += 4; /* skip cache_target_dirty_high_ratio_micro */ + + if (ev >= 20) + *p += 4; /* skip min_write_recency_for_promote */ + + if (ev >= 21) + *p += 1; /* skip use_gmt_hitset */ + + if (ev >= 22) + *p += 1; /* skip fast_read */ + + if (ev >= 23) { + *p += 4; /* skip hit_set_grade_decay_rate */ + *p += 4; /* skip hit_set_search_last_n */ + } + + if (ev >= 24) { + /* skip opts */ + *p += 1 + 1; /* versions */ + len = ceph_decode_32(p); + *p += len; + } + + if (ev >= 25) + pi->last_force_request_resend = ceph_decode_32(p); + /* ignore the rest */ *p = pool_end; @@ -743,6 +928,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; + map->pg_upmap = RB_ROOT; + map->pg_upmap_items = RB_ROOT; mutex_init(&map->crush_workspace_mutex); return map; @@ -757,14 +944,28 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_temp), struct ceph_pg_mapping, node); - rb_erase(&pg->node, &map->pg_temp); - kfree(pg); + erase_pg_mapping(&map->pg_temp, pg); + free_pg_mapping(pg); } while (!RB_EMPTY_ROOT(&map->primary_temp)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->primary_temp), struct ceph_pg_mapping, node); - rb_erase(&pg->node, &map->primary_temp); + erase_pg_mapping(&map->primary_temp, pg); + free_pg_mapping(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_upmap)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_upmap), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_upmap); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_upmap_items), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_upmap_items); kfree(pg); } while (!RB_EMPTY_ROOT(&map->pg_pools)) { @@ -788,7 +989,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) */ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) { - u8 *state; + u32 *state; u32 *weight; struct ceph_entity_addr *addr; int i; @@ -964,47 +1165,40 @@ static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) return __decode_pools(p, end, map, true); } -static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, - bool incremental) +typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool); + +static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root, + decode_mapping_fn_t fn, bool incremental) { u32 n; + WARN_ON(!incremental && !fn); + ceph_decode_32_safe(p, end, n, e_inval); while (n--) { + struct ceph_pg_mapping *pg; struct ceph_pg pgid; - u32 len, i; int ret; ret = ceph_decode_pgid(p, end, &pgid); if (ret) return ret; - ceph_decode_32_safe(p, end, len, e_inval); - - ret = __remove_pg_mapping(&map->pg_temp, pgid); - BUG_ON(!incremental && ret != -ENOENT); - - if (!incremental || len > 0) { - struct ceph_pg_mapping *pg; - - ceph_decode_need(p, end, len*sizeof(u32), e_inval); - - if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - return -EINVAL; - - pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); - if (!pg) - return -ENOMEM; + pg = lookup_pg_mapping(mapping_root, &pgid); + if (pg) { + WARN_ON(!incremental); + erase_pg_mapping(mapping_root, pg); + free_pg_mapping(pg); + } - pg->pgid = pgid; - pg->pg_temp.len = len; - for (i = 0; i < len; i++) - pg->pg_temp.osds[i] = ceph_decode_32(p); + if (fn) { + pg = fn(p, end, incremental); + if (IS_ERR(pg)) + return PTR_ERR(pg); - ret = __insert_pg_mapping(pg, &map->pg_temp); - if (ret) { - kfree(pg); - return ret; + if (pg) { + pg->pgid = pgid; /* struct */ + insert_pg_mapping(mapping_root, pg); } } } @@ -1015,69 +1209,77 @@ e_inval: return -EINVAL; } +static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end, + bool incremental) +{ + struct ceph_pg_mapping *pg; + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len == 0 && incremental) + return NULL; /* new_pg_temp: [] to remove */ + if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) + return ERR_PTR(-EINVAL); + + ceph_decode_need(p, end, len * sizeof(u32), e_inval); + pg = alloc_pg_mapping(len * sizeof(u32)); + if (!pg) + return ERR_PTR(-ENOMEM); + + pg->pg_temp.len = len; + for (i = 0; i < len; i++) + pg->pg_temp.osds[i] = ceph_decode_32(p); + + return pg; + +e_inval: + return ERR_PTR(-EINVAL); +} + static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) { - return __decode_pg_temp(p, end, map, false); + return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, + false); } static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) { - return __decode_pg_temp(p, end, map, true); + return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, + true); } -static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, - bool incremental) +static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end, + bool incremental) { - u32 n; - - ceph_decode_32_safe(p, end, n, e_inval); - while (n--) { - struct ceph_pg pgid; - u32 osd; - int ret; - - ret = ceph_decode_pgid(p, end, &pgid); - if (ret) - return ret; - - ceph_decode_32_safe(p, end, osd, e_inval); - - ret = __remove_pg_mapping(&map->primary_temp, pgid); - BUG_ON(!incremental && ret != -ENOENT); - - if (!incremental || osd != (u32)-1) { - struct ceph_pg_mapping *pg; - - pg = kzalloc(sizeof(*pg), GFP_NOFS); - if (!pg) - return -ENOMEM; + struct ceph_pg_mapping *pg; + u32 osd; - pg->pgid = pgid; - pg->primary_temp.osd = osd; + ceph_decode_32_safe(p, end, osd, e_inval); + if (osd == (u32)-1 && incremental) + return NULL; /* new_primary_temp: -1 to remove */ - ret = __insert_pg_mapping(pg, &map->primary_temp); - if (ret) { - kfree(pg); - return ret; - } - } - } + pg = alloc_pg_mapping(0); + if (!pg) + return ERR_PTR(-ENOMEM); - return 0; + pg->primary_temp.osd = osd; + return pg; e_inval: - return -EINVAL; + return ERR_PTR(-EINVAL); } static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) { - return __decode_primary_temp(p, end, map, false); + return decode_pg_mapping(p, end, &map->primary_temp, + __decode_primary_temp, false); } static int decode_new_primary_temp(void **p, void *end, struct ceph_osdmap *map) { - return __decode_primary_temp(p, end, map, true); + return decode_pg_mapping(p, end, &map->primary_temp, + __decode_primary_temp, true); } u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) @@ -1168,6 +1370,75 @@ e_inval: return -EINVAL; } +static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end, + bool __unused) +{ + return __decode_pg_temp(p, end, false); +} + +static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, + false); +} + +static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, + true); +} + +static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true); +} + +static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, + bool __unused) +{ + struct ceph_pg_mapping *pg; + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) + return ERR_PTR(-EINVAL); + + ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); + pg = alloc_pg_mapping(2 * len * sizeof(u32)); + if (!pg) + return ERR_PTR(-ENOMEM); + + pg->pg_upmap_items.len = len; + for (i = 0; i < len; i++) { + pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p); + pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p); + } + + return pg; + +e_inval: + return ERR_PTR(-EINVAL); +} + +static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, + __decode_pg_upmap_items, false); +} + +static int decode_new_pg_upmap_items(void **p, void *end, + struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, + __decode_pg_upmap_items, true); +} + +static int decode_old_pg_upmap_items(void **p, void *end, + struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true); +} + /* * decode a full map. */ @@ -1218,13 +1489,21 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) /* osd_state, osd_weight, osd_addrs->client_addr */ ceph_decode_need(p, end, 3*sizeof(u32) + - map->max_osd*(1 + sizeof(*map->osd_weight) + + map->max_osd*((struct_v >= 5 ? sizeof(u32) : + sizeof(u8)) + + sizeof(*map->osd_weight) + sizeof(*map->osd_addr)), e_inval); if (ceph_decode_32(p) != map->max_osd) goto e_inval; - ceph_decode_copy(p, map->osd_state, map->max_osd); + if (struct_v >= 5) { + for (i = 0; i < map->max_osd; i++) + map->osd_state[i] = ceph_decode_32(p); + } else { + for (i = 0; i < map->max_osd; i++) + map->osd_state[i] = ceph_decode_8(p); + } if (ceph_decode_32(p) != map->max_osd) goto e_inval; @@ -1257,9 +1536,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) if (err) goto bad; } else { - /* XXX can this happen? */ - kfree(map->osd_primary_affinity); - map->osd_primary_affinity = NULL; + WARN_ON(map->osd_primary_affinity); } /* crush */ @@ -1268,6 +1545,26 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) if (err) goto bad; + *p += len; + if (struct_v >= 3) { + /* erasure_code_profiles */ + ceph_decode_skip_map_of_map(p, end, string, string, string, + e_inval); + } + + if (struct_v >= 4) { + err = decode_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_pg_upmap_items(p, end, map); + if (err) + goto bad; + } else { + WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap)); + WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items)); + } + /* ignore the rest */ *p = end; @@ -1314,7 +1611,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) * new_up_client: { osd=6, addr=... } # set osd_state and addr * new_state: { osd=6, xorstate=EXISTS } # clear osd_state */ -static int decode_new_up_state_weight(void **p, void *end, +static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_osdmap *map) { void *new_up_client; @@ -1330,7 +1627,7 @@ static int decode_new_up_state_weight(void **p, void *end, new_state = *p; ceph_decode_32_safe(p, end, len, e_inval); - len *= sizeof(u32) + sizeof(u8); + len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8)); ceph_decode_need(p, end, len, e_inval); *p += len; @@ -1366,11 +1663,14 @@ static int decode_new_up_state_weight(void **p, void *end, len = ceph_decode_32(p); while (len--) { s32 osd; - u8 xorstate; + u32 xorstate; int ret; osd = ceph_decode_32(p); - xorstate = ceph_decode_8(p); + if (struct_v >= 5) + xorstate = ceph_decode_32(p); + else + xorstate = ceph_decode_8(p); if (xorstate == 0) xorstate = CEPH_OSD_UP; BUG_ON(osd >= map->max_osd); @@ -1504,7 +1804,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_up_client, new_state, new_weight */ - err = decode_new_up_state_weight(p, end, map); + err = decode_new_up_state_weight(p, end, struct_v, map); if (err) goto bad; @@ -1527,6 +1827,32 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, goto bad; } + if (struct_v >= 3) { + /* new_erasure_code_profiles */ + ceph_decode_skip_map_of_map(p, end, string, string, string, + e_inval); + /* old_erasure_code_profiles */ + ceph_decode_skip_set(p, end, string, e_inval); + } + + if (struct_v >= 4) { + err = decode_new_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_old_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_new_pg_upmap_items(p, end, map); + if (err) + goto bad; + + err = decode_old_pg_upmap_items(p, end, map); + if (err) + goto bad; + } + /* ignore the rest */ *p = end; @@ -1547,12 +1873,13 @@ bad: void ceph_oloc_copy(struct ceph_object_locator *dest, const struct ceph_object_locator *src) { - WARN_ON(!ceph_oloc_empty(dest)); - WARN_ON(dest->pool_ns); /* empty() only covers ->pool */ + ceph_oloc_destroy(dest); dest->pool = src->pool; if (src->pool_ns) dest->pool_ns = ceph_get_string(src->pool_ns); + else + dest->pool_ns = NULL; } EXPORT_SYMBOL(ceph_oloc_copy); @@ -1565,14 +1892,15 @@ EXPORT_SYMBOL(ceph_oloc_destroy); void ceph_oid_copy(struct ceph_object_id *dest, const struct ceph_object_id *src) { - WARN_ON(!ceph_oid_empty(dest)); + ceph_oid_destroy(dest); if (src->name != src->inline_name) { /* very rare, see ceph_object_id definition */ dest->name = kmalloc(src->name_len + 1, GFP_NOIO | __GFP_NOFAIL); + } else { + dest->name = dest->inline_name; } - memcpy(dest->name, src->name, src->name_len + 1); dest->name_len = src->name_len; } @@ -1714,9 +2042,8 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) dest->primary = src->primary; } -static bool is_split(const struct ceph_pg *pgid, - u32 old_pg_num, - u32 new_pg_num) +bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, + u32 new_pg_num) { int old_bits = calc_bits_of(old_pg_num); int old_mask = (1 << old_bits) - 1; @@ -1755,14 +2082,17 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, + bool old_recovery_deletes, + bool new_recovery_deletes, const struct ceph_pg *pgid) { return !osds_equal(old_acting, new_acting) || !osds_equal(old_up, new_up) || old_size != new_size || old_min_size != new_min_size || - is_split(pgid, old_pg_num, new_pg_num) || - old_sort_bitwise != new_sort_bitwise; + ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || + old_sort_bitwise != new_sort_bitwise || + old_recovery_deletes != new_recovery_deletes; } static int calc_pg_rank(int osd, const struct ceph_osds *acting) @@ -1885,16 +2215,12 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); * Should only be called with target_oid and target_oloc (as opposed to * base_oid and base_oloc), since tiering isn't taken into account. */ -int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_id *oid, - struct ceph_object_locator *oloc, - struct ceph_pg *raw_pgid) +int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) { - struct ceph_pg_pool_info *pi; - - pi = ceph_pg_pool_by_id(osdmap, oloc->pool); - if (!pi) - return -ENOENT; + WARN_ON(pi->id != oloc->pool); if (!oloc->pool_ns) { raw_pgid->pool = oloc->pool; @@ -1926,6 +2252,20 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, } return 0; } + +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) +{ + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdmap, oloc->pool); + if (!pi) + return -ENOENT; + + return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); +} EXPORT_SYMBOL(ceph_object_locator_to_pg); /* @@ -1968,25 +2308,69 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, } } +/* + * Magic value used for a "default" fallback choose_args, used if the + * crush_choose_arg_map passed to do_crush() does not exist. If this + * also doesn't exist, fall back to canonical weights. + */ +#define CEPH_DEFAULT_CHOOSE_ARGS -1 + static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, - const __u32 *weight, int weight_max) + const __u32 *weight, int weight_max, + s64 choose_args_index) { + struct crush_choose_arg_map *arg_map; int r; BUG_ON(result_max > CEPH_PG_MAX_SIZE); + arg_map = lookup_choose_arg_map(&map->crush->choose_args, + choose_args_index); + if (!arg_map) + arg_map = lookup_choose_arg_map(&map->crush->choose_args, + CEPH_DEFAULT_CHOOSE_ARGS); + mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_workspace); + weight, weight_max, map->crush_workspace, + arg_map ? arg_map->args : NULL); mutex_unlock(&map->crush_workspace_mutex); return r; } +static void remove_nonexistent_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + struct ceph_osds *set) +{ + int i; + + if (ceph_can_shift_osds(pi)) { + int removed = 0; + + /* shift left */ + for (i = 0; i < set->size; i++) { + if (!ceph_osd_exists(osdmap, set->osds[i])) { + removed++; + continue; + } + if (removed) + set->osds[i - removed] = set->osds[i]; + } + set->size -= removed; + } else { + /* set dne devices to NONE */ + for (i = 0; i < set->size; i++) { + if (!ceph_osd_exists(osdmap, set->osds[i])) + set->osds[i] = CRUSH_ITEM_NONE; + } + } +} + /* - * Calculate raw set (CRUSH output) for given PG. The result may - * contain nonexistent OSDs. ->primary is undefined for a raw set. + * Calculate raw set (CRUSH output) for given PG and filter out + * nonexistent OSDs. ->primary is undefined for a raw set. * * Placement seed (CRUSH input) is returned through @ppps. */ @@ -2020,7 +2404,7 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, } len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, - osdmap->osd_weight, osdmap->max_osd); + osdmap->osd_weight, osdmap->max_osd, pi->id); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", len, ruleno, pi->id, pi->crush_ruleset, pi->type, @@ -2029,6 +2413,53 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, } raw->size = len; + remove_nonexistent_osds(osdmap, pi, raw); +} + +/* apply pg_upmap[_items] mappings */ +static void apply_upmap(struct ceph_osdmap *osdmap, + const struct ceph_pg *pgid, + struct ceph_osds *raw) +{ + struct ceph_pg_mapping *pg; + int i, j; + + pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid); + if (pg) { + /* make sure targets aren't marked out */ + for (i = 0; i < pg->pg_upmap.len; i++) { + int osd = pg->pg_upmap.osds[i]; + + if (osd != CRUSH_ITEM_NONE && + osd < osdmap->max_osd && + osdmap->osd_weight[osd] == 0) { + /* reject/ignore explicit mapping */ + return; + } + } + for (i = 0; i < pg->pg_upmap.len; i++) + raw->osds[i] = pg->pg_upmap.osds[i]; + raw->size = pg->pg_upmap.len; + /* check and apply pg_upmap_items, if any */ + } + + pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); + if (pg) { + for (i = 0; i < raw->size; i++) { + for (j = 0; j < pg->pg_upmap_items.len; j++) { + int from = pg->pg_upmap_items.from_to[j][0]; + int to = pg->pg_upmap_items.from_to[j][1]; + + if (from == raw->osds[i]) { + if (!(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) + raw->osds[i] = to; + break; + } + } + } + } } /* @@ -2151,18 +2582,16 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, */ static void get_temp_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, - const struct ceph_pg *raw_pgid, + const struct ceph_pg *pgid, struct ceph_osds *temp) { - struct ceph_pg pgid; struct ceph_pg_mapping *pg; int i; - raw_pg_to_pg(pi, raw_pgid, &pgid); ceph_osds_init(temp); /* pg_temp? */ - pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); + pg = lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { for (i = 0; i < pg->pg_temp.len; i++) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { @@ -2185,7 +2614,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap, } /* primary_temp? */ - pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); + pg = lookup_pg_mapping(&osdmap->primary_temp, pgid); if (pg) temp->primary = pg->primary_temp.osd; } @@ -2198,43 +2627,75 @@ static void get_temp_osds(struct ceph_osdmap *osdmap, * resend a request. */ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting) { - struct ceph_pg_pool_info *pi; + struct ceph_pg pgid; u32 pps; - pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); - if (!pi) { - ceph_osds_init(up); - ceph_osds_init(acting); - goto out; - } + WARN_ON(pi->id != raw_pgid->pool); + raw_pg_to_pg(pi, raw_pgid, &pgid); pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); + apply_upmap(osdmap, &pgid, up); raw_to_up_osds(osdmap, pi, up); apply_primary_affinity(osdmap, pi, pps, up); - get_temp_osds(osdmap, pi, raw_pgid, acting); + get_temp_osds(osdmap, pi, &pgid, acting); if (!acting->size) { memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); acting->size = up->size; if (acting->primary == -1) acting->primary = up->primary; } -out: WARN_ON(!osds_valid(up) || !osds_valid(acting)); } +bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_spg *spgid) +{ + struct ceph_pg pgid; + struct ceph_osds up, acting; + int i; + + WARN_ON(pi->id != raw_pgid->pool); + raw_pg_to_pg(pi, raw_pgid, &pgid); + + if (ceph_can_shift_osds(pi)) { + spgid->pgid = pgid; /* struct */ + spgid->shard = CEPH_SPG_NOSHARD; + return true; + } + + ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting); + for (i = 0; i < acting.size; i++) { + if (acting.osds[i] == acting.primary) { + spgid->pgid = pgid; /* struct */ + spgid->shard = i; + return true; + } + } + + return false; +} + /* * Return acting primary for given PG, or -1 if none. */ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid) { + struct ceph_pg_pool_info *pi; struct ceph_osds up, acting; - ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting); + pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); + if (!pi) + return -1; + + ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting); return acting.primary; } EXPORT_SYMBOL(ceph_pg_to_acting_primary); diff --git a/net/compat.c b/net/compat.c index aba929e5250f..6ded6c821d7a 100644 --- a/net/compat.c +++ b/net/compat.c @@ -37,21 +37,16 @@ int get_compat_msghdr(struct msghdr *kmsg, struct sockaddr __user **save_addr, struct iovec **iov) { - compat_uptr_t uaddr, uiov, tmp3; - compat_size_t nr_segs; + struct compat_msghdr msg; ssize_t err; - if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) || - __get_user(uaddr, &umsg->msg_name) || - __get_user(kmsg->msg_namelen, &umsg->msg_namelen) || - __get_user(uiov, &umsg->msg_iov) || - __get_user(nr_segs, &umsg->msg_iovlen) || - __get_user(tmp3, &umsg->msg_control) || - __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || - __get_user(kmsg->msg_flags, &umsg->msg_flags)) + if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - if (!uaddr) + kmsg->msg_flags = msg.msg_flags; + kmsg->msg_namelen = msg.msg_namelen; + + if (!msg.msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) @@ -59,14 +54,16 @@ int get_compat_msghdr(struct msghdr *kmsg, if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); - kmsg->msg_control = compat_ptr(tmp3); + + kmsg->msg_control = compat_ptr(msg.msg_control); + kmsg->msg_controllen = msg.msg_controllen; if (save_addr) - *save_addr = compat_ptr(uaddr); + *save_addr = compat_ptr(msg.msg_name); - if (uaddr && kmsg->msg_namelen) { + if (msg.msg_name && kmsg->msg_namelen) { if (!save_addr) { - err = move_addr_to_kernel(compat_ptr(uaddr), + err = move_addr_to_kernel(compat_ptr(msg.msg_name), kmsg->msg_namelen, kmsg->msg_name); if (err < 0) @@ -77,13 +74,13 @@ int get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_namelen = 0; } - if (nr_segs > UIO_MAXIOV) + if (msg.msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; return compat_import_iovec(save_addr ? READ : WRITE, - compat_ptr(uiov), nr_segs, + compat_ptr(msg.msg_iov), msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); } @@ -316,15 +313,15 @@ struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval) { struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval; struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog)); - compat_uptr_t ptr; - u16 len; - - if (!access_ok(VERIFY_READ, fprog32, sizeof(*fprog32)) || - !access_ok(VERIFY_WRITE, kfprog, sizeof(struct sock_fprog)) || - __get_user(len, &fprog32->len) || - __get_user(ptr, &fprog32->filter) || - __put_user(len, &kfprog->len) || - __put_user(compat_ptr(ptr), &kfprog->filter)) + struct compat_sock_fprog f32; + struct sock_fprog f; + + if (copy_from_user(&f32, fprog32, sizeof(*fprog32))) + return NULL; + memset(&f, 0, sizeof(f)); + f.len = f32.len; + f.filter = compat_ptr(f32.filter); + if (copy_to_user(kfprog, &f, sizeof(struct sock_fprog))) return NULL; return kfprog; diff --git a/net/core/datagram.c b/net/core/datagram.c index 6877c43cc92d..8c2f4489ff8f 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, int *peeked, int *off, int *err, struct sk_buff **last) { + bool peek_at_off = false; struct sk_buff *skb; - int _off = *off; + int _off = 0; + + if (unlikely(flags & MSG_PEEK && *off >= 0)) { + peek_at_off = true; + _off = *off; + } *last = queue->prev; skb_queue_walk(queue, skb) { if (flags & MSG_PEEK) { - if (_off >= skb->len && (skb->len || _off || - skb->peeked)) { + if (peek_at_off && _off >= skb->len && + (_off || skb->peeked)) { _off -= skb->len; continue; } @@ -203,7 +209,7 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket - * @flags: MSG_ flags + * @flags: MSG\_ flags * @destructor: invoked under the receive lock on successful dequeue * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset @@ -356,7 +362,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, if (flags & MSG_PEEK) { err = -ENOENT; spin_lock_bh(&sk_queue->lock); - if (skb == skb_peek(sk_queue)) { + if (skb->next) { __skb_unlink(skb, sk_queue); refcount_dec(&skb->users); if (destructor) @@ -375,7 +381,7 @@ EXPORT_SYMBOL(__sk_queue_drop_skb); * skb_kill_datagram - Free a datagram skbuff forcibly * @sk: socket * @skb: datagram skbuff - * @flags: MSG_ flags + * @flags: MSG\_ flags * * This function frees a datagram skbuff that was received by * skb_recv_datagram. The flags argument must match the one @@ -809,7 +815,7 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); * sequenced packet sockets providing the socket receive queue * is only ever holding data ready to receive. * - * Note: when you _don't_ use this routine for this protocol, + * Note: when you *don't* use this routine for this protocol, * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ diff --git a/net/core/dev.c b/net/core/dev.c index 7098fba52be1..86b4b0a79e7a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2739,7 +2739,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_NONE; + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } @@ -5289,6 +5289,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); if (rc == BUSY_POLL_BUDGET) __napi_schedule(napi); @@ -5667,12 +5668,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ -static bool netdev_has_any_upper_dev(struct net_device *dev) +bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } +EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device @@ -6765,7 +6767,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags) } EXPORT_SYMBOL(dev_change_flags); -static int __dev_set_mtu(struct net_device *dev, int new_mtu) +int __dev_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; @@ -6775,6 +6777,7 @@ static int __dev_set_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; return 0; } +EXPORT_SYMBOL(__dev_set_mtu); /** * dev_set_mtu - Change maximum transfer unit @@ -7383,7 +7386,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; @@ -7423,7 +7426,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; @@ -7964,7 +7967,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT); + p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!p) return NULL; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 82fd4c9c4a1b..709a4e6fb447 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -28,6 +28,7 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; + ifr.ifr_name[IFNAMSIZ-1] = 0; error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); if (error) @@ -262,6 +263,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: + if (dev->addr_len > sizeof(struct sockaddr)) + return -EINVAL; return dev_set_mac_address(dev, &ifr->ifr_hwaddr); case SIOCSIFHWBROADCAST: @@ -424,6 +427,8 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (copy_from_user(&iwr, arg, sizeof(iwr))) return -EFAULT; + iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; + return wext_handle_ioctl(net, &iwr, cmd, arg); } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index a0093e1b0235..fdcb1bcd2afa 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -400,6 +400,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENOMEM; goto errout; } + refcount_set(&rule->refcnt, 1); rule->fr_net = net; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) @@ -517,8 +518,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, last = r; } - refcount_set(&rule->refcnt, 1); - if (last) list_add_rcu(&rule->list, &last->list); else diff --git a/net/core/filter.c b/net/core/filter.c index c7f737058d89..169974998c76 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2248,7 +2248,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) bpf_skb_net_grow(skb, len_diff_abs); bpf_compute_data_end(skb); - return 0; + return ret; } BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, @@ -2836,15 +2836,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; + bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false); - if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - /* replacing an existing ca */ - tcp_reinit_congestion_control(sk, - inet_csk(sk)->icsk_ca_ops); + ret = tcp_set_congestion_control(sk, name, false, reinit); } else { struct tcp_sock *tp = tcp_sk(sk); @@ -2872,7 +2869,6 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } } - ret = -EINVAL; #endif } else { ret = -EINVAL; @@ -3505,6 +3501,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else + *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else @@ -3520,6 +3517,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else + *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e31fc11a8000..d0713627deb6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -347,8 +347,7 @@ out_entries: static void neigh_get_hash_rnd(u32 *x) { - get_random_bytes(x, sizeof(*x)); - *x |= 1; + *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d3408a693166..912731bed7b7 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -277,7 +277,7 @@ static void zap_completion_queue(void) struct sk_buff *skb = clist; clist = clist->next; if (!skb_irq_freeable(skb)) { - refcount_inc(&skb->users); + refcount_set(&skb->users, 1); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); @@ -666,7 +666,7 @@ int netpoll_setup(struct netpoll *np) int err; rtnl_lock(); - if (np->dev_name) { + if (np->dev_name[0]) { struct net *net = current->nsproxy->net_ns; ndev = __dev_get_by_name(net, np->dev_name); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d1ba90980be1..9201e3621351 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2031,7 +2031,8 @@ static int do_setlink(const struct sk_buff *skb, struct sockaddr *sa; int len; - len = sizeof(sa_family_t) + dev->addr_len; + len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, + sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; @@ -4241,6 +4242,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi switch (event) { case NETDEV_REBOOT: + case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8b11341ed69a..e07556606284 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1363,18 +1363,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, EXPORT_SYMBOL(skb_copy_expand); /** - * skb_pad - zero pad the tail of an skb + * __skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad + * @free_on_error: free buffer on error * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * - * May return error in out of memory cases. The skb is freed on error. + * May return error in out of memory cases. The skb is freed on error + * if @free_on_error is true. */ -int skb_pad(struct sk_buff *skb, int pad) +int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) { int err; int ntail; @@ -1403,10 +1405,11 @@ int skb_pad(struct sk_buff *skb, int pad) return 0; free_skb: - kfree_skb(skb); + if (free_on_error) + kfree_skb(skb); return err; } -EXPORT_SYMBOL(skb_pad); +EXPORT_SYMBOL(__skb_pad); /** * pskb_put - add data to the tail of a potentially fragmented buffer @@ -4747,7 +4750,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, gfp_head = gfp_mask; if (gfp_head & __GFP_DIRECT_RECLAIM) - gfp_head |= __GFP_REPEAT; + gfp_head |= __GFP_RETRY_MAYFAIL; *errcode = -ENOBUFS; skb = alloc_skb(header_len, gfp_head); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 1704948e6a12..f227f002c73d 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1471,9 +1471,12 @@ int dccp_feat_init(struct sock *sk) * singleton values (which always leads to failure). * These settings can still (later) be overridden via sockopts. */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len) || - ccid_get_builtin_ccids(&rx.val, &rx.len)) + if (ccid_get_builtin_ccids(&tx.val, &tx.len)) return -ENOBUFS; + if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { + kfree(tx.val); + return -ENOBUFS; + } if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) diff --git a/net/dccp/input.c b/net/dccp/input.c index 4a05d7876850..fa6be9750bb4 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -126,7 +126,7 @@ static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) static u16 dccp_reset_code_convert(const u8 code) { - const u16 error_code[] = { + static const u16 error_code[] = { [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ [DCCP_RESET_CODE_ABORTED] = ECONNRESET, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index f85d901f4e3f..1b202f16531f 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -631,6 +631,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + reqsk_put(req); return 0; drop_and_free: diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index c376af5bfdfb..1b58eac8aad3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -380,6 +380,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + reqsk_put(req); return 0; drop_and_free: diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 9fe25bf63296..b68168fcc06a 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,6 +24,7 @@ #include <net/checksum.h> #include <net/inet_sock.h> +#include <net/inet_common.h> #include <net/sock.h> #include <net/xfrm.h> @@ -170,6 +171,15 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); +static void dccp_sk_destruct(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_tx_ccid = NULL; + inet_sock_destruct(sk); +} + int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); @@ -179,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; + sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; @@ -201,10 +212,7 @@ void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - /* - * DCCP doesn't use sk_write_queue, just sk_send_head - * for retransmissions - */ + __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; @@ -222,8 +230,7 @@ void dccp_destroy_sock(struct sock *sk) dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 56e46090526b..20bc9c56fca0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -509,21 +509,22 @@ static int dsa_cpu_parse(struct dsa_port *port, u32 index, dst->cpu_dp->netdev = ethernet_dev; } + /* Initialize cpu_port_mask now for drv->setup() + * to have access to a correct value, just like what + * net/dsa/dsa.c::dsa_switch_setup_one does. + */ + ds->cpu_port_mask |= BIT(index); + tag_protocol = ds->ops->get_tag_protocol(ds); dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); if (IS_ERR(dst->tag_ops)) { dev_warn(ds->dev, "No tagger for this switch\n"); + ds->cpu_port_mask &= ~BIT(index); return PTR_ERR(dst->tag_ops); } dst->rcv = dst->tag_ops->rcv; - /* Initialize cpu_port_mask now for drv->setup() - * to have access to a correct value, just like what - * net/dsa/dsa.c::dsa_switch_setup_one does. - */ - ds->cpu_port_mask |= BIT(index); - return 0; } @@ -576,7 +577,7 @@ static int dsa_dst_parse(struct dsa_switch_tree *dst) return err; } - if (!dst->cpu_dp->netdev) { + if (!dst->cpu_dp) { pr_warn("Tree has no master device\n"); return -EINVAL; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index fab41de8e983..fcd90f79458e 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -42,6 +42,10 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) { + /* Let dsa_slave_xmit() free skb */ + if (__skb_put_padto(skb, skb->len + padlen, false)) + return NULL; + nskb = skb; } else { nskb = alloc_skb(NET_IP_ALIGN + skb->len + @@ -56,12 +60,15 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head); skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); - kfree_skb(skb); - } - /* skb is freed when it fails */ - if (skb_put_padto(nskb, nskb->len + padlen)) - return NULL; + /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free + * skb + */ + if (skb_put_padto(nskb, nskb->len + padlen)) + return NULL; + + consume_skb(skb); + } tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); tag[0] = 0; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index b09e56214005..9c7b1d74a5c6 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -40,7 +40,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_network_header(nskb, skb_network_header(skb) - skb->head); skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head); skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); - kfree_skb(skb); + consume_skb(skb); if (padlen) { skb_put_zero(nskb, padlen); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 4e7bdb213cd0..172d8309f89e 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -314,7 +314,8 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr); - skb_put_padto(skb, ETH_ZLEN + HSR_HLEN); + if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) + return; hsr_forward_skb(skb, master); return; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76c2077c3f5b..2e548eca3489 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1731,6 +1731,13 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif + /* Some igmp sysctl, whose values are always used */ + net->ipv4.sysctl_igmp_max_memberships = 20; + net->ipv4.sysctl_igmp_max_msf = 10; + /* IGMP reports for link-local multicast groups are enabled by default */ + net->ipv4.sysctl_igmp_llm_reports = 1; + net->ipv4.sysctl_igmp_qrv = 2; + return 0; } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c4c6e1969ed0..2ae8f54cb321 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1523,9 +1523,17 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) int taglen; for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { - if (optptr[0] == IPOPT_CIPSO) + switch (optptr[0]) { + case IPOPT_CIPSO: return optptr; - taglen = optptr[1]; + case IPOPT_END: + return NULL; + case IPOPT_NOOP: + taglen = 1; + break; + default: + taglen = optptr[1]; + } optlen -= taglen; optptr += taglen; } diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 0cbee0a666ff..df68963dc90a 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -258,7 +258,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * esp_output_udp_encap(x, skb, esp); if (!skb_cloned(skb)) { - if (tailen <= skb_availroom(skb)) { + if (tailen <= skb_tailroom(skb)) { nfrags = 1; trailer = skb; tail = skb_tail_pointer(trailer); @@ -292,8 +292,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * kunmap_atomic(vaddr); - spin_unlock_bh(&x->lock); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -301,6 +299,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * skb_shinfo(skb)->nr_frags = ++nfrags; pfrag->offset = pfrag->offset + allocsize; + + spin_unlock_bh(&x->lock); + nfrags++; skb->len += tailen; @@ -381,7 +382,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; if (!esp->inplace) { int allocsize; @@ -392,7 +393,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * spin_lock_bh(&x->lock); if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { spin_unlock_bh(&x->lock); - goto error; + goto error_free; } skb_shinfo(skb)->nr_frags = 1; @@ -409,7 +410,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -442,8 +443,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * if (sg != dsg) esp_ssg_unref(x, tmp); - kfree(tmp); +error_free: + kfree(tmp); error: return err; } @@ -695,8 +697,10 @@ skip_cow: sg_init_table(sg, nfrags); err = skb_to_sgvec(skb, sg, 0, skb->len); - if (unlikely(err < 0)) + if (unlikely(err < 0)) { + kfree(tmp); goto out; + } skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index e0666016a764..50112324fa5c 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -257,7 +257,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); err = esp_output_tail(x, skb, &esp); - if (err < 0) + if (err) return err; secpath_reset(skb); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4e678fa892dd..044d2a159a3c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1334,13 +1334,14 @@ static struct pernet_operations fib_net_ops = { void __init ip_fib_init(void) { - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); + fib_trie_init(); register_pernet_subsys(&fib_net_ops); + register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_trie_init(); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 222100103808..ec3a9ce281a6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1083,15 +1083,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg, fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (!fi) goto failure; - fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); - if (!fi->fib_metrics) - goto failure; + if (unlikely(!fi->fib_metrics)) { + kfree(fi); + return ERR_PTR(err); + } atomic_set(&fi->fib_metrics->refcnt, 1); - } else + } else { fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; - + } + fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; @@ -1452,7 +1454,7 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: - if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && + if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && fib_nh->nh_flags & RTNH_F_LINKDOWN) || (fib_nh->nh_flags & RTNH_F_DEAD)) return call_fib_notifiers(dev_net(fib_nh->nh_dev), diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 8e0257d01200..1540db65241a 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -450,6 +450,7 @@ out_unlock: out: NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, &grc); + skb->remcsum_offload = 0; return pp; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 28f14afd0dd3..caf2f1101d02 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + struct net_device *dev = skb->dev; + struct in_device *in_dev; int len = skb->len; bool dropped = true; + if (netif_is_l3_master(dev)) { + dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif); + if (!dev) + goto drop; + } + + in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto drop; @@ -2974,12 +2982,6 @@ static int __net_init igmp_net_init(struct net *net) goto out_sock; } - /* Sysctl initialization */ - net->ipv4.sysctl_igmp_max_memberships = 20; - net->ipv4.sysctl_igmp_max_msf = 10; - /* IGMP reports for link-local multicast groups are enabled by default */ - net->ipv4.sysctl_igmp_llm_reports = 1; - net->ipv4.sysctl_igmp_qrv = 2; return 0; out_sock: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7eb252dcecee..e153c40c2436 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -599,6 +599,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; + ll_rs = LL_RESERVED_SPACE(rt->dst.dev); /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing @@ -614,14 +615,15 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (first_len - hlen > mtu || ((first_len - hlen) & 7) || ip_is_fragment(iph) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < ll_rs) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < hlen + ll_rs) goto slow_path_clean; /* Partially cloned skb? */ @@ -711,8 +713,6 @@ slow_path: left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ - ll_rs = LL_RESERVED_SPACE(rt->dst.dev); - /* * Fragment the datagram. */ @@ -965,11 +965,12 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) || - (skb && skb_is_gso(skb))) && + if ((skb && skb_is_gso(skb)) || + (((length + (skb ? skb->len : fragheaderlen)) > mtu) && + (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && - (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { + (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx)) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, maxfraglen, flags); @@ -1288,6 +1289,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, return -EINVAL; if ((size + skb->len > mtu) && + (skb_queue_len(&sk->sk_write_queue) == 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { if (skb->ip_summed != CHECKSUM_PARTIAL) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index bb909f1d7537..06863ea3fc5b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2431,8 +2431,8 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); - if (IS_ERR(mrt)) { - err = PTR_ERR(mrt); + if (!mrt) { + err = -ENOENT; goto errout_free; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 0bc3c3d73e61..9e9d9afd18f7 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -268,14 +268,14 @@ unsigned int arpt_do_table(struct sk_buff *skb, acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); - /* Target might have changed stuff. */ - arp = arp_hdr(skb); - - if (verdict == XT_CONTINUE) + if (verdict == XT_CONTINUE) { + /* Target might have changed stuff. */ + arp = arp_hdr(skb); e = arpt_next_entry(e); - else + } else { /* Verdict */ break; + } } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2a55a40211cb..622ed2887cd5 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -352,13 +352,14 @@ ipt_do_table(struct sk_buff *skb, acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); - /* Target might have changed stuff. */ - ip = ip_hdr(skb); - if (verdict == XT_CONTINUE) + if (verdict == XT_CONTINUE) { + /* Target might have changed stuff. */ + ip = ip_hdr(skb); e = ipt_next_entry(e); - else + } else { /* Verdict */ break; + } } while (!acpar.hotdrop); xt_write_recseq_end(addend); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 7d72decb80f9..efaa04dcc80e 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -117,7 +117,8 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c) * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS - proc_remove(c->pde); + if (cn->procdir) + proc_remove(c->pde); #endif return; } @@ -815,6 +816,7 @@ static void clusterip_net_exit(struct net *net) #ifdef CONFIG_PROC_FS struct clusterip_net *cn = net_generic(net, clusterip_net_id); proc_remove(cn->procdir); + cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); } diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 805c8ddfe860..4bbc273b45e8 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -72,8 +72,7 @@ static const struct nf_chain_type filter_arp = { .family = NFPROTO_ARP, .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | - (1 << NF_ARP_OUT) | - (1 << NF_ARP_FORWARD), + (1 << NF_ARP_OUT), }; static int __init nf_tables_arp_init(void) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c816cd53f7fc..2331de20ca50 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1267,7 +1267,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) if (mtu) return mtu; - mtu = dst->dev->mtu; + mtu = READ_ONCE(dst->dev->mtu); if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { if (rt->rt_uses_gateway && mtu > 576) @@ -2750,26 +2750,34 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); + else + skb_dst_set(skb, &rt->dst); } if (err) goto errout_free; - skb_dst_set(skb, &rt->dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = rt->rt_table_id; - if (rtm->rtm_flags & RTM_F_FIB_MATCH) + if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + if (!res.fi) { + err = fib_props[res.type].error; + if (!err) + err = -EHOSTUNREACH; + goto errout_free; + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, table_id, rt->rt_type, res.prefix, res.prefixlen, fl4.flowi4_tos, res.fi, 0); - else + } else { err = rt_fill_info(net, dst, src, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); + } if (err < 0) goto errout_free; @@ -2979,8 +2987,7 @@ static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); - get_random_bytes(&net->ipv4.dev_addr_genid, - sizeof(net->ipv4.dev_addr_genid)); + atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); return 0; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0905cf04c2a4..03ad8778c395 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -335,6 +335,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; treq->ts_off = 0; + treq->txhash = net_tx_rndhash(); req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..a3e91b552edc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true); + err = tcp_set_congestion_control(sk, name, true, true); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index dbcc9352a48f..69ee877574d0 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -112,7 +112,8 @@ struct bbr { cwnd_gain:10, /* current gain for setting cwnd */ full_bw_cnt:3, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ - unused_b:6; + has_seen_rtt:1, /* have we seen an RTT sample yet? */ + unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ }; @@ -211,6 +212,35 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) return rate >> BW_SCALE; } +/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; +} + +/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ +static void bbr_init_pacing_rate_from_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + u32 rtt_us; + + if (tp->srtt_us) { /* any RTT sample yet? */ + rtt_us = max(tp->srtt_us >> 3, 1U); + bbr->has_seen_rtt = 1; + } else { /* no RTT sample yet */ + rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ + } + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, rtt_us); + sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); +} + /* Pace using current bw estimate and a gain factor. In order to help drive the * network toward lower queues while maintaining high utilization and low * latency, the average pacing rate aims to be slightly (~1%) lower than the @@ -220,12 +250,13 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { + struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u64 rate = bw; + u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); - rate = bbr_rate_bytes_per_sec(sk, rate, gain); - rate = min_t(u64, rate, sk->sk_max_pacing_rate); - if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) + if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) + bbr_init_pacing_rate_from_rtt(sk); + if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) sk->sk_pacing_rate = rate; } @@ -798,7 +829,6 @@ static void bbr_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u64 bw; bbr->prior_cwnd = 0; bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ @@ -814,11 +844,8 @@ static void bbr_init(struct sock *sk) minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ - /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ - bw = (u64)tp->snd_cwnd * BW_UNIT; - do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); - sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ - bbr_set_pacing_rate(sk, bw, bbr_high_gain); + bbr->has_seen_rtt = 0; + bbr_init_pacing_rate_from_rtt(sk); bbr->restore_cwnd = 0; bbr->round_start = 0; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index fde983f6376b..421ea1b918da 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_dontxmit(sk); } -void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca) +static void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -338,7 +338,7 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -360,9 +360,18 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) if (!ca) { err = -ENOENT; } else if (!load) { - icsk->icsk_ca_ops = ca; - if (!try_module_get(ca->owner)) + const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; + + if (try_module_get(ca->owner)) { + if (reinit) { + tcp_reinit_congestion_control(sk, ca); + } else { + icsk->icsk_ca_ops = ca; + module_put(old_ca->owner); + } + } else { err = -EBUSY; + } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { err = -EPERM; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2920e0cb09f8..bab7f0493098 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ @@ -2520,8 +2521,8 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) return; /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || - (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_jiffies32; } @@ -3004,21 +3005,24 @@ void tcp_rearm_rto(struct sock *sk) /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { - struct sk_buff *skb = tcp_write_queue_head(sk); - u64 rto_time_stamp = skb->skb_mstamp + - jiffies_to_usecs(rto); - s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ - if (delta_us > 0) - rto = usecs_to_jiffies(delta_us); + rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); } } +/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ +static void tcp_set_xmit_timer(struct sock *sk) +{ + if (!tcp_schedule_loss_probe(sk)) + tcp_rearm_rto(sk); +} + /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3180,7 +3184,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@ -3208,7 +3212,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. */ - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ } if (icsk->icsk_ca_ops->pkts_acked) { @@ -3580,9 +3584,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) - tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; @@ -3647,18 +3648,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); + /* If needed, reset TLP/RTO timer; RACK may later override this. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } - if (tp->tlp_high_seq) - tcp_process_tlp_ack(sk, ack, flag); if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - if (icsk->icsk_pending == ICSK_TIME_RETRANS) - tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, sack_state.rate); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6ec6900eb300..e9252c7df809 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -943,9 +943,9 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, } EXPORT_SYMBOL(tcp_md5_do_lookup); -struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, - const union tcp_md5_addr *addr, - int family, u8 prefixlen) +static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, + const union tcp_md5_addr *addr, + int family, u8 prefixlen) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1722,6 +1722,8 @@ process: */ sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1729,8 +1731,6 @@ process: } if (nsk == sk) { reqsk_put(req); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v4_send_reset(nsk, skb); goto discard_and_relse; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 4e985dea1dd2..b7661a68d498 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2202,9 +2202,10 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) { const u32 now = tcp_jiffies32; + enum tcp_chrono old = tp->chrono_type; - if (tp->chrono_type > TCP_CHRONO_UNSPEC) - tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start; + if (old > TCP_CHRONO_UNSPEC) + tp->chrono_stat[old - 1] += now - tp->chrono_start; tp->chrono_start = now; tp->chrono_type = new; } @@ -2376,24 +2377,15 @@ bool tcp_schedule_loss_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, tlp_time_stamp, rto_time_stamp; u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); + u32 timeout, rto_delta_us; - /* No consecutive loss probes. */ - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { - tcp_rearm_rto(sk); - return false; - } /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (tp->fastopen_rsk) return false; - /* TLP is only scheduled when next timer event is RTO. */ - if (icsk->icsk_pending != ICSK_TIME_RETRANS) - return false; - /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ @@ -2416,14 +2408,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) (rtt + (rtt >> 1) + TCP_DELACK_MAX)); timeout = max_t(u32, timeout, msecs_to_jiffies(10)); - /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_jiffies32 + timeout; - rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; - if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_jiffies32; - if (delta > 0) - timeout = delta; - } + /* If the RTO formula yields an earlier time, then use that time. */ + rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + if (rto_delta_us > 0) + timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); @@ -3448,6 +3436,10 @@ int tcp_connect(struct sock *sk) int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + tcp_connect_init(sk); if (unlikely(tp->repair)) { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c0feeeef962a..e906014890b6 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -652,7 +652,8 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c index 2417f55374c5..6bb9e14c710a 100644 --- a/net/ipv4/tcp_ulp.c +++ b/net/ipv4/tcp_ulp.c @@ -122,14 +122,14 @@ int tcp_set_ulp(struct sock *sk, const char *name) ulp_ops = __tcp_ulp_find_autoload(name); if (!ulp_ops) - err = -ENOENT; - else - err = ulp_ops->init(sk); + return -ENOENT; - if (err) - goto out; + err = ulp_ops->init(sk); + if (err) { + module_put(ulp_ops->owner); + return err; + } icsk->icsk_ulp_ops = ulp_ops; - out: - return err; + return 0; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 25294d43e147..38e795e0c4bf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -802,7 +802,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check_tx) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx && !skb_is_gso(skb)) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; @@ -1163,34 +1163,32 @@ out: return ret; } -#if BITS_PER_LONG == 64 +#define UDP_SKB_IS_STATELESS 0x80000000 + static void udp_set_dev_scratch(struct sk_buff *skb) { - struct udp_dev_scratch *scratch; + struct udp_dev_scratch *scratch = udp_skb_scratch(skb); BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); - scratch = (struct udp_dev_scratch *)&skb->dev_scratch; - scratch->truesize = skb->truesize; + scratch->_tsize_state = skb->truesize; +#if BITS_PER_LONG == 64 scratch->len = skb->len; scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); +#endif + if (likely(!skb->_skb_refdst && !skb_sec_path(skb))) + scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } static int udp_skb_truesize(struct sk_buff *skb) { - return ((struct udp_dev_scratch *)&skb->dev_scratch)->truesize; -} -#else -static void udp_set_dev_scratch(struct sk_buff *skb) -{ - skb->dev_scratch = skb->truesize; + return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; } -static int udp_skb_truesize(struct sk_buff *skb) +static bool udp_skb_has_head_state(struct sk_buff *skb) { - return skb->dev_scratch; + return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS); } -#endif /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, int size, int partial, @@ -1388,6 +1386,11 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } + /* In the more common cases we cleared the head states previously, + * see __udp_queue_rcv_skb(). + */ + if (unlikely(udp_skb_has_head_state(skb))) + skb_release_head_state(skb); consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); @@ -1571,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; @@ -1779,8 +1783,12 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } - /* clear all pending head states while they are hot in the cache */ - skb_release_head_state(skb); + /* At recvmsg() time we may access skb->dst or skb->sp depending on + * the IP options and the cmsg flags, elsewhere can we clear all + * pending head states while they are hot in the cache + */ + if (likely(IPCB(skb)->opt.optlen == 0 && !skb_sec_path(skb))) + skb_release_head_state(skb); rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { @@ -1802,8 +1810,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) static struct static_key udp_encap_needed __read_mostly; void udp_encap_enable(void) { - if (!static_key_enabled(&udp_encap_needed)) - static_key_slow_inc(&udp_encap_needed); + static_key_enable(&udp_encap_needed); } EXPORT_SYMBOL(udp_encap_enable); @@ -1921,15 +1928,18 @@ drop: /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ -static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; if (dst_hold_safe(dst)) { old = xchg(&sk->sk_rx_dst, dst); dst_release(old); + return old != dst; } + return false; } +EXPORT_SYMBOL(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 781250151d40..0932c85b42af 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -235,7 +235,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3c46e9513a31..936e9ab4dda5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5556,7 +5556,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * our DAD process, so we don't need * to do it again */ - if (!(ifp->rt->rt6i_node)) + if (!rcu_access_pointer(ifp->rt->rt6i_node)) ip6_ins_rt(ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9ed35473dcb5..ab64f367d11c 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -226,7 +226,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info int tailen = esp->tailen; if (!skb_cloned(skb)) { - if (tailen <= skb_availroom(skb)) { + if (tailen <= skb_tailroom(skb)) { nfrags = 1; trailer = skb; tail = skb_tail_pointer(trailer); @@ -260,8 +260,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info kunmap_atomic(vaddr); - spin_unlock_bh(&x->lock); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -269,6 +267,9 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info skb_shinfo(skb)->nr_frags = ++nfrags; pfrag->offset = pfrag->offset + allocsize; + + spin_unlock_bh(&x->lock); + nfrags++; skb->len += tailen; @@ -345,7 +346,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; if (!esp->inplace) { int allocsize; @@ -356,7 +357,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info spin_lock_bh(&x->lock); if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { spin_unlock_bh(&x->lock); - goto error; + goto error_free; } skb_shinfo(skb)->nr_frags = 1; @@ -373,7 +374,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -406,8 +407,9 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info if (sg != dsg) esp_ssg_unref(x, tmp); - kfree(tmp); +error_free: + kfree(tmp); error: return err; } diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index f02f131f6435..1cf437f75b0b 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -286,7 +286,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); err = esp6_output_tail(x, skb, &esp); - if (err < 0) + if (err) return err; secpath_reset(skb); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 4996d734f1d2..3cec529c6113 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -756,6 +756,7 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; + IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM; return true; drop: diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5477ba729c36..e1c85bb4eac0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void) return fn; } -static void node_free(struct fib6_node *fn) +static void node_free_immediate(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } +static void node_free_rcu(struct rcu_head *head) +{ + struct fib6_node *fn = container_of(head, struct fib6_node, rcu); + + kmem_cache_free(fib6_node_kmem, fn); +} + +static void node_free(struct fib6_node *fn) +{ + call_rcu(&fn->rcu, node_free_rcu); +} + static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -601,9 +613,9 @@ insert_above: if (!in || !ln) { if (in) - node_free(in); + node_free_immediate(in); if (ln) - node_free(ln); + node_free_immediate(ln); return ERR_PTR(-ENOMEM); } @@ -786,10 +798,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, goto next_iter; } - if (iter->dst.dev == rt->dst.dev && - iter->rt6i_idev == rt->rt6i_idev && - ipv6_addr_equal(&iter->rt6i_gateway, - &rt->rt6i_gateway)) { + if (rt6_duplicate_nexthop(iter, rt)) { if (rt->rt6i_nsiblings) rt->rt6i_nsiblings = 0; if (!(iter->rt6i_flags & RTF_EXPIRES)) @@ -880,7 +889,7 @@ add: rt->dst.rt6_next = iter; *ins = rt; - rt->rt6i_node = fn; + rcu_assign_pointer(rt->rt6i_node, fn); atomic_inc(&rt->rt6i_ref); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); @@ -906,7 +915,7 @@ add: return err; *ins = rt; - rt->rt6i_node = fn; + rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); if (!info->skip_notify) @@ -917,6 +926,8 @@ add: } nsiblings = iter->rt6i_nsiblings; fib6_purge_rt(iter, fn, info->nl_net); + if (fn->rr_ptr == iter) + fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { @@ -929,6 +940,8 @@ add: if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; fib6_purge_rt(iter, fn, info->nl_net); + if (fn->rr_ptr == iter) + fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; } else { @@ -1017,7 +1030,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Create subtree root node */ sfn = node_alloc(); if (!sfn) - goto st_failure; + goto failure; sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); @@ -1034,12 +1047,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (IS_ERR(sn)) { /* If it is failed, discard just allocated - root, and then (in st_failure) stale node + root, and then (in failure) stale node in main tree. */ - node_free(sfn); + node_free_immediate(sfn); err = PTR_ERR(sn); - goto st_failure; + goto failure; } /* Now link new subtree to main tree */ @@ -1054,7 +1067,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (IS_ERR(sn)) { err = PTR_ERR(sn); - goto st_failure; + goto failure; } } @@ -1095,18 +1108,17 @@ out: atomic_inc(&pn->leaf->rt6i_ref); } #endif - /* Always release dst as dst->__refcnt is guaranteed - * to be taken before entering this function - */ - dst_release_immediate(&rt->dst); + goto failure; } return err; -#ifdef CONFIG_IPV6_SUBTREES - /* Subtree creation failed, probably main tree node - is orphan. If it is, shoot it. +failure: + /* fn->leaf could be NULL if fn is an intermediate node and we + * failed to add the new route to it in both subtree creation + * failure and fib6_add_rt2node() failure case. + * In both cases, fib6_repair_tree() should be called to fix + * fn->leaf. */ -st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); /* Always release dst as dst->__refcnt is guaranteed @@ -1114,7 +1126,6 @@ st_failure: */ dst_release_immediate(&rt->dst); return err; -#endif } /* @@ -1469,8 +1480,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, int fib6_del(struct rt6_info *rt, struct nl_info *info) { + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct net *net = info->nl_net; - struct fib6_node *fn = rt->rt6i_node; struct rt6_info **rtp; #if RT6_DEBUG >= 2 @@ -1659,7 +1671,9 @@ static int fib6_clean_node(struct fib6_walker *w) if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", - __func__, rt, rt->rt6i_node, res); + __func__, rt, + rcu_access_pointer(rt->rt6i_node), + res); #endif continue; } @@ -1781,8 +1795,10 @@ static int fib6_age(struct rt6_info *rt, void *arg) } gc_args->more++; } else if (rt->rt6i_flags & RTF_CACHE) { + if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) + rt->dst.obsolete = DST_OBSOLETE_KILL; if (atomic_read(&rt->dst.__refcnt) == 1 && - time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + rt->dst.obsolete == DST_OBSOLETE_KILL) { RT6_TRACE("aging clone %p\n", rt); return -1; } else if (rt->rt6i_flags & RTF_GATEWAY) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1422d6c08377..2dfe50d8d609 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -673,8 +673,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, *prevhdr = NEXTHDR_FRAGMENT; tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!tmp_hdr) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; } @@ -789,8 +787,6 @@ slow_path: frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + hroom + troom, GFP_ATOMIC); if (!frag) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; } @@ -1385,11 +1381,12 @@ emsgsize: */ cork->length += length; - if ((((length + (skb ? skb->len : headersize)) > mtu) || - (skb && skb_is_gso(skb))) && + if ((skb && skb_is_gso(skb)) || + (((length + (skb ? skb->len : headersize)) > mtu) && + (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) && - (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { + (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, exthdrlen, transhdrlen, mtu, flags, fl6); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 02d795fe3d7f..a5e466d4e093 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -242,7 +242,6 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, pktopt = xchg(&np->pktoptions, NULL); kfree_skb(pktopt); - sk->sk_destruct = inet_sock_destruct; /* * ... and add it to the refcnt debug socks count * in the new family. -acme diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index e9065b8d3af8..a338bbc33cf3 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { - u16 offset = sizeof(struct ipv6hdr); + unsigned int offset = sizeof(struct ipv6hdr); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; @@ -112,6 +112,8 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); offset += ipv6_optlen(exthdr); + if (offset > IPV6_MAXPLEN) + return -EINVAL; *nexthdr = &exthdr->nexthdr; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0488a24c2a44..2d0e7798c793 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -417,14 +417,11 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, struct net_device *loopback_dev = dev_net(dev)->loopback_dev; - if (dev != loopback_dev) { - if (idev && idev->dev == dev) { - struct inet6_dev *loopback_idev = - in6_dev_get(loopback_dev); - if (loopback_idev) { - rt->rt6i_idev = loopback_idev; - in6_dev_put(idev); - } + if (idev && idev->dev != loopback_dev) { + struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); + if (loopback_idev) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); } } } @@ -443,7 +440,8 @@ static bool rt6_check_expired(const struct rt6_info *rt) if (time_after(jiffies, rt->dst.expires)) return true; } else if (rt->dst.from) { - return rt6_check_expired((struct rt6_info *) rt->dst.from); + return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || + rt6_check_expired((struct rt6_info *)rt->dst.from); } return false; } @@ -1292,7 +1290,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt) static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + u32 rt_cookie = 0; + + if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) @@ -1360,8 +1360,14 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) ip6_del_rt(rt); - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { - rt->rt6i_node->fn_sernum = -1; + } else { + struct fib6_node *fn; + + rcu_read_lock(); + fn = rcu_dereference(rt->rt6i_node); + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) + fn->fn_sernum = -1; + rcu_read_unlock(); } } } @@ -1378,7 +1384,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); + (rt->rt6i_flags & RTF_PCPU || + rcu_access_pointer(rt->rt6i_node)); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, @@ -2351,6 +2358,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; + nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; if (ip6_ins_rt(nrt)) @@ -2461,6 +2469,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@ -2513,6 +2522,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = dev_net(dev), @@ -3036,17 +3046,11 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, struct rt6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; - struct rt6_info *rtnh; int err = -EEXIST; list_for_each_entry(nh, rt6_nh_list, next) { /* check if rt6_info already exists */ - rtnh = nh->rt6_info; - - if (rtnh->dst.dev == rt->dst.dev && - rtnh->rt6i_idev == rt->rt6i_idev && - ipv6_addr_equal(&rtnh->rt6i_gateway, - &rt->rt6i_gateway)) + if (rt6_duplicate_nexthop(nh->rt6_info, rt)) return err; } @@ -3430,14 +3434,6 @@ static int rt6_fill_node(struct net *net, rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; - if (rt->rt6i_flags & RTF_DYNAMIC) - rtm->rtm_protocol = RTPROT_REDIRECT; - else if (rt->rt6i_flags & RTF_ADDRCONF) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO)) - rtm->rtm_protocol = RTPROT_RA; - else - rtm->rtm_protocol = RTPROT_KERNEL; - } if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; @@ -3735,10 +3731,10 @@ static int ip6_route_dev_notify(struct notifier_block *this, /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ - in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES - in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); - in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); + in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7b75b0620730..4e7817abc0b9 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -216,6 +216,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; treq->ts_off = 0; + treq->txhash = net_tx_rndhash(); /* * We need to lookup the dst_entry to get the correct window size. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2521690d62d6..206210125fd7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1456,6 +1456,8 @@ process: } sock_hold(sk); refcounted = true; + if (tcp_filter(sk, skb)) + goto discard_and_relse; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1464,8 +1466,6 @@ process: if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); - } else if (tcp_filter(sk, skb)) { - goto discard_and_relse; } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); goto discard_and_relse; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4a3e65626e8b..56030d45823a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -291,11 +291,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, struct udp_table *udptable) { const struct ipv6hdr *iph = ipv6_hdr(skb); - struct sock *sk; - sk = skb_steal_sock(skb); - if (unlikely(sk)) - return sk; return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), udptable, skb); @@ -332,6 +328,15 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif +/* do not use the scratch area len for jumbogram: their length execeeds the + * scratch area space; note that the IP6CB flags is still in the first + * cacheline, so checking for jumbograms is cheap + */ +static int udp6_skb_len(struct sk_buff *skb) +{ + return unlikely(inet6_is_jumbogram(skb)) ? skb->len : udp_skb_len(skb); +} + /* * This should be easy, if there is something there we * return it, otherwise we block. @@ -357,12 +362,13 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: - peeking = off = sk_peek_offset(sk, flags); + peeking = flags & MSG_PEEK; + off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err); if (!skb) return err; - ulen = udp_skb_len(skb); + ulen = udp6_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; @@ -569,8 +575,7 @@ static __inline__ void udpv6_err(struct sk_buff *skb, static struct static_key udpv6_encap_needed __read_mostly; void udpv6_encap_enable(void) { - if (!static_key_enabled(&udpv6_encap_needed)) - static_key_slow_inc(&udpv6_encap_needed); + static_key_enable(&udpv6_encap_needed); } EXPORT_SYMBOL(udpv6_encap_enable); @@ -762,6 +767,15 @@ start_lookup: return 0; } +static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +{ + if (udp_sk_rx_dst_set(sk, dst)) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + + inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); + } +} + int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { @@ -804,6 +818,24 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp6_csum_init(skb, uh, proto)) goto csum_error; + /* Check if the socket is already available, e.g. due to early demux */ + sk = skb_steal_sock(skb); + if (sk) { + struct dst_entry *dst = skb_dst(skb); + int ret; + + if (unlikely(sk->sk_rx_dst != dst)) + udp6_sk_rx_dst_set(sk, dst); + + ret = udpv6_queue_rcv_skb(sk, skb); + sock_put(sk); + + /* a return value > 0 means to resubmit the input */ + if (ret > 0) + return ret; + return 0; + } + /* * Multicast receive code */ @@ -812,11 +844,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, saddr, daddr, udptable, proto); /* Unicast */ - - /* - * check socket cache ... must talk to Alan about his plans - * for sock caches... i'll skip this for now. - */ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) { int ret; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a2267f80febb..e7d378c032cb 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -72,7 +72,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 2e6990f8b80b..23fa7c8b09a5 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -2213,7 +2213,7 @@ static int irda_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct irda_sock *self = irda_sk(sk); - struct irda_device_list list; + struct irda_device_list list = { 0 }; struct irda_device_info *discoveries; struct irda_ias_set * ias_opt; /* IAS get/query params */ struct ias_object * ias_obj; /* Object in IAS */ diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index da49191f7ad0..4abf6287d7e1 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1383,6 +1383,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock, if (!csk) return -EINVAL; + /* We must prevent loops or risk deadlock ! */ + if (csk->sk_family == PF_KCM) + return -EOPNOTSUPP; + psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); if (!psock) return -ENOMEM; diff --git a/net/key/af_key.c b/net/key/af_key.c index ca9d3ae665e7..98f4d8211b9a 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -228,7 +228,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 -static int pfkey_broadcast(struct sk_buff *skb, +static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { @@ -278,7 +278,7 @@ static int pfkey_broadcast(struct sk_buff *skb, rcu_read_unlock(); if (one_sk != NULL) - err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk); + err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); kfree_skb(skb2); kfree_skb(skb); @@ -311,7 +311,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk) hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } @@ -355,7 +355,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1389,7 +1389,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ xfrm_state_put(x); - pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net); + pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } @@ -1476,7 +1476,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x)); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } @@ -1589,7 +1589,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1694,8 +1694,8 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad return -ENOBUFS; } - pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk)); - + pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, + sock_net(sk)); return 0; } @@ -1712,7 +1712,8 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, + sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) @@ -1733,7 +1734,7 @@ static int key_notify_sa_flush(const struct km_event *c) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } @@ -1790,7 +1791,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -1878,7 +1879,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb new_hdr->sadb_msg_errno = 0; } - pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk)); + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } @@ -2206,7 +2207,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } @@ -2426,7 +2427,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: @@ -2682,7 +2683,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -2739,7 +2740,7 @@ static int key_notify_policy_flush(const struct km_event *c) hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } @@ -2803,7 +2804,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb void *ext_hdrs[SADB_EXT_MAX]; int err; - pfkey_broadcast(skb_clone(skb, GFP_KERNEL), + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); @@ -3024,7 +3025,8 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; - pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); return 0; } @@ -3212,7 +3214,8 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_ctx->ctx_len); } - return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, @@ -3408,7 +3411,8 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; - return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE @@ -3599,7 +3603,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index b0c2d4ae781d..90165a6874bc 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -113,7 +113,6 @@ struct l2tp_net { spinlock_t l2tp_session_hlist_lock; }; -static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) { @@ -127,39 +126,6 @@ static inline struct l2tp_net *l2tp_pernet(const struct net *net) return net_generic(net, l2tp_net_id); } -/* Tunnel reference counts. Incremented per session that is added to - * the tunnel. - */ -static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel) -{ - refcount_inc(&tunnel->ref_count); -} - -static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel) -{ - if (refcount_dec_and_test(&tunnel->ref_count)) - l2tp_tunnel_free(tunnel); -} -#ifdef L2TP_REFCNT_DEBUG -#define l2tp_tunnel_inc_refcount(_t) \ -do { \ - pr_debug("l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", \ - __func__, __LINE__, (_t)->name, \ - refcount_read(&_t->ref_count)); \ - l2tp_tunnel_inc_refcount_1(_t); \ -} while (0) -#define l2tp_tunnel_dec_refcount(_t) \ -do { \ - pr_debug("l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", \ - __func__, __LINE__, (_t)->name, \ - refcount_read(&_t->ref_count)); \ - l2tp_tunnel_dec_refcount_1(_t); \ -} while (0) -#else -#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t) -#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t) -#endif - /* Session hash global list for L2TPv3. * The session_id SHOULD be random according to RFC3931, but several * L2TP implementations use incrementing session_ids. So we do a real @@ -229,6 +195,27 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; } +/* Lookup a tunnel. A new reference is held on the returned tunnel. */ +struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) +{ + const struct l2tp_net *pn = l2tp_pernet(net); + struct l2tp_tunnel *tunnel; + + rcu_read_lock_bh(); + list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { + if (tunnel->tunnel_id == tunnel_id) { + l2tp_tunnel_inc_refcount(tunnel); + rcu_read_unlock_bh(); + + return tunnel; + } + } + rcu_read_unlock_bh(); + + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_get); + /* Lookup a session. A new reference is held on the returned session. * Optionally calls session->ref() too if do_ref is true. */ @@ -1348,17 +1335,6 @@ static void l2tp_udp_encap_destroy(struct sock *sk) } } -/* Really kill the tunnel. - * Come here only when all sessions have been cleared from the tunnel. - */ -static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) -{ - BUG_ON(refcount_read(&tunnel->ref_count) != 0); - BUG_ON(tunnel->sock != NULL); - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: free...\n", tunnel->name); - kfree_rcu(tunnel, rcu); -} - /* Workqueue tunnel deletion function */ static void l2tp_tunnel_del_work(struct work_struct *work) { @@ -1844,6 +1820,8 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn l2tp_session_set_header_len(session, tunnel->version); + refcount_set(&session->ref_count, 1); + err = l2tp_session_add_to_tunnel(tunnel, session); if (err) { kfree(session); @@ -1851,10 +1829,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn return ERR_PTR(err); } - /* Bump the reference count. The session context is deleted - * only when this drops to zero. - */ - refcount_set(&session->ref_count, 1); l2tp_tunnel_inc_refcount(tunnel); /* Ensure tunnel socket isn't deleted */ diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index cdb6e3327f74..9101297f27ad 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -231,6 +231,8 @@ out: return tunnel; } +struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); + struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, bool do_ref); @@ -269,6 +271,17 @@ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); +static inline void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) +{ + refcount_inc(&tunnel->ref_count); +} + +static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) +{ + if (refcount_dec_and_test(&tunnel->ref_count)) + kfree_rcu(tunnel, rcu); +} + /* Session reference counts. Incremented when code obtains a reference * to a session. */ diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 12cfcd0ca807..57427d430f10 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -65,10 +65,12 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info, (info->attrs[L2TP_ATTR_CONN_ID])) { tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel) + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (tunnel) { session = l2tp_session_get(net, tunnel, session_id, do_ref); + l2tp_tunnel_dec_refcount(tunnel); + } } return session; @@ -271,8 +273,8 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { ret = -ENODEV; goto out; } @@ -282,6 +284,8 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info (void) l2tp_tunnel_delete(tunnel); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } @@ -299,8 +303,8 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { ret = -ENODEV; goto out; } @@ -311,6 +315,8 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } @@ -438,34 +444,37 @@ static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; - goto out; + goto err; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { - ret = -ENODEV; - goto out; - } - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; - goto out; + goto err; + } + + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { + ret = -ENODEV; + goto err_nlmsg; } ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, L2TP_CMD_TUNNEL_GET); if (ret < 0) - goto err_out; + goto err_nlmsg_tunnel; + + l2tp_tunnel_dec_refcount(tunnel); return genlmsg_unicast(net, msg, info->snd_portid); -err_out: +err_nlmsg_tunnel: + l2tp_tunnel_dec_refcount(tunnel); +err_nlmsg: nlmsg_free(msg); - -out: +err: return ret; } @@ -509,8 +518,9 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf ret = -EINVAL; goto out; } + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); + tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; @@ -518,24 +528,24 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (!info->attrs[L2TP_ATTR_SESSION_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PW_TYPE]) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]); if (cfg.pw_type >= __L2TP_PWTYPE_MAX) { ret = -EINVAL; - goto out; + goto out_tunnel; } if (tunnel->version > 2) { @@ -557,7 +567,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); if (len > 8) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.cookie_len = len; memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len); @@ -566,7 +576,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); if (len > 8) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.peer_cookie_len = len; memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len); @@ -609,7 +619,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) || (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) { ret = -EPROTONOSUPPORT; - goto out; + goto out_tunnel; } /* Check that pseudowire-specific params are present */ @@ -619,7 +629,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf case L2TP_PWTYPE_ETH_VLAN: if (!info->attrs[L2TP_ATTR_VLAN_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } break; case L2TP_PWTYPE_ETH: @@ -647,6 +657,8 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } } +out_tunnel: + l2tp_tunnel_dec_refcount(tunnel); out: return ret; } diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 8708cbe8af5b..2b36eff5d97e 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -7,7 +7,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation - * Copyright(c) 2015 Intel Deutschland GmbH + * Copyright(c) 2015-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -466,3 +466,23 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_manage_rx_ba_offl); + +void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif, + const u8 *addr, unsigned int tid) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + rcu_read_lock(); + sta = sta_info_get_bss(sdata, addr); + if (!sta) + goto unlock; + + set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired); + ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work); + + unlock: + rcu_read_unlock(); +} +EXPORT_SYMBOL(ieee80211_rx_ba_timer_expired); diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index bdcfb2d04cd2..ea4f481839dd 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2076,6 +2076,7 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, { struct net *net = sock_net(in_skb->sk); u32 portid = NETLINK_CB(in_skb).portid; + u32 in_label = LABEL_NOT_SPECIFIED; struct nlattr *tb[RTA_MAX + 1]; u32 labels[MAX_NEW_LABELS]; struct mpls_shim_hdr *hdr; @@ -2086,9 +2087,8 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlmsghdr *nlh; struct sk_buff *skb; struct mpls_nh *nh; - int err = -EINVAL; - u32 in_label; u8 n_labels; + int err; err = nlmsg_parse(in_nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); @@ -2101,11 +2101,15 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, u8 label_count; if (nla_get_labels(tb[RTA_DST], 1, &label_count, - &in_label, extack)) + &in_label, extack)) { + err = -EINVAL; goto errout; + } - if (in_label < MPLS_LABEL_FIRST_UNRESERVED) + if (!mpls_label_ok(net, in_label, extack)) { + err = -EINVAL; goto errout; + } } rt = mpls_route_input_rcu(net, in_label); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 552d606e57ca..974cf2a3795a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -227,114 +227,6 @@ void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, } EXPORT_SYMBOL(nf_unregister_net_hooks); -static LIST_HEAD(nf_hook_list); - -static int _nf_register_hook(struct nf_hook_ops *reg) -{ - struct net *net, *last; - int ret; - - for_each_net(net) { - ret = nf_register_net_hook(net, reg); - if (ret && ret != -ENOENT) - goto rollback; - } - list_add_tail(®->list, &nf_hook_list); - - return 0; -rollback: - last = net; - for_each_net(net) { - if (net == last) - break; - nf_unregister_net_hook(net, reg); - } - return ret; -} - -int nf_register_hook(struct nf_hook_ops *reg) -{ - int ret; - - rtnl_lock(); - ret = _nf_register_hook(reg); - rtnl_unlock(); - - return ret; -} -EXPORT_SYMBOL(nf_register_hook); - -static void _nf_unregister_hook(struct nf_hook_ops *reg) -{ - struct net *net; - - list_del(®->list); - for_each_net(net) - nf_unregister_net_hook(net, reg); -} - -void nf_unregister_hook(struct nf_hook_ops *reg) -{ - rtnl_lock(); - _nf_unregister_hook(reg); - rtnl_unlock(); -} -EXPORT_SYMBOL(nf_unregister_hook); - -int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - unsigned int i; - int err = 0; - - for (i = 0; i < n; i++) { - err = nf_register_hook(®[i]); - if (err) - goto err; - } - return err; - -err: - if (i > 0) - nf_unregister_hooks(reg, i); - return err; -} -EXPORT_SYMBOL(nf_register_hooks); - -/* Caller MUST take rtnl_lock() */ -int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - unsigned int i; - int err = 0; - - for (i = 0; i < n; i++) { - err = _nf_register_hook(®[i]); - if (err) - goto err; - } - return err; - -err: - if (i > 0) - _nf_unregister_hooks(reg, i); - return err; -} -EXPORT_SYMBOL(_nf_register_hooks); - -void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - while (n-- > 0) - nf_unregister_hook(®[n]); -} -EXPORT_SYMBOL(nf_unregister_hooks); - -/* Caller MUST take rtnl_lock */ -void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - while (n-- > 0) - _nf_unregister_hook(®[n]); -} -EXPORT_SYMBOL(_nf_unregister_hooks); - /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, @@ -450,40 +342,9 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif -static int nf_register_hook_list(struct net *net) -{ - struct nf_hook_ops *elem; - int ret; - - rtnl_lock(); - list_for_each_entry(elem, &nf_hook_list, list) { - ret = nf_register_net_hook(net, elem); - if (ret && ret != -ENOENT) - goto out_undo; - } - rtnl_unlock(); - return 0; - -out_undo: - list_for_each_entry_continue_reverse(elem, &nf_hook_list, list) - nf_unregister_net_hook(net, elem); - rtnl_unlock(); - return ret; -} - -static void nf_unregister_hook_list(struct net *net) -{ - struct nf_hook_ops *elem; - - rtnl_lock(); - list_for_each_entry(elem, &nf_hook_list, list) - nf_unregister_net_hook(net, elem); - rtnl_unlock(); -} - static int __net_init netfilter_net_init(struct net *net) { - int i, h, ret; + int i, h; for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) @@ -500,16 +361,12 @@ static int __net_init netfilter_net_init(struct net *net) return -ENOMEM; } #endif - ret = nf_register_hook_list(net); - if (ret) - remove_proc_entry("netfilter", net->proc_net); - return ret; + return 0; } static void __net_exit netfilter_net_exit(struct net *net) { - nf_unregister_hook_list(net); remove_proc_entry("netfilter", net->proc_net); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9979f46c81dc..51390febd5e3 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -96,19 +96,26 @@ static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { + /* 1) Acquire the lock */ spin_lock(lock); - while (unlikely(nf_conntrack_locks_all)) { - spin_unlock(lock); - /* - * Order the 'nf_conntrack_locks_all' load vs. the - * spin_unlock_wait() loads below, to ensure - * that 'nf_conntrack_locks_all_lock' is indeed held: - */ - smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ - spin_unlock_wait(&nf_conntrack_locks_all_lock); - spin_lock(lock); - } + /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics + * It pairs with the smp_store_release() in nf_conntrack_all_unlock() + */ + if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) + return; + + /* fast path failed, unlock */ + spin_unlock(lock); + + /* Slow path 1) get global lock */ + spin_lock(&nf_conntrack_locks_all_lock); + + /* Slow path 2) get the lock we want */ + spin_lock(lock); + + /* Slow path 3) release the global lock */ + spin_unlock(&nf_conntrack_locks_all_lock); } EXPORT_SYMBOL_GPL(nf_conntrack_lock); @@ -149,28 +156,27 @@ static void nf_conntrack_all_lock(void) int i; spin_lock(&nf_conntrack_locks_all_lock); - nf_conntrack_locks_all = true; - /* - * Order the above store of 'nf_conntrack_locks_all' against - * the spin_unlock_wait() loads below, such that if - * nf_conntrack_lock() observes 'nf_conntrack_locks_all' - * we must observe nf_conntrack_locks[] held: - */ - smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ + nf_conntrack_locks_all = true; for (i = 0; i < CONNTRACK_LOCKS; i++) { - spin_unlock_wait(&nf_conntrack_locks[i]); + spin_lock(&nf_conntrack_locks[i]); + + /* This spin_unlock provides the "release" to ensure that + * nf_conntrack_locks_all==true is visible to everyone that + * acquired spin_lock(&nf_conntrack_locks[]). + */ + spin_unlock(&nf_conntrack_locks[i]); } } static void nf_conntrack_all_unlock(void) { - /* - * All prior stores must be complete before we clear + /* All prior stores must be complete before we clear * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() * might observe the false value but not the entire - * critical section: + * critical section. + * It pairs with the smp_load_acquire() in nf_conntrack_lock() */ smp_store_release(&nf_conntrack_locks_all, false); spin_unlock(&nf_conntrack_locks_all_lock); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e03d16ed550d..899c2c36da13 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -422,7 +422,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { - if (nf_ct_remove_expect(expect)) + if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { ret = -EBUSY; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index b553fdd68816..4707d997558a 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -872,6 +872,11 @@ static int dccp_init_net(struct net *net, u_int16_t proto) return dccp_kmemdup_sysctl_table(net, pn, dn); } +static struct nf_proto_net *dccp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.dccp.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { .l3proto = AF_INET, .l4proto = IPPROTO_DCCP, @@ -904,6 +909,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ .init_net = dccp_init_net, + .get_net_proto = dccp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4); @@ -939,5 +945,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = { }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ .init_net = dccp_init_net, + .get_net_proto = dccp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp6); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 31c6c8ee9d5d..6eef29d2eec4 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -783,6 +783,11 @@ static int sctp_init_net(struct net *net, u_int16_t proto) return sctp_kmemdup_sysctl_table(pn, sn); } +static struct nf_proto_net *sctp_get_net_proto(struct net *net) +{ + return &net->ct.nf_ct_proto.sctp.pn; +} + struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .l3proto = PF_INET, .l4proto = IPPROTO_SCTP, @@ -816,6 +821,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { }, #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ .init_net = sctp_init_net, + .get_net_proto = sctp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4); @@ -852,5 +858,6 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ #endif .init_net = sctp_init_net, + .get_net_proto = sctp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp6); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 832c5a08d9a5..b1d3740ae36a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -222,20 +222,21 @@ find_appropriate_src(struct net *net, .tuple = tuple, .zone = zone }; - struct rhlist_head *hl; + struct rhlist_head *hl, *h; hl = rhltable_lookup(&nf_nat_bysource_table, &key, nf_nat_bysource_params); - if (!hl) - return 0; - ct = container_of(hl, typeof(*ct), nat_bysource); + rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; + if (in_range(l3proto, l4proto, result, range)) + return 1; + } - return in_range(l3proto, l4proto, result, range); + return 0; } /* For [FUTURE] fragmentation handling, we want the least-used @@ -440,7 +441,7 @@ nf_nat_setup_info(struct nf_conn *ct, else ct->status |= IPS_DST_NAT; - if (nfct_help(ct)) + if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 92b05e188fd1..733d3e4a30d8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -472,8 +472,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) if (msglen > skb->len) msglen = skb->len; - if (nlh->nlmsg_len < NLMSG_HDRLEN || - skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) + if (skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) return; err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy, @@ -500,7 +499,8 @@ static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < NLMSG_HDRLEN || + if (skb->len < NLMSG_HDRLEN || + nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index f5a7cb68694e..b89f4f65b2a0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -305,7 +305,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, const struct nf_hook_ops *ops = &basechain->ops[0]; hook_mask = 1 << ops->hooknum; - if (!(hook_mask & target->hooks)) + if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; ret = nft_compat_chain_validate_dependency(target->table, @@ -484,7 +484,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, const struct nf_hook_ops *ops = &basechain->ops[0]; hook_mask = 1 << ops->hooknum; - if (!(hook_mask & match->hooks)) + if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; ret = nft_compat_chain_validate_dependency(match->table, diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 18dd57a52651..14538b1d4d11 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -65,19 +65,23 @@ static int nft_limit_init(struct nft_limit *limit, limit->nsecs = unit * NSEC_PER_SEC; if (limit->rate == 0 || limit->nsecs < unit) return -EOVERFLOW; - limit->tokens = limit->tokens_max = limit->nsecs; - - if (tb[NFTA_LIMIT_BURST]) { - u64 rate; + if (tb[NFTA_LIMIT_BURST]) limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); + else + limit->burst = 0; + + if (limit->rate + limit->burst < limit->rate) + return -EOVERFLOW; - rate = limit->rate + limit->burst; - if (rate < limit->rate) - return -EOVERFLOW; + /* The token bucket size limits the number of tokens can be + * accumulated. tokens_max specifies the bucket size. + * tokens_max = unit * (rate + burst) / rate. + */ + limit->tokens = div_u64(limit->nsecs * (limit->rate + limit->burst), + limit->rate); + limit->tokens_max = limit->tokens; - limit->rate = rate; - } if (tb[NFTA_LIMIT_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); @@ -95,9 +99,8 @@ static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, { u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0; u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); - u64 rate = limit->rate - limit->burst; - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate), + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate), NFTA_LIMIT_PAD) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), NFTA_LIMIT_PAD) || diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 1770c1d9b37f..e1648238a9c9 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1003,14 +1003,10 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; - if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) - info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); - if (!info) { - info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, - PAGE_KERNEL); - if (!info) - return NULL; - } + info = kvmalloc(sz, GFP_KERNEL); + if (!info) + return NULL; + memset(info, 0, sizeof(*info)); info->size = size; return info; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index e4610676299b..a54a556fcdb5 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1337,6 +1337,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, goto out; } + OVS_CB(skb)->acts_origlen = acts->orig_len; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 08679ebb3068..03859e386b47 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -629,6 +629,34 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, return ct; } +static +struct nf_conn *ovs_ct_executed(struct net *net, + const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, + bool *ct_executed) +{ + struct nf_conn *ct = NULL; + + /* If no ct, check if we have evidence that an existing conntrack entry + * might be found for this skb. This happens when we lose a skb->_nfct + * due to an upcall, or if the direction is being forced. If the + * connection was not confirmed, it is not cached and needs to be run + * through conntrack again. + */ + *ct_executed = (key->ct_state & OVS_CS_F_TRACKED) && + !(key->ct_state & OVS_CS_F_INVALID) && + (key->ct_zone == info->zone.id); + + if (*ct_executed || (!key->ct_state && info->force)) { + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, + !!(key->ct_state & + OVS_CS_F_NAT_MASK)); + } + + return ct; +} + /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ static bool skb_nfct_cached(struct net *net, const struct sw_flow_key *key, @@ -637,24 +665,17 @@ static bool skb_nfct_cached(struct net *net, { enum ip_conntrack_info ctinfo; struct nf_conn *ct; + bool ct_executed = true; ct = nf_ct_get(skb, &ctinfo); - /* If no ct, check if we have evidence that an existing conntrack entry - * might be found for this skb. This happens when we lose a skb->_nfct - * due to an upcall. If the connection was not confirmed, it is not - * cached and needs to be run through conntrack again. - */ - if (!ct && key->ct_state & OVS_CS_F_TRACKED && - !(key->ct_state & OVS_CS_F_INVALID) && - key->ct_zone == info->zone.id) { - ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, - !!(key->ct_state - & OVS_CS_F_NAT_MASK)); - if (ct) - nf_ct_get(skb, &ctinfo); - } if (!ct) + ct = ovs_ct_executed(net, key, info, skb, &ct_executed); + + if (ct) + nf_ct_get(skb, &ctinfo); + else return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) return false; if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) @@ -679,7 +700,7 @@ static bool skb_nfct_cached(struct net *net, return false; } - return true; + return ct_executed; } #ifdef CONFIG_NF_NAT_NEEDED @@ -1289,8 +1310,8 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, nla_for_each_nested(a, attr, rem) { int type = nla_type(a); - int maxlen = ovs_ct_attr_lens[type].maxlen; - int minlen = ovs_ct_attr_lens[type].minlen; + int maxlen; + int minlen; if (type > OVS_CT_ATTR_MAX) { OVS_NLERR(log, @@ -1298,6 +1319,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, type, OVS_CT_ATTR_MAX); return -EINVAL; } + + maxlen = ovs_ct_attr_lens[type].maxlen; + minlen = ovs_ct_attr_lens[type].minlen; if (nla_len(a) < minlen || nla_len(a) > maxlen) { OVS_NLERR(log, "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 45fe8c8a884d..6b44fe405282 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -381,7 +381,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, } static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, - unsigned int hdrlen) + unsigned int hdrlen, int actions_attrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ @@ -398,7 +398,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, /* OVS_PACKET_ATTR_ACTIONS */ if (upcall_info->actions_len) - size += nla_total_size(upcall_info->actions_len); + size += nla_total_size(actions_attrlen); /* OVS_PACKET_ATTR_MRU */ if (upcall_info->mru) @@ -465,7 +465,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, else hlen = skb->len; - len = upcall_msg_size(upcall_info, hlen - cutlen); + len = upcall_msg_size(upcall_info, hlen - cutlen, + OVS_CB(skb)->acts_origlen); user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 5d8dcd88815f..480600649d0b 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -99,11 +99,13 @@ struct datapath { * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not * fragmented. + * @acts_origlen: The netlink size of the flow actions applied to this skb. * @cutlen: The number of bytes from the packet end to be removed. */ struct ovs_skb_cb { struct vport *input_vport; u16 mru; + u16 acts_origlen; u32 cutlen; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e3beb28203eb..1c61af9af67d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -214,6 +214,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *, static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); +static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb); struct packet_skb_cb { union { @@ -260,6 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb) if (skb != orig_skb) goto drop; + packet_pick_tx_queue(dev, skb); txq = skb_get_tx_queue(dev, skb); local_bh_disable(); @@ -2189,6 +2191,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct timespec ts; __u32 ts_status; bool is_drop_n_account = false; + bool do_vnet = false; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing @@ -2239,8 +2242,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; - if (po->has_vnet_hdr) + if (po->has_vnet_hdr) { netoff += sizeof(struct virtio_net_hdr); + do_vnet = true; + } macoff = netoff - maclen; } if (po->tp_version <= TPACKET_V2) { @@ -2257,8 +2262,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, skb_set_owner_r(copy_skb, sk); } snaplen = po->rx_ring.frame_size - macoff; - if ((int)snaplen < 0) + if ((int)snaplen < 0) { snaplen = 0; + do_vnet = false; + } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { @@ -2271,6 +2278,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; + do_vnet = false; } } spin_lock(&sk->sk_receive_queue.lock); @@ -2296,7 +2304,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } spin_unlock(&sk->sk_receive_queue.lock); - if (po->has_vnet_hdr) { + if (do_vnet) { if (virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true)) { @@ -2747,8 +2755,6 @@ tpacket_error: goto tpacket_error; } - packet_pick_tx_queue(dev, skb); - skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); @@ -2931,8 +2937,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - packet_pick_tx_queue(dev, skb); - if (po->has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) @@ -3702,14 +3706,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; - po->tp_reserve = val; - return 0; + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->tp_reserve = val; + ret = 0; + } + release_sock(sk); + return ret; } case PACKET_LOSS: { @@ -4331,7 +4340,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, register_prot_hook(sk); } spin_unlock(&po->bind_lock); - if (closing && (po->tp_version > TPACKET_V2)) { + if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e10624aa6959..9722bf839d9d 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1015,8 +1015,10 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, if (rds_ib_ring_empty(&ic->i_recv_ring)) rds_ib_stats_inc(s_ib_rx_ring_empty); - if (rds_ib_ring_low(&ic->i_recv_ring)) + if (rds_ib_ring_low(&ic->i_recv_ring)) { rds_ib_recv_refill(conn, 0, GFP_NOWAIT); + rds_ib_stats_inc(s_ib_rx_refill_from_cq); + } } int rds_ib_recv_path(struct rds_conn_path *cp) @@ -1029,6 +1031,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp) if (rds_conn_up(conn)) { rds_ib_attempt_ack(ic); rds_ib_recv_refill(conn, 0, GFP_KERNEL); + rds_ib_stats_inc(s_ib_rx_refill_from_thread); } return ret; diff --git a/net/rds/send.c b/net/rds/send.c index e81aa176f4e2..41b9f0f5bb9c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -170,8 +170,8 @@ restart: * The acquire_in_xmit() check above ensures that only one * caller can increment c_send_gen at any time. */ - cp->cp_send_gen++; - send_gen = cp->cp_send_gen; + send_gen = READ_ONCE(cp->cp_send_gen) + 1; + WRITE_ONCE(cp->cp_send_gen, send_gen); /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, @@ -431,7 +431,7 @@ over_batch: smp_mb(); if ((test_bit(0, &conn->c_map_queued) || !list_empty(&cp->cp_send_queue)) && - send_gen == cp->cp_send_gen) { + send_gen == READ_ONCE(cp->cp_send_gen)) { rds_stats_inc(s_send_lock_queue_raced); if (batch_count < send_batch_count) goto restart; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index c6dc8caaf5ca..c061d6eb465d 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -136,7 +136,7 @@ int rds_tcp_accept_one(struct socket *sock) if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; - ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family, + ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); if (ret) diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index dd30d74824b0..ec3383f97d4c 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -223,6 +223,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->call_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; + call->socket = rx; if (rx->discard_new_call) { _debug("discard %lx", call->user_call_ID); rx->discard_new_call(call, call->user_call_ID); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index aed6cf2e9fd8..f2e9ed34a963 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -835,7 +835,7 @@ out_nlmsg_trim: } static int -act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, +tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, struct list_head *actions, int event) { struct sk_buff *skb; @@ -1018,7 +1018,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } if (event == RTM_GETACTION) - ret = act_get_notify(net, portid, n, &actions, event); + ret = tcf_get_notify(net, portid, n, &actions, event); else { /* delete */ ret = tcf_del_notify(net, n, &actions, portid); if (ret) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 36f0ced9e60c..541707802a23 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -36,11 +36,12 @@ static struct tc_action_ops act_ipt_ops; static unsigned int xt_net_id; static struct tc_action_ops act_xt_ops; -static int ipt_init_target(struct xt_entry_target *t, char *table, - unsigned int hook) +static int ipt_init_target(struct net *net, struct xt_entry_target *t, + char *table, unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; + struct ipt_entry e = {}; int ret = 0; target = xt_request_find_target(AF_INET, t->u.user.name, @@ -49,8 +50,10 @@ static int ipt_init_target(struct xt_entry_target *t, char *table, return PTR_ERR(target); t->u.kernel.target = target; + memset(&par, 0, sizeof(par)); + par.net = net; par.table = table; - par.entryinfo = NULL; + par.entryinfo = &e; par.target = target; par.targinfo = t->data; par.hook_mask = hook; @@ -91,10 +94,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, +static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, id); struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; @@ -159,7 +163,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (unlikely(!t)) goto err2; - err = ipt_init_target(t, tname, hook); + err = ipt_init_target(net, t, tname, hook); if (err < 0) goto err3; @@ -193,18 +197,16 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) { - struct tc_action_net *tn = net_generic(net, ipt_net_id); - - return __tcf_ipt_init(tn, nla, est, a, &act_ipt_ops, ovr, bind); + return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, + bind); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) { - struct tc_action_net *tn = net_generic(net, xt_net_id); - - return __tcf_ipt_init(tn, nla, est, a, &act_xt_ops, ovr, bind); + return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, + bind); } static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 39da0c5801c9..6c5ea84d2682 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -205,7 +205,7 @@ static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp; - if (*chain->p_filter_chain) + if (chain->p_filter_chain) RCU_INIT_POINTER(*chain->p_filter_chain, NULL); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); @@ -215,9 +215,15 @@ static void tcf_chain_flush(struct tcf_chain *chain) static void tcf_chain_destroy(struct tcf_chain *chain) { - list_del(&chain->list); - tcf_chain_flush(chain); - kfree(chain); + /* May be already removed from the list by the previous call. */ + if (!list_empty(&chain->list)) + list_del_init(&chain->list); + + /* There might still be a reference held when we got here from + * tcf_block_put. Wait for the user to drop reference before free. + */ + if (!chain->refcnt) + kfree(chain); } struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, @@ -288,8 +294,10 @@ void tcf_block_put(struct tcf_block *block) if (!block) return; - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + tcf_chain_flush(chain); tcf_chain_destroy(chain); + } kfree(block); } EXPORT_SYMBOL(tcf_block_put); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index bd24a550e0f9..4fb5a3222d0d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -286,9 +286,6 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) void qdisc_hash_add(struct Qdisc *q, bool invisible) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { - struct Qdisc *root = qdisc_dev(q)->qdisc; - - WARN_ON_ONCE(root == &noop_qdisc); ASSERT_RTNL(); hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); if (invisible) @@ -839,7 +836,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, old = dev_graft_qdisc(dev_queue, new); if (new && i > 0) - refcount_inc(&new->refcnt); + qdisc_refcount_inc(new); if (!ingress) qdisc_destroy(old); @@ -850,7 +847,7 @@ skip: notify_and_destroy(net, skb, n, classid, dev->qdisc, new); if (new && !new->ops->attach) - refcount_inc(&new->refcnt); + qdisc_refcount_inc(new); dev->qdisc = new ? : &noop_qdisc; if (new && new->ops->attach) @@ -1259,7 +1256,7 @@ replay: if (q == p || (p && check_loop(q, p, 0))) return -ELOOP; - refcount_inc(&q->refcnt); + qdisc_refcount_inc(q); goto graft; } else { if (!q) diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 572fe2584e48..c403c87aff7a 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -572,8 +572,10 @@ static void atm_tc_destroy(struct Qdisc *sch) struct atm_flow_data *flow, *tmp; pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) + list_for_each_entry(flow, &p->flows, list) { tcf_block_put(flow->block); + flow->block = NULL; + } list_for_each_entry_safe(flow, tmp, &p->flows, list) { if (flow->ref > 1) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 481036f6b54e..156c8a33c677 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1139,6 +1139,13 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) struct tc_ratespec *r; int err; + qdisc_watchdog_init(&q->watchdog, sch); + hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + q->delay_timer.function = cbq_undelay; + + if (!opt) + return -EINVAL; + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL); if (err < 0) return err; @@ -1177,9 +1184,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->link.avpkt = q->link.allot/2; q->link.minidle = -0x7FFFFFFF; - qdisc_watchdog_init(&q->watchdog, sch); - hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - q->delay_timer.function = cbq_undelay; q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); @@ -1431,8 +1435,10 @@ static void cbq_destroy(struct Qdisc *sch) * be bound to classes which have been destroyed already. --TGR '04 */ for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 147fde73a0f5..263d16e3219e 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -648,7 +648,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; /* If XPS was setup, we can allocate memory on right NUMA node */ - array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_REPEAT, + array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_RETRY_MAYFAIL, netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 337f2d6d81e4..2c0c05f2cc34 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -491,10 +491,8 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) if (!q->flows) return -ENOMEM; q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL); - if (!q->backlogs) { - kvfree(q->flows); + if (!q->backlogs) return -ENOMEM; - } for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 57ba406f1437..4ba6da5fb254 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -785,7 +785,7 @@ static void attach_default_qdiscs(struct net_device *dev) dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); dev->qdisc = txq->qdisc_sleeping; - refcount_inc(&dev->qdisc->refcnt); + qdisc_refcount_inc(dev->qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); if (qdisc) { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b52f74610dc7..11ab8dace901 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1418,6 +1418,8 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct tc_hfsc_qopt *qopt; int err; + qdisc_watchdog_init(&q->watchdog, sch); + if (opt == NULL || nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); @@ -1428,6 +1430,10 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) return err; q->eligible = RB_ROOT; + err = tcf_block_get(&q->root.block, &q->root.filter_list); + if (err) + return err; + q->root.cl_common.classid = sch->handle; q->root.refcnt = 1; q->root.sched = q; @@ -1444,8 +1450,6 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) qdisc_class_hash_insert(&q->clhash, &q->root.cl_common); qdisc_class_hash_grow(sch, &q->clhash); - qdisc_watchdog_init(&q->watchdog, sch); - return 0; } @@ -1522,8 +1526,10 @@ hfsc_destroy_qdisc(struct Qdisc *sch) unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 51d3ba682af9..73a53c08091b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -477,6 +477,9 @@ static void hhf_destroy(struct Qdisc *sch) kvfree(q->hhf_valid_bits[i]); } + if (!q->hh_flows) + return; + for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 203286ab4427..5bf5177b2bd3 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1017,6 +1017,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) int err; int i; + qdisc_watchdog_init(&q->watchdog, sch); + INIT_WORK(&q->work, htb_work_func); + if (!opt) return -EINVAL; @@ -1041,8 +1044,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < TC_HTB_NUMPRIO; i++) INIT_LIST_HEAD(q->drops + i); - qdisc_watchdog_init(&q->watchdog, sch); - INIT_WORK(&q->work, htb_work_func); qdisc_skb_head_init(&q->direct_queue); if (tb[TCA_HTB_DIRECT_QLEN]) @@ -1258,8 +1259,10 @@ static void htb_destroy(struct Qdisc *sch) tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) + hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { tcf_block_put(cl->block); + cl->block = NULL; + } } for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index f143b7bbaa0d..9c454f5d6c38 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -257,12 +257,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; - err = multiq_tune(sch, opt); - - if (err) - kfree(q->queues); - - return err; + return multiq_tune(sch, opt); } static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 1b3dd6190e93..14d1724e0dc4 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -933,11 +933,11 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt) struct netem_sched_data *q = qdisc_priv(sch); int ret; + qdisc_watchdog_init(&q->watchdog, sch); + if (!opt) return -EINVAL; - qdisc_watchdog_init(&q->watchdog, sch); - q->loss_model = CLG_RANDOM; ret = netem_change(sch, opt); if (ret) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index f80ea2cc5f1f..fc69fc5956e9 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -437,6 +437,7 @@ congestion_drop: qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); + qdisc_tree_reduce_backlog(sch, 0, delta); return NET_XMIT_CN; } @@ -468,8 +469,10 @@ enqueue: /* Return Congestion Notification only if we dropped a packet * from this flow. */ - if (qlen != slot->qlen) + if (qlen != slot->qlen) { + qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); return NET_XMIT_CN; + } /* As we dropped a packet, better let upper stack know this */ qdisc_tree_reduce_backlog(sch, 1, dropped); @@ -713,13 +716,13 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) int i; int err; + setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, + (unsigned long)sch); + err = tcf_block_get(&q->block, &q->filter_list); if (err) return err; - setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, - (unsigned long)sch); - for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { q->dep[i].next = i + SFQ_MAX_FLOWS; q->dep[i].prev = i + SFQ_MAX_FLOWS; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index b2e4b6ad241a..493270f0d5b0 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -425,12 +425,13 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) { struct tbf_sched_data *q = qdisc_priv(sch); + qdisc_watchdog_init(&q->watchdog, sch); + q->qdisc = &noop_qdisc; + if (opt == NULL) return -EINVAL; q->t_c = ktime_get_ns(); - qdisc_watchdog_init(&q->watchdog, sch); - q->qdisc = &noop_qdisc; return tbf_change(sch, opt); } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index f5b45b8b8b16..a4b6ffb61495 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -252,6 +252,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl6->flowi6_proto = IPPROTO_SCTP; if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) fl6->flowi6_oif = daddr->v6.sin6_scope_id; + else if (asoc) + fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); @@ -510,7 +512,9 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; + addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = *saddr; + addr->v6.sin6_scope_id = 0; } /* Compare addresses exactly. diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 9a647214a91e..e99518e79b52 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -70,7 +70,8 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, info = nla_data(attr); list_for_each_entry_rcu(laddr, address_list, list) { - memcpy(info, &laddr->a, addrlen); + memcpy(info, &laddr->a, sizeof(laddr->a)); + memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); info += addrlen; } @@ -93,7 +94,9 @@ static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, info = nla_data(attr); list_for_each_entry(from, &asoc->peer.transport_addr_list, transports) { - memcpy(info, &from->ipaddr, addrlen); + memcpy(info, &from->ipaddr, sizeof(from->ipaddr)); + memset(info + sizeof(from->ipaddr), 0, + addrlen - sizeof(from->ipaddr)); info += addrlen; } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 4e16b02ed832..6110447fe51d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -228,7 +228,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, sctp_adaptation_ind_param_t aiparam; sctp_supported_ext_param_t ext_param; int num_ext = 0; - __u8 extensions[3]; + __u8 extensions[4]; struct sctp_paramhdr *auth_chunks = NULL, *auth_hmacs = NULL; @@ -396,7 +396,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, sctp_adaptation_ind_param_t aiparam; sctp_supported_ext_param_t ext_param; int num_ext = 0; - __u8 extensions[3]; + __u8 extensions[4]; struct sctp_paramhdr *auth_chunks = NULL, *auth_hmacs = NULL, *auth_random = NULL; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 1db478e34520..8d760863bc41 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4538,8 +4538,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, info->sctpi_ictrlchunks = asoc->stats.ictrlchunks; prim = asoc->peer.primary_path; - memcpy(&info->sctpi_p_address, &prim->ipaddr, - sizeof(struct sockaddr_storage)); + memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr)); info->sctpi_p_state = prim->state; info->sctpi_p_cwnd = prim->cwnd; info->sctpi_p_srtt = prim->srtt; diff --git a/net/socket.c b/net/socket.c index 59e902b9df09..ad22df1ffbd1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1910,22 +1910,18 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, struct sockaddr __user **save_addr, struct iovec **iov) { - struct sockaddr __user *uaddr; - struct iovec __user *uiov; - size_t nr_segs; + struct user_msghdr msg; ssize_t err; - if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) || - __get_user(uaddr, &umsg->msg_name) || - __get_user(kmsg->msg_namelen, &umsg->msg_namelen) || - __get_user(uiov, &umsg->msg_iov) || - __get_user(nr_segs, &umsg->msg_iovlen) || - __get_user(kmsg->msg_control, &umsg->msg_control) || - __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || - __get_user(kmsg->msg_flags, &umsg->msg_flags)) + if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - if (!uaddr) + kmsg->msg_control = (void __force *)msg.msg_control; + kmsg->msg_controllen = msg.msg_controllen; + kmsg->msg_flags = msg.msg_flags; + + kmsg->msg_namelen = msg.msg_namelen; + if (!msg.msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) @@ -1935,11 +1931,12 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_namelen = sizeof(struct sockaddr_storage); if (save_addr) - *save_addr = uaddr; + *save_addr = msg.msg_name; - if (uaddr && kmsg->msg_namelen) { + if (msg.msg_name && kmsg->msg_namelen) { if (!save_addr) { - err = move_addr_to_kernel(uaddr, kmsg->msg_namelen, + err = move_addr_to_kernel(msg.msg_name, + kmsg->msg_namelen, kmsg->msg_name); if (err < 0) return err; @@ -1949,12 +1946,13 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_namelen = 0; } - if (nr_segs > UIO_MAXIOV) + if (msg.msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; - return import_iovec(save_addr ? READ : WRITE, uiov, nr_segs, + return import_iovec(save_addr ? READ : WRITE, + msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); } diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index fb39284ec174..12649c9fedab 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -34,6 +34,7 @@ * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. */ +#include <crypto/algapi.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/err.h> @@ -927,7 +928,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, if (ret) goto out_err; - if (memcmp(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) { + if (crypto_memneq(pkt_hmac, our_hmac, kctx->gk5e->cksumlength) != 0) { ret = GSS_S_BAD_SIG; goto out_err; } diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index f0c6a8c78a56..46b295e4f2b8 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -55,15 +55,15 @@ enum { #define PROC(proc, name) \ [GSSX_##proc] = { \ .p_proc = GSSX_##proc, \ - .p_encode = (kxdreproc_t)gssx_enc_##name, \ - .p_decode = (kxdrdproc_t)gssx_dec_##name, \ + .p_encode = gssx_enc_##name, \ + .p_decode = gssx_dec_##name, \ .p_arglen = GSSX_ARG_##name##_sz, \ .p_replen = GSSX_RES_##name##_sz, \ .p_statidx = GSSX_##proc, \ .p_name = #proc, \ } -static struct rpc_procinfo gssp_procedures[] = { +static const struct rpc_procinfo gssp_procedures[] = { PROC(INDICATE_MECHS, indicate_mechs), PROC(GET_CALL_CONTEXT, get_call_context), PROC(IMPORT_AND_CANON_NAME, import_and_canon_name), @@ -364,11 +364,12 @@ void gssp_free_upcall_data(struct gssp_upcall_data *data) /* * Initialization stuff */ - +static unsigned int gssp_version1_counts[ARRAY_SIZE(gssp_procedures)]; static const struct rpc_version gssp_version1 = { .number = GSSPROXY_VERS_1, .nrprocs = ARRAY_SIZE(gssp_procedures), .procs = gssp_procedures, + .counts = gssp_version1_counts, }; static const struct rpc_version *gssp_version[] = { diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 25d9a9cf7b66..c4778cae58ef 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -44,7 +44,7 @@ static int gssx_dec_bool(struct xdr_stream *xdr, u32 *v) } static int gssx_enc_buffer(struct xdr_stream *xdr, - gssx_buffer *buf) + const gssx_buffer *buf) { __be32 *p; @@ -56,7 +56,7 @@ static int gssx_enc_buffer(struct xdr_stream *xdr, } static int gssx_enc_in_token(struct xdr_stream *xdr, - struct gssp_in_token *in) + const struct gssp_in_token *in) { __be32 *p; @@ -130,7 +130,7 @@ static int gssx_dec_option(struct xdr_stream *xdr, } static int dummy_enc_opt_array(struct xdr_stream *xdr, - struct gssx_option_array *oa) + const struct gssx_option_array *oa) { __be32 *p; @@ -348,7 +348,7 @@ static int gssx_dec_status(struct xdr_stream *xdr, } static int gssx_enc_call_ctx(struct xdr_stream *xdr, - struct gssx_call_ctx *ctx) + const struct gssx_call_ctx *ctx) { struct gssx_option opt; __be32 *p; @@ -733,8 +733,9 @@ static int gssx_enc_cb(struct xdr_stream *xdr, struct gssx_cb *cb) void gssx_enc_accept_sec_context(struct rpc_rqst *req, struct xdr_stream *xdr, - struct gssx_arg_accept_sec_context *arg) + const void *data) { + const struct gssx_arg_accept_sec_context *arg = data; int err; err = gssx_enc_call_ctx(xdr, &arg->call_ctx); @@ -789,8 +790,9 @@ done: int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct gssx_res_accept_sec_context *res) + void *data) { + struct gssx_res_accept_sec_context *res = data; u32 value_follows; int err; struct page *scratch; diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.h b/net/sunrpc/auth_gss/gss_rpc_xdr.h index 9d88c6239f01..146c31032917 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.h +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.h @@ -179,10 +179,10 @@ struct gssx_res_accept_sec_context { #define gssx_dec_init_sec_context NULL void gssx_enc_accept_sec_context(struct rpc_rqst *req, struct xdr_stream *xdr, - struct gssx_arg_accept_sec_context *args); + const void *data); int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct xdr_stream *xdr, - struct gssx_res_accept_sec_context *res); + void *data); #define gssx_enc_release_handle NULL #define gssx_dec_release_handle NULL #define gssx_enc_get_mic NULL diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index a54a7a3d28f5..7b1ee5a0b03c 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -838,6 +838,14 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g struct xdr_netobj mic; struct xdr_buf integ_buf; + /* NFS READ normally uses splice to send data in-place. However + * the data in cache can change after the reply's MIC is computed + * but before the RPC reply is sent. To prevent the client from + * rejecting the server-computed MIC in this somewhat rare case, + * do not use splice with the GSS integrity service. + */ + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) return 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b5cb921775a0..2e49d1f892b7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1517,14 +1517,16 @@ static void call_start(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; + int idx = task->tk_msg.rpc_proc->p_statidx; dprintk("RPC: %5u call_start %s%d proc %s (%s)\n", task->tk_pid, clnt->cl_program->name, clnt->cl_vers, rpc_proc_name(task), (RPC_IS_ASYNC(task) ? "async" : "sync")); - /* Increment call count */ - task->tk_msg.rpc_proc->p_count++; + /* Increment call count (version might not be valid for ping) */ + if (clnt->cl_program->version[clnt->cl_vers]) + clnt->cl_program->version[clnt->cl_vers]->counts[idx]++; clnt->cl_stats->rpccnt++; task->tk_action = call_reserve; } @@ -1672,7 +1674,7 @@ call_allocate(struct rpc_task *task) unsigned int slack = task->tk_rqstp->rq_cred->cr_auth->au_cslack; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - struct rpc_procinfo *proc = task->tk_msg.rpc_proc; + const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; int status; dprint_status(task); @@ -2476,16 +2478,18 @@ out_overflow: goto out_garbage; } -static void rpcproc_encode_null(void *rqstp, struct xdr_stream *xdr, void *obj) +static void rpcproc_encode_null(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + const void *obj) { } -static int rpcproc_decode_null(void *rqstp, struct xdr_stream *xdr, void *obj) +static int rpcproc_decode_null(struct rpc_rqst *rqstp, struct xdr_stream *xdr, + void *obj) { return 0; } -static struct rpc_procinfo rpcproc_null = { +static const struct rpc_procinfo rpcproc_null = { .p_encode = rpcproc_encode_null, .p_decode = rpcproc_decode_null, }; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 5b30603596d0..ea0676f199c8 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -128,13 +128,13 @@ struct rpcbind_args { int r_status; }; -static struct rpc_procinfo rpcb_procedures2[]; -static struct rpc_procinfo rpcb_procedures3[]; -static struct rpc_procinfo rpcb_procedures4[]; +static const struct rpc_procinfo rpcb_procedures2[]; +static const struct rpc_procinfo rpcb_procedures3[]; +static const struct rpc_procinfo rpcb_procedures4[]; struct rpcb_info { u32 rpc_vers; - struct rpc_procinfo * rpc_proc; + const struct rpc_procinfo *rpc_proc; }; static const struct rpcb_info rpcb_next_version[]; @@ -620,7 +620,8 @@ int rpcb_v4_register(struct net *net, const u32 program, const u32 version, return -EAFNOSUPPORT; } -static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbind_args *map, struct rpc_procinfo *proc) +static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, + struct rpcbind_args *map, const struct rpc_procinfo *proc) { struct rpc_message msg = { .rpc_proc = proc, @@ -671,7 +672,7 @@ static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt) void rpcb_getport_async(struct rpc_task *task) { struct rpc_clnt *clnt; - struct rpc_procinfo *proc; + const struct rpc_procinfo *proc; u32 bind_version; struct rpc_xprt *xprt; struct rpc_clnt *rpcb_clnt; @@ -843,8 +844,9 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) */ static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct rpcbind_args *rpcb) + const void *data) { + const struct rpcbind_args *rpcb = data; __be32 *p; dprintk("RPC: %5u encoding PMAP_%s call (%u, %u, %d, %u)\n", @@ -860,8 +862,9 @@ static void rpcb_enc_mapping(struct rpc_rqst *req, struct xdr_stream *xdr, } static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr, - struct rpcbind_args *rpcb) + void *data) { + struct rpcbind_args *rpcb = data; unsigned long port; __be32 *p; @@ -882,8 +885,9 @@ static int rpcb_dec_getport(struct rpc_rqst *req, struct xdr_stream *xdr, } static int rpcb_dec_set(struct rpc_rqst *req, struct xdr_stream *xdr, - unsigned int *boolp) + void *data) { + unsigned int *boolp = data; __be32 *p; p = xdr_inline_decode(xdr, 4); @@ -917,8 +921,9 @@ static void encode_rpcb_string(struct xdr_stream *xdr, const char *string, } static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, - const struct rpcbind_args *rpcb) + const void *data) { + const struct rpcbind_args *rpcb = data; __be32 *p; dprintk("RPC: %5u encoding RPCB_%s call (%u, %u, '%s', '%s')\n", @@ -937,8 +942,9 @@ static void rpcb_enc_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, } static int rpcb_dec_getaddr(struct rpc_rqst *req, struct xdr_stream *xdr, - struct rpcbind_args *rpcb) + void *data) { + struct rpcbind_args *rpcb = data; struct sockaddr_storage address; struct sockaddr *sap = (struct sockaddr *)&address; __be32 *p; @@ -989,11 +995,11 @@ out_fail: * since the Linux kernel RPC code requires only these. */ -static struct rpc_procinfo rpcb_procedures2[] = { +static const struct rpc_procinfo rpcb_procedures2[] = { [RPCBPROC_SET] = { .p_proc = RPCBPROC_SET, - .p_encode = (kxdreproc_t)rpcb_enc_mapping, - .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_encode = rpcb_enc_mapping, + .p_decode = rpcb_dec_set, .p_arglen = RPCB_mappingargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_SET, @@ -1002,8 +1008,8 @@ static struct rpc_procinfo rpcb_procedures2[] = { }, [RPCBPROC_UNSET] = { .p_proc = RPCBPROC_UNSET, - .p_encode = (kxdreproc_t)rpcb_enc_mapping, - .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_encode = rpcb_enc_mapping, + .p_decode = rpcb_dec_set, .p_arglen = RPCB_mappingargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_UNSET, @@ -1012,8 +1018,8 @@ static struct rpc_procinfo rpcb_procedures2[] = { }, [RPCBPROC_GETPORT] = { .p_proc = RPCBPROC_GETPORT, - .p_encode = (kxdreproc_t)rpcb_enc_mapping, - .p_decode = (kxdrdproc_t)rpcb_dec_getport, + .p_encode = rpcb_enc_mapping, + .p_decode = rpcb_dec_getport, .p_arglen = RPCB_mappingargs_sz, .p_replen = RPCB_getportres_sz, .p_statidx = RPCBPROC_GETPORT, @@ -1022,11 +1028,11 @@ static struct rpc_procinfo rpcb_procedures2[] = { }, }; -static struct rpc_procinfo rpcb_procedures3[] = { +static const struct rpc_procinfo rpcb_procedures3[] = { [RPCBPROC_SET] = { .p_proc = RPCBPROC_SET, - .p_encode = (kxdreproc_t)rpcb_enc_getaddr, - .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_encode = rpcb_enc_getaddr, + .p_decode = rpcb_dec_set, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_SET, @@ -1035,8 +1041,8 @@ static struct rpc_procinfo rpcb_procedures3[] = { }, [RPCBPROC_UNSET] = { .p_proc = RPCBPROC_UNSET, - .p_encode = (kxdreproc_t)rpcb_enc_getaddr, - .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_encode = rpcb_enc_getaddr, + .p_decode = rpcb_dec_set, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_UNSET, @@ -1045,8 +1051,8 @@ static struct rpc_procinfo rpcb_procedures3[] = { }, [RPCBPROC_GETADDR] = { .p_proc = RPCBPROC_GETADDR, - .p_encode = (kxdreproc_t)rpcb_enc_getaddr, - .p_decode = (kxdrdproc_t)rpcb_dec_getaddr, + .p_encode = rpcb_enc_getaddr, + .p_decode = rpcb_dec_getaddr, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_getaddrres_sz, .p_statidx = RPCBPROC_GETADDR, @@ -1055,11 +1061,11 @@ static struct rpc_procinfo rpcb_procedures3[] = { }, }; -static struct rpc_procinfo rpcb_procedures4[] = { +static const struct rpc_procinfo rpcb_procedures4[] = { [RPCBPROC_SET] = { .p_proc = RPCBPROC_SET, - .p_encode = (kxdreproc_t)rpcb_enc_getaddr, - .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_encode = rpcb_enc_getaddr, + .p_decode = rpcb_dec_set, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_SET, @@ -1068,8 +1074,8 @@ static struct rpc_procinfo rpcb_procedures4[] = { }, [RPCBPROC_UNSET] = { .p_proc = RPCBPROC_UNSET, - .p_encode = (kxdreproc_t)rpcb_enc_getaddr, - .p_decode = (kxdrdproc_t)rpcb_dec_set, + .p_encode = rpcb_enc_getaddr, + .p_decode = rpcb_dec_set, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_setres_sz, .p_statidx = RPCBPROC_UNSET, @@ -1078,8 +1084,8 @@ static struct rpc_procinfo rpcb_procedures4[] = { }, [RPCBPROC_GETADDR] = { .p_proc = RPCBPROC_GETADDR, - .p_encode = (kxdreproc_t)rpcb_enc_getaddr, - .p_decode = (kxdrdproc_t)rpcb_dec_getaddr, + .p_encode = rpcb_enc_getaddr, + .p_decode = rpcb_dec_getaddr, .p_arglen = RPCB_getaddrargs_sz, .p_replen = RPCB_getaddrres_sz, .p_statidx = RPCBPROC_GETADDR, @@ -1112,22 +1118,28 @@ static const struct rpcb_info rpcb_next_version6[] = { }, }; +static unsigned int rpcb_version2_counts[ARRAY_SIZE(rpcb_procedures2)]; static const struct rpc_version rpcb_version2 = { .number = RPCBVERS_2, .nrprocs = ARRAY_SIZE(rpcb_procedures2), - .procs = rpcb_procedures2 + .procs = rpcb_procedures2, + .counts = rpcb_version2_counts, }; +static unsigned int rpcb_version3_counts[ARRAY_SIZE(rpcb_procedures3)]; static const struct rpc_version rpcb_version3 = { .number = RPCBVERS_3, .nrprocs = ARRAY_SIZE(rpcb_procedures3), - .procs = rpcb_procedures3 + .procs = rpcb_procedures3, + .counts = rpcb_version3_counts, }; +static unsigned int rpcb_version4_counts[ARRAY_SIZE(rpcb_procedures4)]; static const struct rpc_version rpcb_version4 = { .number = RPCBVERS_4, .nrprocs = ARRAY_SIZE(rpcb_procedures4), - .procs = rpcb_procedures4 + .procs = rpcb_procedures4, + .counts = rpcb_version4_counts, }; static const struct rpc_version *rpcb_version[] = { diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index caeb01ad2b5a..1e671333c3d5 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -55,8 +55,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) { seq_printf(seq, "proc%u %u", vers->number, vers->nrprocs); for (j = 0; j < vers->nrprocs; j++) - seq_printf(seq, " %u", - vers->procs[j].p_count); + seq_printf(seq, " %u", vers->counts[j]); seq_putc(seq, '\n'); } return 0; @@ -78,9 +77,9 @@ static const struct file_operations rpc_proc_fops = { /* * Get RPC server stats */ -void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { +void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) +{ const struct svc_program *prog = statp->program; - const struct svc_procedure *proc; const struct svc_version *vers; unsigned int i, j; @@ -99,11 +98,12 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { statp->rpcbadclnt); for (i = 0; i < prog->pg_nvers; i++) { - if (!(vers = prog->pg_vers[i]) || !(proc = vers->vs_proc)) + vers = prog->pg_vers[i]; + if (!vers) continue; seq_printf(seq, "proc%d %u", i, vers->vs_nproc); - for (j = 0; j < vers->vs_nproc; j++, proc++) - seq_printf(seq, " %u", proc->pc_count); + for (j = 0; j < vers->vs_nproc; j++) + seq_printf(seq, " %u", vers->vs_count[j]); seq_putc(seq, '\n'); } } @@ -192,7 +192,7 @@ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) EXPORT_SYMBOL_GPL(rpc_count_iostats); static void _print_name(struct seq_file *seq, unsigned int op, - struct rpc_procinfo *procs) + const struct rpc_procinfo *procs) { if (procs[op].p_name) seq_printf(seq, "\t%12s: ", procs[op].p_name); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index bc0f5a0ecbdc..85ce0db5b0a6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1008,7 +1008,7 @@ int svc_register(const struct svc_serv *serv, struct net *net, const unsigned short port) { struct svc_program *progp; - struct svc_version *vers; + const struct svc_version *vers; unsigned int i; int error = 0; @@ -1151,10 +1151,9 @@ static int svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) { struct svc_program *progp; - struct svc_version *versp = NULL; /* compiler food */ - struct svc_procedure *procp = NULL; + const struct svc_version *versp = NULL; /* compiler food */ + const struct svc_procedure *procp = NULL; struct svc_serv *serv = rqstp->rq_server; - kxdrproc_t xdr; __be32 *statp; u32 prog, vers, proc; __be32 auth_stat, rpc_stat; @@ -1166,7 +1165,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) if (argv->iov_len < 6*4) goto err_short_len; - /* Will be turned off only in gss privacy case: */ + /* Will be turned off by GSS integrity and privacy services */ set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); @@ -1262,7 +1261,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) svc_putnl(resv, RPC_SUCCESS); /* Bump per-procedure stats counter */ - procp->pc_count++; + versp->vs_count[proc]++; /* Initialize storage for argp and resp */ memset(rqstp->rq_argp, 0, procp->pc_argsize); @@ -1276,28 +1275,30 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) /* Call the function that processes the request. */ if (!versp->vs_dispatch) { - /* Decode arguments */ - xdr = procp->pc_decode; - if (xdr && !xdr(rqstp, argv->iov_base, rqstp->rq_argp)) + /* + * Decode arguments + * XXX: why do we ignore the return value? + */ + if (procp->pc_decode && + !procp->pc_decode(rqstp, argv->iov_base)) goto err_garbage; - *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); + *statp = procp->pc_func(rqstp); /* Encode reply */ if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags)) { if (procp->pc_release) - procp->pc_release(rqstp, NULL, rqstp->rq_resp); + procp->pc_release(rqstp); goto dropit; } if (*statp == rpc_autherr_badcred) { if (procp->pc_release) - procp->pc_release(rqstp, NULL, rqstp->rq_resp); + procp->pc_release(rqstp); goto err_bad_auth; } - if (*statp == rpc_success && - (xdr = procp->pc_encode) && - !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { + if (*statp == rpc_success && procp->pc_encode && + !procp->pc_encode(rqstp, resv->iov_base + resv->iov_len)) { dprintk("svc: failed to encode reply\n"); /* serv->sv_stats->rpcsystemerr++; */ *statp = rpc_system_err; @@ -1307,7 +1308,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) if (!versp->vs_dispatch(rqstp, statp)) { /* Release reply info */ if (procp->pc_release) - procp->pc_release(rqstp, NULL, rqstp->rq_resp); + procp->pc_release(rqstp); goto dropit; } } @@ -1318,7 +1319,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) /* Release reply info */ if (procp->pc_release) - procp->pc_release(rqstp, NULL, rqstp->rq_resp); + procp->pc_release(rqstp); if (procp->pc_encode == NULL) goto dropit; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 7bfe1fb42add..d16a8b423c20 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -659,11 +659,13 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) int i; /* now allocate needed pages. If we get a failure, sleep briefly */ - pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; - WARN_ON_ONCE(pages >= RPCSVC_MAXPAGES); - if (pages >= RPCSVC_MAXPAGES) + pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; + if (pages > RPCSVC_MAXPAGES) { + pr_warn_once("svc: warning: pages=%u > RPCSVC_MAXPAGES=%lu\n", + pages, RPCSVC_MAXPAGES); /* use as many pages as possible */ - pages = RPCSVC_MAXPAGES - 1; + pages = RPCSVC_MAXPAGES; + } for (i = 0; i < pages ; i++) while (rqstp->rq_pages[i] == NULL) { struct page *p = alloc_page(GFP_KERNEL); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2b720fa35c4f..e18500151236 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -421,6 +421,9 @@ static void svc_data_ready(struct sock *sk) dprintk("svc: socket %p(inet %p), busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + + /* Refer to svc_setup_socket() for details. */ + rmb(); svsk->sk_odata(sk); if (!test_and_set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags)) svc_xprt_enqueue(&svsk->sk_xprt); @@ -437,6 +440,9 @@ static void svc_write_space(struct sock *sk) if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + + /* Refer to svc_setup_socket() for details. */ + rmb(); svsk->sk_owspace(sk); svc_xprt_enqueue(&svsk->sk_xprt); } @@ -760,8 +766,12 @@ static void svc_tcp_listen_data_ready(struct sock *sk) dprintk("svc: socket %p TCP (listen) state change %d\n", sk, sk->sk_state); - if (svsk) + if (svsk) { + /* Refer to svc_setup_socket() for details. */ + rmb(); svsk->sk_odata(sk); + } + /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -794,6 +804,8 @@ static void svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { + /* Refer to svc_setup_socket() for details. */ + rmb(); svsk->sk_ostate(sk); if (sk->sk_state != TCP_ESTABLISHED) { set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); @@ -1381,12 +1393,18 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, return ERR_PTR(err); } - inet->sk_user_data = svsk; svsk->sk_sock = sock; svsk->sk_sk = inet; svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; + /* + * This barrier is necessary in order to prevent race condition + * with svc_data_ready(), svc_listen_data_ready() and others + * when calling callbacks above. + */ + wmb(); + inet->sk_user_data = svsk; /* Initialize the socket */ if (sock->type == SOCK_DGRAM) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3e63c5e97ebe..4654a9934269 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1047,13 +1047,15 @@ out: return ret; } -static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt, gfp_t gfp_flags) +static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) { struct rpc_rqst *req = ERR_PTR(-EAGAIN); if (!atomic_add_unless(&xprt->num_reqs, 1, xprt->max_reqs)) goto out; - req = kzalloc(sizeof(struct rpc_rqst), gfp_flags); + spin_unlock(&xprt->reserve_lock); + req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS); + spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; atomic_dec(&xprt->num_reqs); @@ -1081,7 +1083,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) list_del(&req->rq_list); goto out_init_req; } - req = xprt_dynamic_alloc_slot(xprt, GFP_NOWAIT|__GFP_NOWARN); + req = xprt_dynamic_alloc_slot(xprt); if (!IS_ERR(req)) goto out_init_req; switch (PTR_ERR(req)) { diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index c1ae8142ab73..b8213ddce2f2 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -3,6 +3,6 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ - svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ - svc_rdma_rw.o module.o + svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ + module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 59e64025ed96..d3f84bb1d443 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -91,7 +91,7 @@ __fmr_unmap(struct rpcrdma_mw *mw) list_add(&mw->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); - list_del_init(&mw->fmr.fm_mr->list); + list_del(&mw->fmr.fm_mr->list); return rc; } @@ -213,13 +213,11 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_nents = i; mw->mw_dir = rpcrdma_data_dir(writing); - if (i == 0) - goto out_dmamap_err; - if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir)) + mw->mw_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, i, mw->mw_dir); + if (!mw->mw_nents) goto out_dmamap_err; for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) @@ -237,16 +235,18 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, return mw->mw_nents; out_dmamap_err: - pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", - mw->mw_sg, mw->mw_nents); - rpcrdma_defer_mr_recovery(mw); + pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", + mw->mw_sg, i); + rpcrdma_put_mw(r_xprt, mw); return -EIO; out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", len, (unsigned long long)dma_pages[0], pageoff, mw->mw_nents, rc); - rpcrdma_defer_mr_recovery(mw); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + rpcrdma_put_mw(r_xprt, mw); return -EIO; } @@ -255,24 +255,26 @@ out_maperr: * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that req->rl_registered is not empty. + * Caller ensures that @mws is not empty before the call. This + * function empties the list. */ static void -fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) { - struct rpcrdma_mw *mw, *tmp; + struct rpcrdma_mw *mw; LIST_HEAD(unmap_list); int rc; - dprintk("RPC: %s: req %p\n", __func__, req); - /* ORDER: Invalidate all of the req's MRs first * * ib_unmap_fmr() is slow, so use a single call instead * of one call per mapped FMR. */ - list_for_each_entry(mw, &req->rl_registered, mw_list) + list_for_each_entry(mw, mws, mw_list) { + dprintk("RPC: %s: unmapping fmr %p\n", + __func__, &mw->fmr); list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); + } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) @@ -281,9 +283,11 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ - list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { - list_del_init(&mw->mw_list); - list_del_init(&mw->fmr.fm_mr->list); + while (!list_empty(mws)) { + mw = rpcrdma_pop_mw(mws); + dprintk("RPC: %s: DMA unmapping fmr %p\n", + __func__, &mw->fmr); + list_del(&mw->fmr.fm_mr->list); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); @@ -294,8 +298,9 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) out_reset: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { - list_del_init(&mw->fmr.fm_mr->list); + while (!list_empty(mws)) { + mw = rpcrdma_pop_mw(mws); + list_del(&mw->fmr.fm_mr->list); fmr_op_recover_mr(mw); } } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index f81dd93176c0..6aea36a38bfd 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -277,7 +277,7 @@ __frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) } /** - * frwr_wc_fastreg - Invoked by RDMA provider for each polled FastReg WC + * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC * @cq: completion queue (ignored) * @wc: completed WR * @@ -298,7 +298,7 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) } /** - * frwr_wc_localinv - Invoked by RDMA provider for each polled LocalInv WC + * frwr_wc_localinv - Invoked by RDMA provider for a flushed LocalInv WC * @cq: completion queue (ignored) * @wc: completed WR * @@ -319,7 +319,7 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) } /** - * frwr_wc_localinv - Invoked by RDMA provider for each polled LocalInv WC + * frwr_wc_localinv_wake - Invoked by RDMA provider for a signaled LocalInv WC * @cq: completion queue (ignored) * @wc: completed WR * @@ -355,7 +355,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct ib_mr *mr; struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; - int rc, i, n, dma_nents; + int rc, i, n; u8 key; mw = NULL; @@ -391,14 +391,10 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_nents = i; mw->mw_dir = rpcrdma_data_dir(writing); - if (i == 0) - goto out_dmamap_err; - dma_nents = ib_dma_map_sg(ia->ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - if (!dma_nents) + mw->mw_nents = ib_dma_map_sg(ia->ri_device, mw->mw_sg, i, mw->mw_dir); + if (!mw->mw_nents) goto out_dmamap_err; n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); @@ -436,13 +432,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, return mw->mw_nents; out_dmamap_err: - pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", - mw->mw_sg, mw->mw_nents); - rpcrdma_defer_mr_recovery(mw); + pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", + mw->mw_sg, i); + frmr->fr_state = FRMR_IS_INVALID; + rpcrdma_put_mw(r_xprt, mw); return -EIO; out_mapmr_err: - pr_err("rpcrdma: failed to map mr %p (%u/%u)\n", + pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frmr->fr_mr, n, mw->mw_nents); rpcrdma_defer_mr_recovery(mw); return -EIO; @@ -458,21 +455,19 @@ out_senderr: * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that req->rl_registered is not empty. + * Caller ensures that @mws is not empty before the call. This + * function empties the list. */ static void -frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) { struct ib_send_wr *first, **prev, *last, *bad_wr; - struct rpcrdma_rep *rep = req->rl_reply; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_frmr *f; struct rpcrdma_mw *mw; int count, rc; - dprintk("RPC: %s: req %p\n", __func__, req); - - /* ORDER: Invalidate all of the req's MRs first + /* ORDER: Invalidate all of the MRs first * * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. @@ -480,11 +475,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) f = NULL; count = 0; prev = &first; - list_for_each_entry(mw, &req->rl_registered, mw_list) { + list_for_each_entry(mw, mws, mw_list) { mw->frmr.fr_state = FRMR_IS_INVALID; - if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) && - (mw->mw_handle == rep->rr_inv_rkey)) + if (mw->mw_flags & RPCRDMA_MW_F_RI) continue; f = &mw->frmr; @@ -524,18 +518,19 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless ri_id->qp is a valid pointer. */ r_xprt->rx_stats.local_inv_needed++; + bad_wr = NULL; rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); + if (bad_wr != first) + wait_for_completion(&f->fr_linv_done); if (rc) goto reset_mrs; - wait_for_completion(&f->fr_linv_done); - - /* ORDER: Now DMA unmap all of the req's MRs, and return + /* ORDER: Now DMA unmap all of the MRs, and return * them to the free MW list. */ unmap: - while (!list_empty(&req->rl_registered)) { - mw = rpcrdma_pop_mw(&req->rl_registered); + while (!list_empty(mws)) { + mw = rpcrdma_pop_mw(mws); dprintk("RPC: %s: DMA unmapping frmr %p\n", __func__, &mw->frmr); ib_dma_unmap_sg(ia->ri_device, @@ -546,17 +541,19 @@ unmap: reset_mrs: pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); - rdma_disconnect(ia->ri_id); /* Find and reset the MRs in the LOCAL_INV WRs that did not - * get posted. This is synchronous, and slow. + * get posted. */ - list_for_each_entry(mw, &req->rl_registered, mw_list) { - f = &mw->frmr; - if (mw->mw_handle == bad_wr->ex.invalidate_rkey) { - __frwr_reset_mr(ia, mw); - bad_wr = bad_wr->next; - } + rpcrdma_init_cqcount(&r_xprt->rx_ep, -count); + while (bad_wr) { + f = container_of(bad_wr, struct rpcrdma_frmr, + fr_invwr); + mw = container_of(f, struct rpcrdma_mw, frmr); + + __frwr_reset_mr(ia, mw); + + bad_wr = bad_wr->next; } goto unmap; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 694e9b13ecf0..ca4d6e4528f3 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -141,7 +141,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, if (xdr->page_len) { remaining = xdr->page_len; - offset = xdr->page_base & ~PAGE_MASK; + offset = offset_in_page(xdr->page_base); count = 0; while (remaining) { remaining -= min_t(unsigned int, @@ -222,7 +222,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, len = xdrbuf->page_len; ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); - page_base = xdrbuf->page_base & ~PAGE_MASK; + page_base = offset_in_page(xdrbuf->page_base); p = 0; while (len && n < RPCRDMA_MAX_SEGS) { if (!ppages[p]) { @@ -540,7 +540,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, goto out; page = virt_to_page(xdr->tail[0].iov_base); - page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + page_base = offset_in_page(xdr->tail[0].iov_base); /* If the content in the page list is an odd length, * xdr_write_pages() has added a pad at the beginning @@ -557,7 +557,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, */ if (xdr->page_len) { ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_base = xdr->page_base & ~PAGE_MASK; + page_base = offset_in_page(xdr->page_base); remaining = xdr->page_len; while (remaining) { sge_no++; @@ -587,7 +587,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, */ if (xdr->tail[0].iov_len) { page = virt_to_page(xdr->tail[0].iov_base); - page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + page_base = offset_in_page(xdr->tail[0].iov_base); len = xdr->tail[0].iov_len; map_tail: @@ -734,6 +734,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) rpclen = 0; } + req->rl_xid = rqst->rq_xid; + rpcrdma_insert_req(&r_xprt->rx_buf, req); + /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -875,9 +878,9 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) srcp += curlen; copy_len -= curlen; - page_base = rqst->rq_rcv_buf.page_base; - ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); - page_base &= ~PAGE_MASK; + ppages = rqst->rq_rcv_buf.pages + + (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT); + page_base = offset_in_page(rqst->rq_rcv_buf.page_base); fixup_copy_count = 0; if (copy_len && rqst->rq_rcv_buf.page_len) { int pagelist_len; @@ -928,6 +931,24 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) return fixup_copy_count; } +/* Caller must guarantee @rep remains stable during this call. + */ +static void +rpcrdma_mark_remote_invalidation(struct list_head *mws, + struct rpcrdma_rep *rep) +{ + struct rpcrdma_mw *mw; + + if (!(rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)) + return; + + list_for_each_entry(mw, mws, mw_list) + if (mw->mw_handle == rep->rr_inv_rkey) { + mw->mw_flags = RPCRDMA_MW_F_RI; + break; /* only one invalidated MR per RPC */ + } +} + #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes @@ -969,14 +990,16 @@ rpcrdma_reply_handler(struct work_struct *work) { struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpcrdma_msg *headerp; struct rpcrdma_req *req; struct rpc_rqst *rqst; - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - struct rpc_xprt *xprt = &r_xprt->rx_xprt; __be32 *iptr; int rdmalen, status, rmerr; unsigned long cwnd; + struct list_head mws; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); @@ -994,27 +1017,45 @@ rpcrdma_reply_handler(struct work_struct *work) /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock_bh(&xprt->transport_lock); - rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); - if (!rqst) + spin_lock(&buf->rb_lock); + req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, + headerp->rm_xid); + if (!req) goto out_nomatch; - - req = rpcr_to_rdmar(rqst); if (req->rl_reply) goto out_duplicate; - /* Sanity checking has passed. We are now committed - * to complete this transaction. + list_replace_init(&req->rl_registered, &mws); + rpcrdma_mark_remote_invalidation(&mws, rep); + + /* Avoid races with signals and duplicate replies + * by marking this req as matched. */ - list_del_init(&rqst->rq_list); - spin_unlock_bh(&xprt->transport_lock); + req->rl_reply = rep; + spin_unlock(&buf->rb_lock); + dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(headerp->rm_xid)); - /* from here on, the reply is no longer an orphan */ - req->rl_reply = rep; - xprt->reestablish_timeout = 0; + /* Invalidate and unmap the data payloads before waking the + * waiting application. This guarantees the memory regions + * are properly fenced from the server before the application + * accesses the data. It also ensures proper send flow control: + * waking the next RPC waits until this RPC has relinquished + * all its Send Queue entries. + */ + if (!list_empty(&mws)) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &mws); + /* Perform XID lookup, reconstruction of the RPC reply, and + * RPC completion while holding the transport lock to ensure + * the rep, rqst, and rq_task pointers remain stable. + */ + spin_lock_bh(&xprt->transport_lock); + rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); + if (!rqst) + goto out_norqst; + xprt->reestablish_timeout = 0; if (headerp->rm_vers != rpcrdma_version) goto out_badversion; @@ -1024,12 +1065,9 @@ rpcrdma_reply_handler(struct work_struct *work) case rdma_msg: /* never expect read chunks */ /* never expect reply chunks (two ways to check) */ - /* never expect write chunks without having offered RDMA */ if (headerp->rm_body.rm_chunks[0] != xdr_zero || (headerp->rm_body.rm_chunks[1] == xdr_zero && - headerp->rm_body.rm_chunks[2] != xdr_zero) || - (headerp->rm_body.rm_chunks[1] != xdr_zero && - list_empty(&req->rl_registered))) + headerp->rm_body.rm_chunks[2] != xdr_zero)) goto badheader; if (headerp->rm_body.rm_chunks[1] != xdr_zero) { /* count any expected write chunks in read reply */ @@ -1066,8 +1104,7 @@ rpcrdma_reply_handler(struct work_struct *work) /* never expect read or write chunks, always reply chunks */ if (headerp->rm_body.rm_chunks[0] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero || - headerp->rm_body.rm_chunks[2] != xdr_one || - list_empty(&req->rl_registered)) + headerp->rm_body.rm_chunks[2] != xdr_one) goto badheader; iptr = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); @@ -1093,17 +1130,6 @@ badheader: } out: - /* Invalidate and flush the data payloads before waking the - * waiting application. This guarantees the memory region is - * properly fenced from the server before the application - * accesses the data. It also ensures proper send flow - * control: waking the next RPC waits until this RPC has - * relinquished all its Send Queue entries. - */ - if (!list_empty(&req->rl_registered)) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); - - spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) @@ -1112,7 +1138,7 @@ out: xprt_complete_rqst(rqst->rq_task, status); spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", - __func__, xprt, rqst, status); + __func__, xprt, rqst, status); return; out_badstatus: @@ -1161,26 +1187,37 @@ out_rdmaerr: r_xprt->rx_stats.bad_reply_count++; goto out; -/* If no pending RPC transaction was matched, post a replacement - * receive buffer before returning. +/* The req was still available, but by the time the transport_lock + * was acquired, the rqst and task had been released. Thus the RPC + * has already been terminated. */ +out_norqst: + spin_unlock_bh(&xprt->transport_lock); + rpcrdma_buffer_put(req); + dprintk("RPC: %s: race, no rqst left for req %p\n", + __func__, req); + return; + out_shortreply: dprintk("RPC: %s: short/invalid reply\n", __func__); goto repost; out_nomatch: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&buf->rb_lock); dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); goto repost; out_duplicate: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&buf->rb_lock); dprintk("RPC: %s: " "duplicate reply %p to RPC request %p: xid 0x%08x\n", __func__, rep, req, be32_to_cpu(headerp->rm_xid)); +/* If no pending RPC transaction was matched, post a replacement + * receive buffer before returning. + */ repost: r_xprt->rx_stats.bad_reply_count++; if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c deleted file mode 100644 index bdcf7d85a3dc..000000000000 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2016 Oracle. All rights reserved. - * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the BSD-type - * license below: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * Neither the name of the Network Appliance, Inc. nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Author: Tom Tucker <tom@opengridcomputing.com> - */ - -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/debug.h> -#include <asm/unaligned.h> -#include <linux/sunrpc/rpc_rdma.h> -#include <linux/sunrpc/svc_rdma.h> - -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - -static __be32 *xdr_check_read_list(__be32 *p, __be32 *end) -{ - __be32 *next; - - while (*p++ != xdr_zero) { - next = p + rpcrdma_readchunk_maxsz - 1; - if (next > end) - return NULL; - p = next; - } - return p; -} - -static __be32 *xdr_check_write_list(__be32 *p, __be32 *end) -{ - __be32 *next; - - while (*p++ != xdr_zero) { - next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; - if (next > end) - return NULL; - p = next; - } - return p; -} - -static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end) -{ - __be32 *next; - - if (*p++ != xdr_zero) { - next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; - if (next > end) - return NULL; - p = next; - } - return p; -} - -/** - * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header - * @rq_arg: Receive buffer - * - * On entry, xdr->head[0].iov_base points to first byte in the - * RPC-over-RDMA header. - * - * On successful exit, head[0] points to first byte past the - * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. - * The length of the RPC-over-RDMA header is returned. - */ -int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) -{ - __be32 *p, *end, *rdma_argp; - unsigned int hdr_len; - - /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) - goto out_short; - - rdma_argp = rq_arg->head[0].iov_base; - if (*(rdma_argp + 1) != rpcrdma_version) - goto out_version; - - switch (*(rdma_argp + 3)) { - case rdma_msg: - case rdma_nomsg: - break; - - case rdma_done: - goto out_drop; - - case rdma_error: - goto out_drop; - - default: - goto out_proc; - } - - end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); - p = xdr_check_read_list(rdma_argp + 4, end); - if (!p) - goto out_inval; - p = xdr_check_write_list(p, end); - if (!p) - goto out_inval; - p = xdr_check_reply_chunk(p, end); - if (!p) - goto out_inval; - if (p > end) - goto out_inval; - - rq_arg->head[0].iov_base = p; - hdr_len = (unsigned long)p - (unsigned long)rdma_argp; - rq_arg->head[0].iov_len -= hdr_len; - return hdr_len; - -out_short: - dprintk("svcrdma: header too short = %d\n", rq_arg->len); - return -EINVAL; - -out_version: - dprintk("svcrdma: bad xprt version: %u\n", - be32_to_cpup(rdma_argp + 1)); - return -EPROTONOSUPPORT; - -out_drop: - dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); - return 0; - -out_proc: - dprintk("svcrdma: bad rdma procedure (%u)\n", - be32_to_cpup(rdma_argp + 3)); - return -EINVAL; - -out_inval: - dprintk("svcrdma: failed to parse transport header\n"); - return -EINVAL; -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 27a99bf5b1a6..ad4bd62eebf1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016, 2017 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * @@ -40,12 +41,66 @@ * Author: Tom Tucker <tom@opengridcomputing.com> */ -#include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> -#include <linux/spinlock.h> +/* Operation + * + * The main entry point is svc_rdma_recvfrom. This is called from + * svc_recv when the transport indicates there is incoming data to + * be read. "Data Ready" is signaled when an RDMA Receive completes, + * or when a set of RDMA Reads complete. + * + * An svc_rqst is passed in. This structure contains an array of + * free pages (rq_pages) that will contain the incoming RPC message. + * + * Short messages are moved directly into svc_rqst::rq_arg, and + * the RPC Call is ready to be processed by the Upper Layer. + * svc_rdma_recvfrom returns the length of the RPC Call message, + * completing the reception of the RPC Call. + * + * However, when an incoming message has Read chunks, + * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's + * data payload from the client. svc_rdma_recvfrom sets up the + * RDMA Reads using pages in svc_rqst::rq_pages, which are + * transferred to an svc_rdma_op_ctxt for the duration of the + * I/O. svc_rdma_recvfrom then returns zero, since the RPC message + * is still not yet ready. + * + * When the Read chunk payloads have become available on the + * server, "Data Ready" is raised again, and svc_recv calls + * svc_rdma_recvfrom again. This second call may use a different + * svc_rqst than the first one, thus any information that needs + * to be preserved across these two calls is kept in an + * svc_rdma_op_ctxt. + * + * The second call to svc_rdma_recvfrom performs final assembly + * of the RPC Call message, using the RDMA Read sink pages kept in + * the svc_rdma_op_ctxt. The xdr_buf is copied from the + * svc_rdma_op_ctxt to the second svc_rqst. The second call returns + * the length of the completed RPC Call message. + * + * Page Management + * + * Pages under I/O must be transferred from the first svc_rqst to an + * svc_rdma_op_ctxt before the first svc_rdma_recvfrom call returns. + * + * The first svc_rqst supplies pages for RDMA Reads. These are moved + * from rqstp::rq_pages into ctxt::pages. The consumed elements of + * the rq_pages array are set to NULL and refilled with the first + * svc_rdma_recvfrom call returns. + * + * During the second svc_rdma_recvfrom call, RDMA Read sink pages + * are transferred from the svc_rdma_op_ctxt to the second svc_rqst + * (see rdma_read_complete() below). + */ + #include <asm/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> + +#include <linux/spinlock.h> + +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/debug.h> +#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -59,7 +114,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *ctxt, u32 byte_count) { - struct rpcrdma_msg *rmsgp; struct page *page; u32 bc; int sge_no; @@ -83,20 +137,12 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.page_len = bc; rqstp->rq_arg.page_base = 0; - /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ - rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; - if (rmsgp->rm_type == rdma_nomsg) - rqstp->rq_arg.pages = &rqstp->rq_pages[0]; - else - rqstp->rq_arg.pages = &rqstp->rq_pages[1]; - sge_no = 1; while (bc && sge_no < ctxt->count) { page = ctxt->pages[sge_no]; put_page(rqstp->rq_pages[sge_no]); rqstp->rq_pages[sge_no] = page; bc -= min_t(u32, bc, ctxt->sge[sge_no].length); - rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; sge_no++; } rqstp->rq_respages = &rqstp->rq_pages[sge_no]; @@ -115,406 +161,208 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -/* Issue an RDMA_READ using the local lkey to map the data sink */ -int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - int *page_no, - u32 *page_offset, - u32 rs_handle, - u32 rs_length, - u64 rs_offset, - bool last) -{ - struct ib_rdma_wr read_wr; - int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; - struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); - int ret, read, pno; - u32 pg_off = *page_offset; - u32 pg_no = *page_no; - - ctxt->direction = DMA_FROM_DEVICE; - ctxt->read_hdr = head; - pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd); - read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset, - rs_length); - - for (pno = 0; pno < pages_needed; pno++) { - int len = min_t(int, rs_length, PAGE_SIZE - pg_off); - - head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; - head->arg.page_len += len; - - head->arg.len += len; - if (!pg_off) - head->count++; - rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - ctxt->sge[pno].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - head->arg.pages[pg_no], pg_off, - PAGE_SIZE - pg_off, - DMA_FROM_DEVICE); - ret = ib_dma_mapping_error(xprt->sc_cm_id->device, - ctxt->sge[pno].addr); - if (ret) - goto err; - svc_rdma_count_mappings(xprt, ctxt); - - ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->sge[pno].length = len; - ctxt->count++; - - /* adjust offset and wrap to next page if needed */ - pg_off += len; - if (pg_off == PAGE_SIZE) { - pg_off = 0; - pg_no++; - } - rs_length -= len; - } - - if (last && rs_length == 0) - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - else - clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - - memset(&read_wr, 0, sizeof(read_wr)); - ctxt->cqe.done = svc_rdma_wc_read; - read_wr.wr.wr_cqe = &ctxt->cqe; - read_wr.wr.opcode = IB_WR_RDMA_READ; - read_wr.wr.send_flags = IB_SEND_SIGNALED; - read_wr.rkey = rs_handle; - read_wr.remote_addr = rs_offset; - read_wr.wr.sg_list = ctxt->sge; - read_wr.wr.num_sge = pages_needed; - - ret = svc_rdma_send(xprt, &read_wr.wr); - if (ret) { - pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - goto err; - } +/* This accommodates the largest possible Write chunk, + * in one segment. + */ +#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) - /* return current location in page array */ - *page_no = pg_no; - *page_offset = pg_off; - ret = read; - atomic_inc(&rdma_stat_read); - return ret; - err: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - return ret; -} +/* This accommodates the largest possible Position-Zero + * Read chunk or Reply chunk, in one segment. + */ +#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) -/* Issue an RDMA_READ using an FRMR to map the data sink */ -int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - int *page_no, - u32 *page_offset, - u32 rs_handle, - u32 rs_length, - u64 rs_offset, - bool last) +/* Sanity check the Read list. + * + * Implementation limits: + * - This implementation supports only one Read chunk. + * + * Sanity checks: + * - Read list does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * The segment count is limited to how many segments can + * fit in the transport header without overflowing the + * buffer. That's about 40 Read segments for a 1KB inline + * threshold. + * + * Returns pointer to the following Write list. + */ +static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) { - struct ib_rdma_wr read_wr; - struct ib_send_wr inv_wr; - struct ib_reg_wr reg_wr; - u8 key; - int nents = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; - struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); - struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt); - int ret, read, pno, dma_nents, n; - u32 pg_off = *page_offset; - u32 pg_no = *page_no; - - if (IS_ERR(frmr)) - return -ENOMEM; - - ctxt->direction = DMA_FROM_DEVICE; - ctxt->frmr = frmr; - nents = min_t(unsigned int, nents, xprt->sc_frmr_pg_list_len); - read = min_t(int, (nents << PAGE_SHIFT) - *page_offset, rs_length); - - frmr->direction = DMA_FROM_DEVICE; - frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); - frmr->sg_nents = nents; - - for (pno = 0; pno < nents; pno++) { - int len = min_t(int, rs_length, PAGE_SIZE - pg_off); - - head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; - head->arg.page_len += len; - head->arg.len += len; - if (!pg_off) - head->count++; - - sg_set_page(&frmr->sg[pno], rqstp->rq_arg.pages[pg_no], - len, pg_off); - - rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* adjust offset and wrap to next page if needed */ - pg_off += len; - if (pg_off == PAGE_SIZE) { - pg_off = 0; - pg_no++; + u32 position; + bool first; + + first = true; + while (*p++ != xdr_zero) { + if (first) { + position = be32_to_cpup(p++); + first = false; + } else if (be32_to_cpup(p++) != position) { + return NULL; } - rs_length -= len; - } + p++; /* handle */ + if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) + return NULL; + p += 2; /* offset */ - if (last && rs_length == 0) - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - else - clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - - dma_nents = ib_dma_map_sg(xprt->sc_cm_id->device, - frmr->sg, frmr->sg_nents, - frmr->direction); - if (!dma_nents) { - pr_err("svcrdma: failed to dma map sg %p\n", - frmr->sg); - return -ENOMEM; + if (p > end) + return NULL; } + return p; +} - n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); - if (unlikely(n != frmr->sg_nents)) { - pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n", - frmr->mr, n, frmr->sg_nents); - return n < 0 ? n : -EINVAL; - } +/* The segment count is limited to how many segments can + * fit in the transport header without overflowing the + * buffer. That's about 60 Write segments for a 1KB inline + * threshold. + */ +static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, + u32 maxlen) +{ + u32 i, segcount; - /* Bump the key */ - key = (u8)(frmr->mr->lkey & 0x000000FF); - ib_update_fast_reg_key(frmr->mr, ++key); - - ctxt->sge[0].addr = frmr->mr->iova; - ctxt->sge[0].lkey = frmr->mr->lkey; - ctxt->sge[0].length = frmr->mr->length; - ctxt->count = 1; - ctxt->read_hdr = head; - - /* Prepare REG WR */ - ctxt->reg_cqe.done = svc_rdma_wc_reg; - reg_wr.wr.wr_cqe = &ctxt->reg_cqe; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.send_flags = IB_SEND_SIGNALED; - reg_wr.wr.num_sge = 0; - reg_wr.mr = frmr->mr; - reg_wr.key = frmr->mr->lkey; - reg_wr.access = frmr->access_flags; - reg_wr.wr.next = &read_wr.wr; - - /* Prepare RDMA_READ */ - memset(&read_wr, 0, sizeof(read_wr)); - ctxt->cqe.done = svc_rdma_wc_read; - read_wr.wr.wr_cqe = &ctxt->cqe; - read_wr.wr.send_flags = IB_SEND_SIGNALED; - read_wr.rkey = rs_handle; - read_wr.remote_addr = rs_offset; - read_wr.wr.sg_list = ctxt->sge; - read_wr.wr.num_sge = 1; - if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { - read_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - read_wr.wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; - } else { - read_wr.wr.opcode = IB_WR_RDMA_READ; - read_wr.wr.next = &inv_wr; - /* Prepare invalidate */ - memset(&inv_wr, 0, sizeof(inv_wr)); - ctxt->inv_cqe.done = svc_rdma_wc_inv; - inv_wr.wr_cqe = &ctxt->inv_cqe; - inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; - inv_wr.ex.invalidate_rkey = frmr->mr->lkey; - } + segcount = be32_to_cpup(p++); + for (i = 0; i < segcount; i++) { + p++; /* handle */ + if (be32_to_cpup(p++) > maxlen) + return NULL; + p += 2; /* offset */ - /* Post the chain */ - ret = svc_rdma_send(xprt, ®_wr.wr); - if (ret) { - pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - goto err; + if (p > end) + return NULL; } - /* return current location in page array */ - *page_no = pg_no; - *page_offset = pg_off; - ret = read; - atomic_inc(&rdma_stat_read); - return ret; - err: - svc_rdma_put_context(ctxt, 0); - svc_rdma_put_frmr(xprt, frmr); - return ret; -} - -static unsigned int -rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch) -{ - unsigned int count; - - for (count = 0; ch->rc_discrim != xdr_zero; ch++) - count++; - return count; + return p; } -/* If there was additional inline content, append it to the end of arg.pages. - * Tail copy has to be done after the reader function has determined how many - * pages are needed for RDMA READ. +/* Sanity check the Write list. + * + * Implementation limits: + * - This implementation supports only one Write chunk. + * + * Sanity checks: + * - Write list does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * Returns pointer to the following Reply chunk. */ -static int -rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, - u32 position, u32 byte_count, u32 page_offset, int page_no) +static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) { - char *srcp, *destp; - - srcp = head->arg.head[0].iov_base + position; - byte_count = head->arg.head[0].iov_len - position; - if (byte_count > PAGE_SIZE) { - dprintk("svcrdma: large tail unsupported\n"); - return 0; - } - - /* Fit as much of the tail on the current page as possible */ - if (page_offset != PAGE_SIZE) { - destp = page_address(rqstp->rq_arg.pages[page_no]); - destp += page_offset; - while (byte_count--) { - *destp++ = *srcp++; - page_offset++; - if (page_offset == PAGE_SIZE && byte_count) - goto more; - } - goto done; + u32 chcount; + + chcount = 0; + while (*p++ != xdr_zero) { + p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); + if (!p) + return NULL; + if (chcount++ > 1) + return NULL; } - -more: - /* Fit the rest on the next page */ - page_no++; - destp = page_address(rqstp->rq_arg.pages[page_no]); - while (byte_count--) - *destp++ = *srcp++; - - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - -done: - byte_count = head->arg.head[0].iov_len - position; - head->arg.page_len += byte_count; - head->arg.len += byte_count; - head->arg.buflen += byte_count; - return 1; + return p; } -/* Returns the address of the first read chunk or <nul> if no read chunk - * is present +/* Sanity check the Reply chunk. + * + * Sanity checks: + * - Reply chunk does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * Returns pointer to the following RPC header. */ -static struct rpcrdma_read_chunk * -svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) { - struct rpcrdma_read_chunk *ch = - (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - - if (ch->rc_discrim == xdr_zero) - return NULL; - return ch; + if (*p++ != xdr_zero) { + p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); + if (!p) + return NULL; + } + return p; } -static int rdma_read_chunks(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rmsgp, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head) +/* On entry, xdr->head[0].iov_base points to first byte in the + * RPC-over-RDMA header. + * + * On successful exit, head[0] points to first byte past the + * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. + * The length of the RPC-over-RDMA header is returned. + * + * Assumptions: + * - The transport header is entirely contained in the head iovec. + */ +static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { - int page_no, ret; - struct rpcrdma_read_chunk *ch; - u32 handle, page_offset, byte_count; - u32 position; - u64 rs_offset; - bool last; - - /* If no read list is present, return 0 */ - ch = svc_rdma_get_read_chunk(rmsgp); - if (!ch) - return 0; + __be32 *p, *end, *rdma_argp; + unsigned int hdr_len; + char *proc; + + /* Verify that there's enough bytes for header + something */ + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) + goto out_short; + + rdma_argp = rq_arg->head[0].iov_base; + if (*(rdma_argp + 1) != rpcrdma_version) + goto out_version; + + switch (*(rdma_argp + 3)) { + case rdma_msg: + proc = "RDMA_MSG"; + break; + case rdma_nomsg: + proc = "RDMA_NOMSG"; + break; + + case rdma_done: + goto out_drop; - if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES) - return -EINVAL; - - /* The request is completed when the RDMA_READs complete. The - * head context keeps all the pages that comprise the - * request. - */ - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->hdr_count = head->count; - head->arg.page_base = 0; - head->arg.page_len = 0; - head->arg.len = rqstp->rq_arg.len; - head->arg.buflen = rqstp->rq_arg.buflen; - - /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ - position = be32_to_cpu(ch->rc_position); - if (position == 0) { - head->arg.pages = &head->pages[0]; - page_offset = head->byte_len; - } else { - head->arg.pages = &head->pages[head->count]; - page_offset = 0; - } + case rdma_error: + goto out_drop; - ret = 0; - page_no = 0; - for (; ch->rc_discrim != xdr_zero; ch++) { - if (be32_to_cpu(ch->rc_position) != position) - goto err; - - handle = be32_to_cpu(ch->rc_target.rs_handle), - byte_count = be32_to_cpu(ch->rc_target.rs_length); - xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, - &rs_offset); - - while (byte_count > 0) { - last = (ch + 1)->rc_discrim == xdr_zero; - ret = xprt->sc_reader(xprt, rqstp, head, - &page_no, &page_offset, - handle, byte_count, - rs_offset, last); - if (ret < 0) - goto err; - byte_count -= ret; - rs_offset += ret; - head->arg.buflen += ret; - } + default: + goto out_proc; } - /* Read list may need XDR round-up (see RFC 5666, s. 3.7) */ - if (page_offset & 3) { - u32 pad = 4 - (page_offset & 3); - - head->arg.tail[0].iov_len += pad; - head->arg.len += pad; - head->arg.buflen += pad; - page_offset += pad; - } + end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); + p = xdr_check_read_list(rdma_argp + 4, end); + if (!p) + goto out_inval; + p = xdr_check_write_list(p, end); + if (!p) + goto out_inval; + p = xdr_check_reply_chunk(p, end); + if (!p) + goto out_inval; + if (p > end) + goto out_inval; + + rq_arg->head[0].iov_base = p; + hdr_len = (unsigned long)p - (unsigned long)rdma_argp; + rq_arg->head[0].iov_len -= hdr_len; + rq_arg->len -= hdr_len; + dprintk("svcrdma: received %s request for XID 0x%08x, hdr_len=%u\n", + proc, be32_to_cpup(rdma_argp), hdr_len); + return hdr_len; + +out_short: + dprintk("svcrdma: header too short = %d\n", rq_arg->len); + return -EINVAL; + +out_version: + dprintk("svcrdma: bad xprt version: %u\n", + be32_to_cpup(rdma_argp + 1)); + return -EPROTONOSUPPORT; - ret = 1; - if (position && position < head->arg.head[0].iov_len) - ret = rdma_copy_tail(rqstp, head, position, - byte_count, page_offset, page_no); - head->arg.head[0].iov_len = position; - head->position = position; +out_drop: + dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); + return 0; - err: - /* Detach arg pages. svc_recv will replenish them */ - for (page_no = 0; - &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++) - rqstp->rq_pages[page_no] = NULL; +out_proc: + dprintk("svcrdma: bad rdma procedure (%u)\n", + be32_to_cpup(rdma_argp + 3)); + return -EINVAL; - return ret; +out_inval: + dprintk("svcrdma: failed to parse transport header\n"); + return -EINVAL; } static void rdma_read_complete(struct svc_rqst *rqstp, @@ -528,24 +376,9 @@ static void rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_pages[page_no] = head->pages[page_no]; } - /* Adjustments made for RDMA_NOMSG type requests */ - if (head->position == 0) { - if (head->arg.len <= head->sge[0].length) { - head->arg.head[0].iov_len = head->arg.len - - head->byte_len; - head->arg.page_len = 0; - } else { - head->arg.head[0].iov_len = head->sge[0].length - - head->byte_len; - head->arg.page_len = head->arg.len - - head->sge[0].length; - } - } - /* Point rq_arg.pages past header */ rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; rqstp->rq_arg.page_len = head->arg.page_len; - rqstp->rq_arg.page_base = head->arg.page_base; /* rq_respages starts after the last arg page */ rqstp->rq_respages = &rqstp->rq_pages[page_no]; @@ -642,21 +475,44 @@ static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, return true; } -/* - * Set up the rqstp thread context to point to the RQ buffer. If - * necessary, pull additional data from the client with an RDMA_READ - * request. +/** + * svc_rdma_recvfrom - Receive an RPC call + * @rqstp: request structure into which to receive an RPC Call + * + * Returns: + * The positive number of bytes in the RPC Call message, + * %0 if there were no Calls ready to return, + * %-EINVAL if the Read chunk data is too large, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * + * Called in a loop when XPT_DATA is set. XPT_DATA is cleared only + * when there are no remaining ctxt's to process. + * + * The next ctxt is removed from the "receive" lists. + * + * - If the ctxt completes a Read, then finish assembling the Call + * message and return the number of bytes in the message. + * + * - If the ctxt completes a Receive, then construct the Call + * message from the contents of the Receive buffer. + * + * - If there are no Read chunks in this message, then finish + * assembling the Call message and return the number of bytes + * in the message. + * + * - If there are Read chunks in this message, post Read WRs to + * pull that payload and return 0. */ int svc_rdma_recvfrom(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; struct svcxprt_rdma *rdma_xprt = container_of(xprt, struct svcxprt_rdma, sc_xprt); - struct svc_rdma_op_ctxt *ctxt = NULL; - struct rpcrdma_msg *rmsgp; - int ret = 0; - - dprintk("svcrdma: rqstp=%p\n", rqstp); + struct svc_rdma_op_ctxt *ctxt; + __be32 *p; + int ret; spin_lock(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { @@ -671,22 +527,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_op_ctxt, list); list_del(&ctxt->list); } else { - atomic_inc(&rdma_stat_rq_starve); + /* No new incoming requests, terminate the loop */ clear_bit(XPT_DATA, &xprt->xpt_flags); - ctxt = NULL; + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + return 0; } spin_unlock(&rdma_xprt->sc_rq_dto_lock); - if (!ctxt) { - /* This is the EAGAIN path. The svc_recv routine will - * return -EAGAIN, the nfsd thread will go to call into - * svc_recv again and we shouldn't be on the active - * transport list - */ - if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) - goto defer; - goto out; - } - dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n", + + dprintk("svcrdma: recvfrom: ctxt=%p on xprt=%p, rqstp=%p\n", ctxt, rdma_xprt, rqstp); atomic_inc(&rdma_stat_recv); @@ -694,7 +542,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); /* Decode the RDMA header. */ - rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + p = (__be32 *)rqstp->rq_arg.head[0].iov_base; ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); if (ret < 0) goto out_err; @@ -702,9 +550,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_drop; rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) { - ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, - &rmsgp->rm_xid, + if (svc_rdma_is_backchannel_reply(xprt, p)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, &rqstp->rq_arg); svc_rdma_put_context(ctxt, 0); if (ret) @@ -712,39 +559,34 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return ret; } - /* Read read-list data. */ - ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); - if (ret > 0) { - /* read-list posted, defer until data received from client. */ - goto defer; - } else if (ret < 0) { - /* Post of read-list failed, free context. */ - svc_rdma_put_context(ctxt, 1); - return 0; - } + p += rpcrdma_fixed_maxsz; + if (*p != xdr_zero) + goto out_readchunk; complete: - ret = rqstp->rq_arg.head[0].iov_len - + rqstp->rq_arg.page_len - + rqstp->rq_arg.tail[0].iov_len; svc_rdma_put_context(ctxt, 0); - out: - dprintk("svcrdma: ret=%d, rq_arg.len=%u, " - "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zd\n", - ret, rqstp->rq_arg.len, - rqstp->rq_arg.head[0].iov_base, - rqstp->rq_arg.head[0].iov_len); + dprintk("svcrdma: recvfrom: xprt=%p, rqstp=%p, rq_arg.len=%u\n", + rdma_xprt, rqstp, rqstp->rq_arg.len); rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); - return ret; + return rqstp->rq_arg.len; + +out_readchunk: + ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); + if (ret < 0) + goto out_postfail; + return 0; out_err: - svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret); + svc_rdma_send_error(rdma_xprt, p, ret); svc_rdma_put_context(ctxt, 0); return 0; -defer: - return 0; +out_postfail: + if (ret == -EINVAL) + svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_put_context(ctxt, 1); + return ret; out_drop: svc_rdma_put_context(ctxt, 1); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 0cf620277693..933f79bed270 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -12,6 +12,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); +static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); + /* Each R/W context contains state for one chain of RDMA Read or * Write Work Requests. * @@ -113,22 +116,20 @@ struct svc_rdma_chunk_ctxt { struct svcxprt_rdma *cc_rdma; struct list_head cc_rwctxts; int cc_sqecount; - enum dma_data_direction cc_dir; }; static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, - struct svc_rdma_chunk_ctxt *cc, - enum dma_data_direction dir) + struct svc_rdma_chunk_ctxt *cc) { cc->cc_rdma = rdma; svc_xprt_get(&rdma->sc_xprt); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; - cc->cc_dir = dir; } -static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc) +static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir) { struct svcxprt_rdma *rdma = cc->cc_rdma; struct svc_rdma_rw_ctxt *ctxt; @@ -138,7 +139,7 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc) rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, cc->cc_dir); + ctxt->rw_nents, dir); svc_rdma_put_rw_ctxt(rdma, ctxt); } svc_xprt_put(&rdma->sc_xprt); @@ -176,13 +177,14 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) info->wi_seg_no = 0; info->wi_nsegs = be32_to_cpup(++chunk); info->wi_segs = ++chunk; - svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE); + svc_rdma_cc_init(rdma, &info->wi_cc); + info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; } static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) { - svc_rdma_cc_release(&info->wi_cc); + svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); kfree(info); } @@ -216,6 +218,76 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) svc_rdma_write_info_free(info); } +/* State for pulling a Read chunk. + */ +struct svc_rdma_read_info { + struct svc_rdma_op_ctxt *ri_readctxt; + unsigned int ri_position; + unsigned int ri_pageno; + unsigned int ri_pageoff; + unsigned int ri_chunklen; + + struct svc_rdma_chunk_ctxt ri_cc; +}; + +static struct svc_rdma_read_info * +svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_read_info *info; + + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return info; + + svc_rdma_cc_init(rdma, &info->ri_cc); + info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done; + return info; +} + +static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) +{ + svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE); + kfree(info); +} + +/** + * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx + * @cq: controlling Completion Queue + * @wc: Work Completion + * + */ +static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_read_info *info = + container_of(cc, struct svc_rdma_read_info, ri_cc); + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: read ctx: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + svc_rdma_put_context(info->ri_readctxt, 1); + } else { + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&info->ri_readctxt->list, + &rdma->sc_read_complete_q); + spin_unlock(&rdma->sc_rq_dto_lock); + + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); + } + + svc_rdma_read_info_free(info); +} + /* This function sleeps when the transport's Send Queue is congested. * * Assumptions: @@ -232,6 +304,9 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) struct ib_cqe *cqe; int ret; + if (cc->cc_sqecount > rdma->sc_sq_depth) + return -EINVAL; + first_wr = NULL; cqe = &cc->cc_cqe; list_for_each(tmp, &cc->cc_rwctxts) { @@ -295,8 +370,9 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, struct scatterlist *sg; struct page **page; - page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK; - page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT; + page_off = info->wi_next_off + xdr->page_base; + page_no = page_off >> PAGE_SHIFT; + page_off = offset_in_page(page_off); page = xdr->pages + page_no; info->wi_next_off += remaining; sg = ctxt->rw_sg_table.sgl; @@ -332,7 +408,6 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, __be32 *seg; int ret; - cc->cc_cqe.done = svc_rdma_write_done; seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; do { unsigned int write_len; @@ -425,6 +500,7 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, * * Returns a non-negative number of bytes the chunk consumed, or * %-E2BIG if the payload was larger than the Write chunk, + * %-EINVAL if client provided too many segments, * %-ENOMEM if rdma_rw context pool was exhausted, * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). @@ -465,6 +541,7 @@ out_err: * * Returns a non-negative number of bytes the chunk consumed, or * %-E2BIG if the payload was larger than the Reply chunk, + * %-EINVAL if client provided too many segments, * %-ENOMEM if rdma_rw context pool was exhausted, * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). @@ -510,3 +587,353 @@ out_err: svc_rdma_write_info_free(info); return ret; } + +static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, + struct svc_rqst *rqstp, + u32 rkey, u32 len, u64 offset) +{ + struct svc_rdma_op_ctxt *head = info->ri_readctxt; + struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; + struct svc_rdma_rw_ctxt *ctxt; + unsigned int sge_no, seg_len; + struct scatterlist *sg; + int ret; + + sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); + if (!ctxt) + goto out_noctx; + ctxt->rw_nents = sge_no; + + dprintk("svcrdma: reading segment %u@0x%016llx:0x%08x (%u sges)\n", + len, offset, rkey, sge_no); + + sg = ctxt->rw_sg_table.sgl; + for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { + seg_len = min_t(unsigned int, len, + PAGE_SIZE - info->ri_pageoff); + + head->arg.pages[info->ri_pageno] = + rqstp->rq_pages[info->ri_pageno]; + if (!info->ri_pageoff) + head->count++; + + sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], + seg_len, info->ri_pageoff); + sg = sg_next(sg); + + info->ri_pageoff += seg_len; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + len -= seg_len; + + /* Safety check */ + if (len && + &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end) + goto out_overrun; + } + + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp, + cc->cc_rdma->sc_port_num, + ctxt->rw_sg_table.sgl, ctxt->rw_nents, + 0, offset, rkey, DMA_FROM_DEVICE); + if (ret < 0) + goto out_initerr; + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + return 0; + +out_noctx: + dprintk("svcrdma: no R/W ctxs available\n"); + return -ENOMEM; + +out_overrun: + dprintk("svcrdma: request overruns rq_pages\n"); + return -EINVAL; + +out_initerr: + svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); + pr_err("svcrdma: failed to map pagelist (%d)\n", ret); + return -EIO; +} + +static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + int ret; + + info->ri_chunklen = 0; + while (*p++ != xdr_zero) { + u32 rs_handle, rs_length; + u64 rs_offset; + + if (be32_to_cpup(p++) != info->ri_position) + break; + rs_handle = be32_to_cpup(p++); + rs_length = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &rs_offset); + + ret = svc_rdma_build_read_segment(info, rqstp, + rs_handle, rs_length, + rs_offset); + if (ret < 0) + break; + + info->ri_chunklen += rs_length; + } + + return ret; +} + +/* If there is inline content following the Read chunk, append it to + * the page list immediately following the data payload. This has to + * be done after the reader function has determined how many pages + * were consumed for RDMA Read. + * + * On entry, ri_pageno and ri_pageoff point directly to the end of the + * page list. On exit, both have been updated to the new "next byte". + * + * Assumptions: + * - Inline content fits entirely in rq_pages[0] + * - Trailing content is only a handful of bytes + */ +static int svc_rdma_copy_tail(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info) +{ + struct svc_rdma_op_ctxt *head = info->ri_readctxt; + unsigned int tail_length, remaining; + u8 *srcp, *destp; + + /* Assert that all inline content fits in page 0. This is an + * implementation limit, not a protocol limit. + */ + if (head->arg.head[0].iov_len > PAGE_SIZE) { + pr_warn_once("svcrdma: too much trailing inline content\n"); + return -EINVAL; + } + + srcp = head->arg.head[0].iov_base; + srcp += info->ri_position; + tail_length = head->arg.head[0].iov_len - info->ri_position; + remaining = tail_length; + + /* If there is room on the last page in the page list, try to + * fit the trailing content there. + */ + if (info->ri_pageoff > 0) { + unsigned int len; + + len = min_t(unsigned int, remaining, + PAGE_SIZE - info->ri_pageoff); + destp = page_address(rqstp->rq_pages[info->ri_pageno]); + destp += info->ri_pageoff; + + memcpy(destp, srcp, len); + srcp += len; + destp += len; + info->ri_pageoff += len; + remaining -= len; + + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + } + + /* Otherwise, a fresh page is needed. */ + if (remaining) { + head->arg.pages[info->ri_pageno] = + rqstp->rq_pages[info->ri_pageno]; + head->count++; + + destp = page_address(rqstp->rq_pages[info->ri_pageno]); + memcpy(destp, srcp, remaining); + info->ri_pageoff += remaining; + } + + head->arg.page_len += tail_length; + head->arg.len += tail_length; + head->arg.buflen += tail_length; + return 0; +} + +/* Construct RDMA Reads to pull over a normal Read chunk. The chunk + * data lands in the page list of head->arg.pages. + * + * Currently NFSD does not look at the head->arg.tail[0] iovec. + * Therefore, XDR round-up of the Read chunk and trailing + * inline content must both be added at the end of the pagelist. + */ +static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + struct svc_rdma_op_ctxt *head = info->ri_readctxt; + int ret; + + dprintk("svcrdma: Reading Read chunk at position %u\n", + info->ri_position); + + info->ri_pageno = head->hdr_count; + info->ri_pageoff = 0; + + ret = svc_rdma_build_read_chunk(rqstp, info, p); + if (ret < 0) + goto out; + + /* Read chunk may need XDR round-up (see RFC 5666, s. 3.7). + */ + if (info->ri_chunklen & 3) { + u32 padlen = 4 - (info->ri_chunklen & 3); + + info->ri_chunklen += padlen; + + /* NB: data payload always starts on XDR alignment, + * thus the pad can never contain a page boundary. + */ + info->ri_pageoff += padlen; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + } + + head->arg.page_len = info->ri_chunklen; + head->arg.len += info->ri_chunklen; + head->arg.buflen += info->ri_chunklen; + + if (info->ri_position < head->arg.head[0].iov_len) { + ret = svc_rdma_copy_tail(rqstp, info); + if (ret < 0) + goto out; + } + head->arg.head[0].iov_len = info->ri_position; + +out: + return ret; +} + +/* Construct RDMA Reads to pull over a Position Zero Read chunk. + * The start of the data lands in the first page just after + * the Transport header, and the rest lands in the page list of + * head->arg.pages. + * + * Assumptions: + * - A PZRC has an XDR-aligned length (no implicit round-up). + * - There can be no trailing inline content (IOW, we assume + * a PZRC is never sent in an RDMA_MSG message, though it's + * allowed by spec). + */ +static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + struct svc_rdma_op_ctxt *head = info->ri_readctxt; + int ret; + + dprintk("svcrdma: Reading Position Zero Read chunk\n"); + + info->ri_pageno = head->hdr_count - 1; + info->ri_pageoff = offset_in_page(head->byte_len); + + ret = svc_rdma_build_read_chunk(rqstp, info, p); + if (ret < 0) + goto out; + + head->arg.len += info->ri_chunklen; + head->arg.buflen += info->ri_chunklen; + + if (head->arg.buflen <= head->sge[0].length) { + /* Transport header and RPC message fit entirely + * in page where head iovec resides. + */ + head->arg.head[0].iov_len = info->ri_chunklen; + } else { + /* Transport header and part of RPC message reside + * in the head iovec's page. + */ + head->arg.head[0].iov_len = + head->sge[0].length - head->byte_len; + head->arg.page_len = + info->ri_chunklen - head->arg.head[0].iov_len; + } + +out: + return ret; +} + +/** + * svc_rdma_recv_read_chunk - Pull a Read chunk from the client + * @rdma: controlling RDMA transport + * @rqstp: set of pages to use as Read sink buffers + * @head: pages under I/O collect here + * @p: pointer to start of Read chunk + * + * Returns: + * %0 if all needed RDMA Reads were posted successfully, + * %-EINVAL if client provided too many segments, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * + * Assumptions: + * - All Read segments in @p have the same Position value. + */ +int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, __be32 *p) +{ + struct svc_rdma_read_info *info; + struct page **page; + int ret; + + /* The request (with page list) is constructed in + * head->arg. Pages involved with RDMA Read I/O are + * transferred there. + */ + head->hdr_count = head->count; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = head->pages; + head->arg.page_base = 0; + head->arg.page_len = 0; + head->arg.len = rqstp->rq_arg.len; + head->arg.buflen = rqstp->rq_arg.buflen; + + info = svc_rdma_read_info_alloc(rdma); + if (!info) + return -ENOMEM; + info->ri_readctxt = head; + + info->ri_position = be32_to_cpup(p + 1); + if (info->ri_position) + ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); + else + ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); + + /* Mark the start of the pages that can be used for the reply */ + if (info->ri_pageoff > 0) + info->ri_pageno++; + rqstp->rq_respages = &rqstp->rq_pages[info->ri_pageno]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + if (ret < 0) + goto out; + + ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); + +out: + /* Read sink pages have been moved from rqstp->rq_pages to + * head->arg.pages. Force svc_recv to refill those slots + * in rq_pages. + */ + for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++) + *page = NULL; + + if (ret < 0) + svc_rdma_read_info_free(info); + return ret; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 1736337f3a55..7c3a211e0e9a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -313,13 +313,17 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, dma_addr = ib_dma_map_page(dev, virt_to_page(base), offset, len, DMA_TO_DEVICE); if (ib_dma_mapping_error(dev, dma_addr)) - return -EIO; + goto out_maperr; ctxt->sge[sge_no].addr = dma_addr; ctxt->sge[sge_no].length = len; ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; svc_rdma_count_mappings(rdma, ctxt); return 0; + +out_maperr: + pr_err("svcrdma: failed to map buffer\n"); + return -EIO; } static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, @@ -334,13 +338,17 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); if (ib_dma_mapping_error(dev, dma_addr)) - return -EIO; + goto out_maperr; ctxt->sge[sge_no].addr = dma_addr; ctxt->sge[sge_no].length = len; ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; svc_rdma_count_mappings(rdma, ctxt); return 0; + +out_maperr: + pr_err("svcrdma: failed to map page\n"); + return -EIO; } /** @@ -547,7 +555,6 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, return 0; err: - pr_err("svcrdma: failed to post Send WR (%d)\n", ret); svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); return ret; @@ -677,7 +684,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) return 0; err2: - if (ret != -E2BIG) + if (ret != -E2BIG && ret != -EINVAL) goto err1; ret = svc_rdma_post_recv(rdma, GFP_KERNEL); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index a9d9cb1ba4c6..e660d4965b18 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -202,7 +202,6 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) out: ctxt->count = 0; ctxt->mapped_sges = 0; - ctxt->frmr = NULL; return ctxt; out_empty: @@ -226,22 +225,13 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; struct ib_device *device = xprt->sc_cm_id->device; - u32 lkey = xprt->sc_pd->local_dma_lkey; unsigned int i; - for (i = 0; i < ctxt->mapped_sges; i++) { - /* - * Unmap the DMA addr in the SGE if the lkey matches - * the local_dma_lkey, otherwise, ignore it since it is - * an FRMR lkey and will be unmapped later when the - * last WR that uses it completes. - */ - if (ctxt->sge[i].lkey == lkey) - ib_dma_unmap_page(device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); - } + for (i = 0; i < ctxt->mapped_sges; i++) + ib_dma_unmap_page(device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); ctxt->mapped_sges = 0; } @@ -346,36 +336,6 @@ out: svc_xprt_put(&xprt->sc_xprt); } -static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt, - struct ib_wc *wc, - const char *opname) -{ - if (wc->status != IB_WC_SUCCESS) - goto err; - -out: - atomic_inc(&xprt->sc_sq_avail); - wake_up(&xprt->sc_send_wait); - return; - -err: - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: %s: %s (%u/0x%x)\n", - opname, ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - goto out; -} - -static void svc_rdma_send_wc_common_put(struct ib_cq *cq, struct ib_wc *wc, - const char *opname) -{ - struct svcxprt_rdma *xprt = cq->cq_context; - - svc_rdma_send_wc_common(xprt, wc, opname); - svc_xprt_put(&xprt->sc_xprt); -} - /** * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: completion queue @@ -384,73 +344,28 @@ static void svc_rdma_send_wc_common_put(struct ib_cq *cq, struct ib_wc *wc, */ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { - struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_op_ctxt *ctxt; - - svc_rdma_send_wc_common_put(cq, wc, "send"); - - ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); -} - -/** - * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_reg(struct ib_cq *cq, struct ib_wc *wc) -{ - svc_rdma_send_wc_common_put(cq, wc, "fastreg"); -} - -/** - * svc_rdma_wc_read - Invoked by RDMA provider for each polled Read WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc) -{ struct svcxprt_rdma *xprt = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_op_ctxt *ctxt; - svc_rdma_send_wc_common(xprt, wc, "read"); + atomic_inc(&xprt->sc_sq_avail); + wake_up(&xprt->sc_send_wait); ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); svc_rdma_unmap_dma(ctxt); - svc_rdma_put_frmr(xprt, ctxt->frmr); - - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr; - - read_hdr = ctxt->read_hdr; - spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&read_hdr->list, - &xprt->sc_read_complete_q); - spin_unlock(&xprt->sc_rq_dto_lock); + svc_rdma_put_context(ctxt, 1); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - svc_xprt_enqueue(&xprt->sc_xprt); + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: Send: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); } - svc_rdma_put_context(ctxt, 0); svc_xprt_put(&xprt->sc_xprt); } -/** - * svc_rdma_wc_inv - Invoked by RDMA provider for each polled LOCAL_INV WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_inv(struct ib_cq *cq, struct ib_wc *wc) -{ - svc_rdma_send_wc_common_put(cq, wc, "localInv"); -} - static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, int listener) { @@ -462,14 +377,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); - INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); INIT_LIST_HEAD(&cma_xprt->sc_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); - spin_lock_init(&cma_xprt->sc_frmr_q_lock); spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); @@ -780,86 +693,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(ret); } -static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) -{ - struct ib_mr *mr; - struct scatterlist *sg; - struct svc_rdma_fastreg_mr *frmr; - u32 num_sg; - - frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); - if (!frmr) - goto err; - - num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len); - mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg); - if (IS_ERR(mr)) - goto err_free_frmr; - - sg = kcalloc(RPCSVC_MAXPAGES, sizeof(*sg), GFP_KERNEL); - if (!sg) - goto err_free_mr; - - sg_init_table(sg, RPCSVC_MAXPAGES); - - frmr->mr = mr; - frmr->sg = sg; - INIT_LIST_HEAD(&frmr->frmr_list); - return frmr; - - err_free_mr: - ib_dereg_mr(mr); - err_free_frmr: - kfree(frmr); - err: - return ERR_PTR(-ENOMEM); -} - -static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) -{ - struct svc_rdma_fastreg_mr *frmr; - - while (!list_empty(&xprt->sc_frmr_q)) { - frmr = list_entry(xprt->sc_frmr_q.next, - struct svc_rdma_fastreg_mr, frmr_list); - list_del_init(&frmr->frmr_list); - kfree(frmr->sg); - ib_dereg_mr(frmr->mr); - kfree(frmr); - } -} - -struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) -{ - struct svc_rdma_fastreg_mr *frmr = NULL; - - spin_lock(&rdma->sc_frmr_q_lock); - if (!list_empty(&rdma->sc_frmr_q)) { - frmr = list_entry(rdma->sc_frmr_q.next, - struct svc_rdma_fastreg_mr, frmr_list); - list_del_init(&frmr->frmr_list); - frmr->sg_nents = 0; - } - spin_unlock(&rdma->sc_frmr_q_lock); - if (frmr) - return frmr; - - return rdma_alloc_frmr(rdma); -} - -void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, - struct svc_rdma_fastreg_mr *frmr) -{ - if (frmr) { - ib_dma_unmap_sg(rdma->sc_cm_id->device, - frmr->sg, frmr->sg_nents, frmr->direction); - spin_lock(&rdma->sc_frmr_q_lock); - WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); - list_add(&frmr->frmr_list, &rdma->sc_frmr_q); - spin_unlock(&rdma->sc_frmr_q_lock); - } -} - /* * This is the xpo_recvfrom function for listening endpoints. Its * purpose is to accept incoming connections. The CMA callback handler @@ -908,8 +741,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * capabilities of this particular device */ newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, (size_t)RPCSVC_MAXPAGES); - newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd, - RPCSVC_MAXPAGES); newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_requests); @@ -952,7 +783,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; - qp_attr.port_num = newxprt->sc_cm_id->port_num; + qp_attr.port_num = newxprt->sc_port_num; qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; @@ -976,47 +807,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } newxprt->sc_qp = newxprt->sc_cm_id->qp; - /* - * Use the most secure set of MR resources based on the - * transport type and available memory management features in - * the device. Here's the table implemented below: - * - * Fast Global DMA Remote WR - * Reg LKEY MR Access - * Sup'd Sup'd Needed Needed - * - * IWARP N N Y Y - * N Y Y Y - * Y N Y N - * Y Y N - - * - * IB N N Y N - * N Y N - - * Y N Y N - * Y Y N - - * - * NB: iWARP requires remote write access for the data sink - * of an RDMA_READ. IB does not. - */ - newxprt->sc_reader = rdma_read_chunk_lcl; - if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - newxprt->sc_frmr_pg_list_len = - dev->attrs.max_fast_reg_page_list_len; - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; - newxprt->sc_reader = rdma_read_chunk_frmr; - } else + if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) newxprt->sc_snd_w_inv = false; - - /* - * Determine if a DMA MR is required and if so, what privs are required - */ - if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && - !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) + if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) && + !rdma_ib_or_roce(dev, newxprt->sc_port_num)) goto errout; - if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); @@ -1056,7 +852,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); dprintk(" max_sge : %d\n", newxprt->sc_max_sge); - dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd); dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); dprintk(" max_requests : %d\n", newxprt->sc_max_requests); dprintk(" ord : %d\n", newxprt->sc_ord); @@ -1117,12 +912,6 @@ static void __svc_rdma_free(struct work_struct *work) pr_err("svcrdma: sc_xprt still in use? (%d)\n", kref_read(&xprt->xpt_ref)); - /* - * Destroy queued, but not processed read completions. Note - * that this cleanup has to be done before destroying the - * cm_id because the device ptr is needed to unmap the dma in - * svc_rdma_put_context. - */ while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&rdma->sc_read_complete_q, @@ -1130,8 +919,6 @@ static void __svc_rdma_free(struct work_struct *work) list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } - - /* Destroy queued, but not processed recv completions */ while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&rdma->sc_rq_dto_q, @@ -1151,7 +938,6 @@ static void __svc_rdma_free(struct work_struct *work) xprt->xpt_bc_xprt = NULL; } - rdma_dealloc_frmr_q(rdma); svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_destroy_ctxts(rdma); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 62ecbccd9748..d1c458e5ec4d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -684,7 +684,8 @@ xprt_rdma_free(struct rpc_task *task) dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - if (unlikely(!list_empty(&req->rl_registered))) + rpcrdma_remove_req(&r_xprt->rx_buf, req); + if (!list_empty(&req->rl_registered)) ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); rpcrdma_unmap_sges(ia, req); rpcrdma_buffer_put(req); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3dbce9ac4327..e4171f2abe37 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -243,8 +243,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; #endif - struct ib_qp_attr *attr = &ia->ri_qp_attr; - struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; int connstate = 0; switch (event->event) { @@ -267,7 +265,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) break; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device for %pIS:%u\n", + pr_info("rpcrdma: removing device %s for %pIS:%u\n", + ia->ri_device->name, sap, rpc_get_port(sap)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); @@ -282,13 +281,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) return 1; case RDMA_CM_EVENT_ESTABLISHED: connstate = 1; - ib_query_qp(ia->ri_id->qp, attr, - IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, - iattr); - dprintk("RPC: %s: %d responder resources" - " (%d initiator)\n", - __func__, attr->max_dest_rd_atomic, - attr->max_rd_atomic); rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: @@ -298,11 +290,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n", - sap, rpc_get_port(sap), ia->ri_device->name, + dprintk("rpcrdma: connection to %pIS:%u rejected: %s\n", + sap, rpc_get_port(sap), rdma_reject_msg(id, event->status)); -#endif connstate = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) connstate = -EAGAIN; @@ -310,37 +300,19 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; connected: - dprintk("RPC: %s: %sconnected\n", - __func__, connstate > 0 ? "" : "dis"); atomic_set(&xprt->rx_buf.rb_credits, 1); ep->rep_connected = connstate; rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", - __func__, sap, rpc_get_port(sap), ep, - rdma_event_msg(event->event)); + dprintk("RPC: %s: %pIS:%u on %s/%s (ep 0x%p): %s\n", + __func__, sap, rpc_get_port(sap), + ia->ri_device->name, ia->ri_ops->ro_displayname, + ep, rdma_event_msg(event->event)); break; } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - if (connstate == 1) { - int ird = attr->max_dest_rd_atomic; - int tird = ep->rep_remote_cma.responder_resources; - - pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", - sap, rpc_get_port(sap), - ia->ri_device->name, - ia->ri_ops->ro_displayname, - xprt->rx_buf.rb_max_requests, - ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); - } else if (connstate < 0) { - pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", - sap, rpc_get_port(sap), connstate); - } -#endif - return 0; } @@ -971,7 +943,6 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) if (req == NULL) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&req->rl_free); spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); @@ -1033,6 +1004,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); + INIT_LIST_HEAD(&buf->rb_pending); INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); @@ -1055,7 +1027,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) goto out; } req->rl_backchannel = false; - list_add(&req->rl_free, &buf->rb_send_bufs); + list_add(&req->rl_list, &buf->rb_send_bufs); } INIT_LIST_HEAD(&buf->rb_recv_bufs); @@ -1084,8 +1056,8 @@ rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf) struct rpcrdma_req *req; req = list_first_entry(&buf->rb_send_bufs, - struct rpcrdma_req, rl_free); - list_del(&req->rl_free); + struct rpcrdma_req, rl_list); + list_del_init(&req->rl_list); return req; } @@ -1187,6 +1159,7 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) if (!mw) goto out_nomws; + mw->mw_flags = 0; return mw; out_nomws: @@ -1267,7 +1240,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) spin_lock(&buffers->rb_lock); buffers->rb_send_count--; - list_add_tail(&req->rl_free, &buffers->rb_send_bufs); + list_add_tail(&req->rl_list, &buffers->rb_send_bufs); if (rep) { buffers->rb_recv_count--; list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1d66acf1a723..b282d3f8cdd8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -271,6 +271,7 @@ struct rpcrdma_mw { struct scatterlist *mw_sg; int mw_nents; enum dma_data_direction mw_dir; + unsigned long mw_flags; union { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; @@ -282,6 +283,11 @@ struct rpcrdma_mw { struct list_head mw_all; }; +/* mw_flags */ +enum { + RPCRDMA_MW_F_RI = 1, +}; + /* * struct rpcrdma_req -- structure central to the request/reply sequence. * @@ -334,7 +340,8 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { - struct list_head rl_free; + struct list_head rl_list; + __be32 rl_xid; unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; @@ -396,6 +403,7 @@ struct rpcrdma_buffer { int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; + struct list_head rb_pending; u32 rb_max_requests; atomic_t rb_credits; /* most recent credit grant */ @@ -461,7 +469,7 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, - struct rpcrdma_req *); + struct list_head *); void (*ro_unmap_safe)(struct rpcrdma_xprt *, struct rpcrdma_req *, bool); void (*ro_recover_mr)(struct rpcrdma_mw *); @@ -544,6 +552,34 @@ void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); +static inline void +rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) +{ + spin_lock(&buffers->rb_lock); + if (list_empty(&req->rl_list)) + list_add_tail(&req->rl_list, &buffers->rb_pending); + spin_unlock(&buffers->rb_lock); +} + +static inline struct rpcrdma_req * +rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid) +{ + struct rpcrdma_req *pos; + + list_for_each_entry(pos, &buffers->rb_pending, rl_list) + if (pos->rl_xid == xid) + return pos; + return NULL; +} + +static inline void +rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) +{ + spin_lock(&buffers->rb_lock); + list_del(&req->rl_list); + spin_unlock(&buffers->rb_lock); +} + struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d5b54c020dec..4f154d388748 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1624,6 +1624,8 @@ static void xs_tcp_state_change(struct sock *sk) if (test_and_clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state)) xprt_clear_connecting(xprt); + if (sk->sk_err) + xprt_wake_pending_tasks(xprt, -sk->sk_err); xs_sock_mark_closed(xprt); } out: diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d174ee3254ee..89cd061c4468 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -65,6 +65,8 @@ static struct tipc_bearer *bearer_get(struct net *net, int bearer_id) } static void bearer_disable(struct net *net, struct tipc_bearer *b); +static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); /** * tipc_media_find - locates specified media object by name @@ -428,6 +430,10 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, /* Associate TIPC bearer with L2 bearer */ rcu_assign_pointer(b->media_ptr, dev); + b->pt.dev = dev; + b->pt.type = htons(ETH_P_TIPC); + b->pt.func = tipc_l2_rcv_msg; + dev_add_pack(&b->pt); memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len); b->bcast_addr.media_id = b->media->type_id; @@ -447,6 +453,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b) struct net_device *dev; dev = (struct net_device *)rtnl_dereference(b->media_ptr); + dev_remove_pack(&b->pt); RCU_INIT_POINTER(dev->tipc_ptr, NULL); synchronize_net(); dev_put(dev); @@ -594,11 +601,12 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(dev->tipc_ptr); + b = rcu_dereference_rtnl(dev->tipc_ptr) ?: + rcu_dereference_rtnl(orig_dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && - (skb->pkt_type <= PACKET_BROADCAST))) { + (skb->pkt_type <= PACKET_MULTICAST))) { skb->next = NULL; - tipc_rcv(dev_net(dev), skb, b); + tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; } @@ -659,11 +667,6 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, return NOTIFY_OK; } -static struct packet_type tipc_packet_type __read_mostly = { - .type = htons(ETH_P_TIPC), - .func = tipc_l2_rcv_msg, -}; - static struct notifier_block notifier = { .notifier_call = tipc_l2_device_event, .priority = 0, @@ -671,19 +674,12 @@ static struct notifier_block notifier = { int tipc_bearer_setup(void) { - int err; - - err = register_netdevice_notifier(¬ifier); - if (err) - return err; - dev_add_pack(&tipc_packet_type); - return 0; + return register_netdevice_notifier(¬ifier); } void tipc_bearer_cleanup(void) { unregister_netdevice_notifier(¬ifier); - dev_remove_pack(&tipc_packet_type); } void tipc_bearer_stop(struct net *net) diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 635c9086e19a..e07a55a80c18 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -131,6 +131,7 @@ struct tipc_media { * @name: bearer name (format = media:interface) * @media: ptr to media structure associated with bearer * @bcast_addr: media address used in broadcasting + * @pt: packet type for bearer * @rcu: rcu struct for tipc_bearer * @priority: default link priority for bearer * @window: default window size for bearer @@ -151,6 +152,7 @@ struct tipc_bearer { char name[TIPC_MAX_BEARER_NAME]; struct tipc_media *media; struct tipc_media_addr bcast_addr; + struct packet_type pt; struct rcu_head rcu; u32 priority; u32 window; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index ab3087687a32..6ef379f004ac 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -479,13 +479,14 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { struct sk_buff *_skb = *skb; - struct tipc_msg *hdr = buf_msg(_skb); + struct tipc_msg *hdr; struct tipc_msg ohdr; - int dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE); + int dlen; if (skb_linearize(_skb)) goto exit; hdr = buf_msg(_skb); + dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE); if (msg_dest_droppable(hdr)) goto exit; if (msg_errcode(hdr)) @@ -511,8 +512,11 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) goto exit; + /* reassign after skb header modifications */ + hdr = buf_msg(_skb); /* Now reverse the concerned fields */ msg_set_errcode(hdr, err); + msg_set_non_seq(hdr, 0); msg_set_origport(hdr, msg_destport(&ohdr)); msg_set_destport(hdr, msg_origport(&ohdr)); msg_set_destnode(hdr, msg_prevnode(&ohdr)); diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 9bfe886ab330..750949dfc1d7 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -258,13 +258,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, arg = nlmsg_new(0, GFP_KERNEL); if (!arg) { kfree_skb(msg->rep); + msg->rep = NULL; return -ENOMEM; } err = __tipc_nl_compat_dumpit(cmd, msg, arg); - if (err) + if (err) { kfree_skb(msg->rep); - + msg->rep = NULL; + } kfree_skb(arg); return err; diff --git a/net/tipc/node.c b/net/tipc/node.c index aeef8011ac7d..7dd22330a6b4 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1126,8 +1126,8 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, strncpy(linkname, tipc_link_name(link), len); err = 0; } -exit: tipc_node_read_unlock(node); +exit: tipc_node_put(node); return err; } @@ -1455,10 +1455,8 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { syncpt = iseqno + exp_pkts - 1; - if (!tipc_link_is_up(l)) { - tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); + if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); - } if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); @@ -1559,6 +1557,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) /* Check/update node state before receiving */ if (unlikely(skb)) { + if (unlikely(skb_linearize(skb))) + goto discard; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 101e3597338f..d50edd6e0019 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2255,8 +2255,8 @@ void tipc_sk_reinit(struct net *net) do { tsk = ERR_PTR(rhashtable_walk_start(&iter)); - if (tsk) - continue; + if (IS_ERR(tsk)) + goto walk_stop; while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); @@ -2265,7 +2265,7 @@ void tipc_sk_reinit(struct net *net) msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } - +walk_stop: rhashtable_walk_stop(&iter); } while (tsk == ERR_PTR(-EAGAIN)); } diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 0bf91cd3733c..be3d9e3183dc 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -52,7 +52,6 @@ struct tipc_subscriber { struct list_head subscrp_list; }; -static void tipc_subscrp_delete(struct tipc_subscription *sub); static void tipc_subscrb_put(struct tipc_subscriber *subscriber); /** @@ -197,15 +196,19 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, { struct list_head *subscription_list = &subscriber->subscrp_list; struct tipc_subscription *sub, *temp; + u32 timeout; spin_lock_bh(&subscriber->lock); list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) { if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) continue; - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - tipc_subscrp_delete(sub); + timeout = htohl(sub->evt.s.timeout, sub->swap); + if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) { + tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscrp_list); + tipc_subscrp_put(sub); + } if (s) break; @@ -236,18 +239,12 @@ static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) tipc_subscrb_put(subscriber); } -static void tipc_subscrp_delete(struct tipc_subscription *sub) -{ - u32 timeout = htohl(sub->evt.s.timeout, sub->swap); - - if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) - tipc_subscrp_put(sub); -} - static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { + tipc_subscrb_get(subscriber); tipc_subscrb_subscrp_delete(subscriber, s); + tipc_subscrb_put(subscriber); } static struct tipc_subscription *tipc_subscrp_create(struct net *net, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a03130a47b85..60aff60e30ad 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -272,7 +272,7 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, goto out; } - if (len == sizeof(crypto_info)) { + if (len == sizeof(*crypto_info)) { if (copy_to_user(optval, crypto_info, sizeof(*crypto_info))) rc = -EFAULT; goto out; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7b52a380d710..be8982b4f8c0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, */ mutex_lock(&u->iolock); - if (flags & MSG_PEEK) - skip = sk_peek_offset(sk, flags); - else - skip = 0; + skip = max(sk_peek_offset(sk, flags), 0); do { int chunk; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 45ba3d0872cc..8ce85420ecb0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -291,8 +291,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, [NL80211_ATTR_PID] = { .type = NLA_U32 }, [NL80211_ATTR_4ADDR] = { .type = NLA_U8 }, - [NL80211_ATTR_PMKID] = { .type = NLA_BINARY, - .len = WLAN_PMKID_LEN }, + [NL80211_ATTR_PMKID] = { .len = WLAN_PMKID_LEN }, [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED }, @@ -348,6 +347,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, [NL80211_ATTR_P2P_CTWINDOW] = { .type = NLA_U8 }, [NL80211_ATTR_P2P_OPPPS] = { .type = NLA_U8 }, + [NL80211_ATTR_LOCAL_MESH_POWER_MODE] = {. type = NLA_U32 }, [NL80211_ATTR_ACL_POLICY] = {. type = NLA_U32 }, [NL80211_ATTR_MAC_ADDRS] = { .type = NLA_NESTED }, [NL80211_ATTR_STA_CAPABILITY] = { .type = NLA_U16 }, @@ -520,7 +520,7 @@ nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = { static const struct nla_policy nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = { [NL80211_NAN_FUNC_TYPE] = { .type = NLA_U8 }, - [NL80211_NAN_FUNC_SERVICE_ID] = { .type = NLA_BINARY, + [NL80211_NAN_FUNC_SERVICE_ID] = { .len = NL80211_NAN_FUNC_SERVICE_ID_LEN }, [NL80211_NAN_FUNC_PUBLISH_TYPE] = { .type = NLA_U8 }, [NL80211_NAN_FUNC_PUBLISH_BCAST] = { .type = NLA_FLAG }, @@ -6469,6 +6469,10 @@ static int validate_scan_freqs(struct nlattr *freqs) struct nlattr *attr1, *attr2; int n_channels = 0, tmp1, tmp2; + nla_for_each_nested(attr1, freqs, tmp1) + if (nla_len(attr1) != sizeof(u32)) + return 0; + nla_for_each_nested(attr1, freqs, tmp1) { n_channels++; /* diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ff61d8557929..69b16ee327d9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2226,7 +2226,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, goto no_transform; } - dst_hold(&xdst->u.dst); route = xdst->route; } } @@ -3308,9 +3307,15 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; + /* Stage 0 - sanity checks */ if ((err = xfrm_migrate_check(m, num_migrate)) < 0) goto out; + if (dir >= XFRM_POLICY_MAX) { + err = -EINVAL; + goto out; + } + /* Stage 1 - find policy */ if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { err = -ENOENT; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 6c0956d10db6..a792effdb0b5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1620,6 +1620,7 @@ int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, unsigned short family, struct net *net) { + int i; int err = 0; struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (!afinfo) @@ -1628,6 +1629,9 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/ if (afinfo->tmpl_sort) err = afinfo->tmpl_sort(dst, src, n); + else + for (i = 0; i < n; i++) + dst[i] = src[i]; spin_unlock_bh(&net->xfrm.xfrm_state_lock); rcu_read_unlock(); return err; @@ -1638,6 +1642,7 @@ int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family) { + int i; int err = 0; struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); struct net *net = xs_net(*src); @@ -1648,6 +1653,9 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, spin_lock_bh(&net->xfrm.xfrm_state_lock); if (afinfo->state_sort) err = afinfo->state_sort(dst, src, n); + else + for (i = 0; i < n; i++) + dst[i] = src[i]; spin_unlock_bh(&net->xfrm.xfrm_state_lock); rcu_read_unlock(); return err; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2be4c6af008a..9391ced05259 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -796,7 +796,7 @@ static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb return -EMSGSIZE; xuo = nla_data(attr); - + memset(xuo, 0, sizeof(*xuo)); xuo->ifindex = xso->dev->ifindex; xuo->flags = xso->flags; @@ -1869,6 +1869,7 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct return -EMSGSIZE; id = nlmsg_data(nlh); + memset(&id->sa_id, 0, sizeof(id->sa_id)); memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr)); id->sa_id.spi = x->id.spi; id->sa_id.family = x->props.family; @@ -2578,6 +2579,8 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct ue = nlmsg_data(nlh); copy_to_user_state(x, &ue->state); ue->hard = (c->data.hard != 0) ? 1 : 0; + /* clear the padding bytes */ + memset(&ue->hard + 1, 0, sizeof(*ue) - offsetofend(typeof(*ue), hard)); err = xfrm_mark_put(skb, &x->mark); if (err) @@ -2715,6 +2718,7 @@ static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c) struct nlattr *attr; id = nlmsg_data(nlh); + memset(id, 0, sizeof(*id)); memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr)); id->spi = x->id.spi; id->family = x->props.family; |