71 files changed, 1778 insertions, 971 deletions
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index e9d0f0c1a048..16d287565987 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -275,7 +275,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
 			}
 			break;
 		case 'R':{
-				int16_t *nwqid = va_arg(ap, int16_t *);
+				uint16_t *nwqid = va_arg(ap, uint16_t *);
 				struct p9_qid **wqids =
 				    va_arg(ap, struct p9_qid **);
 
@@ -440,7 +440,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
 						 stbuf->n_gid, stbuf->n_muid);
 			} break;
 		case 'V':{
-				int32_t count = va_arg(ap, int32_t);
+				uint32_t count = va_arg(ap, uint32_t);
 				struct iov_iter *from =
 						va_arg(ap, struct iov_iter *);
 				errcode = p9pdu_writef(pdu, proto_version, "d",
@@ -471,7 +471,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
 			}
 			break;
 		case 'R':{
-				int16_t nwqid = va_arg(ap, int);
+				uint16_t nwqid = va_arg(ap, int);
 				struct p9_qid *wqids =
 				    va_arg(ap, struct p9_qid *);
 
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 3e3d82d8ff70..bced8c074c12 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -734,6 +734,7 @@ static int parse_opts(char *params, struct p9_fd_opts *opts)
 	opts->port = P9_PORT;
 	opts->rfd = ~0;
 	opts->wfd = ~0;
+	opts->privport = 0;
 
 	if (!params)
 		return 0;
@@ -1013,7 +1014,6 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
 {
 	int err;
 	struct p9_fd_opts opts;
-	struct p9_trans_fd *p;
 
 	parse_opts(args, &opts);
 
@@ -1026,7 +1026,6 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
 	if (err < 0)
 		return err;
 
-	p = (struct p9_trans_fd *) client->trans;
 	p9_conn_create(client);
 
 	return 0;
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 14ad43b5cf89..3533d2a53ab6 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -139,6 +139,7 @@ struct p9_rdma_opts {
 	int sq_depth;
 	int rq_depth;
 	long timeout;
+	int privport;
 };
 
 /*
@@ -146,7 +147,10 @@ struct p9_rdma_opts {
  */
 enum {
 	/* Options that take integer arguments */
-	Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err,
+	Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout,
+	/* Options that take no argument */
+	Opt_privport,
+	Opt_err,
 };
 
 static match_table_t tokens = {
@@ -154,6 +158,7 @@ static match_table_t tokens = {
 	{Opt_sq_depth, "sq=%u"},
 	{Opt_rq_depth, "rq=%u"},
 	{Opt_timeout, "timeout=%u"},
+	{Opt_privport, "privport"},
 	{Opt_err, NULL},
 };
 
@@ -175,6 +180,7 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 	opts->sq_depth = P9_RDMA_SQ_DEPTH;
 	opts->rq_depth = P9_RDMA_RQ_DEPTH;
 	opts->timeout = P9_RDMA_TIMEOUT;
+	opts->privport = 0;
 
 	if (!params)
 		return 0;
@@ -193,13 +199,13 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
-		if (token == Opt_err)
-			continue;
-		r = match_int(&args[0], &option);
-		if (r < 0) {
-			p9_debug(P9_DEBUG_ERROR,
-				 "integer field, but no integer?\n");
-			continue;
+		if ((token != Opt_err) && (token != Opt_privport)) {
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				continue;
+			}
 		}
 		switch (token) {
 		case Opt_port:
@@ -214,6 +220,9 @@ static int parse_opts(char *params, struct p9_rdma_opts *opts)
 		case Opt_timeout:
 			opts->timeout = option;
 			break;
+		case Opt_privport:
+			opts->privport = 1;
+			break;
 		default:
 			continue;
 		}
@@ -607,6 +616,23 @@ static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req)
 	return 0;
 }
 
+static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
+{
+	struct sockaddr_in cl = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_ANY),
+	};
+	int port, err = -EINVAL;
+
+	for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) {
+		cl.sin_port = htons((ushort)port);
+		err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl);
+		if (err != -EADDRINUSE)
+			break;
+	}
+	return err;
+}
+
 /**
  * trans_create_rdma - Transport method for creating atransport instance
  * @client: client instance
@@ -642,6 +668,16 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
 	/* Associate the client with the transport */
 	client->trans = rdma;
 
+	/* Bind to a privileged port if we need to */
+	if (opts.privport) {
+		err = p9_rdma_bind_privport(rdma);
+		if (err < 0) {
+			pr_err("%s (%d): problem binding to privport: %d\n",
+			       __func__, task_pid_nr(current), -err);
+			goto error;
+		}
+	}
+
 	/* Resolve the server's address */
 	rdma->addr.sin_family = AF_INET;
 	rdma->addr.sin_addr.s_addr = in_aton(addr);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index e62bcbbabb5e..9dd49ca67dbc 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -525,7 +525,10 @@ static ssize_t p9_mount_tag_show(struct device *dev,
 	vdev = dev_to_virtio(dev);
 	chan = vdev->priv;
 
-	return snprintf(buf, chan->tag_len + 1, "%s", chan->tag);
+	memcpy(buf, chan->tag, chan->tag_len);
+	buf[chan->tag_len] = 0;
+
+	return chan->tag_len + 1;
 }
 
 static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL);
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index a05b9dbf14c9..9070dfd6b4ad 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -1313,7 +1313,8 @@ int hidp_connection_add(struct hidp_connadd_req *req,
 			struct socket *ctrl_sock,
 			struct socket *intr_sock)
 {
-	u32 valid_flags = 0;
+	u32 valid_flags = BIT(HIDP_VIRTUAL_CABLE_UNPLUG) |
+			  BIT(HIDP_BOOT_PROTOCOL_MODE);
 	struct hidp_session *session;
 	struct l2cap_conn *conn;
 	struct l2cap_chan *chan;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 409608960899..e29ad70b3000 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -170,7 +170,7 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
 	struct br_port_msg *bpm;
 	struct nlattr *nest, *nest2;
 
-	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI);
+	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
 	if (!nlh)
 		return -EMSGSIZE;
 
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 0e4ddb81610d..4b5c236998ff 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -394,7 +394,7 @@ errout:
  * Dump information about all ports, in response to GETLINK
  */
 int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
-	       struct net_device *dev, u32 filter_mask)
+	       struct net_device *dev, u32 filter_mask, int nlflags)
 {
 	struct net_bridge_port *port = br_port_get_rtnl(dev);
 
@@ -402,7 +402,7 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	    !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED))
 		return 0;
 
-	return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, NLM_F_MULTI,
+	return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags,
 			      filter_mask, dev);
 }
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 6ca0251cb478..3362c29400f1 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -828,7 +828,7 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port);
 int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
 int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
 int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
-	       u32 filter_mask);
+	       u32 filter_mask, int nlflags);
 
 #ifdef CONFIG_SYSFS
 /* br_sysfs_if.c */
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index ec565508e904..79e8f71aef5b 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -490,6 +490,43 @@ out:
 }
 EXPORT_SYMBOL(ceph_parse_options);
 
+int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
+{
+	struct ceph_options *opt = client->options;
+	size_t pos = m->count;
+
+	if (opt->name)
+		seq_printf(m, "name=%s,", opt->name);
+	if (opt->key)
+		seq_puts(m, "secret=<hidden>,");
+
+	if (opt->flags & CEPH_OPT_FSID)
+		seq_printf(m, "fsid=%pU,", &opt->fsid);
+	if (opt->flags & CEPH_OPT_NOSHARE)
+		seq_puts(m, "noshare,");
+	if (opt->flags & CEPH_OPT_NOCRC)
+		seq_puts(m, "nocrc,");
+	if (opt->flags & CEPH_OPT_NOMSGAUTH)
+		seq_puts(m, "nocephx_require_signatures,");
+	if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
+		seq_puts(m, "notcp_nodelay,");
+
+	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+		seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
+	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+		seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
+	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+		seq_printf(m, "osdkeepalivetimeout=%d,",
+			   opt->osd_keepalive_timeout);
+
+	/* drop redundant comma */
+	if (m->count != pos)
+		m->count--;
+
+	return 0;
+}
+EXPORT_SYMBOL(ceph_print_client_options);
+
 u64 ceph_client_id(struct ceph_client *client)
 {
 	return client->monc.auth->global_id;
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 16bc199d9a62..9d84ce4ea0df 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -17,6 +17,7 @@ const char *crush_bucket_alg_name(int alg)
 	case CRUSH_BUCKET_LIST: return "list";
 	case CRUSH_BUCKET_TREE: return "tree";
 	case CRUSH_BUCKET_STRAW: return "straw";
+	case CRUSH_BUCKET_STRAW2: return "straw2";
 	default: return "unknown";
 	}
 }
@@ -40,6 +41,8 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)
 		return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)];
 	case CRUSH_BUCKET_STRAW:
 		return ((struct crush_bucket_straw *)b)->item_weights[p];
+	case CRUSH_BUCKET_STRAW2:
+		return ((struct crush_bucket_straw2 *)b)->item_weights[p];
 	}
 	return 0;
 }
@@ -77,6 +80,14 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
 	kfree(b);
 }
 
+void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
+{
+	kfree(b->item_weights);
+	kfree(b->h.perm);
+	kfree(b->h.items);
+	kfree(b);
+}
+
 void crush_destroy_bucket(struct crush_bucket *b)
 {
 	switch (b->alg) {
@@ -92,6 +103,9 @@ void crush_destroy_bucket(struct crush_bucket *b)
 	case CRUSH_BUCKET_STRAW:
 		crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
 		break;
+	case CRUSH_BUCKET_STRAW2:
+		crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b);
+		break;
 	}
 }
 
diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h
new file mode 100644
index 000000000000..6192c7fc958c
--- /dev/null
+++ b/net/ceph/crush/crush_ln_table.h
@@ -0,0 +1,166 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#if defined(__linux__)
+#include <linux/types.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#endif
+
+#ifndef CEPH_CRUSH_LN_H
+#define CEPH_CRUSH_LN_H
+
+
+// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+
+static int64_t __RH_LH_tbl[128*2+2] = {
+  0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
+  0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
+  0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
+  0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll,
+  0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll,
+  0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll,
+  0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll,
+  0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell,
+  0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll,
+  0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll,
+  0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll,
+  0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll,
+  0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll,
+  0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll,
+  0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all,
+  0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll,
+  0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all,
+  0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell,
+  0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll,
+  0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll,
+  0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll,
+  0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll,
+  0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll,
+  0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll,
+  0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll,
+  0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll,
+  0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell,
+  0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll,
+  0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll,
+  0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll,
+  0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll,
+  0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll,
+  0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll,
+  0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll,
+  0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll,
+  0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll,
+  0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll,
+  0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll,
+  0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll,
+  0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll,
+  0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll,
+  0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll,
+  0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll,
+  0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll,
+  0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll,
+  0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll,
+  0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll,
+  0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll,
+  0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll,
+  0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll,
+  0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll,
+  0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll,
+  0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll,
+  0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell,
+  0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell,
+  0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll,
+  0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell,
+  0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll,
+  0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll,
+  0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll,
+  0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll,
+  0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll,
+  0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
+  0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
+  0x0000800000000000ll, 0x0000ffff00000000ll,
+  };
+
+
+    // LL_tbl[k] = 2^48*log2(1.0+k/2^15);
+static int64_t __LL_tbl[256] = {
+  0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
+  0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
+  0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
+  0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull,
+  0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull,
+  0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull,
+  0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull,
+  0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull,
+  0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull,
+  0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull,
+  0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull,
+  0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull,
+  0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull,
+  0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull,
+  0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull,
+  0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull,
+  0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull,
+  0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull,
+  0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull,
+  0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull,
+  0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull,
+  0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull,
+  0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull,
+  0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull,
+  0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull,
+  0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull,
+  0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull,
+  0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull,
+  0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull,
+  0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull,
+  0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull,
+  0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull,
+  0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull,
+  0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull,
+  0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull,
+  0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull,
+  0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull,
+  0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull,
+  0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull,
+  0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull,
+  0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull,
+  0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull,
+  0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull,
+  0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull,
+  0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull,
+  0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull,
+  0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull,
+  0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull,
+  0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull,
+  0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull,
+  0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull,
+  0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull,
+  0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull,
+  0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull,
+  0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull,
+  0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull,
+  0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull,
+  0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull,
+  0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull,
+  0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull,
+  0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull,
+  0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull,
+  0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull,
+  0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
+};
+
+
+
+
+#endif
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index a1ef53c04415..5b47736d27d9 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -20,7 +20,7 @@
 
 #include <linux/crush/crush.h>
 #include <linux/crush/hash.h>
-#include <linux/crush/mapper.h>
+#include "crush_ln_table.h"
 
 /*
  * Implement the core CRUSH mapping algorithm.
@@ -238,6 +238,102 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 	return bucket->h.items[high];
 }
 
+// compute 2^44*log2(input+1)
+uint64_t crush_ln(unsigned xin)
+{
+    unsigned x=xin, x1;
+    int iexpon, index1, index2;
+    uint64_t RH, LH, LL, xl64, result;
+
+    x++;
+
+    // normalize input
+    iexpon = 15;
+    while(!(x&0x18000)) { x<<=1; iexpon--; }
+
+    index1 = (x>>8)<<1;
+    // RH ~ 2^56/index1
+    RH = __RH_LH_tbl[index1 - 256];
+    // LH ~ 2^48 * log2(index1/256)
+    LH = __RH_LH_tbl[index1 + 1 - 256];
+
+    // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
+    xl64 = (int64_t)x * RH;
+    xl64 >>= 48;
+    x1 = xl64;
+
+    result = iexpon;
+    result <<= (12 + 32);
+
+    index2 = x1 & 0xff;
+    // LL ~ 2^48*log2(1.0+index2/2^15)
+    LL = __LL_tbl[index2];
+
+    LH = LH + LL;
+
+    LH >>= (48-12 - 32);
+    result += LH;
+
+    return result;
+}
+
+
+/*
+ * straw2
+ *
+ * for reference, see:
+ *
+ * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
+ *
+ */
+
+static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
+				int x, int r)
+{
+	unsigned i, high = 0;
+	unsigned u;
+	unsigned w;
+	__s64 ln, draw, high_draw = 0;
+
+	for (i = 0; i < bucket->h.size; i++) {
+		w = bucket->item_weights[i];
+		if (w) {
+			u = crush_hash32_3(bucket->h.hash, x,
+					   bucket->h.items[i], r);
+			u &= 0xffff;
+
+			/*
+			 * for some reason slightly less than 0x10000 produces
+			 * a slightly more accurate distribution... probably a
+			 * rounding effect.
+			 *
+			 * the natural log lookup table maps [0,0xffff]
+			 * (corresponding to real numbers [1/0x10000, 1] to
+			 * [0, 0xffffffffffff] (corresponding to real numbers
+			 * [-11.090355,0]).
+			 */
+			ln = crush_ln(u) - 0x1000000000000ll;
+
+			/*
+			 * divide by 16.16 fixed-point weight.  note
+			 * that the ln value is negative, so a larger
+			 * weight means a larger (less negative) value
+			 * for draw.
+			 */
+			draw = div64_s64(ln, w);
+		} else {
+			draw = S64_MIN;
+		}
+
+		if (i == 0 || draw > high_draw) {
+			high = i;
+			high_draw = draw;
+		}
+	}
+	return bucket->h.items[high];
+}
+
+
 static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 {
 	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
@@ -255,12 +351,16 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 	case CRUSH_BUCKET_STRAW:
 		return bucket_straw_choose((struct crush_bucket_straw *)in,
 					   x, r);
+	case CRUSH_BUCKET_STRAW2:
+		return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
+					    x, r);
 	default:
 		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
 		return in->items[0];
 	}
 }
 
+
 /*
  * true if device is marked "out" (failed, fully offloaded)
  * of the cluster
@@ -290,6 +390,7 @@ static int is_out(const struct crush_map *map,
  * @type: the type of item to choose
  * @out: pointer to output vector
  * @outpos: our position in that vector
+ * @out_size: size of the out vector
  * @tries: number of attempts to make
  * @recurse_tries: number of attempts to have recursive chooseleaf make
  * @local_retries: localized retries
@@ -304,6 +405,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 			       const __u32 *weight, int weight_max,
 			       int x, int numrep, int type,
 			       int *out, int outpos,
+			       int out_size,
 			       unsigned int tries,
 			       unsigned int recurse_tries,
 			       unsigned int local_retries,
@@ -322,6 +424,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 	int item = 0;
 	int itemtype;
 	int collide, reject;
+	int count = out_size;
 
 	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
 		recurse_to_leaf ? "_LEAF" : "",
@@ -329,7 +432,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 		tries, recurse_tries, local_retries, local_fallback_retries,
 		parent_r);
 
-	for (rep = outpos; rep < numrep; rep++) {
+	for (rep = outpos; rep < numrep && count > 0 ; rep++) {
 		/* keep trying until we get a non-out, non-colliding item */
 		ftotal = 0;
 		skip_rep = 0;
@@ -403,7 +506,7 @@ static int crush_choose_firstn(const struct crush_map *map,
 							 map->buckets[-1-item],
 							 weight, weight_max,
 							 x, outpos+1, 0,
-							 out2, outpos,
+							 out2, outpos, count,
 							 recurse_tries, 0,
 							 local_retries,
 							 local_fallback_retries,
@@ -463,6 +566,7 @@ reject:
 		dprintk("CHOOSE got %d\n", item);
 		out[outpos] = item;
 		outpos++;
+		count--;
 	}
 
 	dprintk("CHOOSE returns %d\n", outpos);
@@ -654,6 +758,7 @@ int crush_do_rule(const struct crush_map *map,
 	__u32 step;
 	int i, j;
 	int numrep;
+	int out_size;
 	/*
 	 * the original choose_total_tries value was off by one (it
 	 * counted "retries" and not "tries").  add one.
@@ -761,6 +866,7 @@ int crush_do_rule(const struct crush_map *map,
 						x, numrep,
 						curstep->arg2,
 						o+osize, j,
+						result_max-osize,
 						choose_tries,
 						recurse_tries,
 						choose_local_retries,
@@ -770,11 +876,13 @@ int crush_do_rule(const struct crush_map *map,
 						c+osize,
 						0);
 				} else {
+					out_size = ((numrep < (result_max-osize)) ?
+                                                    numrep : (result_max-osize));
 					crush_choose_indep(
 						map,
 						map->buckets[-1-w[i]],
 						weight, weight_max,
-						x, numrep, numrep,
+						x, out_size, numrep,
 						curstep->arg2,
 						o+osize, j,
 						choose_tries,
@@ -783,7 +891,7 @@ int crush_do_rule(const struct crush_map *map,
 						recurse_to_leaf,
 						c+osize,
 						0);
-					osize += numrep;
+					osize += out_size;
 				}
 			}
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 14d9995097cc..593dc2eabcc8 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -22,6 +22,7 @@
  *      .../monmap      - current monmap
  *      .../osdc        - active osd requests
  *      .../monc        - mon client state
+ *      .../client_options - libceph-only (i.e. not rbd or cephfs) options
  *      .../dentry_lru  - dump contents of dentry lru
  *      .../caps        - expose cap (reservation) stats
  *      .../bdi         - symlink to ../../bdi/something
@@ -177,10 +178,24 @@ static int osdc_show(struct seq_file *s, void *pp)
 	return 0;
 }
 
+static int client_options_show(struct seq_file *s, void *p)
+{
+	struct ceph_client *client = s->private;
+	int ret;
+
+	ret = ceph_print_client_options(s, client);
+	if (ret)
+		return ret;
+
+	seq_putc(s, '\n');
+	return 0;
+}
+
 CEPH_DEFINE_SHOW_FUNC(monmap_show)
 CEPH_DEFINE_SHOW_FUNC(osdmap_show)
 CEPH_DEFINE_SHOW_FUNC(monc_show)
 CEPH_DEFINE_SHOW_FUNC(osdc_show)
+CEPH_DEFINE_SHOW_FUNC(client_options_show)
 
 int ceph_debugfs_init(void)
 {
@@ -242,6 +257,14 @@ int ceph_debugfs_client_init(struct ceph_client *client)
 	if (!client->debugfs_osdmap)
 		goto out;
 
+	client->debugfs_options = debugfs_create_file("client_options",
+					0600,
+					client->debugfs_dir,
+					client,
+					&client_options_show_fops);
+	if (!client->debugfs_options)
+		goto out;
+
 	return 0;
 
 out:
@@ -252,6 +275,7 @@ out:
 void ceph_debugfs_client_cleanup(struct ceph_client *client)
 {
 	dout("ceph_debugfs_client_cleanup %p\n", client);
+	debugfs_remove(client->debugfs_options);
 	debugfs_remove(client->debugfs_osdmap);
 	debugfs_remove(client->debugfs_monmap);
 	debugfs_remove(client->osdc.debugfs_file);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a9f4ae45b7fb..967080a9f043 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -505,8 +505,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 		pr_err("connect %s error %d\n",
 		       ceph_pr_addr(&con->peer_addr.in_addr), ret);
 		sock_release(sock);
-		con->error_msg = "connect error";
-
 		return ret;
 	}
 
@@ -2145,12 +2143,10 @@ static int process_connect(struct ceph_connection *con)
 		 * to WAIT.  This shouldn't happen if we are the
 		 * client.
 		 */
-		pr_err("process_connect got WAIT as client\n");
 		con->error_msg = "protocol error, got WAIT as client";
 		return -1;
 
 	default:
-		pr_err("connect protocol error, will retry\n");
 		con->error_msg = "protocol error, garbage tag during connect";
 		return -1;
 	}
@@ -2282,8 +2278,7 @@ static int read_partial_message(struct ceph_connection *con)
 
 	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
 	if (cpu_to_le32(crc) != con->in_hdr.crc) {
-		pr_err("read_partial_message bad hdr "
-		       " crc %u != expected %u\n",
+		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
 		       crc, con->in_hdr.crc);
 		return -EBADMSG;
 	}
@@ -2313,7 +2308,7 @@ static int read_partial_message(struct ceph_connection *con)
 		pr_err("read_partial_message bad seq %lld expected %lld\n",
 		       seq, con->in_seq + 1);
 		con->error_msg = "bad message sequence # for incoming message";
-		return -EBADMSG;
+		return -EBADE;
 	}
 
 	/* allocate message? */
@@ -2660,6 +2655,8 @@ more:
 			switch (ret) {
 			case -EBADMSG:
 				con->error_msg = "bad crc";
+				/* fall through */
+			case -EBADE:
 				ret = -EIO;
 				break;
 			case -EIO:
@@ -2838,7 +2835,8 @@ static void con_work(struct work_struct *work)
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
-			con->error_msg = "socket error on read";
+			if (!con->error_msg)
+				con->error_msg = "socket error on read";
 			fault = true;
 			break;
 		}
@@ -2847,7 +2845,8 @@ static void con_work(struct work_struct *work)
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
-			con->error_msg = "socket error on write";
+			if (!con->error_msg)
+				con->error_msg = "socket error on write";
 			fault = true;
 		}
 
@@ -2869,11 +2868,13 @@ static void con_work(struct work_struct *work)
  */
 static void con_fault(struct ceph_connection *con)
 {
-	pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-		ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
 	dout("fault %p state %lu to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
 
+	pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+		ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+	con->error_msg = NULL;
+
 	WARN_ON(con->state != CON_STATE_CONNECTING &&
 	       con->state != CON_STATE_NEGOTIATING &&
 	       con->state != CON_STATE_OPEN);
@@ -3295,8 +3296,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 		 */
 		if (*skip)
 			return 0;
-		con->error_msg = "error allocating memory for incoming message";
 
+		con->error_msg = "error allocating memory for incoming message";
 		return -ENOMEM;
 	}
 	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index b8c3fde5b04f..15796696d64e 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -122,6 +122,22 @@ bad:
 	return -EINVAL;
 }
 
+static int crush_decode_straw2_bucket(void **p, void *end,
+				      struct crush_bucket_straw2 *b)
+{
+	int j;
+	dout("crush_decode_straw2_bucket %p to %p\n", *p, end);
+	b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+	if (b->item_weights == NULL)
+		return -ENOMEM;
+	ceph_decode_need(p, end, b->h.size * sizeof(u32), bad);
+	for (j = 0; j < b->h.size; j++)
+		b->item_weights[j] = ceph_decode_32(p);
+	return 0;
+bad:
+	return -EINVAL;
+}
+
 static int skip_name_map(void **p, void *end)
 {
         int len;
@@ -204,6 +220,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		case CRUSH_BUCKET_STRAW:
 			size = sizeof(struct crush_bucket_straw);
 			break;
+		case CRUSH_BUCKET_STRAW2:
+			size = sizeof(struct crush_bucket_straw2);
+			break;
 		default:
 			err = -EINVAL;
 			goto bad;
@@ -261,6 +280,12 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 			if (err < 0)
 				goto bad;
 			break;
+		case CRUSH_BUCKET_STRAW2:
+			err = crush_decode_straw2_bucket(p, end,
+				(struct crush_bucket_straw2 *)b);
+			if (err < 0)
+				goto bad;
+			break;
 		}
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index af4a1b0adc10..c7ba0388f1be 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2713,7 +2713,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 	if (unlikely(!skb))
 		goto out_null;
 
-	if (netif_needs_gso(dev, skb, features)) {
+	if (netif_needs_gso(skb, features)) {
 		struct sk_buff *segs;
 
 		segs = skb_gso_segment(skb, features);
@@ -3079,7 +3079,7 @@ static struct rps_dev_flow *
 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	    struct rps_dev_flow *rflow, u16 next_cpu)
 {
-	if (next_cpu != RPS_NO_CPU) {
+	if (next_cpu < nr_cpu_ids) {
 #ifdef CONFIG_RFS_ACCEL
 		struct netdev_rx_queue *rxqueue;
 		struct rps_dev_flow_table *flow_table;
@@ -3184,7 +3184,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		 * If the desired CPU (where last recvmsg was done) is
 		 * different from current CPU (one in the rx-queue flow
 		 * table entry), switch if one of the following holds:
-		 *   - Current CPU is unset (equal to RPS_NO_CPU).
+		 *   - Current CPU is unset (>= nr_cpu_ids).
 		 *   - Current CPU is offline.
 		 *   - The current CPU's queue tail has advanced beyond the
 		 *     last packet that was enqueued using this table entry.
@@ -3192,14 +3192,14 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		 *     have been dequeued, thus preserving in order delivery.
 		 */
 		if (unlikely(tcpu != next_cpu) &&
-		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
 		      rflow->last_qtail)) >= 0)) {
 			tcpu = next_cpu;
 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 		}
 
-		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
 			*rflowp = rflow;
 			cpu = tcpu;
 			goto done;
@@ -3240,14 +3240,14 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 	struct rps_dev_flow_table *flow_table;
 	struct rps_dev_flow *rflow;
 	bool expire = true;
-	int cpu;
+	unsigned int cpu;
 
 	rcu_read_lock();
 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	if (flow_table && flow_id <= flow_table->mask) {
 		rflow = &flow_table->flows[flow_id];
 		cpu = ACCESS_ONCE(rflow->cpu);
-		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 			   rflow->last_qtail) <
 		     (int)(10 * flow_table->mask)))
diff --git a/net/core/filter.c b/net/core/filter.c
index b669e75d2b36..bf831a85c315 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1175,12 +1175,27 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
 	return 0;
 }
 
+/**
+ *	bpf_skb_clone_not_writable - is the header of a clone not writable
+ *	@skb: buffer to check
+ *	@len: length up to which to write, can be negative
+ *
+ *	Returns true if modifying the header part of the cloned buffer
+ *	does require the data to be copied. I.e. this version works with
+ *	negative lengths needed for eBPF case!
+ */
+static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len)
+{
+	return skb_header_cloned(skb) ||
+	       (int) skb_headroom(skb) + len > skb->hdr_len;
+}
+
 #define BPF_RECOMPUTE_CSUM(flags)	((flags) & 1)
 
 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
-	unsigned int offset = (unsigned int) r2;
+	int offset = (int) r2;
 	void *from = (void *) (long) r3;
 	unsigned int len = (unsigned int) r4;
 	char buf[16];
@@ -1194,10 +1209,12 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	 *
 	 * so check for invalid 'offset' and too large 'len'
 	 */
-	if (unlikely(offset > 0xffff || len > sizeof(buf)))
+	if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
 		return -EFAULT;
 
-	if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len))
+	offset -= skb->data - skb_mac_header(skb);
+	if (unlikely(skb_cloned(skb) &&
+		     bpf_skb_clone_unwritable(skb, offset + len)))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, len, buf);
@@ -1232,15 +1249,18 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = {
 #define BPF_HEADER_FIELD_SIZE(flags)	((flags) & 0x0f)
 #define BPF_IS_PSEUDO_HEADER(flags)	((flags) & 0x10)
 
-static u64 bpf_l3_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags)
+static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	int offset = (int) r2;
 	__sum16 sum, *ptr;
 
-	if (unlikely(offset > 0xffff))
+	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
 
-	if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum)))
+	offset -= skb->data - skb_mac_header(skb);
+	if (unlikely(skb_cloned(skb) &&
+		     bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1276,16 +1296,19 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_l4_csum_replace(u64 r1, u64 offset, u64 from, u64 to, u64 flags)
+static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags);
+	int offset = (int) r2;
 	__sum16 sum, *ptr;
 
-	if (unlikely(offset > 0xffff))
+	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
 
-	if (skb_cloned(skb) && !skb_clone_writable(skb, offset + sizeof(sum)))
+	offset -= skb->data - skb_mac_header(skb);
+	if (unlikely(skb_cloned(skb) &&
+		     bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a3abb719221f..78fc04ad36fc 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -16,7 +16,6 @@
 #include <linux/export.h>
 #include <linux/user_namespace.h>
 #include <linux/net_namespace.h>
-#include <linux/rtnetlink.h>
 #include <net/sock.h>
 #include <net/netlink.h>
 #include <net/net_namespace.h>
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 358d52a38533..666e0928ba40 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2854,7 +2854,7 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
 
 int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 			    struct net_device *dev, u16 mode,
-			    u32 flags, u32 mask)
+			    u32 flags, u32 mask, int nlflags)
 {
 	struct nlmsghdr *nlh;
 	struct ifinfomsg *ifm;
@@ -2863,7 +2863,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
 	struct net_device *br_dev = netdev_master_upper_dev_get(dev);
 
-	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), NLM_F_MULTI);
+	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
 	if (nlh == NULL)
 		return -EMSGSIZE;
 
@@ -2969,7 +2969,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 		if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
 			if (idx >= cb->args[0] &&
 			    br_dev->netdev_ops->ndo_bridge_getlink(
-				    skb, portid, seq, dev, filter_mask) < 0)
+				    skb, portid, seq, dev, filter_mask,
+				    NLM_F_MULTI) < 0)
 				break;
 			idx++;
 		}
@@ -2977,7 +2978,8 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 		if (ops->ndo_bridge_getlink) {
 			if (idx >= cb->args[0] &&
 			    ops->ndo_bridge_getlink(skb, portid, seq, dev,
-						    filter_mask) < 0)
+						    filter_mask,
+						    NLM_F_MULTI) < 0)
 				break;
 			idx++;
 		}
@@ -3018,7 +3020,7 @@ static int rtnl_bridge_notify(struct net_device *dev)
 		goto errout;
 	}
 
-	err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0);
+	err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
 	if (err < 0)
 		goto errout;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3b6e5830256e..3cfff2a3d651 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -280,13 +280,14 @@ nodata:
 EXPORT_SYMBOL(__alloc_skb);
 
 /**
- * build_skb - build a network buffer
+ * __build_skb - build a network buffer
  * @data: data buffer provided by caller
- * @frag_size: size of fragment, or 0 if head was kmalloced
+ * @frag_size: size of data, or 0 if head was kmalloced
  *
  * Allocate a new &sk_buff. Caller provides space holding head and
  * skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator.
+ * @frag_size is 0, otherwise data should come from the page allocator
+ *  or vmalloc()
  * The return is the new skb buffer.
  * On a failure the return is %NULL, and @data is not freed.
  * Notes :
@@ -297,7 +298,7 @@ EXPORT_SYMBOL(__alloc_skb);
  *  before giving packet to stack.
  *  RX rings only contains data buffers, not full skbs.
  */
-struct sk_buff *build_skb(void *data, unsigned int frag_size)
+struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 {
 	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
@@ -311,7 +312,6 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->truesize = SKB_TRUESIZE(size);
-	skb->head_frag = frag_size != 0;
 	atomic_set(&skb->users, 1);
 	skb->head = data;
 	skb->data = data;
@@ -328,6 +328,23 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 
 	return skb;
 }
+
+/* build_skb() is wrapper over __build_skb(), that specifically
+ * takes care of skb->head and skb->pfmemalloc
+ * This means that if @frag_size is not zero, then @data must be backed
+ * by a page fragment, not kmalloc() or vmalloc()
+ */
+struct sk_buff *build_skb(void *data, unsigned int frag_size)
+{
+	struct sk_buff *skb = __build_skb(data, frag_size);
+
+	if (skb && frag_size) {
+		skb->head_frag = 1;
+		if (virt_to_head_page(data)->pfmemalloc)
+			skb->pfmemalloc = 1;
+	}
+	return skb;
+}
 EXPORT_SYMBOL(build_skb);
 
 struct netdev_alloc_cache {
@@ -348,7 +365,8 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
 	gfp_t gfp = gfp_mask;
 
 	if (order) {
-		gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
+		gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+			    __GFP_NOMEMALLOC;
 		page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
 		nc->frag.size = PAGE_SIZE << (page ? order : 0);
 	}
@@ -4124,19 +4142,21 @@ EXPORT_SYMBOL(skb_try_coalesce);
  */
 void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 {
-	if (xnet)
-		skb_orphan(skb);
 	skb->tstamp.tv64 = 0;
 	skb->pkt_type = PACKET_HOST;
 	skb->skb_iif = 0;
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
-	skb->mark = 0;
 	skb_sender_cpu_clear(skb);
-	skb_init_secmark(skb);
 	secpath_reset(skb);
 	nf_reset(skb);
 	nf_reset_trace(skb);
+
+	if (!xnet)
+		return;
+
+	skb_orphan(skb);
+	skb->mark = 0;
 }
 EXPORT_SYMBOL_GPL(skb_scrub_packet);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 2b4f21d34df6..ccf4c5629b3c 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -453,7 +453,8 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 						       iph->saddr, iph->daddr);
 	if (req) {
 		nsk = dccp_check_req(sk, skb, req);
-		reqsk_put(req);
+		if (!nsk)
+			reqsk_put(req);
 		return nsk;
 	}
 	nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9d0551092c6c..5165571f397a 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -301,7 +301,8 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
 				   &iph->daddr, inet6_iif(skb));
 	if (req) {
 		nsk = dccp_check_req(sk, skb, req);
-		reqsk_put(req);
+		if (!nsk)
+			reqsk_put(req);
 		return nsk;
 	}
 	nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 5f566663e47f..30addee2dd03 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -186,8 +186,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (child == NULL)
 		goto listen_overflow;
 
-	inet_csk_reqsk_queue_unlink(sk, req);
-	inet_csk_reqsk_queue_removed(sk, req);
+	inet_csk_reqsk_queue_drop(sk, req);
 	inet_csk_reqsk_queue_add(sk, req, child);
 out:
 	return child;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 5eaadabe23a1..e6f6cc3a1bcf 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -124,7 +124,7 @@ static ssize_t temp1_max_store(struct device *dev,
 
 	return count;
 }
-static DEVICE_ATTR(temp1_max, S_IRUGO, temp1_max_show, temp1_max_store);
+static DEVICE_ATTR_RW(temp1_max);
 
 static ssize_t temp1_max_alarm_show(struct device *dev,
 				    struct device_attribute *attr, char *buf)
@@ -159,8 +159,8 @@ static umode_t dsa_hwmon_attrs_visible(struct kobject *kobj,
 	if (index == 1) {
 		if (!drv->get_temp_limit)
 			mode = 0;
-		else if (drv->set_temp_limit)
-			mode |= S_IWUSR;
+		else if (!drv->set_temp_limit)
+			mode &= ~S_IWUSR;
 	} else if (index == 2 && !drv->get_temp_alarm) {
 		mode = 0;
 	}
@@ -633,7 +633,7 @@ static int dsa_of_probe(struct device *dev)
 		if (cd->sw_addr > PHY_MAX_ADDR)
 			continue;
 
-		if (!of_property_read_u32(np, "eeprom-length", &eeprom_len))
+		if (!of_property_read_u32(child, "eeprom-length", &eeprom_len))
 			cd->eeprom_len = eeprom_len;
 
 		for_each_available_child_of_node(child, port) {
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index af150b43b214..34968cd5c146 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -711,11 +711,10 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 				    cb->nlh->nlmsg_seq, NLM_F_MULTI,
 				    skb, FOU_CMD_GET);
 		if (ret)
-			goto done;
+			break;
 	}
 	mutex_unlock(&fn->fou_lock);
 
-done:
 	cb->args[0] = idx;
 	return skb->len;
 }
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5c3dd6267ed3..8976ca423a07 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -564,6 +564,40 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+/* return true if req was found in the syn_table[] */
+static bool reqsk_queue_unlink(struct request_sock_queue *queue,
+			       struct request_sock *req)
+{
+	struct listen_sock *lopt = queue->listen_opt;
+	struct request_sock **prev;
+	bool found = false;
+
+	spin_lock(&queue->syn_wait_lock);
+
+	for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
+	     prev = &(*prev)->dl_next) {
+		if (*prev == req) {
+			*prev = req->dl_next;
+			found = true;
+			break;
+		}
+	}
+
+	spin_unlock(&queue->syn_wait_lock);
+	if (del_timer(&req->rsk_timer))
+		reqsk_put(req);
+	return found;
+}
+
+void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+	if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+		reqsk_put(req);
+	}
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+
 static void reqsk_timer_handler(unsigned long data)
 {
 	struct request_sock *req = (struct request_sock *)data;
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 70e8b3c308ec..bb77ebdae3b3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -111,6 +111,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		      const struct nlmsghdr *unlh)
 {
 	const struct inet_sock *inet = inet_sk(sk);
+	const struct tcp_congestion_ops *ca_ops;
 	const struct inet_diag_handler *handler;
 	int ext = req->idiag_ext;
 	struct inet_diag_msg *r;
@@ -208,16 +209,31 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		info = nla_data(attr);
 	}
 
-	if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
-		if (nla_put_string(skb, INET_DIAG_CONG,
-				   icsk->icsk_ca_ops->name) < 0)
+	if (ext & (1 << (INET_DIAG_CONG - 1))) {
+		int err = 0;
+
+		rcu_read_lock();
+		ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+		if (ca_ops)
+			err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name);
+		rcu_read_unlock();
+		if (err < 0)
 			goto errout;
+	}
 
 	handler->idiag_get_info(sk, r, info);
 
-	if (sk->sk_state < TCP_TIME_WAIT &&
-	    icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
-		icsk->icsk_ca_ops->get_info(sk, ext, skb);
+	if (sk->sk_state < TCP_TIME_WAIT) {
+		int err = 0;
+
+		rcu_read_lock();
+		ca_ops = READ_ONCE(icsk->icsk_ca_ops);
+		if (ca_ops && ca_ops->get_info)
+			err = ca_ops->get_info(sk, ext, skb);
+		rcu_read_unlock();
+		if (err < 0)
+			goto errout;
+	}
 
 out:
 	nlmsg_end(skb, nlh);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 939992c456f3..3674484946a5 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -82,6 +82,9 @@ int ip_forward(struct sk_buff *skb)
 	if (skb->pkt_type != PACKET_HOST)
 		goto drop;
 
+	if (unlikely(skb->sk))
+		goto drop;
+
 	if (skb_warn_if_lro(skb))
 		goto drop;
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index a93f260cf24c..05ff44b758df 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -158,6 +158,7 @@ void ping_unhash(struct sock *sk)
 	if (sk_hashed(sk)) {
 		write_lock_bh(&ping_table.lock);
 		hlist_nulls_del(&sk->sk_nulls_node);
+		sk_nulls_node_init(&sk->sk_nulls_node);
 		sock_put(sk);
 		isk->inet_num = 0;
 		isk->inet_sport = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a78540f28276..bff62fc87b8e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -962,10 +962,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	if (dst_metric_locked(dst, RTAX_MTU))
 		return;
 
-	if (dst->dev->mtu < mtu)
-		return;
-
-	if (rt->rt_pmtu && rt->rt_pmtu < mtu)
+	if (ipv4_mtu(dst) < mtu)
 		return;
 
 	if (mtu < ip_rt_min_pmtu)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 18e3a12eb1b2..8c5cd9efebbc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -520,8 +520,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 
 				/* Race breaker. If space is freed after
 				 * wspace test but before the flags are set,
-				 * IO signal will be lost.
+				 * IO signal will be lost. Memory barrier
+				 * pairs with the input side.
 				 */
+				smp_mb__after_atomic();
 				if (sk_stream_is_writeable(sk))
 					mask |= POLLOUT | POLLWRNORM;
 			}
@@ -2595,6 +2597,7 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp;
+	u32 rate;
 
 	memset(info, 0, sizeof(*info));
 
@@ -2655,10 +2658,11 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_total_retrans = tp->total_retrans;
 
-	info->tcpi_pacing_rate = sk->sk_pacing_rate != ~0U ?
-					sk->sk_pacing_rate : ~0ULL;
-	info->tcpi_max_pacing_rate = sk->sk_max_pacing_rate != ~0U ?
-					sk->sk_max_pacing_rate : ~0ULL;
+	rate = READ_ONCE(sk->sk_pacing_rate);
+	info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+
+	rate = READ_ONCE(sk->sk_max_pacing_rate);
+	info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index b504371af742..4376016f7fa5 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -277,7 +277,7 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
 	}
 }
 
-static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+static int dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 {
 	const struct dctcp *ca = inet_csk_ca(sk);
 
@@ -297,8 +297,9 @@ static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 			info.dctcp_ab_tot = ca->acked_bytes_total;
 		}
 
-		nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
+		return nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
 	}
+	return 0;
 }
 
 static struct tcp_congestion_ops dctcp __read_mostly = {
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 1d5a30a90adf..67476f085e48 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -300,8 +300,7 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
-static void tcp_illinois_info(struct sock *sk, u32 ext,
-			      struct sk_buff *skb)
+static int tcp_illinois_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 {
 	const struct illinois *ca = inet_csk_ca(sk);
 
@@ -318,8 +317,9 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
 			do_div(t, info.tcpv_rttcnt);
 			info.tcpv_rtt = t;
 		}
-		nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+		return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
 	}
+	return 0;
 }
 
 static struct tcp_congestion_ops tcp_illinois __read_mostly = {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a7ef679dd3ea..3a4d9b34bed4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4845,6 +4845,8 @@ static void tcp_check_space(struct sock *sk)
 {
 	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
 		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+		/* pairs with tcp_poll() */
+		smp_mb__after_atomic();
 		if (sk->sk_socket &&
 		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
 			tcp_new_space(sk);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3571f2be4470..fc1c658ec6c1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1348,7 +1348,8 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 	req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
 	if (req) {
 		nsk = tcp_check_req(sk, skb, req, false);
-		reqsk_put(req);
+		if (!nsk)
+			reqsk_put(req);
 		return nsk;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 63d6311b5365..e5d7649136fc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -755,10 +755,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (!child)
 		goto listen_overflow;
 
-	inet_csk_reqsk_queue_unlink(sk, req);
-	inet_csk_reqsk_queue_removed(sk, req);
-
+	inet_csk_reqsk_queue_drop(sk, req);
 	inet_csk_reqsk_queue_add(sk, req, child);
+	/* Warning: caller must not call reqsk_put(req);
+	 * child stole last reference on it.
+	 */
 	return child;
 
 listen_overflow:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8c8d7e06b72f..a369e8a70b2c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2812,39 +2812,65 @@ begin_fwd:
 	}
 }
 
-/* Send a fin.  The caller locks the socket for us.  This cannot be
- * allowed to fail queueing a FIN frame under any circumstances.
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ */
+static void sk_forced_wmem_schedule(struct sock *sk, int size)
+{
+	int amt, status;
+
+	if (size <= sk->sk_forward_alloc)
+		return;
+	amt = sk_mem_pages(size);
+	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+	sk_memory_allocated_add(sk, amt, &status);
+}
+
+/* Send a FIN. The caller locks the socket for us.
+ * We should try to send a FIN packet really hard, but eventually give up.
  */
 void tcp_send_fin(struct sock *sk)
 {
+	struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = tcp_write_queue_tail(sk);
-	int mss_now;
 
-	/* Optimization, tack on the FIN if we have a queue of
-	 * unsent frames.  But be careful about outgoing SACKS
-	 * and IP options.
+	/* Optimization, tack on the FIN if we have one skb in write queue and
+	 * this skb was not yet sent, or we are under memory pressure.
+	 * Note: in the latter case, FIN packet will be sent after a timeout,
+	 * as TCP stack thinks it has already been transmitted.
 	 */
-	mss_now = tcp_current_mss(sk);
-
-	if (tcp_send_head(sk)) {
-		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
-		TCP_SKB_CB(skb)->end_seq++;
+	if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+coalesce:
+		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
+		TCP_SKB_CB(tskb)->end_seq++;
 		tp->write_seq++;
+		if (!tcp_send_head(sk)) {
+			/* This means tskb was already sent.
+			 * Pretend we included the FIN on previous transmit.
+			 * We need to set tp->snd_nxt to the value it would have
+			 * if FIN had been sent. This is because retransmit path
+			 * does not change tp->snd_nxt.
+			 */
+			tp->snd_nxt++;
+			return;
+		}
 	} else {
-		/* Socket is locked, keep trying until memory is available. */
-		for (;;) {
-			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
-			if (skb)
-				break;
-			yield();
+		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
+		if (unlikely(!skb)) {
+			if (tskb)
+				goto coalesce;
+			return;
 		}
+		skb_reserve(skb, MAX_TCP_HEADER);
+		sk_forced_wmem_schedule(sk, skb->truesize);
 		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
 		tcp_init_nondata_skb(skb, tp->write_seq,
 				     TCPHDR_ACK | TCPHDR_FIN);
 		tcp_queue_skb(sk, skb);
 	}
-	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
+	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
 }
 
 /* We get here when a process closes a file descriptor (either due to
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index a6afde666ab1..c71a1b8f7bde 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -286,7 +286,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
-void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
+int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 {
 	const struct vegas *ca = inet_csk_ca(sk);
 	if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
@@ -297,8 +297,9 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 			.tcpv_minrtt = ca->minRTT,
 		};
 
-		nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+		return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
 	}
+	return 0;
 }
 EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
 
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index 0531b99d8637..e8a6b33cc61d 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -19,6 +19,6 @@ void tcp_vegas_init(struct sock *sk);
 void tcp_vegas_state(struct sock *sk, u8 ca_state);
 void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
 void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
-void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
+int tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb);
 
 #endif	/* __TCP_VEGAS_H */
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index bb63fba47d47..b3c57cceb990 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -256,8 +256,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 }
 
 /* Extract info for Tcp socket info provided via netlink. */
-static void tcp_westwood_info(struct sock *sk, u32 ext,
-			      struct sk_buff *skb)
+static int tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb)
 {
 	const struct westwood *ca = inet_csk_ca(sk);
 
@@ -268,8 +267,9 @@ static void tcp_westwood_info(struct sock *sk, u32 ext,
 			.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min),
 		};
 
-		nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
+		return nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
 	}
+	return 0;
 }
 
 static struct tcp_congestion_ops tcp_westwood __read_mostly = {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index b5e6cc1d4a73..a38d3ac0f18f 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1246,7 +1246,6 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 static int ip6gre_tunnel_init(struct net_device *dev)
 {
 	struct ip6_tnl *tunnel;
-	int i;
 
 	tunnel = netdev_priv(dev);
 
@@ -1260,16 +1259,10 @@ static int ip6gre_tunnel_init(struct net_device *dev)
 	if (ipv6_addr_any(&tunnel->parms.raddr))
 		dev->header_ops = &ip6gre_header_ops;
 
-	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
+	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
 
-	for_each_possible_cpu(i) {
-		struct pcpu_sw_netstats *ip6gre_tunnel_stats;
-		ip6gre_tunnel_stats = per_cpu_ptr(dev->tstats, i);
-		u64_stats_init(&ip6gre_tunnel_stats->syncp);
-	}
-
 	return 0;
 }
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ad51df85aa00..b6575d665568 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -946,7 +946,8 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 				   &ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
 	if (req) {
 		nsk = tcp_check_req(sk, skb, req, false);
-		reqsk_put(req);
+		if (!nsk)
+			reqsk_put(req);
 		return nsk;
 	}
 	nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index db8a2ea6d4de..954810c76a86 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -53,6 +53,11 @@ static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
 	return rt;
 }
 
+static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
+{
+	return rcu_dereference_rtnl(dev->mpls_ptr);
+}
+
 static bool mpls_output_possible(const struct net_device *dev)
 {
 	return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -136,6 +141,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	struct mpls_route *rt;
 	struct mpls_entry_decoded dec;
 	struct net_device *out_dev;
+	struct mpls_dev *mdev;
 	unsigned int hh_len;
 	unsigned int new_header_size;
 	unsigned int mtu;
@@ -143,6 +149,10 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 
 	/* Careful this entire function runs inside of an rcu critical section */
 
+	mdev = mpls_dev_get(dev);
+	if (!mdev || !mdev->input_enabled)
+		goto drop;
+
 	if (skb->pkt_type != PACKET_HOST)
 		goto drop;
 
@@ -352,9 +362,9 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 	if (!dev)
 		goto errout;
 
-	/* For now just support ethernet devices */
+	/* Ensure this is a supported device */
 	err = -EINVAL;
-	if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
+	if (!mpls_dev_get(dev))
 		goto errout;
 
 	err = -EINVAL;
@@ -428,10 +438,89 @@ errout:
 	return err;
 }
 
+#define MPLS_PERDEV_SYSCTL_OFFSET(field)	\
+	(&((struct mpls_dev *)0)->field)
+
+static const struct ctl_table mpls_dev_table[] = {
+	{
+		.procname	= "input",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.data		= MPLS_PERDEV_SYSCTL_OFFSET(input_enabled),
+	},
+	{ }
+};
+
+static int mpls_dev_sysctl_register(struct net_device *dev,
+				    struct mpls_dev *mdev)
+{
+	char path[sizeof("net/mpls/conf/") + IFNAMSIZ];
+	struct ctl_table *table;
+	int i;
+
+	table = kmemdup(&mpls_dev_table, sizeof(mpls_dev_table), GFP_KERNEL);
+	if (!table)
+		goto out;
+
+	/* Table data contains only offsets relative to the base of
+	 * the mdev at this point, so make them absolute.
+	 */
+	for (i = 0; i < ARRAY_SIZE(mpls_dev_table); i++)
+		table[i].data = (char *)mdev + (uintptr_t)table[i].data;
+
+	snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name);
+
+	mdev->sysctl = register_net_sysctl(dev_net(dev), path, table);
+	if (!mdev->sysctl)
+		goto free;
+
+	return 0;
+
+free:
+	kfree(table);
+out:
+	return -ENOBUFS;
+}
+
+static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev)
+{
+	struct ctl_table *table;
+
+	table = mdev->sysctl->ctl_table_arg;
+	unregister_net_sysctl_table(mdev->sysctl);
+	kfree(table);
+}
+
+static struct mpls_dev *mpls_add_dev(struct net_device *dev)
+{
+	struct mpls_dev *mdev;
+	int err = -ENOMEM;
+
+	ASSERT_RTNL();
+
+	mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
+		return ERR_PTR(err);
+
+	err = mpls_dev_sysctl_register(dev, mdev);
+	if (err)
+		goto free;
+
+	rcu_assign_pointer(dev->mpls_ptr, mdev);
+
+	return mdev;
+
+free:
+	kfree(mdev);
+	return ERR_PTR(err);
+}
+
 static void mpls_ifdown(struct net_device *dev)
 {
 	struct mpls_route __rcu **platform_label;
 	struct net *net = dev_net(dev);
+	struct mpls_dev *mdev;
 	unsigned index;
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
@@ -443,14 +532,35 @@ static void mpls_ifdown(struct net_device *dev)
 			continue;
 		rt->rt_dev = NULL;
 	}
+
+	mdev = mpls_dev_get(dev);
+	if (!mdev)
+		return;
+
+	mpls_dev_sysctl_unregister(mdev);
+
+	RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+
+	kfree(mdev);
 }
 
 static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct mpls_dev *mdev;
 
 	switch(event) {
+	case NETDEV_REGISTER:
+		/* For now just support ethernet devices */
+		if ((dev->type == ARPHRD_ETHER) ||
+		    (dev->type == ARPHRD_LOOPBACK)) {
+			mdev = mpls_add_dev(dev);
+			if (IS_ERR(mdev))
+				return notifier_from_errno(PTR_ERR(mdev));
+		}
+		break;
+
 	case NETDEV_UNREGISTER:
 		mpls_ifdown(dev);
 		break;
@@ -536,6 +646,15 @@ int nla_get_labels(const struct nlattr *nla,
 		if ((dec.bos != bos) || dec.ttl || dec.tc)
 			return -EINVAL;
 
+		switch (dec.label) {
+		case LABEL_IMPLICIT_NULL:
+			/* RFC3032: This is a label that an LSR may
+			 * assign and distribute, but which never
+			 * actually appears in the encapsulation.
+			 */
+			return -EINVAL;
+		}
+
 		label[i] = dec.label;
 	}
 	*labels = nla_labels;
@@ -912,7 +1031,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write,
 	return ret;
 }
 
-static struct ctl_table mpls_table[] = {
+static const struct ctl_table mpls_table[] = {
 	{
 		.procname	= "platform_labels",
 		.data		= NULL,
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index fb6de92052c4..693877d69606 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -22,6 +22,12 @@ struct mpls_entry_decoded {
 	u8 bos;
 };
 
+struct mpls_dev {
+	int			input_enabled;
+
+	struct ctl_table_header *sysctl;
+};
+
 struct sk_buff;
 
 static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 78af83bc9c8e..ad9d11fb29fd 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4340,7 +4340,6 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 	case NFT_CONTINUE:
 	case NFT_BREAK:
 	case NFT_RETURN:
-		desc->len = sizeof(data->verdict);
 		break;
 	case NFT_JUMP:
 	case NFT_GOTO:
@@ -4355,10 +4354,10 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 
 		chain->use++;
 		data->verdict.chain = chain;
-		desc->len = sizeof(data);
 		break;
 	}
 
+	desc->len = sizeof(data->verdict);
 	desc->type = NFT_DATA_VERDICT;
 	return 0;
 }
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 57d3e1af5630..0522fc9bfb0a 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -63,6 +63,8 @@ int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
 		if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
 			goto nla_put_failure;
 		break;
+	default:
+		break;
 	}
 
 	return 0;
diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c
index 62cabee42fbe..635dbba93d01 100644
--- a/net/netfilter/nft_reject_inet.c
+++ b/net/netfilter/nft_reject_inet.c
@@ -108,6 +108,8 @@ static int nft_reject_inet_dump(struct sk_buff *skb,
 		if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
 			goto nla_put_failure;
 		break;
+	default:
+		break;
 	}
 
 	return 0;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 19909d0786a2..ec4adbdcb9b4 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1629,13 +1629,11 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
 	if (data == NULL)
 		return NULL;
 
-	skb = build_skb(data, size);
+	skb = __build_skb(data, size);
 	if (skb == NULL)
 		vfree(data);
-	else {
-		skb->head_frag = 0;
+	else
 		skb->destructor = netlink_skb_destructor;
-	}
 
 	return skb;
 }
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 4d2cede17468..dc6a2d324bd8 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -38,6 +38,9 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	struct tcf_bpf *prog = act->priv;
 	int action, filter_res;
 
+	if (unlikely(!skb_mac_header_was_set(skb)))
+		return TC_ACT_UNSPEC;
+
 	spin_lock(&prog->tcf_lock);
 
 	prog->tcf_tm.lastuse = jiffies;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 8e472518f9f6..295d14bd6c67 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -63,7 +63,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 		skb->mark = c->mark;
 		/* using overlimits stats to count how many packets marked */
 		ca->tcf_qstats.overlimits++;
-		nf_ct_put(c);
 		goto out;
 	}
 
@@ -82,7 +81,6 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
 	nf_ct_put(c);
 
 out:
-	skb->nfct = NULL;
 	spin_unlock(&ca->tcf_lock);
 	return ca->tcf_action;
 }
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5953517ec059..3f63ceac8e01 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -157,7 +157,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 
 	if (!(at & AT_EGRESS)) {
 		if (m->tcfm_ok_push)
-			skb_push(skb2, skb2->dev->hard_header_len);
+			skb_push(skb2, skb->mac_len);
 	}
 
 	/* mirror is always swallowed */
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 5c4171c5d2bd..91bd9c19471d 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -66,6 +66,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	struct cls_bpf_prog *prog;
 	int ret = -1;
 
+	if (unlikely(!skb_mac_header_was_set(skb)))
+		return -1;
+
 	/* Needed here for accessing maps. */
 	rcu_read_lock();
 	list_for_each_entry_rcu(prog, &head->plist, link) {
diff --git a/net/socket.c b/net/socket.c
index 3e33959f3ce5..884e32997698 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -312,7 +312,7 @@ static const struct super_operations sockfs_ops = {
 static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]",
-				dentry->d_inode->i_ino);
+				d_inode(dentry)->i_ino);
 }
 
 static const struct dentry_operations sockfs_dentry_operations = {
@@ -375,7 +375,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
 		  &socket_file_ops);
 	if (unlikely(IS_ERR(file))) {
 		/* drop dentry, keep inode */
-		ihold(path.dentry->d_inode);
+		ihold(d_inode(path.dentry));
 		path_put(&path);
 		return file;
 	}
@@ -497,7 +497,7 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
 	ssize_t len;
 	ssize_t used = 0;
 
-	len = security_inode_listsecurity(dentry->d_inode, buffer, size);
+	len = security_inode_listsecurity(d_inode(dentry), buffer, size);
 	if (len < 0)
 		return len;
 	used += len;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 2d12b76b5a64..d81186d34558 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -94,7 +94,7 @@ rpc_timeout_upcall_queue(struct work_struct *work)
 	}
 	dentry = dget(pipe->dentry);
 	spin_unlock(&pipe->lock);
-	rpc_purge_list(dentry ? &RPC_I(dentry->d_inode)->waitq : NULL,
+	rpc_purge_list(dentry ? &RPC_I(d_inode(dentry))->waitq : NULL,
 			&free_list, destroy_msg, -ETIMEDOUT);
 	dput(dentry);
 }
@@ -152,7 +152,7 @@ rpc_queue_upcall(struct rpc_pipe *pipe, struct rpc_pipe_msg *msg)
 	dentry = dget(pipe->dentry);
 	spin_unlock(&pipe->lock);
 	if (dentry) {
-		wake_up(&RPC_I(dentry->d_inode)->waitq);
+		wake_up(&RPC_I(d_inode(dentry))->waitq);
 		dput(dentry);
 	}
 	return res;
@@ -591,7 +591,7 @@ static int __rpc_mkpipe_dentry(struct inode *dir, struct dentry *dentry,
 	err = __rpc_create_common(dir, dentry, S_IFIFO | mode, i_fop, private);
 	if (err)
 		return err;
-	rpci = RPC_I(dentry->d_inode);
+	rpci = RPC_I(d_inode(dentry));
 	rpci->private = private;
 	rpci->pipe = pipe;
 	fsnotify_create(dir, dentry);
@@ -616,7 +616,7 @@ int rpc_rmdir(struct dentry *dentry)
 	int error;
 
 	parent = dget_parent(dentry);
-	dir = parent->d_inode;
+	dir = d_inode(parent);
 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
 	error = __rpc_rmdir(dir, dentry);
 	mutex_unlock(&dir->i_mutex);
@@ -638,7 +638,7 @@ static int __rpc_unlink(struct inode *dir, struct dentry *dentry)
 
 static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
 {
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = d_inode(dentry);
 
 	rpc_close_pipes(inode);
 	return __rpc_unlink(dir, dentry);
@@ -654,7 +654,7 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
 		if (!dentry)
 			return ERR_PTR(-ENOMEM);
 	}
-	if (dentry->d_inode == NULL)
+	if (d_really_is_negative(dentry))
 		return dentry;
 	dput(dentry);
 	return ERR_PTR(-EEXIST);
@@ -667,7 +667,7 @@ static void __rpc_depopulate(struct dentry *parent,
 			     const struct rpc_filelist *files,
 			     int start, int eof)
 {
-	struct inode *dir = parent->d_inode;
+	struct inode *dir = d_inode(parent);
 	struct dentry *dentry;
 	struct qstr name;
 	int i;
@@ -679,9 +679,9 @@ static void __rpc_depopulate(struct dentry *parent,
 
 		if (dentry == NULL)
 			continue;
-		if (dentry->d_inode == NULL)
+		if (d_really_is_negative(dentry))
 			goto next;
-		switch (dentry->d_inode->i_mode & S_IFMT) {
+		switch (d_inode(dentry)->i_mode & S_IFMT) {
 			default:
 				BUG();
 			case S_IFREG:
@@ -699,7 +699,7 @@ static void rpc_depopulate(struct dentry *parent,
 			   const struct rpc_filelist *files,
 			   int start, int eof)
 {
-	struct inode *dir = parent->d_inode;
+	struct inode *dir = d_inode(parent);
 
 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
 	__rpc_depopulate(parent, files, start, eof);
@@ -711,7 +711,7 @@ static int rpc_populate(struct dentry *parent,
 			int start, int eof,
 			void *private)
 {
-	struct inode *dir = parent->d_inode;
+	struct inode *dir = d_inode(parent);
 	struct dentry *dentry;
 	int i, err;
 
@@ -754,7 +754,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
 		int (*populate)(struct dentry *, void *), void *args_populate)
 {
 	struct dentry *dentry;
-	struct inode *dir = parent->d_inode;
+	struct inode *dir = d_inode(parent);
 	int error;
 
 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
@@ -787,7 +787,7 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
 	int error;
 
 	parent = dget_parent(dentry);
-	dir = parent->d_inode;
+	dir = d_inode(parent);
 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
 	if (depopulate != NULL)
 		depopulate(dentry);
@@ -819,7 +819,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
 				 void *private, struct rpc_pipe *pipe)
 {
 	struct dentry *dentry;
-	struct inode *dir = parent->d_inode;
+	struct inode *dir = d_inode(parent);
 	umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR;
 	int err;
 
@@ -864,7 +864,7 @@ rpc_unlink(struct dentry *dentry)
 	int error = 0;
 
 	parent = dget_parent(dentry);
-	dir = parent->d_inode;
+	dir = d_inode(parent);
 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
 	error = __rpc_rmpipe(dir, dentry);
 	mutex_unlock(&dir->i_mutex);
@@ -1375,7 +1375,7 @@ rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
 	struct dentry *clnt_dir = pipe_dentry->d_parent;
 	struct dentry *gssd_dir = clnt_dir->d_parent;
 
-	__rpc_rmpipe(clnt_dir->d_inode, pipe_dentry);
+	__rpc_rmpipe(d_inode(clnt_dir), pipe_dentry);
 	__rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
 	__rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
 	dput(pipe_dentry);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index b91fd9c597b4..337ca851a350 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
 	if (!task->tk_timeout)
 		return;
 
-	dprintk("RPC: %5u setting alarm for %lu ms\n",
-			task->tk_pid, task->tk_timeout * 1000 / HZ);
+	dprintk("RPC: %5u setting alarm for %u ms\n",
+		task->tk_pid, jiffies_to_msecs(task->tk_timeout));
 
 	task->u.tk_wait.expires = jiffies + task->tk_timeout;
 	if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9949722d99ce..1d4fe24af06a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -326,6 +326,15 @@ out_unlock:
 	xprt_clear_locked(xprt);
 }
 
+static void xprt_task_clear_bytes_sent(struct rpc_task *task)
+{
+	if (task != NULL) {
+		struct rpc_rqst *req = task->tk_rqstp;
+		if (req != NULL)
+			req->rq_bytes_sent = 0;
+	}
+}
+
 /**
  * xprt_release_xprt - allow other requests to use a transport
  * @xprt: transport with other tasks potentially waiting
@@ -336,11 +345,7 @@ out_unlock:
 void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	if (xprt->snd_task == task) {
-		if (task != NULL) {
-			struct rpc_rqst *req = task->tk_rqstp;
-			if (req != NULL)
-				req->rq_bytes_sent = 0;
-		}
+		xprt_task_clear_bytes_sent(task);
 		xprt_clear_locked(xprt);
 		__xprt_lock_write_next(xprt);
 	}
@@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt);
 void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	if (xprt->snd_task == task) {
-		if (task != NULL) {
-			struct rpc_rqst *req = task->tk_rqstp;
-			if (req != NULL)
-				req->rq_bytes_sent = 0;
-		}
+		xprt_task_clear_bytes_sent(task);
 		xprt_clear_locked(xprt);
 		__xprt_lock_write_next_cong(xprt);
 	}
@@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
 		goto out;
 	if (xprt->snd_task != task)
 		goto out;
+	xprt_task_clear_bytes_sent(task);
 	xprt->snd_task = cookie;
 	ret = true;
 out:
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index da5136fd5694..579f72bbcf4b 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,6 +1,7 @@
 obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
 
-xprtrdma-y := transport.o rpc_rdma.o verbs.o
+xprtrdma-y := transport.o rpc_rdma.o verbs.o \
+	fmr_ops.o frwr_ops.o physical_ops.o
 
 obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
 
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
new file mode 100644
index 000000000000..302d4ebf6fbf
--- /dev/null
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Memory Regions (FMR).
+ * Referred to sometimes as MTHCAFMR mode.
+ *
+ * FMR uses synchronous memory registration and deregistration.
+ * FMR registration is known to be fast, but FMR deregistration
+ * can take tens of usecs to complete.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/* Maximum scatter/gather per FMR */
+#define RPCRDMA_MAX_FMR_SGES	(64)
+
+static int
+fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+	    struct rpcrdma_create_data_internal *cdata)
+{
+	return 0;
+}
+
+/* FMR mode conveys up to 64 pages of payload per chunk segment.
+ */
+static size_t
+fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+		     rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES);
+}
+
+static int
+fmr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
+	struct ib_fmr_attr fmr_attr = {
+		.max_pages	= RPCRDMA_MAX_FMR_SGES,
+		.max_maps	= 1,
+		.page_shift	= PAGE_SHIFT
+	};
+	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+	struct rpcrdma_mw *r;
+	int i, rc;
+
+	INIT_LIST_HEAD(&buf->rb_mws);
+	INIT_LIST_HEAD(&buf->rb_all);
+
+	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+	dprintk("RPC:       %s: initializing %d FMRs\n", __func__, i);
+
+	while (i--) {
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+		if (!r)
+			return -ENOMEM;
+
+		r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+		if (IS_ERR(r->r.fmr))
+			goto out_fmr_err;
+
+		list_add(&r->mw_list, &buf->rb_mws);
+		list_add(&r->mw_all, &buf->rb_all);
+	}
+	return 0;
+
+out_fmr_err:
+	rc = PTR_ERR(r->r.fmr);
+	dprintk("RPC:       %s: ib_alloc_fmr status %i\n", __func__, rc);
+	kfree(r);
+	return rc;
+}
+
+/* Use the ib_map_phys_fmr() verb to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+	   int nsegs, bool writing)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct ib_device *device = ia->ri_id->device;
+	enum dma_data_direction direction = rpcrdma_data_dir(writing);
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct rpcrdma_mw *mw = seg1->rl_mw;
+	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
+	int len, pageoff, i, rc;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (nsegs > RPCRDMA_MAX_FMR_SGES)
+		nsegs = RPCRDMA_MAX_FMR_SGES;
+	for (i = 0; i < nsegs;) {
+		rpcrdma_map_one(device, seg, direction);
+		physaddrs[i] = seg->mr_dma;
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+
+	rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
+	if (rc)
+		goto out_maperr;
+
+	seg1->mr_rkey = mw->r.fmr->rkey;
+	seg1->mr_base = seg1->mr_dma + pageoff;
+	seg1->mr_nsegs = i;
+	seg1->mr_len = len;
+	return i;
+
+out_maperr:
+	dprintk("RPC:       %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
+		__func__, len, (unsigned long long)seg1->mr_dma,
+		pageoff, i, rc);
+	while (i--)
+		rpcrdma_unmap_one(device, --seg);
+	return rc;
+}
+
+/* Use the ib_unmap_fmr() verb to prevent further remote
+ * access via RDMA READ or RDMA WRITE.
+ */
+static int
+fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct ib_device *device;
+	int rc, nsegs = seg->mr_nsegs;
+	LIST_HEAD(l);
+
+	list_add(&seg1->rl_mw->r.fmr->list, &l);
+	rc = ib_unmap_fmr(&l);
+	read_lock(&ia->ri_qplock);
+	device = ia->ri_id->device;
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(device, seg++);
+	read_unlock(&ia->ri_qplock);
+	if (rc)
+		goto out_err;
+	return nsegs;
+
+out_err:
+	dprintk("RPC:       %s: ib_unmap_fmr status %i\n", __func__, rc);
+	return nsegs;
+}
+
+/* After a disconnect, unmap all FMRs.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_fmr_external().
+ */
+static void
+fmr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_mw *r;
+	LIST_HEAD(list);
+	int rc;
+
+	list_for_each_entry(r, &buf->rb_all, mw_all)
+		list_add(&r->r.fmr->list, &list);
+
+	rc = ib_unmap_fmr(&list);
+	if (rc)
+		dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
+			__func__, rc);
+}
+
+static void
+fmr_op_destroy(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+	int rc;
+
+	while (!list_empty(&buf->rb_all)) {
+		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+		list_del(&r->mw_all);
+		rc = ib_dealloc_fmr(r->r.fmr);
+		if (rc)
+			dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
+				__func__, rc);
+		kfree(r);
+	}
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
+	.ro_map				= fmr_op_map,
+	.ro_unmap			= fmr_op_unmap,
+	.ro_open			= fmr_op_open,
+	.ro_maxpages			= fmr_op_maxpages,
+	.ro_init			= fmr_op_init,
+	.ro_reset			= fmr_op_reset,
+	.ro_destroy			= fmr_op_destroy,
+	.ro_displayname			= "fmr",
+};
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 000000000000..dff0481dbcf8
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Registration Work
+ * Requests (FRWR). Also referred to sometimes as FRMR mode.
+ *
+ * FRWR features ordered asynchronous registration and deregistration
+ * of arbitrarily sized memory regions. This is the fastest and safest
+ * but most complex memory registration mode.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+static int
+__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
+	    unsigned int depth)
+{
+	struct rpcrdma_frmr *f = &r->r.frmr;
+	int rc;
+
+	f->fr_mr = ib_alloc_fast_reg_mr(pd, depth);
+	if (IS_ERR(f->fr_mr))
+		goto out_mr_err;
+	f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
+	if (IS_ERR(f->fr_pgl))
+		goto out_list_err;
+	return 0;
+
+out_mr_err:
+	rc = PTR_ERR(f->fr_mr);
+	dprintk("RPC:       %s: ib_alloc_fast_reg_mr status %i\n",
+		__func__, rc);
+	return rc;
+
+out_list_err:
+	rc = PTR_ERR(f->fr_pgl);
+	dprintk("RPC:       %s: ib_alloc_fast_reg_page_list status %i\n",
+		__func__, rc);
+	ib_dereg_mr(f->fr_mr);
+	return rc;
+}
+
+static void
+__frwr_release(struct rpcrdma_mw *r)
+{
+	int rc;
+
+	rc = ib_dereg_mr(r->r.frmr.fr_mr);
+	if (rc)
+		dprintk("RPC:       %s: ib_dereg_mr status %i\n",
+			__func__, rc);
+	ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+}
+
+static int
+frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+	     struct rpcrdma_create_data_internal *cdata)
+{
+	struct ib_device_attr *devattr = &ia->ri_devattr;
+	int depth, delta;
+
+	ia->ri_max_frmr_depth =
+			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+			      devattr->max_fast_reg_page_list_len);
+	dprintk("RPC:       %s: device's max FR page list len = %u\n",
+		__func__, ia->ri_max_frmr_depth);
+
+	/* Add room for frmr register and invalidate WRs.
+	 * 1. FRMR reg WR for head
+	 * 2. FRMR invalidate WR for head
+	 * 3. N FRMR reg WRs for pagelist
+	 * 4. N FRMR invalidate WRs for pagelist
+	 * 5. FRMR reg WR for tail
+	 * 6. FRMR invalidate WR for tail
+	 * 7. The RDMA_SEND WR
+	 */
+	depth = 7;
+
+	/* Calculate N if the device max FRMR depth is smaller than
+	 * RPCRDMA_MAX_DATA_SEGS.
+	 */
+	if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
+		delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth;
+		do {
+			depth += 2; /* FRMR reg + invalidate */
+			delta -= ia->ri_max_frmr_depth;
+		} while (delta > 0);
+	}
+
+	ep->rep_attr.cap.max_send_wr *= depth;
+	if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
+		cdata->max_requests = devattr->max_qp_wr / depth;
+		if (!cdata->max_requests)
+			return -EINVAL;
+		ep->rep_attr.cap.max_send_wr = cdata->max_requests *
+					       depth;
+	}
+
+	return 0;
+}
+
+/* FRWR mode conveys a list of pages per chunk segment. The
+ * maximum length of that list is the FRWR page list depth.
+ */
+static size_t
+frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+		     rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
+}
+
+/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
+static void
+frwr_sendcompletion(struct ib_wc *wc)
+{
+	struct rpcrdma_mw *r;
+
+	if (likely(wc->status == IB_WC_SUCCESS))
+		return;
+
+	/* WARNING: Only wr_id and status are reliable at this point */
+	r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+	dprintk("RPC:       %s: frmr %p (stale), status %d\n",
+		__func__, r, wc->status);
+	r->r.frmr.fr_state = FRMR_IS_STALE;
+}
+
+static int
+frwr_op_init(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+	unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+	int i;
+
+	INIT_LIST_HEAD(&buf->rb_mws);
+	INIT_LIST_HEAD(&buf->rb_all);
+
+	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
+	dprintk("RPC:       %s: initializing %d FRMRs\n", __func__, i);
+
+	while (i--) {
+		struct rpcrdma_mw *r;
+		int rc;
+
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+		if (!r)
+			return -ENOMEM;
+
+		rc = __frwr_init(r, pd, device, depth);
+		if (rc) {
+			kfree(r);
+			return rc;
+		}
+
+		list_add(&r->mw_list, &buf->rb_mws);
+		list_add(&r->mw_all, &buf->rb_all);
+		r->mw_sendcompletion = frwr_sendcompletion;
+	}
+
+	return 0;
+}
+
+/* Post a FAST_REG Work Request to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+	    int nsegs, bool writing)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct ib_device *device = ia->ri_id->device;
+	enum dma_data_direction direction = rpcrdma_data_dir(writing);
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct rpcrdma_mw *mw = seg1->rl_mw;
+	struct rpcrdma_frmr *frmr = &mw->r.frmr;
+	struct ib_mr *mr = frmr->fr_mr;
+	struct ib_send_wr fastreg_wr, *bad_wr;
+	u8 key;
+	int len, pageoff;
+	int i, rc;
+	int seg_len;
+	u64 pa;
+	int page_no;
+
+	pageoff = offset_in_page(seg1->mr_offset);
+	seg1->mr_offset -= pageoff;	/* start of page */
+	seg1->mr_len += pageoff;
+	len = -pageoff;
+	if (nsegs > ia->ri_max_frmr_depth)
+		nsegs = ia->ri_max_frmr_depth;
+	for (page_no = i = 0; i < nsegs;) {
+		rpcrdma_map_one(device, seg, direction);
+		pa = seg->mr_dma;
+		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
+			frmr->fr_pgl->page_list[page_no++] = pa;
+			pa += PAGE_SIZE;
+		}
+		len += seg->mr_len;
+		++seg;
+		++i;
+		/* Check for holes */
+		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	dprintk("RPC:       %s: Using frmr %p to map %d segments (%d bytes)\n",
+		__func__, mw, i, len);
+
+	frmr->fr_state = FRMR_IS_VALID;
+
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.wr_id = (unsigned long)(void *)mw;
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
+	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
+	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fastreg_wr.wr.fast_reg.page_list_len = page_no;
+	fastreg_wr.wr.fast_reg.length = len;
+	fastreg_wr.wr.fast_reg.access_flags = writing ?
+				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+				IB_ACCESS_REMOTE_READ;
+	key = (u8)(mr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(mr, ++key);
+	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+	if (rc)
+		goto out_senderr;
+
+	seg1->mr_rkey = mr->rkey;
+	seg1->mr_base = seg1->mr_dma + pageoff;
+	seg1->mr_nsegs = i;
+	seg1->mr_len = len;
+	return i;
+
+out_senderr:
+	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+	ib_update_fast_reg_key(mr, --key);
+	frmr->fr_state = FRMR_IS_INVALID;
+	while (i--)
+		rpcrdma_unmap_one(device, --seg);
+	return rc;
+}
+
+/* Post a LOCAL_INV Work Request to prevent further remote access
+ * via RDMA READ or RDMA WRITE.
+ */
+static int
+frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+	struct rpcrdma_mr_seg *seg1 = seg;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct ib_send_wr invalidate_wr, *bad_wr;
+	int rc, nsegs = seg->mr_nsegs;
+	struct ib_device *device;
+
+	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
+
+	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
+	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
+	invalidate_wr.opcode = IB_WR_LOCAL_INV;
+	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
+	DECR_CQCOUNT(&r_xprt->rx_ep);
+
+	read_lock(&ia->ri_qplock);
+	device = ia->ri_id->device;
+	while (seg1->mr_nsegs--)
+		rpcrdma_unmap_one(device, seg++);
+	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	read_unlock(&ia->ri_qplock);
+	if (rc)
+		goto out_err;
+	return nsegs;
+
+out_err:
+	/* Force rpcrdma_buffer_get() to retry */
+	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
+	dprintk("RPC:       %s: ib_post_send status %i\n", __func__, rc);
+	return nsegs;
+}
+
+/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
+ * an unusable state. Find FRMRs in this state and dereg / reg
+ * each.  FRMRs that are VALID and attached to an rpcrdma_req are
+ * also torn down.
+ *
+ * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
+ *
+ * This is invoked only in the transport connect worker in order
+ * to serialize with rpcrdma_register_frmr_external().
+ */
+static void
+frwr_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+	unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
+	struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
+	struct rpcrdma_mw *r;
+	int rc;
+
+	list_for_each_entry(r, &buf->rb_all, mw_all) {
+		if (r->r.frmr.fr_state == FRMR_IS_INVALID)
+			continue;
+
+		__frwr_release(r);
+		rc = __frwr_init(r, pd, device, depth);
+		if (rc) {
+			dprintk("RPC:       %s: mw %p left %s\n",
+				__func__, r,
+				(r->r.frmr.fr_state == FRMR_IS_STALE ?
+					"stale" : "valid"));
+			continue;
+		}
+
+		r->r.frmr.fr_state = FRMR_IS_INVALID;
+	}
+}
+
+static void
+frwr_op_destroy(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_mw *r;
+
+	while (!list_empty(&buf->rb_all)) {
+		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
+		list_del(&r->mw_all);
+		__frwr_release(r);
+		kfree(r);
+	}
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
+	.ro_map				= frwr_op_map,
+	.ro_unmap			= frwr_op_unmap,
+	.ro_open			= frwr_op_open,
+	.ro_maxpages			= frwr_op_maxpages,
+	.ro_init			= frwr_op_init,
+	.ro_reset			= frwr_op_reset,
+	.ro_destroy			= frwr_op_destroy,
+	.ro_displayname			= "frwr",
+};
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
new file mode 100644
index 000000000000..ba518af16787
--- /dev/null
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* No-op chunk preparation. All client memory is pre-registered.
+ * Sometimes referred to as ALLPHYSICAL mode.
+ *
+ * Physical registration is simple because all client memory is
+ * pre-registered and never deregistered. This mode is good for
+ * adapter bring up, but is considered not safe: the server is
+ * trusted not to abuse its access to client memory not involved
+ * in RDMA I/O.
+ */
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+static int
+physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
+		 struct rpcrdma_create_data_internal *cdata)
+{
+	return 0;
+}
+
+/* PHYSICAL memory registration conveys one page per chunk segment.
+ */
+static size_t
+physical_op_maxpages(struct rpcrdma_xprt *r_xprt)
+{
+	return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
+		     rpcrdma_max_segments(r_xprt));
+}
+
+static int
+physical_op_init(struct rpcrdma_xprt *r_xprt)
+{
+	return 0;
+}
+
+/* The client's physical memory is already exposed for
+ * remote access via RDMA READ or RDMA WRITE.
+ */
+static int
+physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+		int nsegs, bool writing)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+	rpcrdma_map_one(ia->ri_id->device, seg,
+			rpcrdma_data_dir(writing));
+	seg->mr_rkey = ia->ri_bind_mem->rkey;
+	seg->mr_base = seg->mr_dma;
+	seg->mr_nsegs = 1;
+	return 1;
+}
+
+/* Unmap a memory region, but leave it registered.
+ */
+static int
+physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+
+	read_lock(&ia->ri_qplock);
+	rpcrdma_unmap_one(ia->ri_id->device, seg);
+	read_unlock(&ia->ri_qplock);
+
+	return 1;
+}
+
+static void
+physical_op_reset(struct rpcrdma_xprt *r_xprt)
+{
+}
+
+static void
+physical_op_destroy(struct rpcrdma_buffer *buf)
+{
+}
+
+const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
+	.ro_map				= physical_op_map,
+	.ro_unmap			= physical_op_unmap,
+	.ro_open			= physical_op_open,
+	.ro_maxpages			= physical_op_maxpages,
+	.ro_init			= physical_op_init,
+	.ro_reset			= physical_op_reset,
+	.ro_destroy			= physical_op_destroy,
+	.ro_displayname			= "physical",
+};
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 91ffde82fa0c..2c53ea9e1b83 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,6 +53,14 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static const char transfertypes[][12] = {
 	"pure inline",	/* no chunks */
@@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	struct rpcrdma_write_array *warray = NULL;
 	struct rpcrdma_write_chunk *cur_wchunk = NULL;
 	__be32 *iptr = headerp->rm_body.rm_chunks;
+	int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool);
 
 	if (type == rpcrdma_readch || type == rpcrdma_areadch) {
 		/* a read chunk - server will RDMA Read our memory */
@@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	if (nsegs < 0)
 		return nsegs;
 
+	map = r_xprt->rx_ia.ri_ops->ro_map;
 	do {
-		n = rpcrdma_register_external(seg, nsegs,
-						cur_wchunk != NULL, r_xprt);
+		n = map(r_xprt, seg, nsegs, cur_wchunk != NULL);
 		if (n <= 0)
 			goto out;
 		if (cur_rchunk) {	/* read */
@@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
 	return (unsigned char *)iptr - (unsigned char *)headerp;
 
 out:
-	if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
-		for (pos = 0; nchunks--;)
-			pos += rpcrdma_deregister_external(
-					&req->rl_segments[pos], r_xprt);
-	}
-	return n;
-}
+	if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+		return n;
 
-/*
- * Marshal chunks. This routine returns the header length
- * consumed by marshaling.
- *
- * Returns positive RPC/RDMA header size, or negative errno.
- */
-
-ssize_t
-rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
-{
-	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
-	struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf);
-
-	if (req->rl_rtype != rpcrdma_noch)
-		result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
-					       headerp, req->rl_rtype);
-	else if (req->rl_wtype != rpcrdma_noch)
-		result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
-					       headerp, req->rl_wtype);
-	return result;
+	for (pos = 0; nchunks--;)
+		pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+						      &req->rl_segments[pos]);
+	return n;
 }
 
 /*
@@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	char *base;
 	size_t rpclen, padlen;
 	ssize_t hdrlen;
+	enum rpcrdma_chunktype rtype, wtype;
 	struct rpcrdma_msg *headerp;
 
 	/*
@@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * into pages; otherwise use reply chunks.
 	 */
 	if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
-		req->rl_wtype = rpcrdma_noch;
+		wtype = rpcrdma_noch;
 	else if (rqst->rq_rcv_buf.page_len == 0)
-		req->rl_wtype = rpcrdma_replych;
+		wtype = rpcrdma_replych;
 	else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
-		req->rl_wtype = rpcrdma_writech;
+		wtype = rpcrdma_writech;
 	else
-		req->rl_wtype = rpcrdma_replych;
+		wtype = rpcrdma_replych;
 
 	/*
 	 * Chunks needed for arguments?
@@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * TBD check NFSv4 setacl
 	 */
 	if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-		req->rl_rtype = rpcrdma_noch;
+		rtype = rpcrdma_noch;
 	else if (rqst->rq_snd_buf.page_len == 0)
-		req->rl_rtype = rpcrdma_areadch;
+		rtype = rpcrdma_areadch;
 	else
-		req->rl_rtype = rpcrdma_readch;
+		rtype = rpcrdma_readch;
 
 	/* The following simplification is not true forever */
-	if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
-		req->rl_wtype = rpcrdma_noch;
-	if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
+	if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+		wtype = rpcrdma_noch;
+	if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
 		dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
 			__func__);
 		return -EIO;
@@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 * When padding is in use and applies to the transfer, insert
 	 * it and change the message type.
 	 */
-	if (req->rl_rtype == rpcrdma_noch) {
+	if (rtype == rpcrdma_noch) {
 
 		padlen = rpcrdma_inline_pullup(rqst,
 						RPCRDMA_INLINE_PAD_VALUE(rqst));
@@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 			headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
 			headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
 			hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
-			if (req->rl_wtype != rpcrdma_noch) {
+			if (wtype != rpcrdma_noch) {
 				dprintk("RPC:       %s: invalid chunk list\n",
 					__func__);
 				return -EIO;
@@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 			 * on receive. Therefore, we request a reply chunk
 			 * for non-writes wherever feasible and efficient.
 			 */
-			if (req->rl_wtype == rpcrdma_noch)
-				req->rl_wtype = rpcrdma_replych;
+			if (wtype == rpcrdma_noch)
+				wtype = rpcrdma_replych;
 		}
 	}
 
-	hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
+	if (rtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
+					       headerp, rtype);
+		wtype = rtype;	/* simplify dprintk */
+
+	} else if (wtype != rpcrdma_noch) {
+		hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
+					       headerp, wtype);
+	}
 	if (hdrlen < 0)
 		return hdrlen;
 
 	dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
 		" headerp 0x%p base 0x%p lkey 0x%x\n",
-		__func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
+		__func__, transfertypes[wtype], hdrlen, rpclen, padlen,
 		headerp, base, rdmab_lkey(req->rl_rdmabuf));
 
 	/*
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 2e192baa59f3..54f23b1be986 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = {
 static struct rpc_xprt_ops xprt_rdma_procs;	/* forward reference */
 
 static void
+xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	char buf[20];
+
+	snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
+}
+
+static void
+xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	char buf[40];
+
+	snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
+}
+
+static void
 xprt_rdma_format_addresses(struct rpc_xprt *xprt)
 {
 	struct sockaddr *sap = (struct sockaddr *)
 					&rpcx_to_rdmad(xprt).addr;
-	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	char buf[64];
+	char buf[128];
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		xprt_rdma_format_addresses4(xprt, sap);
+		break;
+	case AF_INET6:
+		xprt_rdma_format_addresses6(xprt, sap);
+		break;
+	default:
+		pr_err("rpcrdma: Unrecognized address family\n");
+		return;
+	}
 
 	(void)rpc_ntop(sap, buf, sizeof(buf));
 	xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
@@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt)
 	snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
 	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
 
-	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
-
-	snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
-	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
-
 	snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
 	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
 
-	/* netid */
-	xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
+	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
 }
 
 static void
@@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args)
 			  xprt_rdma_connect_worker);
 
 	xprt_rdma_format_addresses(xprt);
-	xprt->max_payload = rpcrdma_max_payload(new_xprt);
+	xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
+	if (xprt->max_payload == 0)
+		goto out4;
+	xprt->max_payload <<= PAGE_SHIFT;
 	dprintk("RPC:       %s: transport data payload maximum: %zu bytes\n",
 		__func__, xprt->max_payload);
 
@@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer)
 
 	for (i = 0; req->rl_nchunks;) {
 		--req->rl_nchunks;
-		i += rpcrdma_deregister_external(
-			&req->rl_segments[i], r_xprt);
+		i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
+						    &req->rl_segments[i]);
 	}
 
 	rpcrdma_buffer_put(req);
@@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task)
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	int rc = 0;
 
-	if (req->rl_niovs == 0)
-		rc = rpcrdma_marshal_req(rqst);
-	else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL)
-		rc = rpcrdma_marshal_chunks(rqst, 0);
+	rc = rpcrdma_marshal_req(rqst);
 	if (rc < 0)
 		goto failed_marshal;
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e28909fddd30..4870d272e006 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -50,6 +50,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/prefetch.h>
+#include <linux/sunrpc/addr.h>
 #include <asm/bitops.h>
 
 #include "xprt_rdma.h"
@@ -62,9 +63,6 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
-static void rpcrdma_reset_frmrs(struct rpcrdma_ia *);
-static void rpcrdma_reset_fmrs(struct rpcrdma_ia *);
-
 /*
  * internal functions
  */
@@ -188,7 +186,7 @@ static const char * const wc_status[] = {
 	"remote access error",
 	"remote operation error",
 	"transport retry counter exceeded",
-	"RNR retrycounter exceeded",
+	"RNR retry counter exceeded",
 	"local RDD violation error",
 	"remove invalid RD request",
 	"operation aborted",
@@ -206,21 +204,17 @@ static const char * const wc_status[] = {
 static void
 rpcrdma_sendcq_process_wc(struct ib_wc *wc)
 {
-	if (likely(wc->status == IB_WC_SUCCESS))
-		return;
-
 	/* WARNING: Only wr_id and status are reliable at this point */
-	if (wc->wr_id == 0ULL) {
-		if (wc->status != IB_WC_WR_FLUSH_ERR)
+	if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
+		if (wc->status != IB_WC_SUCCESS &&
+		    wc->status != IB_WC_WR_FLUSH_ERR)
 			pr_err("RPC:       %s: SEND: %s\n",
 			       __func__, COMPLETION_MSG(wc->status));
 	} else {
 		struct rpcrdma_mw *r;
 
 		r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
-		r->r.frmr.fr_state = FRMR_IS_STALE;
-		pr_err("RPC:       %s: frmr %p (stale): %s\n",
-		       __func__, r, COMPLETION_MSG(wc->status));
+		r->mw_sendcompletion(wc);
 	}
 }
 
@@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 	struct rpcrdma_ia *ia = &xprt->rx_ia;
 	struct rpcrdma_ep *ep = &xprt->rx_ep;
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-	struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
+	struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr;
 #endif
 	struct ib_qp_attr *attr = &ia->ri_qp_attr;
 	struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr;
@@ -480,9 +474,8 @@ connected:
 		wake_up_all(&ep->rep_connect_wait);
 		/*FALLTHROUGH*/
 	default:
-		dprintk("RPC:       %s: %pI4:%u (ep 0x%p): %s\n",
-			__func__, &addr->sin_addr.s_addr,
-			ntohs(addr->sin_port), ep,
+		dprintk("RPC:       %s: %pIS:%u (ep 0x%p): %s\n",
+			__func__, sap, rpc_get_port(sap), ep,
 			CONNECTION_MSG(event->event));
 		break;
 	}
@@ -491,19 +484,16 @@ connected:
 	if (connstate == 1) {
 		int ird = attr->max_dest_rd_atomic;
 		int tird = ep->rep_remote_cma.responder_resources;
-		printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
-			"on %s, memreg %d slots %d ird %d%s\n",
-			&addr->sin_addr.s_addr,
-			ntohs(addr->sin_port),
+
+		pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
+			sap, rpc_get_port(sap),
 			ia->ri_id->device->name,
-			ia->ri_memreg_strategy,
+			ia->ri_ops->ro_displayname,
 			xprt->rx_buf.rb_max_requests,
 			ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
 	} else if (connstate < 0) {
-		printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
-			&addr->sin_addr.s_addr,
-			ntohs(addr->sin_port),
-			connstate);
+		pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n",
+			sap, rpc_get_port(sap), connstate);
 	}
 #endif
 
@@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 
 	if (memreg == RPCRDMA_FRMR) {
 		/* Requires both frmr reg and local dma lkey */
-		if ((devattr->device_cap_flags &
+		if (((devattr->device_cap_flags &
 		     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
-		    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+		    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) ||
+		      (devattr->max_fast_reg_page_list_len == 0)) {
 			dprintk("RPC:       %s: FRMR registration "
 				"not supported by HCA\n", __func__);
 			memreg = RPCRDMA_MTHCAFMR;
-		} else {
-			/* Mind the ia limit on FRMR page list depth */
-			ia->ri_max_frmr_depth = min_t(unsigned int,
-				RPCRDMA_MAX_DATA_SEGS,
-				devattr->max_fast_reg_page_list_len);
 		}
 	}
 	if (memreg == RPCRDMA_MTHCAFMR) {
@@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	 */
 	switch (memreg) {
 	case RPCRDMA_FRMR:
+		ia->ri_ops = &rpcrdma_frwr_memreg_ops;
 		break;
 	case RPCRDMA_ALLPHYSICAL:
+		ia->ri_ops = &rpcrdma_physical_memreg_ops;
 		mem_priv = IB_ACCESS_LOCAL_WRITE |
 				IB_ACCESS_REMOTE_WRITE |
 				IB_ACCESS_REMOTE_READ;
 		goto register_setup;
 	case RPCRDMA_MTHCAFMR:
+		ia->ri_ops = &rpcrdma_fmr_memreg_ops;
 		if (ia->ri_have_dma_lkey)
 			break;
 		mem_priv = IB_ACCESS_LOCAL_WRITE;
@@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 		rc = -ENOMEM;
 		goto out3;
 	}
-	dprintk("RPC:       %s: memory registration strategy is %d\n",
-		__func__, memreg);
+	dprintk("RPC:       %s: memory registration strategy is '%s'\n",
+		__func__, ia->ri_ops->ro_displayname);
 
 	/* Else will do memory reg/dereg for each chunk */
 	ia->ri_memreg_strategy = memreg;
@@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 
 	ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
 	ep->rep_attr.qp_context = ep;
-	/* send_cq and recv_cq initialized below */
 	ep->rep_attr.srq = NULL;
 	ep->rep_attr.cap.max_send_wr = cdata->max_requests;
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR: {
-		int depth = 7;
-
-		/* Add room for frmr register and invalidate WRs.
-		 * 1. FRMR reg WR for head
-		 * 2. FRMR invalidate WR for head
-		 * 3. N FRMR reg WRs for pagelist
-		 * 4. N FRMR invalidate WRs for pagelist
-		 * 5. FRMR reg WR for tail
-		 * 6. FRMR invalidate WR for tail
-		 * 7. The RDMA_SEND WR
-		 */
-
-		/* Calculate N if the device max FRMR depth is smaller than
-		 * RPCRDMA_MAX_DATA_SEGS.
-		 */
-		if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
-			int delta = RPCRDMA_MAX_DATA_SEGS -
-				    ia->ri_max_frmr_depth;
-
-			do {
-				depth += 2; /* FRMR reg + invalidate */
-				delta -= ia->ri_max_frmr_depth;
-			} while (delta > 0);
-
-		}
-		ep->rep_attr.cap.max_send_wr *= depth;
-		if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
-			cdata->max_requests = devattr->max_qp_wr / depth;
-			if (!cdata->max_requests)
-				return -EINVAL;
-			ep->rep_attr.cap.max_send_wr = cdata->max_requests *
-						       depth;
-		}
-		break;
-	}
-	default:
-		break;
-	}
+	rc = ia->ri_ops->ro_open(ia, ep, cdata);
+	if (rc)
+		return rc;
 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
 	ep->rep_attr.cap.max_recv_sge = 1;
@@ -944,21 +895,9 @@ retry:
 		rpcrdma_ep_disconnect(ep, ia);
 		rpcrdma_flush_cqs(ep);
 
-		switch (ia->ri_memreg_strategy) {
-		case RPCRDMA_FRMR:
-			rpcrdma_reset_frmrs(ia);
-			break;
-		case RPCRDMA_MTHCAFMR:
-			rpcrdma_reset_fmrs(ia);
-			break;
-		case RPCRDMA_ALLPHYSICAL:
-			break;
-		default:
-			rc = -EIO;
-			goto out;
-		}
-
 		xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+		ia->ri_ops->ro_reset(xprt);
+
 		id = rpcrdma_create_id(xprt, ia,
 				(struct sockaddr *)&xprt->rx_data.addr);
 		if (IS_ERR(id)) {
@@ -1123,91 +1062,6 @@ out:
 	return ERR_PTR(rc);
 }
 
-static int
-rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
-{
-	int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ;
-	struct ib_fmr_attr fmr_attr = {
-		.max_pages	= RPCRDMA_MAX_DATA_SEGS,
-		.max_maps	= 1,
-		.page_shift	= PAGE_SHIFT
-	};
-	struct rpcrdma_mw *r;
-	int i, rc;
-
-	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-	dprintk("RPC:       %s: initializing %d FMRs\n", __func__, i);
-
-	while (i--) {
-		r = kzalloc(sizeof(*r), GFP_KERNEL);
-		if (r == NULL)
-			return -ENOMEM;
-
-		r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr);
-		if (IS_ERR(r->r.fmr)) {
-			rc = PTR_ERR(r->r.fmr);
-			dprintk("RPC:       %s: ib_alloc_fmr failed %i\n",
-				__func__, rc);
-			goto out_free;
-		}
-
-		list_add(&r->mw_list, &buf->rb_mws);
-		list_add(&r->mw_all, &buf->rb_all);
-	}
-	return 0;
-
-out_free:
-	kfree(r);
-	return rc;
-}
-
-static int
-rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_frmr *f;
-	struct rpcrdma_mw *r;
-	int i, rc;
-
-	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-	dprintk("RPC:       %s: initializing %d FRMRs\n", __func__, i);
-
-	while (i--) {
-		r = kzalloc(sizeof(*r), GFP_KERNEL);
-		if (r == NULL)
-			return -ENOMEM;
-		f = &r->r.frmr;
-
-		f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-						ia->ri_max_frmr_depth);
-		if (IS_ERR(f->fr_mr)) {
-			rc = PTR_ERR(f->fr_mr);
-			dprintk("RPC:       %s: ib_alloc_fast_reg_mr "
-				"failed %i\n", __func__, rc);
-			goto out_free;
-		}
-
-		f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device,
-							ia->ri_max_frmr_depth);
-		if (IS_ERR(f->fr_pgl)) {
-			rc = PTR_ERR(f->fr_pgl);
-			dprintk("RPC:       %s: ib_alloc_fast_reg_page_list "
-				"failed %i\n", __func__, rc);
-
-			ib_dereg_mr(f->fr_mr);
-			goto out_free;
-		}
-
-		list_add(&r->mw_list, &buf->rb_mws);
-		list_add(&r->mw_all, &buf->rb_all);
-	}
-
-	return 0;
-
-out_free:
-	kfree(r);
-	return rc;
-}
-
 int
 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 {
@@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
 	buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
 	p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
 
-	INIT_LIST_HEAD(&buf->rb_mws);
-	INIT_LIST_HEAD(&buf->rb_all);
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR:
-		rc = rpcrdma_init_frmrs(ia, buf);
-		if (rc)
-			goto out;
-		break;
-	case RPCRDMA_MTHCAFMR:
-		rc = rpcrdma_init_fmrs(ia, buf);
-		if (rc)
-			goto out;
-		break;
-	default:
-		break;
-	}
+	rc = ia->ri_ops->ro_init(r_xprt);
+	if (rc)
+		goto out;
 
 	for (i = 0; i < buf->rb_max_requests; i++) {
 		struct rpcrdma_req *req;
@@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
 	kfree(req);
 }
 
-static void
-rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_mw *r;
-	int rc;
-
-	while (!list_empty(&buf->rb_all)) {
-		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-		list_del(&r->mw_all);
-		list_del(&r->mw_list);
-
-		rc = ib_dealloc_fmr(r->r.fmr);
-		if (rc)
-			dprintk("RPC:       %s: ib_dealloc_fmr failed %i\n",
-				__func__, rc);
-
-		kfree(r);
-	}
-}
-
-static void
-rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf)
-{
-	struct rpcrdma_mw *r;
-	int rc;
-
-	while (!list_empty(&buf->rb_all)) {
-		r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
-		list_del(&r->mw_all);
-		list_del(&r->mw_list);
-
-		rc = ib_dereg_mr(r->r.frmr.fr_mr);
-		if (rc)
-			dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
-				__func__, rc);
-		ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-
-		kfree(r);
-	}
-}
-
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
@@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 			rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
 	}
 
-	switch (ia->ri_memreg_strategy) {
-	case RPCRDMA_FRMR:
-		rpcrdma_destroy_frmrs(buf);
-		break;
-	case RPCRDMA_MTHCAFMR:
-		rpcrdma_destroy_fmrs(buf);
-		break;
-	default:
-		break;
-	}
+	ia->ri_ops->ro_destroy(buf);
 
 	kfree(buf->rb_pool);
 }
 
-/* After a disconnect, unmap all FMRs.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_fmr_external().
- */
-static void
-rpcrdma_reset_fmrs(struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_xprt *r_xprt =
-				container_of(ia, struct rpcrdma_xprt, rx_ia);
-	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct list_head *pos;
-	struct rpcrdma_mw *r;
-	LIST_HEAD(l);
-	int rc;
-
-	list_for_each(pos, &buf->rb_all) {
-		r = list_entry(pos, struct rpcrdma_mw, mw_all);
-
-		INIT_LIST_HEAD(&l);
-		list_add(&r->r.fmr->list, &l);
-		rc = ib_unmap_fmr(&l);
-		if (rc)
-			dprintk("RPC:       %s: ib_unmap_fmr failed %i\n",
-				__func__, rc);
-	}
-}
-
-/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
- * an unusable state. Find FRMRs in this state and dereg / reg
- * each.  FRMRs that are VALID and attached to an rpcrdma_req are
- * also torn down.
- *
- * This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
- *
- * This is invoked only in the transport connect worker in order
- * to serialize with rpcrdma_register_frmr_external().
- */
-static void
-rpcrdma_reset_frmrs(struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_xprt *r_xprt =
-				container_of(ia, struct rpcrdma_xprt, rx_ia);
-	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
-	struct list_head *pos;
-	struct rpcrdma_mw *r;
-	int rc;
-
-	list_for_each(pos, &buf->rb_all) {
-		r = list_entry(pos, struct rpcrdma_mw, mw_all);
-
-		if (r->r.frmr.fr_state == FRMR_IS_INVALID)
-			continue;
-
-		rc = ib_dereg_mr(r->r.frmr.fr_mr);
-		if (rc)
-			dprintk("RPC:       %s: ib_dereg_mr failed %i\n",
-				__func__, rc);
-		ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
-
-		r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
-					ia->ri_max_frmr_depth);
-		if (IS_ERR(r->r.frmr.fr_mr)) {
-			rc = PTR_ERR(r->r.frmr.fr_mr);
-			dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
-				" failed %i\n", __func__, rc);
-			continue;
-		}
-		r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
-					ia->ri_id->device,
-					ia->ri_max_frmr_depth);
-		if (IS_ERR(r->r.frmr.fr_pgl)) {
-			rc = PTR_ERR(r->r.frmr.fr_pgl);
-			dprintk("RPC:       %s: "
-				"ib_alloc_fast_reg_page_list "
-				"failed %i\n", __func__, rc);
-
-			ib_dereg_mr(r->r.frmr.fr_mr);
-			continue;
-		}
-		r->r.frmr.fr_state = FRMR_IS_INVALID;
-	}
-}
-
 /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
  * some req segments uninitialized.
  */
@@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
 	}
 }
 
-/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external().
+/* rpcrdma_unmap_one() was already done during deregistration.
  * Redo only the ib_post_send().
  */
 static void
@@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
  */
 
+void
+rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
+{
+	dprintk("RPC:       map_one: offset %p iova %llx len %zu\n",
+		seg->mr_offset,
+		(unsigned long long)seg->mr_dma, seg->mr_dmalen);
+}
+
 static int
 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
 				struct ib_mr **mrp, struct ib_sge *iov)
@@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
 }
 
 /*
- * Wrappers for chunk registration, shared by read/write chunk code.
- */
-
-static void
-rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
-{
-	seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
-	seg->mr_dmalen = seg->mr_len;
-	if (seg->mr_page)
-		seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
-				seg->mr_page, offset_in_page(seg->mr_offset),
-				seg->mr_dmalen, seg->mr_dir);
-	else
-		seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
-				seg->mr_offset,
-				seg->mr_dmalen, seg->mr_dir);
-	if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
-		dprintk("RPC:       %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
-			__func__,
-			(unsigned long long)seg->mr_dma,
-			seg->mr_offset, seg->mr_dmalen);
-	}
-}
-
-static void
-rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
-{
-	if (seg->mr_page)
-		ib_dma_unmap_page(ia->ri_id->device,
-				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-	else
-		ib_dma_unmap_single(ia->ri_id->device,
-				seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
-}
-
-static int
-rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
-			int *nsegs, int writing, struct rpcrdma_ia *ia,
-			struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	struct rpcrdma_mw *mw = seg1->rl_mw;
-	struct rpcrdma_frmr *frmr = &mw->r.frmr;
-	struct ib_mr *mr = frmr->fr_mr;
-	struct ib_send_wr fastreg_wr, *bad_wr;
-	u8 key;
-	int len, pageoff;
-	int i, rc;
-	int seg_len;
-	u64 pa;
-	int page_no;
-
-	pageoff = offset_in_page(seg1->mr_offset);
-	seg1->mr_offset -= pageoff;	/* start of page */
-	seg1->mr_len += pageoff;
-	len = -pageoff;
-	if (*nsegs > ia->ri_max_frmr_depth)
-		*nsegs = ia->ri_max_frmr_depth;
-	for (page_no = i = 0; i < *nsegs;) {
-		rpcrdma_map_one(ia, seg, writing);
-		pa = seg->mr_dma;
-		for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
-			frmr->fr_pgl->page_list[page_no++] = pa;
-			pa += PAGE_SIZE;
-		}
-		len += seg->mr_len;
-		++seg;
-		++i;
-		/* Check for holes */
-		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
-		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
-			break;
-	}
-	dprintk("RPC:       %s: Using frmr %p to map %d segments\n",
-		__func__, mw, i);
-
-	frmr->fr_state = FRMR_IS_VALID;
-
-	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
-	fastreg_wr.wr_id = (unsigned long)(void *)mw;
-	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
-	fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma;
-	fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
-	fastreg_wr.wr.fast_reg.page_list_len = page_no;
-	fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
-	fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
-	if (fastreg_wr.wr.fast_reg.length < len) {
-		rc = -EIO;
-		goto out_err;
-	}
-
-	/* Bump the key */
-	key = (u8)(mr->rkey & 0x000000FF);
-	ib_update_fast_reg_key(mr, ++key);
-
-	fastreg_wr.wr.fast_reg.access_flags = (writing ?
-				IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
-				IB_ACCESS_REMOTE_READ);
-	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
-	DECR_CQCOUNT(&r_xprt->rx_ep);
-
-	rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
-	if (rc) {
-		dprintk("RPC:       %s: failed ib_post_send for register,"
-			" status %i\n", __func__, rc);
-		ib_update_fast_reg_key(mr, --key);
-		goto out_err;
-	} else {
-		seg1->mr_rkey = mr->rkey;
-		seg1->mr_base = seg1->mr_dma + pageoff;
-		seg1->mr_nsegs = i;
-		seg1->mr_len = len;
-	}
-	*nsegs = i;
-	return 0;
-out_err:
-	frmr->fr_state = FRMR_IS_INVALID;
-	while (i--)
-		rpcrdma_unmap_one(ia, --seg);
-	return rc;
-}
-
-static int
-rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
-			struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	struct ib_send_wr invalidate_wr, *bad_wr;
-	int rc;
-
-	seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
-
-	memset(&invalidate_wr, 0, sizeof invalidate_wr);
-	invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
-	invalidate_wr.opcode = IB_WR_LOCAL_INV;
-	invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
-	DECR_CQCOUNT(&r_xprt->rx_ep);
-
-	read_lock(&ia->ri_qplock);
-	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia, seg++);
-	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
-	read_unlock(&ia->ri_qplock);
-	if (rc) {
-		/* Force rpcrdma_buffer_get() to retry */
-		seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
-		dprintk("RPC:       %s: failed ib_post_send for invalidate,"
-			" status %i\n", __func__, rc);
-	}
-	return rc;
-}
-
-static int
-rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
-			int *nsegs, int writing, struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
-	int len, pageoff, i, rc;
-
-	pageoff = offset_in_page(seg1->mr_offset);
-	seg1->mr_offset -= pageoff;	/* start of page */
-	seg1->mr_len += pageoff;
-	len = -pageoff;
-	if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
-		*nsegs = RPCRDMA_MAX_DATA_SEGS;
-	for (i = 0; i < *nsegs;) {
-		rpcrdma_map_one(ia, seg, writing);
-		physaddrs[i] = seg->mr_dma;
-		len += seg->mr_len;
-		++seg;
-		++i;
-		/* Check for holes */
-		if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
-		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
-			break;
-	}
-	rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma);
-	if (rc) {
-		dprintk("RPC:       %s: failed ib_map_phys_fmr "
-			"%u@0x%llx+%i (%d)... status %i\n", __func__,
-			len, (unsigned long long)seg1->mr_dma,
-			pageoff, i, rc);
-		while (i--)
-			rpcrdma_unmap_one(ia, --seg);
-	} else {
-		seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey;
-		seg1->mr_base = seg1->mr_dma + pageoff;
-		seg1->mr_nsegs = i;
-		seg1->mr_len = len;
-	}
-	*nsegs = i;
-	return rc;
-}
-
-static int
-rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
-			struct rpcrdma_ia *ia)
-{
-	struct rpcrdma_mr_seg *seg1 = seg;
-	LIST_HEAD(l);
-	int rc;
-
-	list_add(&seg1->rl_mw->r.fmr->list, &l);
-	rc = ib_unmap_fmr(&l);
-	read_lock(&ia->ri_qplock);
-	while (seg1->mr_nsegs--)
-		rpcrdma_unmap_one(ia, seg++);
-	read_unlock(&ia->ri_qplock);
-	if (rc)
-		dprintk("RPC:       %s: failed ib_unmap_fmr,"
-			" status %i\n", __func__, rc);
-	return rc;
-}
-
-int
-rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
-			int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	int rc = 0;
-
-	switch (ia->ri_memreg_strategy) {
-
-	case RPCRDMA_ALLPHYSICAL:
-		rpcrdma_map_one(ia, seg, writing);
-		seg->mr_rkey = ia->ri_bind_mem->rkey;
-		seg->mr_base = seg->mr_dma;
-		seg->mr_nsegs = 1;
-		nsegs = 1;
-		break;
-
-	/* Registration using frmr registration */
-	case RPCRDMA_FRMR:
-		rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
-		break;
-
-	/* Registration using fmr memory registration */
-	case RPCRDMA_MTHCAFMR:
-		rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
-		break;
-
-	default:
-		return -EIO;
-	}
-	if (rc)
-		return rc;
-
-	return nsegs;
-}
-
-int
-rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
-		struct rpcrdma_xprt *r_xprt)
-{
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	int nsegs = seg->mr_nsegs, rc;
-
-	switch (ia->ri_memreg_strategy) {
-
-	case RPCRDMA_ALLPHYSICAL:
-		read_lock(&ia->ri_qplock);
-		rpcrdma_unmap_one(ia, seg);
-		read_unlock(&ia->ri_qplock);
-		break;
-
-	case RPCRDMA_FRMR:
-		rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
-		break;
-
-	case RPCRDMA_MTHCAFMR:
-		rc = rpcrdma_deregister_fmr_external(seg, ia);
-		break;
-
-	default:
-		break;
-	}
-	return nsegs;
-}
-
-/*
  * Prepost any receive buffer, then post send.
  *
  * Receive buffer is donated to hardware, reclaimed upon recv completion.
@@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 	}
 
 	send_wr.next = NULL;
-	send_wr.wr_id = 0ULL;	/* no send cookie */
+	send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
 	send_wr.sg_list = req->rl_send_iov;
 	send_wr.num_sge = req->rl_niovs;
 	send_wr.opcode = IB_WR_SEND;
@@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
 	return rc;
 }
 
-/* Physical mapping means one Read/Write list entry per-page.
- * All list entries must fit within an inline buffer
- *
- * NB: The server must return a Write list for NFS READ,
- *     which has the same constraint. Factor in the inline
- *     rsize as well.
+/* How many chunk list items fit within our inline buffers?
  */
-static size_t
-rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt)
+unsigned int
+rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
 {
 	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
-	unsigned int inline_size, pages;
+	int bytes, segments;
 
-	inline_size = min_t(unsigned int,
-			    cdata->inline_wsize, cdata->inline_rsize);
-	inline_size -= RPCRDMA_HDRLEN_MIN;
-	pages = inline_size / sizeof(struct rpcrdma_segment);
-	return pages << PAGE_SHIFT;
-}
-
-static size_t
-rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt)
-{
-	return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
-}
-
-size_t
-rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt)
-{
-	size_t result;
-
-	switch (r_xprt->rx_ia.ri_memreg_strategy) {
-	case RPCRDMA_ALLPHYSICAL:
-		result = rpcrdma_physical_max_payload(r_xprt);
-		break;
-	default:
-		result = rpcrdma_mr_max_payload(r_xprt);
+	bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
+	bytes -= RPCRDMA_HDRLEN_MIN;
+	if (bytes < sizeof(struct rpcrdma_segment) * 2) {
+		pr_warn("RPC:       %s: inline threshold too small\n",
+			__func__);
+		return 0;
 	}
-	return result;
+
+	segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
+	dprintk("RPC:       %s: max chunk list size = %d segments\n",
+		__func__, segments);
+	return segments;
 }
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 0a16fb6f0885..78e0b8beaa36 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -60,6 +60,7 @@
  * Interface Adapter -- one per transport instance
  */
 struct rpcrdma_ia {
+	const struct rpcrdma_memreg_ops	*ri_ops;
 	rwlock_t		ri_qplock;
 	struct rdma_cm_id 	*ri_id;
 	struct ib_pd		*ri_pd;
@@ -105,6 +106,10 @@ struct rpcrdma_ep {
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
+/* Force completion handler to ignore the signal
+ */
+#define RPCRDMA_IGNORE_COMPLETION	(0ULL)
+
 /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
  *
  * The below structure appears at the front of a large region of kmalloc'd
@@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 	return (struct rpcrdma_msg *)rb->rg_base;
 }
 
-enum rpcrdma_chunktype {
-	rpcrdma_noch = 0,
-	rpcrdma_readch,
-	rpcrdma_areadch,
-	rpcrdma_writech,
-	rpcrdma_replych
-};
-
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
  * and complete a reply, asychronously. It needs several pieces of
@@ -213,6 +210,7 @@ struct rpcrdma_mw {
 		struct ib_fmr		*fmr;
 		struct rpcrdma_frmr	frmr;
 	} r;
+	void			(*mw_sendcompletion)(struct ib_wc *);
 	struct list_head	mw_list;
 	struct list_head	mw_all;
 };
@@ -258,7 +256,6 @@ struct rpcrdma_req {
 	unsigned int	rl_niovs;	/* 0, 2 or 4 */
 	unsigned int	rl_nchunks;	/* non-zero if chunks */
 	unsigned int	rl_connect_cookie;	/* retry detection */
-	enum rpcrdma_chunktype	rl_rtype, rl_wtype;
 	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
 	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
 	struct ib_sge	rl_send_iov[4];	/* for active requests */
@@ -340,6 +337,29 @@ struct rpcrdma_stats {
 };
 
 /*
+ * Per-registration mode operations
+ */
+struct rpcrdma_xprt;
+struct rpcrdma_memreg_ops {
+	int		(*ro_map)(struct rpcrdma_xprt *,
+				  struct rpcrdma_mr_seg *, int, bool);
+	int		(*ro_unmap)(struct rpcrdma_xprt *,
+				    struct rpcrdma_mr_seg *);
+	int		(*ro_open)(struct rpcrdma_ia *,
+				   struct rpcrdma_ep *,
+				   struct rpcrdma_create_data_internal *);
+	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
+	int		(*ro_init)(struct rpcrdma_xprt *);
+	void		(*ro_reset)(struct rpcrdma_xprt *);
+	void		(*ro_destroy)(struct rpcrdma_buffer *);
+	const char	*ro_displayname;
+};
+
+extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
+extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
+
+/*
  * RPCRDMA transport -- encapsulates the structures above for
  * integration with RPC.
  *
@@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
-int rpcrdma_register_external(struct rpcrdma_mr_seg *,
-				int, int, struct rpcrdma_xprt *);
-int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
-				struct rpcrdma_xprt *);
-
 struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
 					    size_t, gfp_t);
 void rpcrdma_free_regbuf(struct rpcrdma_ia *,
 			 struct rpcrdma_regbuf *);
 
+unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
+
+static inline enum dma_data_direction
+rpcrdma_data_dir(bool writing)
+{
+	return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+}
+
+static inline void
+rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
+		enum dma_data_direction direction)
+{
+	seg->mr_dir = direction;
+	seg->mr_dmalen = seg->mr_len;
+
+	if (seg->mr_page)
+		seg->mr_dma = ib_dma_map_page(device,
+				seg->mr_page, offset_in_page(seg->mr_offset),
+				seg->mr_dmalen, seg->mr_dir);
+	else
+		seg->mr_dma = ib_dma_map_single(device,
+				seg->mr_offset,
+				seg->mr_dmalen, seg->mr_dir);
+
+	if (ib_dma_mapping_error(device, seg->mr_dma))
+		rpcrdma_mapping_error(seg);
+}
+
+static inline void
+rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
+{
+	if (seg->mr_page)
+		ib_dma_unmap_page(device,
+				  seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+	else
+		ib_dma_unmap_single(device,
+				    seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
+}
+
 /*
  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
  */
@@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *);
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
-ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
 int rpcrdma_marshal_req(struct rpc_rqst *);
-size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
 
 /* Temporary NFS request map cache. Created in svc_rdma.c  */
 extern struct kmem_cache *svc_rdma_map_cachep;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 3613e72e858e..70e3dacbf84a 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -591,14 +591,14 @@ void tipc_bearer_stop(struct net *net)
 
 /* Caller should hold rtnl_lock to protect the bearer */
 static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg,
-				struct tipc_bearer *bearer)
+				struct tipc_bearer *bearer, int nlflags)
 {
 	void *hdr;
 	struct nlattr *attrs;
 	struct nlattr *prop;
 
 	hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
-			  NLM_F_MULTI, TIPC_NL_BEARER_GET);
+			  nlflags, TIPC_NL_BEARER_GET);
 	if (!hdr)
 		return -EMSGSIZE;
 
@@ -657,7 +657,7 @@ int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		if (!bearer)
 			continue;
 
-		err = __tipc_nl_add_bearer(&msg, bearer);
+		err = __tipc_nl_add_bearer(&msg, bearer, NLM_F_MULTI);
 		if (err)
 			break;
 	}
@@ -705,7 +705,7 @@ int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info)
 		goto err_out;
 	}
 
-	err = __tipc_nl_add_bearer(&msg, bearer);
+	err = __tipc_nl_add_bearer(&msg, bearer, 0);
 	if (err)
 		goto err_out;
 	rtnl_unlock();
@@ -857,14 +857,14 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
 }
 
 static int __tipc_nl_add_media(struct tipc_nl_msg *msg,
-			       struct tipc_media *media)
+			       struct tipc_media *media, int nlflags)
 {
 	void *hdr;
 	struct nlattr *attrs;
 	struct nlattr *prop;
 
 	hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
-			  NLM_F_MULTI, TIPC_NL_MEDIA_GET);
+			  nlflags, TIPC_NL_MEDIA_GET);
 	if (!hdr)
 		return -EMSGSIZE;
 
@@ -916,7 +916,8 @@ int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 	rtnl_lock();
 	for (; media_info_array[i] != NULL; i++) {
-		err = __tipc_nl_add_media(&msg, media_info_array[i]);
+		err = __tipc_nl_add_media(&msg, media_info_array[i],
+					  NLM_F_MULTI);
 		if (err)
 			break;
 	}
@@ -963,7 +964,7 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info)
 		goto err_out;
 	}
 
-	err = __tipc_nl_add_media(&msg, media);
+	err = __tipc_nl_add_media(&msg, media, 0);
 	if (err)
 		goto err_out;
 	rtnl_unlock();
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a6b30df6ec02..43a515dc97b0 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1145,11 +1145,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr)
 		}
 		/* Synchronize with parallel link if applicable */
 		if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) {
-			link_handle_out_of_seq_msg(l_ptr, skb);
-			if (link_synch(l_ptr))
-				link_retrieve_defq(l_ptr, &head);
-			skb = NULL;
-			goto unlock;
+			if (!link_synch(l_ptr))
+				goto unlock;
 		}
 		l_ptr->next_in_no++;
 		if (unlikely(!skb_queue_empty(&l_ptr->deferdq)))
@@ -2013,7 +2010,7 @@ msg_full:
 
 /* Caller should hold appropriate locks to protect the link */
 static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
-			      struct tipc_link *link)
+			      struct tipc_link *link, int nlflags)
 {
 	int err;
 	void *hdr;
@@ -2022,7 +2019,7 @@ static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 
 	hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
-			  NLM_F_MULTI, TIPC_NL_LINK_GET);
+			  nlflags, TIPC_NL_LINK_GET);
 	if (!hdr)
 		return -EMSGSIZE;
 
@@ -2095,7 +2092,7 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
 		if (!node->links[i])
 			continue;
 
-		err = __tipc_nl_add_link(net, msg, node->links[i]);
+		err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI);
 		if (err)
 			return err;
 	}
@@ -2143,7 +2140,6 @@ int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb)
 			err = __tipc_nl_add_node_links(net, &msg, node,
 						       &prev_link);
 			tipc_node_unlock(node);
-			tipc_node_put(node);
 			if (err)
 				goto out;
 
@@ -2210,7 +2206,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info)
 		goto err_out;
 	}
 
-	err = __tipc_nl_add_link(net, &msg, link);
+	err = __tipc_nl_add_link(net, &msg, link, 0);
 	if (err)
 		goto err_out;
 
diff --git a/net/tipc/server.c b/net/tipc/server.c
index ab6183cdb121..77ff03ed1e18 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -102,7 +102,7 @@ static void tipc_conn_kref_release(struct kref *kref)
 		}
 		saddr->scope = -TIPC_NODE_SCOPE;
 		kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
-		sk_release_kernel(sk);
+		sock_release(sock);
 		con->sock = NULL;
 	}
 
@@ -321,12 +321,9 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
 	struct socket *sock = NULL;
 	int ret;
 
-	ret = sock_create_kern(AF_TIPC, SOCK_SEQPACKET, 0, &sock);
+	ret = __sock_create(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock, 1);
 	if (ret < 0)
 		return NULL;
-
-	sk_change_net(sock->sk, s->net);
-
 	ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE,
 				(char *)&s->imp, sizeof(s->imp));
 	if (ret < 0)
@@ -376,7 +373,7 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
 
 create_err:
 	kernel_sock_shutdown(sock, SHUT_RDWR);
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 	return NULL;
 }
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index ee90d74d7516..9074b5cede38 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1764,13 +1764,14 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
 int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq)
 {
 	u32 dnode, dport = 0;
-	int err = -TIPC_ERR_NO_PORT;
+	int err;
 	struct sk_buff *skb;
 	struct tipc_sock *tsk;
 	struct tipc_net *tn;
 	struct sock *sk;
 
 	while (skb_queue_len(inputq)) {
+		err = -TIPC_ERR_NO_PORT;
 		skb = NULL;
 		dport = tipc_skb_peek_port(inputq, dport);
 		tsk = tipc_sk_lookup(net, dport);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 433f287ee548..5266ea7b922b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -305,7 +305,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 		struct dentry *dentry = unix_sk(s)->path.dentry;
 
-		if (dentry && dentry->d_inode == i) {
+		if (dentry && d_backing_inode(dentry) == i) {
 			sock_hold(s);
 			goto found;
 		}
@@ -778,7 +778,7 @@ static struct sock *unix_find_other(struct net *net,
 		err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 		if (err)
 			goto fail;
-		inode = path.dentry->d_inode;
+		inode = d_backing_inode(path.dentry);
 		err = inode_permission(inode, MAY_WRITE);
 		if (err)
 			goto put_fail;
@@ -839,7 +839,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 	 */
 	err = security_path_mknod(&path, dentry, mode, 0);
 	if (!err) {
-		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
+		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 		if (!err) {
 			res->mnt = mntget(path.mnt);
 			res->dentry = dget(dentry);
@@ -905,7 +905,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 			goto out_up;
 		}
 		addr->hash = UNIX_HASH_SIZE;
-		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
+		hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
 		spin_lock(&unix_table_lock);
 		u->path = path;
 		list = &unix_socket_table[hash];
diff --git a/net/unix/diag.c b/net/unix/diag.c
index ef542fbca9fe..c512f64d5287 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -25,7 +25,7 @@ static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb)
 
 	if (dentry) {
 		struct unix_diag_vfs uv = {
-			.udiag_vfs_ino = dentry->d_inode->i_ino,
+			.udiag_vfs_ino = d_backing_inode(dentry)->i_ino,
 			.udiag_vfs_dev = dentry->d_sb->s_dev,
 		};
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 99f7012b23b9..a73a226f2d33 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -95,39 +95,36 @@ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
 
 unsigned int unix_tot_inflight;
 
-
 struct sock *unix_get_socket(struct file *filp)
 {
 	struct sock *u_sock = NULL;
 	struct inode *inode = file_inode(filp);
 
-	/*
-	 *	Socket ?
-	 */
+	/* Socket ? */
 	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
 		struct socket *sock = SOCKET_I(inode);
 		struct sock *s = sock->sk;
 
-		/*
-		 *	PF_UNIX ?
-		 */
+		/* PF_UNIX ? */
 		if (s && sock->ops && sock->ops->family == PF_UNIX)
 			u_sock = s;
 	}
 	return u_sock;
 }
 
-/*
- *	Keep the number of times in flight count for the file
- *	descriptor if it is for an AF_UNIX socket.
+/* Keep the number of times in flight count for the file
+ * descriptor if it is for an AF_UNIX socket.
  */
 
 void unix_inflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
+
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
+
 		spin_lock(&unix_gc_lock);
+
 		if (atomic_long_inc_return(&u->inflight) == 1) {
 			BUG_ON(!list_empty(&u->link));
 			list_add_tail(&u->link, &gc_inflight_list);
@@ -142,10 +139,13 @@ void unix_inflight(struct file *fp)
 void unix_notinflight(struct file *fp)
 {
 	struct sock *s = unix_get_socket(fp);
+
 	if (s) {
 		struct unix_sock *u = unix_sk(s);
+
 		spin_lock(&unix_gc_lock);
 		BUG_ON(list_empty(&u->link));
+
 		if (atomic_long_dec_and_test(&u->inflight))
 			list_del_init(&u->link);
 		unix_tot_inflight--;
@@ -161,32 +161,27 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 
 	spin_lock(&x->sk_receive_queue.lock);
 	skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
-		/*
-		 *	Do we have file descriptors ?
-		 */
+		/* Do we have file descriptors ? */
 		if (UNIXCB(skb).fp) {
 			bool hit = false;
-			/*
-			 *	Process the descriptors of this socket
-			 */
+			/* Process the descriptors of this socket */
 			int nfd = UNIXCB(skb).fp->count;
 			struct file **fp = UNIXCB(skb).fp->fp;
+
 			while (nfd--) {
-				/*
-				 *	Get the socket the fd matches
-				 *	if it indeed does so
-				 */
+				/* Get the socket the fd matches if it indeed does so */
 				struct sock *sk = unix_get_socket(*fp++);
+
 				if (sk) {
 					struct unix_sock *u = unix_sk(sk);
 
-					/*
-					 * Ignore non-candidates, they could
+					/* Ignore non-candidates, they could
 					 * have been added to the queues after
 					 * starting the garbage collection
 					 */
 					if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
 						hit = true;
+
 						func(u);
 					}
 				}
@@ -203,24 +198,22 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
 static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
 			  struct sk_buff_head *hitlist)
 {
-	if (x->sk_state != TCP_LISTEN)
+	if (x->sk_state != TCP_LISTEN) {
 		scan_inflight(x, func, hitlist);
-	else {
+	} else {
 		struct sk_buff *skb;
 		struct sk_buff *next;
 		struct unix_sock *u;
 		LIST_HEAD(embryos);
 
-		/*
-		 * For a listening socket collect the queued embryos
+		/* For a listening socket collect the queued embryos
 		 * and perform a scan on them as well.
 		 */
 		spin_lock(&x->sk_receive_queue.lock);
 		skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
 			u = unix_sk(skb->sk);
 
-			/*
-			 * An embryo cannot be in-flight, so it's safe
+			/* An embryo cannot be in-flight, so it's safe
 			 * to use the list link.
 			 */
 			BUG_ON(!list_empty(&u->link));
@@ -249,8 +242,7 @@ static void inc_inflight(struct unix_sock *usk)
 static void inc_inflight_move_tail(struct unix_sock *u)
 {
 	atomic_long_inc(&u->inflight);
-	/*
-	 * If this still might be part of a cycle, move it to the end
+	/* If this still might be part of a cycle, move it to the end
 	 * of the list, so that it's checked even if it was already
 	 * passed over
 	 */
@@ -263,8 +255,7 @@ static bool gc_in_progress;
 
 void wait_for_unix_gc(void)
 {
-	/*
-	 * If number of inflight sockets is insane,
+	/* If number of inflight sockets is insane,
 	 * force a garbage collect right now.
 	 */
 	if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
@@ -288,8 +279,7 @@ void unix_gc(void)
 		goto out;
 
 	gc_in_progress = true;
-	/*
-	 * First, select candidates for garbage collection.  Only
+	/* First, select candidates for garbage collection.  Only
 	 * in-flight sockets are considered, and from those only ones
 	 * which don't have any external reference.
 	 *
@@ -320,15 +310,13 @@ void unix_gc(void)
 		}
 	}
 
-	/*
-	 * Now remove all internal in-flight reference to children of
+	/* Now remove all internal in-flight reference to children of
 	 * the candidates.
 	 */
 	list_for_each_entry(u, &gc_candidates, link)
 		scan_children(&u->sk, dec_inflight, NULL);
 
-	/*
-	 * Restore the references for children of all candidates,
+	/* Restore the references for children of all candidates,
 	 * which have remaining references.  Do this recursively, so
 	 * only those remain, which form cyclic references.
 	 *
@@ -350,8 +338,7 @@ void unix_gc(void)
 	}
 	list_del(&cursor);
 
-	/*
-	 * not_cycle_list contains those sockets which do not make up a
+	/* not_cycle_list contains those sockets which do not make up a
 	 * cycle.  Restore these to the inflight list.
 	 */
 	while (!list_empty(&not_cycle_list)) {
@@ -360,8 +347,7 @@ void unix_gc(void)
 		list_move_tail(&u->link, &gc_inflight_list);
 	}
 
-	/*
-	 * Now gc_candidates contains only garbage.  Restore original
+	/* Now gc_candidates contains only garbage.  Restore original
 	 * inflight counters for these as well, and remove the skbuffs
 	 * which are creating the cycle(s).
 	 */